From: adilger Date: Fri, 12 Sep 2003 10:43:36 +0000 (+0000) Subject: Make server bulk RPC timeouts shorter than the client timeouts, so we don't X-Git-Tag: v1_7_100~3236 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=2de13382b974b8130d49d81888854c17db67b8a4 Make server bulk RPC timeouts shorter than the client timeouts, so we don't have cascading failures. Server bulk timeout is 1/4 of the client timeout. Also fix /proc variables to be int, as that is what the functions expect. b=1845 --- diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 28a9a3d..b00de37 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -36,11 +36,11 @@ /* global variables */ extern atomic_t obd_memory; extern int obd_memmax; -extern unsigned long obd_fail_loc; -extern unsigned long obd_timeout; +extern unsigned int obd_fail_loc; +extern unsigned int obd_timeout; extern unsigned long obd_max_dirty_pages; extern char obd_lustre_upcall[128]; -extern unsigned long obd_sync_filter; +extern unsigned int obd_sync_filter; #define OBD_FAIL_MDS 0x100 #define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 364cf84..3c5337e 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -114,7 +114,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, GOTO(cleanup_buf, rc); } - lwi = LWI_TIMEOUT(obd_timeout * HZ, mds_bulk_timeout, desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi); if (rc) { LASSERT (rc == -ETIMEDOUT); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 2efee5b..7ee897e 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -77,11 +77,10 @@ struct lprocfs_vars lprocfs_version[] = {{"version", obd_proc_read_version, NULL int proc_version; /* The following are visible and mutable through /proc/sys/lustre/. */ -unsigned long obd_fail_loc; -unsigned long obd_timeout = 100; -unsigned long obd_bulk_timeout = 1; +unsigned int obd_fail_loc; +unsigned int obd_timeout = 100; char obd_lustre_upcall[128] = "/usr/lib/lustre/lustre_upcall"; -unsigned long obd_sync_filter; /* = 0, don't sync by default */ +unsigned int obd_sync_filter; /* = 0, don't sync by default */ #ifdef __KERNEL__ /* opening /dev/obd */ diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 20db486..5f357580 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -412,6 +412,9 @@ static int ost_brw_read(struct ptlrpc_request *req) if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) GOTO(out, rc = -EIO); + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, + (obd_timeout + 1) / 4); + body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body); if (body == NULL) { CERROR("Missing/short ost_body\n"); @@ -494,17 +497,17 @@ static int ost_brw_read(struct ptlrpc_request *req) if (rc == 0) { rc = ptlrpc_bulk_put(desc); if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, - desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, + ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); if (rc) { LASSERT(rc == -ETIMEDOUT); - CERROR ("timeout waiting for bulk PUT\n"); + DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); ptlrpc_abort_bulk(desc); } } else { - CERROR("ptlrpc_bulk_put failed RC: %d\n", rc); + DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc); } comms_error = rc != 0; } @@ -574,7 +577,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) /* pause before transaction has been started */ OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, - obd_timeout +1); + (obd_timeout + 1) / 4); swab = lustre_msg_swabbed(req->rq_reqmsg); body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body); @@ -654,17 +657,17 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (rc == 0) { rc = ptlrpc_bulk_get(desc); if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, - desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, + ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); if (rc) { LASSERT(rc == -ETIMEDOUT); - CERROR("timeout waiting for bulk GET\n"); + DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); ptlrpc_abort_bulk(desc); } } else { - CERROR("ptlrpc_bulk_get failed RC: %d\n", rc); + DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc); } comms_error = rc != 0; } diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c index 9829900..f817802 100644 --- a/lustre/ptlbd/rpc.c +++ b/lustre/ptlbd/rpc.c @@ -275,7 +275,7 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, GOTO(out_reply, rc); } - lwi = LWI_TIMEOUT(obd_timeout * HZ, NULL, desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); if (rc != 0) { LASSERT(rc == -ETIMEDOUT);