From: Alexey Lyashkov Date: Thu, 26 Jan 2012 12:39:12 +0000 (+0400) Subject: LU-1039 ptlrpc: handle bulk IO errors correctly. X-Git-Tag: 2.1.58~7 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=c9590221dc43dd5e7a7ede389f0a7d9cf566e5bf LU-1039 ptlrpc: handle bulk IO errors correctly. don't panic on incorrect bulk transfer, correctly handle a bulk request reorder. LustreError: 12927:0:(client.c:1696:ptlrpc_check_set()) LBUG Pid: 12927, comm: ptlrpcd-brw Call Trace: [] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [] lbug_with_loc+0x75/0xe0 [libcfs] [] ptlrpc_check_set+0x1a66/0x1b90 [ptlrpc] [] ? ptlrpcd_check+0x46/0x290 [ptlrpc] [] ? ptlrpcd_check+0x46/0x290 [ptlrpc] [] ptlrpcd_check+0x1f8/0x290 [ptlrpc] [] ptlrpcd+0x32b/0x3b0 [ptlrpc] [] ? finish_task_switch+0x48/0x110 [] ? default_wake_function+0x0/0x20 [] ? ptlrpcd+0x0/0x3b0 [ptlrpc] [] child_rip+0xa/0x20 [] ? _spin_unlock_irq+0x30/0x40 [] ? restore_args+0x0/0x30 [] ? ptlrpcd+0x0/0x3b0 [ptlrpc] [] ? child_rip+0x0/0x20 Xyratex-bug-id: MRP-303 Change-Id: Ibb19c33e92dbecf5d029c7e6c567f65fb764f444 Signed-off-by: Alexey Lyashkov Signed-off-by: Oleg Drokin Reviewed-on: http://review.whamcloud.com/2023 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Johann Lombardi --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 0b29238..db9e46f 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -359,6 +359,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 #define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 #define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 4cd4b07..cbe543b 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -783,7 +783,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) /* Check if client was evicted while we were doing i/o before touching network */ if (rc == 0) { - rc = target_bulk_io(exp, desc, &lwi); + if (likely(!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) + rc = target_bulk_io(exp, desc, &lwi); no_reply = rc != 0; } @@ -800,7 +801,7 @@ out_lock: out_tls: ost_tls_put(req); out_bulk: - if (desc) + if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) ptlrpc_free_bulk(desc); out: LASSERT(rc <= 0); @@ -824,6 +825,20 @@ out: exp->exp_connection->c_remote_uuid.uuid, libcfs_id2str(req->rq_peer)); } + /* send a bulk after reply to simulate a network delay or reordering + * by a router */ + if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) { + cfs_waitq_t waitq; + struct l_wait_info lwi1; + + CDEBUG(D_INFO, "reorder BULK\n"); + cfs_waitq_init(&waitq); + + lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL); + l_wait_event(waitq, 0, &lwi1); + rc = target_bulk_io(exp, desc, &lwi); + ptlrpc_free_bulk(desc); + } RETURN(rc); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 0a2ab28..30c7cfe 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1690,7 +1690,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * process the reply. Similarly if the RPC returned * an error, and therefore the bulk will never arrive. */ - if (req->rq_bulk == NULL || req->rq_status != 0) { + if (req->rq_bulk == NULL || req->rq_status < 0) { ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); GOTO(interpret, req->rq_status); } @@ -1708,7 +1708,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * was good after getting the REPLY for her GET or * the ACK for her PUT. */ DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); - LBUG(); + req->rq_status = -EIO; } ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 5c37070..6cb8e8e 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -189,6 +189,12 @@ void client_bulk_callback (lnet_event_t *ev) ev->type == LNET_EVENT_UNLINK); LASSERT (ev->unlinked); + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) + ev->status = -EIO; + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE)) + ev->status = -EIO; + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, "event type %d, status %d, desc %p\n", ev->type, ev->status, desc); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7aa65ed..fae6a60 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -8586,6 +8586,26 @@ test_223 () { } run_test 223 "osc reenqueue if without AGL lock granted =======================" +test_224a() { # LU-1039, MRP-303 + #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 + $LCTL set_param fail_loc=0x508 + dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync + $LCTL set_param fail_loc=0 + df $DIR +} +run_test 224a "Don't panic on bulk IO failure" + +test_224b() { # LU-1039, MRP-303 + dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 + cancel_lru_locks osc + #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 + $LCTL set_param fail_loc=0x515 + dd of=/dev/null if=$DIR/$tfile bs=4096 count=1 + $LCTL set_param fail_loc=0 + df $DIR +} +run_test 224b "Don't panic on bulk IO failure" + # # tests that do cleanup/setup should be run at the end #