From: Alexander Boyko Date: Fri, 19 Jul 2019 11:07:42 +0000 (-0400) Subject: LU-12567 ptlrpc: handle reply and resend reorder X-Git-Tag: 2.14.56~191 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=f7f31f8f969f410cca0b4b8b02f81391148e01f2 LU-12567 ptlrpc: handle reply and resend reorder ptlrpc can't detect a bulk transfer timeout if rpc and bulk are reordered on router. We should fail a bulk for situations where bulk is not completed (after bulk timeout LNET_EVENT_UNLINK is set). HPE-bug-id: LUS-7445, LUS-7569 Signed-off-by: Alexander Boyko Signed-off-by: Alexey Lyashkov Change-Id: Iaf099d31f8fbc68c3edbfcff77ae424862e0adc1 Reviewed-on: https://review.whamcloud.com/35571 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index af53016..04fe734 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2157,7 +2157,10 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * was good after getting the REPLY for her GET or * the ACK for her PUT. */ - DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); + DEBUG_REQ(D_ERROR, req, "bulk transfer failed %d/%d/%d", + req->rq_status, + req->rq_bulk->bd_nob, + req->rq_bulk->bd_nob_transferred); req->rq_status = -EIO; } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 90a1587..fe07a49 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -225,10 +225,9 @@ void client_bulk_callback(struct lnet_event *ev) spin_lock(&req->rq_lock); req->rq_net_err = 1; spin_unlock(&req->rq_lock); + desc->bd_failure = 1; } - if (ev->status != 0) - desc->bd_failure = 1; /* NB don't unlock till after wakeup; desc can disappear under us * otherwise */ diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index 882591e..62e6f2e8 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -2481,9 +2481,22 @@ out_lock: * to reorder. */ if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) && desc) { + /* Calculate checksum before request transfer, original + * it is done by target_bulk_io() */ + rc = sptlrpc_svc_wrap_bulk(req, desc); + if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS)) + req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg); + else /* old version, bulk matchbits is rq_xid */ + req->rq_mbits = req->rq_xid; + + req->rq_status = rc; + target_committed_to_req(req); + target_send_reply(req, 0, 0); + CDEBUG(D_INFO, "reorder BULK\n"); + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2, + cfs_fail_val ? : 3); - ssleep(3); target_bulk_io(exp, desc); ptlrpc_free_bulk(desc); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a847d88..5c88214 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -18834,26 +18834,48 @@ run_test 223 "osc reenqueue if without AGL lock granted =======================" test_224a() { # LU-1039, MRP-303 [ $PARALLEL == "yes" ] && skip "skip parallel run" - #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 $LCTL set_param fail_loc=0x508 - dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync + dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 conv=fsync $LCTL set_param fail_loc=0 df $DIR } run_test 224a "Don't panic on bulk IO failure" -test_224b() { # LU-1039, MRP-303 +test_224bd_sub() { # LU-1039, MRP-303 [ $PARALLEL == "yes" ] && skip "skip parallel run" + local timeout=$1 - dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 + shift + dd if=/dev/urandom of=$TMP/$tfile bs=1M count=1 + + $LFS setstripe -c 1 -i 0 $DIR/$tfile + + dd if=$TMP/$tfile of=$DIR/$tfile bs=1M count=1 cancel_lru_locks osc + set_checksums 0 + stack_trap "set_checksums $ORIG_CSUM" EXIT + local at_max_saved=0 + + # adaptive timeouts may prevent seeing the issue + if at_is_enabled; then + at_max_saved=$(at_max_get mds) + at_max_set 0 mds client + stack_trap "at_max_set $at_max_saved mds client" EXIT + fi + #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 - $LCTL set_param fail_loc=0x515 - dd of=/dev/null if=$DIR/$tfile bs=4096 count=1 - $LCTL set_param fail_loc=0 + do_facet ost1 $LCTL set_param fail_val=$timeout fail_loc=0x80000515 + dd of=$TMP/$tfile.new if=$DIR/$tfile bs=1M count=1 || "$@" + + do_facet ost1 $LCTL set_param fail_loc=0 + cmp $TMP/$tfile $TMP/$tfile.new || error "file contents wrong" df $DIR } + +test_224b() { + test_224bd_sub 3 error "dd failed" +} run_test 224b "Don't panic on bulk IO failure" test_224c() { # LU-6441 @@ -18894,6 +18916,11 @@ test_224c() { # LU-6441 } run_test 224c "Don't hang if one of md lost during large bulk RPC" +test_224d() { # LU-11169 + test_224bd_sub $((TIMEOUT + 2)) error "dd failed" +} +run_test 224d "Don't corrupt data on bulk IO timeout" + MDSSURVEY=${MDSSURVEY:-$(which mds-survey 2>/dev/null || true)} test_225a () { [ $PARALLEL == "yes" ] && skip "skip parallel run"