From: Alexander Boyko <c17825@cray.com>
Date: Fri, 19 Jul 2019 11:07:42 +0000 (-0400)
Subject: LU-12567 ptlrpc: handle reply and resend reorder
X-Git-Tag: 2.14.56~191
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=f7f31f8f969f410cca0b4b8b02f81391148e01f2

LU-12567 ptlrpc: handle reply and resend reorder

ptlrpc can't detect a bulk transfer timeout
if rpc and bulk are reordered on router.
We should fail a bulk for situations where bulk is not
completed (after bulk timeout LNET_EVENT_UNLINK is set).

HPE-bug-id: LUS-7445, LUS-7569
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Change-Id: Iaf099d31f8fbc68c3edbfcff77ae424862e0adc1
Reviewed-on: https://review.whamcloud.com/35571
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---

diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index af53016..04fe734 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -2157,7 +2157,10 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 			 * was good after getting the REPLY for her GET or
 			 * the ACK for her PUT.
 			 */
-			DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+			DEBUG_REQ(D_ERROR, req, "bulk transfer failed %d/%d/%d",
+				  req->rq_status,
+				  req->rq_bulk->bd_nob,
+				  req->rq_bulk->bd_nob_transferred);
 			req->rq_status = -EIO;
 		}
 
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c
index 90a1587..fe07a49 100644
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -225,10 +225,9 @@ void client_bulk_callback(struct lnet_event *ev)
 		spin_lock(&req->rq_lock);
 		req->rq_net_err = 1;
 		spin_unlock(&req->rq_lock);
+		desc->bd_failure = 1;
 	}
 
-	if (ev->status != 0)
-		desc->bd_failure = 1;
 
 	/* NB don't unlock till after wakeup; desc can disappear under us
 	 * otherwise */
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c
index 882591e..62e6f2e8 100644
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -2481,9 +2481,22 @@ out_lock:
 	 * to reorder. */
 	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
 	    desc) {
+		/* Calculate checksum before request transfer, original
+		 * it is done by target_bulk_io() */
+		rc = sptlrpc_svc_wrap_bulk(req, desc);
+		if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS))
+			req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+		else /* old version, bulk matchbits is rq_xid */
+			req->rq_mbits = req->rq_xid;
+
+		req->rq_status = rc;
+		target_committed_to_req(req);
+		target_send_reply(req, 0, 0);
+
 		CDEBUG(D_INFO, "reorder BULK\n");
+		OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,
+				 cfs_fail_val ? : 3);
 
-		ssleep(3);
 		target_bulk_io(exp, desc);
 		ptlrpc_free_bulk(desc);
 	}
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index a847d88..5c88214 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -18834,26 +18834,48 @@ run_test 223 "osc reenqueue if without AGL lock granted ======================="
 
 test_224a() { # LU-1039, MRP-303
 	[ $PARALLEL == "yes" ] && skip "skip parallel run"
-
 	#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
 	$LCTL set_param fail_loc=0x508
-	dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync
+	dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 conv=fsync
 	$LCTL set_param fail_loc=0
 	df $DIR
 }
 run_test 224a "Don't panic on bulk IO failure"
 
-test_224b() { # LU-1039, MRP-303
+test_224bd_sub() { # LU-1039, MRP-303
 	[ $PARALLEL == "yes" ] && skip "skip parallel run"
+	local timeout=$1
 
-	dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+	shift
+	dd if=/dev/urandom of=$TMP/$tfile bs=1M count=1
+
+	$LFS setstripe -c 1 -i 0 $DIR/$tfile
+
+	dd if=$TMP/$tfile of=$DIR/$tfile bs=1M count=1
 	cancel_lru_locks osc
+	set_checksums 0
+	stack_trap "set_checksums $ORIG_CSUM" EXIT
+	local at_max_saved=0
+
+	# adaptive timeouts may prevent seeing the issue
+	if at_is_enabled; then
+		at_max_saved=$(at_max_get mds)
+		at_max_set 0 mds client
+		stack_trap "at_max_set $at_max_saved mds client" EXIT
+	fi
+
 	#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2   0x515
-	$LCTL set_param fail_loc=0x515
-	dd of=/dev/null if=$DIR/$tfile bs=4096 count=1
-	$LCTL set_param fail_loc=0
+	do_facet ost1 $LCTL set_param fail_val=$timeout fail_loc=0x80000515
+	dd of=$TMP/$tfile.new if=$DIR/$tfile bs=1M count=1 || "$@"
+
+	do_facet ost1 $LCTL set_param fail_loc=0
+	cmp $TMP/$tfile $TMP/$tfile.new || error "file contents wrong"
 	df $DIR
 }
+
+test_224b() {
+	test_224bd_sub 3 error "dd failed"
+}
 run_test 224b "Don't panic on bulk IO failure"
 
 test_224c() { # LU-6441
@@ -18894,6 +18916,11 @@ test_224c() { # LU-6441
 }
 run_test 224c "Don't hang if one of md lost during large bulk RPC"
 
+test_224d() { # LU-11169
+	test_224bd_sub $((TIMEOUT + 2)) error "dd failed"
+}
+run_test 224d "Don't corrupt data on bulk IO timeout"
+
 MDSSURVEY=${MDSSURVEY:-$(which mds-survey 2>/dev/null || true)}
 test_225a () {
 	[ $PARALLEL == "yes" ] && skip "skip parallel run"