From: Alexander Boyko <alexander_boyko@xyratex.com>
Date: Wed, 29 Oct 2014 21:59:12 +0000 (-0700)
Subject: LU-5079 ptlrpc: fix early reply timeout for recovery
X-Git-Tag: 2.5.3.90~70
X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F65%2F12365%2F4;p=fs%2Flustre-release.git

LU-5079 ptlrpc: fix early reply timeout for recovery

Commit 8b2f9c0e408 http://review.whamcloud.com/9100 changed
the deadline calculation from current time to request arrival.
During recovery, the new deadline could be less than calculated
at request arrival time. And even worse, the deadline may be
in past. For the first case unneeded early reply would be sent
to client. For the second, client requests would be dropped by
timeout and client reconnect happend.

Do at_measured() for recovery in the same way like general early
reply base on the current time. And set new timeout to the end
of recovery.

Test to check recovery deadline bug.

The patch also speeds up replay-vbr test 7* by decreasing the
at_max value in test_7_cycle().

This patch is back-ported from the following one:
Lustre-commit: 1d889090f2e2902d861d1fab0227c4343127cc42
Lustre-change: http://review.whamcloud.com/11213

Test-Parameters: alwaysuploadlogs \
envdefinitions=SLOW=yes,ENABLE_QUOTA=yes \
mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs ostfilesystemtype=ldiskfs \
clientdistro=el6 ossdistro=el6 mdsdistro=el6 \
mdtcount=1 mdssizegb=2 ostcount=7 ostsizegb=2 \
testlist=replay-vbr,replay-vbr

Xyratex-bug-id: MRP-1988

Change-Id: I7a5d60ca8e52a805546cc9934eb42bddeaf8ff4e
Signed-off-by: Alexander Boyko <alexander_boyko@xyratex.com>
Signed-off-by: Jian Yu <jian.yu@intel.com>
Reviewed-on: http://review.whamcloud.com/12365
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index ace96fa..a43bfbd 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -419,6 +419,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_TGT_LAST_REPLAY         0x710
 #define OBD_FAIL_TGT_CLIENT_ADD          0x711
 #define OBD_FAIL_TGT_RCVG_FLAG           0x712
+#define OBD_FAIL_TGT_REPLAY_DELAY2	 0x714
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index eb8a58b..bfae4cc 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1829,17 +1829,19 @@ repeat:
 
 static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
 {
-        struct ptlrpc_request *req = NULL;
-        ENTRY;
+	struct ptlrpc_request *req = NULL;
+	ENTRY;
 
-        CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
-               obd->obd_next_recovery_transno);
+	CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
+		obd->obd_next_recovery_transno);
 
-        if (target_recovery_overseer(obd, check_for_next_transno,
-                                     exp_req_replay_healthy)) {
-                abort_req_replay_queue(obd);
-                abort_lock_replay_queue(obd);
-        }
+	CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+	if (target_recovery_overseer(obd, check_for_next_transno,
+				     exp_req_replay_healthy)) {
+		abort_req_replay_queue(obd);
+		abort_lock_replay_queue(obd);
+	}
 
 	spin_lock(&obd->obd_recovery_task_lock);
 	if (!cfs_list_empty(&obd->obd_req_replay_queue)) {
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c
index a74d51d..22bc111 100644
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1313,16 +1313,18 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                 RETURN(-ENOSYS);
         }
 
-        if (req->rq_export &&
-            lustre_msg_get_flags(req->rq_reqmsg) &
-            (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
-                /* During recovery, we don't want to send too many early
-                 * replies, but on the other hand we want to make sure the
-                 * client has enough time to resend if the rpc is lost. So
-                 * during the recovery period send at least 4 early replies,
-                 * spacing them every at_extra if we can. at_estimate should
-                 * always equal this fixed value during recovery. */
-		at_measured(&svcpt->scp_at_estimate, min(at_extra,
+	if (req->rq_export &&
+	    lustre_msg_get_flags(req->rq_reqmsg) &
+	    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+		/* During recovery, we don't want to send too many early
+		 * replies, but on the other hand we want to make sure the
+		 * client has enough time to resend if the rpc is lost. So
+		 * during the recovery period send at least 4 early replies,
+		 * spacing them every at_extra if we can. at_estimate should
+		 * always equal this fixed value during recovery. */
+		at_measured(&svcpt->scp_at_estimate,
+			    cfs_time_current_sec() -
+			    req->rq_arrival_time.tv_sec + min(at_extra,
 			    req->rq_export->exp_obd->obd_recovery_timeout / 4));
 	} else {
 		/* We want to extend the request deadline by at_extra seconds,
@@ -1336,17 +1338,17 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
 			    cfs_time_current_sec() -
 			    req->rq_arrival_time.tv_sec);
 
-		/* Check to see if we've actually increased the deadline -
-		 * we may be past adaptive_max */
-		if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
-		    at_get(&svcpt->scp_at_estimate)) {
-			DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
-				  "(%ld/%ld), not sending early reply\n",
-				  olddl, req->rq_arrival_time.tv_sec +
-				  at_get(&svcpt->scp_at_estimate) -
-				  cfs_time_current_sec());
-			RETURN(-ETIMEDOUT);
-		}
+	}
+	/* Check to see if we've actually increased the deadline -
+	 * we may be past adaptive_max */
+	if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+	    at_get(&svcpt->scp_at_estimate)) {
+		DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+			  "(%ld/%ld), not sending early reply\n",
+			  olddl, req->rq_arrival_time.tv_sec +
+			  at_get(&svcpt->scp_at_estimate) -
+			  cfs_time_current_sec());
+		RETURN(-ETIMEDOUT);
 	}
 
 	reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS);
diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh
index 42c39fd..52d894e 100755
--- a/lustre/tests/replay-ost-single.sh
+++ b/lustre/tests/replay-ost-single.sh
@@ -396,6 +396,34 @@ test_8e() {
 }
 run_test 8e "Verify that ptlrpc resends request on -EINPROGRESS"
 
+test_9() {
+	local server_version=$(lustre_version_code ost1)
+
+	[[ $server_version -ge $(version_code 2.6.55) ]] ||
+		[[ $server_version -ge $(version_code 2.5.27) &&
+		   $server_version -lt $(version_code 2.5.50) ]] ||
+		[[ $server_version -ge $(version_code 2.5.3) &&
+		   $server_version -lt $(version_code 2.5.11) ]] ||
+		{ skip "Need OST version 2.5.4+ or 2.5.27+ or 2.6.55+"; return; }
+
+	$SETSTRIPE -i 0 -c 1 $DIR/$tfile
+	replay_barrier ost1
+	# do IO
+	dd if=/dev/zero of=$DIR/$tfile count=1 bs=1M > /dev/null ||
+		error "failed to write"
+	# failover, replay and resend replayed waiting request
+	#define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
+	do_facet ost1 $LCTL set_param fail_loc=0x00000714
+	do_facet ost1 $LCTL set_param fail_val=$TIMEOUT
+	fail ost1
+	do_facet ost1 $LCTL set_param fail_loc=0
+	do_facet ost1 "dmesg | tail -n 100" |\
+		sed -n '/no req deadline/,$ p' | grep -q 'Already past' &&
+		return 1
+	return 0
+}
+run_test 9 "Verify that no req deadline happened during recovery"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh
index 4018a9b..f633a0d 100644
--- a/lustre/tests/replay-vbr.sh
+++ b/lustre/tests/replay-vbr.sh
@@ -711,6 +711,12 @@ test_7_cycle() {
     do_facet $SINGLEMDS "$LCTL set_param mdd.${!var}.sync_permission=0"
     do_facet $SINGLEMDS "$LCTL set_param mdt.${!var}.commit_on_sharing=0"
 
+	local at_max_saved=0
+	if at_is_enabled; then
+		at_max_saved=$(at_max_get $SINGLEMDS)
+		at_max_set $TIMEOUT mds client
+	fi
+
     do_node $CLIENT1 mkdir -p $DIR/$tdir
     replay_barrier $SINGLEMDS
     # first operation
@@ -727,11 +733,12 @@ test_7_cycle() {
     # should fail as conflict expected
     client_evicted $CLIENT1 || rc=1
 
-    wait_recovery_complete $SINGLEMDS
+	wait_recovery_complete $SINGLEMDS
 	wait_mds_ost_sync || error "wait_mds_ost_sync failed"
 
-    rm -rf $DIR/$tdir
-    return $rc
+	[[ $at_max_saved -eq 0 ]] || at_max_set $at_max_saved mds client
+	rm -rf $DIR/$tdir
+	return $rc
 }
 
 test_7a() {