From 978458e05db4cad21e3ee32384168f53fd3e2d72 Mon Sep 17 00:00:00 2001
From: Fan Yong <fan.yong@intel.com>
Date: Fri, 14 Aug 2015 11:01:50 +0800
Subject: [PATCH] LU-7190 lfsck: tolerate MDT-OST communication failures

During the 2nd phase scanning, the layout LFSCK slave engine on the
OST will query the master engine status from the MDT periodically.
Sometimes, the query RPC may hit failure that may because network
trouble, or the MDS node issues. To make the LFSCK can go ahead,
the slave engine will not wait for ever, instead, it will assume
the master engine has exited without notifying (or fail to notify)
the slave engine. So the slave engine will exit also and clean up
the LFSCK environment on the OST, including the OST-object access
bitmap that is used to find out orphan OST-objects.

On the other hand, the assumption of master engine exit maybe wrong.
If the master engine does not exit, and the network trouble between
the MDS and OSS recovered after the slave engine exited, then the
master engine will try to find out orphan OST-objects during its
2nd phase scanning. But because the slave engine has already exited
and released the OST-object access bitmap, the master engine has
no way to find out orphan OST-objects.

To avoid above trouble, we make some compromise: when the slave
engine on the OST failed to query the master engine status, it will
not exit at once, instead, it will try several times. If the network
trouble can recover during such interval, the LFSCK will go ahead;
otherwise, the slave engine will exit as original does.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Ifa06552c61d885297a54ab6bfdc92d48c8f56fa3
Reviewed-on: http://review.whamcloud.com/16667
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/lfsck/lfsck_layout.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c
index 903b4d3..0206c5b 100644
--- a/lustre/lfsck/lfsck_layout.c
+++ b/lustre/lfsck/lfsck_layout.c
@@ -72,6 +72,8 @@ struct lfsck_layout_slave_target {
 	__u64			llst_gen;
 	atomic_t		llst_ref;
 	__u32			llst_index;
+	/* How many times we have failed to get the master status. */
+	int			llst_failures;
 };
 
 struct lfsck_layout_slave_data {
@@ -3442,11 +3444,22 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env,
 	bool				      done  = false;
 
 	if (rc != 0) {
-		/* It is quite probably caused by target crash,
-		 * to make the LFSCK can go ahead, assume that
-		 * the target finished the LFSCK prcoessing. */
-		done = true;
+		/* It is probably caused by network trouble, or target crash,
+		 * it will try several times (depends on the obd_timeout, and
+		 * will not less than 3 times). But to make the LFSCK can go
+		 * ahead, we should not try for ever. After some try but still
+		 * hit failure, it will assume that the target exit the LFSCK
+		 * prcoessing and stop try. */
+		if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
+			int max_try = max_t(int, obd_timeout / 30, 3);
+
+			if (++(llst->llst_failures) > max_try)
+				done = true;
+		} else {
+			done = true;
+		}
 	} else {
+		llst->llst_failures = 0;
 		lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
 		if (lr->lr_status != LS_SCANNING_PHASE1 &&
 		    lr->lr_status != LS_SCANNING_PHASE2)
@@ -3455,8 +3468,9 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env,
 
 	if (done) {
 		CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
-		       "status %d\n", lfsck_lfsck2name(com->lc_lfsck),
-		       llst->llst_index, lr != NULL ? lr->lr_status : rc);
+		       "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
+		       llst->llst_index, lr != NULL ? lr->lr_status : rc,
+		       llst->llst_failures);
 
 		lfsck_layout_llst_del(llsd, llst);
 	}
-- 
1.8.3.1