From 2fe2d1e82005746180309d9b79057a418a729e54 Mon Sep 17 00:00:00 2001
From: Di Wang <di.wang@intel.com>
Date: Tue, 8 Dec 2015 09:28:14 -0800
Subject: [PATCH] LU-7531 osp: allow few requests during recovery

Allow OSP requests during recovery, so recovery
threads will not be blocked if the remote target
is also in recovery status, otherwise it might
cause deadlock.

Add replay-single.sh 117 to verify this case.

Signed-off-by: Di Wang <di.wang@intel.com>
Change-Id: Iad3b6fd382d76c9bc042096c51cfac0a0d33091d
Reviewed-on: http://review.whamcloud.com/17539
Tested-by: Jenkins
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/ldlm/ldlm_lib.c        |  7 ++++++-
 lustre/ldlm/ldlm_request.c    | 15 +++++++++++++++
 lustre/osp/osp_md_object.c    | 10 ++++++++++
 lustre/osp/osp_object.c       |  8 ++++++++
 lustre/tests/replay-single.sh | 35 +++++++++++++++++++++++++++++++++++
 5 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index dd518e5..13e3e59 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -2447,14 +2447,19 @@ static int target_start_recovery_thread(struct lu_target *lut,
 	struct obd_device *obd = lut->lut_obd;
 	int rc = 0;
 	struct target_recovery_data *trd = &obd->obd_recovery_data;
+	int index;
 
 	memset(trd, 0, sizeof(*trd));
 	init_completion(&trd->trd_starting);
 	init_completion(&trd->trd_finishing);
 	trd->trd_recovery_handler = handler;
 
+	rc = server_name2index(obd->obd_name, &index, NULL);
+	if (rc < 0)
+		return rc;
+
 	if (!IS_ERR(kthread_run(target_recovery_thread,
-				lut, "tgt_recov"))) {
+				lut, "tgt_recover_%d", index))) {
 		wait_for_completion(&trd->trd_starting);
 		LASSERT(obd->obd_recovering != 0);
 	} else {
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c
index b0a99ce..8c7b8fa 100644
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -1224,6 +1224,21 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
                         GOTO(out, rc);
                 }
 
+		/* If OSP want cancel cross-MDT lock, let's not block it in
+		 * in recovery, otherwise the lock will not released, if
+		 * the remote target is also in recovery, and it also need
+		 * this lock, it might cause deadlock. */
+		if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS &&
+		    exp->exp_obd->obd_lu_dev != NULL &&
+		    exp->exp_obd->obd_lu_dev->ld_site != NULL) {
+			struct lu_device *top_dev;
+
+			top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev;
+			if (top_dev != NULL &&
+			    top_dev->ld_obd->obd_recovering)
+				req->rq_allow_replay = 1;
+		}
+
                 req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
                 ptlrpc_at_set_req_timeout(req);
diff --git a/lustre/osp/osp_md_object.c b/lustre/osp/osp_md_object.c
index 69c815c..643e31a 100644
--- a/lustre/osp/osp_md_object.c
+++ b/lustre/osp/osp_md_object.c
@@ -885,6 +885,7 @@ static int osp_md_object_lock(const struct lu_env *env,
 	struct ldlm_res_id	*res_id;
 	struct dt_device	*dt_dev = lu2dt_dev(dt->do_lu.lo_dev);
 	struct osp_device	*osp = dt2osp_dev(dt_dev);
+	struct lu_device	*top_device;
 	struct ptlrpc_request	*req;
 	int			rc = 0;
 	__u64			flags = 0;
@@ -907,6 +908,15 @@ static int osp_md_object_lock(const struct lu_env *env,
 	if (IS_ERR(req))
 		RETURN(PTR_ERR(req));
 
+	/* During recovery, it needs to let OSP send enqueue
+	 * without checking recoverying status, in case the
+	 * other target is being recovered at the same time,
+	 * and if we wait here for the import to be recovered,
+	 * it might cause deadlock */
+	top_device = dt_dev->dd_lu_dev.ld_site->ls_top_dev;
+	if (top_device->ld_obd->obd_recovering)
+		req->rq_allow_replay = 1;
+
 	rc = ldlm_cli_enqueue(osp->opd_exp, &req, einfo, res_id,
 			      (const union ldlm_policy_data *)policy,
 			      &flags, NULL, 0, LVB_T_NONE, lh, 0);
diff --git a/lustre/osp/osp_object.c b/lustre/osp/osp_object.c
index 033044d..c3d8408 100644
--- a/lustre/osp/osp_object.c
+++ b/lustre/osp/osp_object.c
@@ -1690,6 +1690,7 @@ static int osp_it_fetch(const struct lu_env *env, struct osp_it *it)
 	struct lu_device	 *dev	= it->ooi_obj->do_lu.lo_dev;
 	struct osp_device	 *osp	= lu2osp_dev(dev);
 	struct page		**pages;
+	struct lu_device *top_device;
 	struct ptlrpc_request	 *req	= NULL;
 	struct ptlrpc_bulk_desc  *desc;
 	struct idx_info 	 *ii;
@@ -1725,6 +1726,13 @@ static int osp_it_fetch(const struct lu_env *env, struct osp_it *it)
 		RETURN(rc);
 	}
 
+	/* Let's allow this request during recovery, otherwise
+	 * if the remote target is also in recovery status,
+	 * it might cause deadlock */
+	top_device = dev->ld_site->ls_top_dev;
+	if (top_device->ld_obd->obd_recovering)
+		req->rq_allow_replay = 1;
+
 	req->rq_request_portal = OUT_PORTAL;
 	ii = req_capsule_client_get(&req->rq_pill, &RMF_IDX_INFO);
 	memset(ii, 0, sizeof(*ii));
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index 7855b67..a775227 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -4294,6 +4294,41 @@ test_116b() {
 }
 run_test 116b "large update log slave MDT recovery"
 
+test_117() {
+	[ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
+	([ $FAILURE_MODE == "HARD" ] &&
+		[ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+		skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+		return 0
+	local index
+	local mds_indexs
+
+	mkdir -p $DIR/$tdir
+	$LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
+	$LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
+	sleep 2
+
+	# Let's set rdonly on all MDTs, so client will send
+	# replay requests on all MDTs and replay these requests
+	# at the same time. This test will verify the recovery
+	# will not be deadlock in this case, LU-7531.
+	for ((index = 0; index < $((MDSCOUNT)); index++)); do
+		replay_barrier mds$((index + 1))
+		if [ -z $mds_indexs ]; then
+			mds_indexs="${mds_indexs}mds$((index+1))"
+		else
+			mds_indexs="${mds_indexs},mds$((index+1))"
+		fi
+	done
+
+	rm -rf $DIR/$tdir/remote_dir
+	rm -rf $DIR/$tdir/remote_dir_1
+
+	fail $mds_indexs
+
+	rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
 
 complete $SECONDS
 check_and_cleanup_lustre
-- 
1.8.3.1