From: Fan Yong <fan.yong@intel.com>
Date: Tue, 21 Jun 2016 19:12:26 +0000 (+0800)
Subject: LU-8472 scrub: try to avoid recovery during OI scrub
X-Git-Tag: 2.8.59~33
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=a41c6fad4672a60166088b9ad8aeb4f1b51c38e7;ds=sidebyside

LU-8472 scrub: try to avoid recovery during OI scrub

It is known issue that FID based operation will hit -EINPROGRESS
or -EREMCHG failure if related OI mapping is invalid (most cases
because file-level backup/restore).

On the other hand, the recovery for cross-MDTs modifications will
trigger FID based operation(s) before OI scrub rebuilding related
OI mappings.

So during sanity-scrub tests, the scripts should avoid cross-MDTs
recovery via sync all transactions before file-level backup.

More warning message about the recovery failure if because of bad
OI mappings.

Another fix is about setting LOC_F_NEW flag for the object to be
created via out_create().

Test-Parameters: mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs ostfilesystemtype=ldiskfs mdscount=2 mdtcount=4 testlist=sanity-scrub,sanity-scrub,sanity-scrub
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I6e8bc9c5d587be72ecd7e33fa7e9959fe5b34006
Reviewed-on: http://review.whamcloud.com/21918
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Jian Yu <jian.yu@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/target/out_handler.c b/lustre/target/out_handler.c
index 2a3f610..d9ba4dc 100644
--- a/lustre/target/out_handler.c
+++ b/lustre/target/out_handler.c
@@ -1057,7 +1057,7 @@ int out_handle(struct tgt_session_info *tsi)
 	reply->ourp_count = updates;
 	tti->tti_u.update.tti_update_reply = reply;
 	tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
- 
+
 	/* Walk through updates in the request to execute them */
 	for (i = 0; i < update_buf_count; i++) {
 		struct tgt_handler	*h;
@@ -1069,9 +1069,16 @@ int out_handle(struct tgt_session_info *tsi)
 		our = update_bufs[i];
 		update_count = our->ourq_count;
 		for (j = 0; j < update_count; j++) {
+			struct lu_object_conf conf;
+
 			update = object_update_request_get(our, j, NULL);
+			if (update->ou_type == OUT_CREATE)
+				conf.loc_flags = LOC_F_NEW;
+			else
+				conf.loc_flags = 0;
 
-			dt_obj = dt_locate(env, dt, &update->ou_fid);
+			dt_obj = dt_locate_at(env, dt, &update->ou_fid,
+				dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
 			if (IS_ERR(dt_obj))
 				GOTO(out, rc = PTR_ERR(dt_obj));
 
diff --git a/lustre/target/update_recovery.c b/lustre/target/update_recovery.c
index 3ffecba1..85764cf 100644
--- a/lustre/target/update_recovery.c
+++ b/lustre/target/update_recovery.c
@@ -1167,6 +1167,17 @@ static int update_recovery_exec(const struct lu_env *env,
 		dt_obj = dt_locate(env, tdtd->tdtd_dt, fid);
 		if (IS_ERR(dt_obj)) {
 			rc = PTR_ERR(dt_obj);
+			if (rc == -EREMCHG)
+				LCONSOLE_WARN("%.16s: hit invalid OI mapping "
+					      "for "DFID" during recovering, "
+					      "that may because auto scrub is "
+					      "disabled on related MDT, and "
+					      "will cause recovery failure. "
+					      "Please enable auto scrub and "
+					      "retry the recovery.\n",
+					      tdtd->tdtd_lut->lut_obd->obd_name,
+					      PFID(fid));
+
 			break;
 		}
 		sub_dt_obj = dt_object_child(dt_obj);
diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh
index a137f87..29ba554 100644
--- a/lustre/tests/sanity-scrub.sh
+++ b/lustre/tests/sanity-scrub.sh
@@ -143,6 +143,18 @@ scrub_prep() {
 	done
 	echo "prepared $(date)."
 	cleanup_mount $MOUNT > /dev/null || error "Fail to stop client!"
+
+	# sync local transactions on every MDT
+	do_nodes $(comma_list $(mdts_nodes)) \
+		"$LCTL set_param -n osd*.*MDT*.force_sync=1"
+
+	# wait for a while to cancel update logs after transactions committed.
+	sleep 3
+
+	# sync again to guarantee all things done.
+	do_nodes $(comma_list $(mdts_nodes)) \
+		"$LCTL set_param -n osd*.*MDT*.force_sync=1"
+
 	for n in $(seq $MDSCOUNT); do
 		echo "stop mds$n"
 		stop mds$n > /dev/null || error "Fail to stop MDS$n!"
@@ -667,16 +679,24 @@ test_5() {
 		$LCTL set_param fail_val=3 fail_loc=0x190
 
 	local n
+	declare -a pids
+
 	for n in $(seq $MDSCOUNT); do
-		stat $DIR/$tdir/mds$n/${tfile}800 ||
-			error "(17) Failed to stat mds$n/${tfile}800"
+		stat $DIR/$tdir/mds$n/${tfile}800 &
+		pids[$n]=$!
 	done
 
-	scrub_check_status 18 scanning
+	sleep 3
+
+	scrub_check_status 17 scanning
 
 	do_nodes $(comma_list $(mdts_nodes)) \
 		$LCTL set_param fail_loc=0 fail_val=0
 
+	for n in $(seq $MDSCOUNT); do
+		wait ${pids[$n]} || error "(18) Fail to stat mds$n/${tfile}800"
+	done
+
 	scrub_check_status 19 completed
 	scrub_check_flags 20 ""
 }