LU-8472 scrub: try to avoid recovery during OI scrub

author Fan Yong <fan.yong@intel.com>

Tue, 21 Jun 2016 19:12:26 +0000 (03:12 +0800)

committer Oleg Drokin <oleg.drokin@intel.com>

Mon, 26 Sep 2016 15:19:10 +0000 (15:19 +0000)
author Fan Yong <fan.yong@intel.com>
Tue, 21 Jun 2016 19:12:26 +0000 (03:12 +0800)
committer Oleg Drokin <oleg.drokin@intel.com>
Mon, 26 Sep 2016 15:19:10 +0000 (15:19 +0000)
diff --git a/lustre/target/out_handler.c b/lustre/target/out_handler.c

index 2a3f610..d9ba4dc 100644 (file)
--- a/lustre/target/out_handler.c
+++ b/lustre/target/out_handler.c
@@ -1057,7 +1057,7 @@ int out_handle(struct tgt_session_info *tsi)
         reply->ourp_count = updates;
         tti->tti_u.update.tti_update_reply = reply;
         tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
- 
+
         /* Walk through updates in the request to execute them */
         for (i = 0; i < update_buf_count; i++) {
                 struct tgt_handler      *h;
@@ -1069,9 +1069,16 @@ int out_handle(struct tgt_session_info *tsi)
                 our = update_bufs[i];
                 update_count = our->ourq_count;
                 for (j = 0; j < update_count; j++) {
+                       struct lu_object_conf conf;
+
                         update = object_update_request_get(our, j, NULL);
+                       if (update->ou_type == OUT_CREATE)
+                               conf.loc_flags = LOC_F_NEW;
+                       else
+                               conf.loc_flags = 0;
  
-                       dt_obj = dt_locate(env, dt, &update->ou_fid);
+                       dt_obj = dt_locate_at(env, dt, &update->ou_fid,
+                               dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
                         if (IS_ERR(dt_obj))
                                 GOTO(out, rc = PTR_ERR(dt_obj));
  
diff --git a/lustre/target/update_recovery.c b/lustre/target/update_recovery.c

index 3ffecba..85764cf 100644 (file)
--- a/lustre/target/update_recovery.c
+++ b/lustre/target/update_recovery.c
@@ -1167,6 +1167,17 @@ static int update_recovery_exec(const struct lu_env *env,
                 dt_obj = dt_locate(env, tdtd->tdtd_dt, fid);
                 if (IS_ERR(dt_obj)) {
                         rc = PTR_ERR(dt_obj);
+                       if (rc == -EREMCHG)
+                               LCONSOLE_WARN("%.16s: hit invalid OI mapping "
+                                             "for "DFID" during recovering, "
+                                             "that may because auto scrub is "
+                                             "disabled on related MDT, and "
+                                             "will cause recovery failure. "
+                                             "Please enable auto scrub and "
+                                             "retry the recovery.\n",
+                                             tdtd->tdtd_lut->lut_obd->obd_name,
+                                             PFID(fid));
+
                         break;
                 }
                 sub_dt_obj = dt_object_child(dt_obj);
diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh

index a137f87..29ba554 100644 (file)
--- a/lustre/tests/sanity-scrub.sh
+++ b/lustre/tests/sanity-scrub.sh
@@ -143,6 +143,18 @@ scrub_prep() {
         done
         echo "prepared $(date)."
         cleanup_mount $MOUNT > /dev/null || error "Fail to stop client!"
+
+       # sync local transactions on every MDT
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "$LCTL set_param -n osd*.*MDT*.force_sync=1"
+
+       # wait for a while to cancel update logs after transactions committed.
+       sleep 3
+
+       # sync again to guarantee all things done.
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "$LCTL set_param -n osd*.*MDT*.force_sync=1"
+
         for n in $(seq $MDSCOUNT); do
                 echo "stop mds$n"
                 stop mds$n > /dev/null || error "Fail to stop MDS$n!"
@@ -667,16 +679,24 @@ test_5() {
                 $LCTL set_param fail_val=3 fail_loc=0x190
  
         local n
+       declare -a pids
+
         for n in $(seq $MDSCOUNT); do
-               stat $DIR/$tdir/mds$n/${tfile}800 ||
-                       error "(17) Failed to stat mds$n/${tfile}800"
+               stat $DIR/$tdir/mds$n/${tfile}800 &
+               pids[$n]=$!
         done
  
-       scrub_check_status 18 scanning
+       sleep 3
+
+       scrub_check_status 17 scanning
  
         do_nodes $(comma_list $(mdts_nodes)) \
                 $LCTL set_param fail_loc=0 fail_val=0
  
+       for n in $(seq $MDSCOUNT); do
+               wait ${pids[$n]} || error "(18) Fail to stat mds$n/${tfile}800"
+       done
+
         scrub_check_status 19 completed
         scrub_check_flags 20 ""
  }
author	Fan Yong <fan.yong@intel.com>
	Tue, 21 Jun 2016 19:12:26 +0000 (03:12 +0800)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Mon, 26 Sep 2016 15:19:10 +0000 (15:19 +0000)
lustre/target/out_handler.c		patch \| blob \| history
lustre/target/update_recovery.c		patch \| blob \| history
lustre/tests/sanity-scrub.sh		patch \| blob \| history