Whamcloud - gitweb
LU-8472 scrub: try to avoid recovery during OI scrub 18/21918/4
authorFan Yong <fan.yong@intel.com>
Tue, 21 Jun 2016 19:12:26 +0000 (03:12 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 26 Sep 2016 15:19:10 +0000 (15:19 +0000)
It is known issue that FID based operation will hit -EINPROGRESS
or -EREMCHG failure if related OI mapping is invalid (most cases
because file-level backup/restore).

On the other hand, the recovery for cross-MDTs modifications will
trigger FID based operation(s) before OI scrub rebuilding related
OI mappings.

So during sanity-scrub tests, the scripts should avoid cross-MDTs
recovery via sync all transactions before file-level backup.

More warning message about the recovery failure if because of bad
OI mappings.

Another fix is about setting LOC_F_NEW flag for the object to be
created via out_create().

Test-Parameters: mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs ostfilesystemtype=ldiskfs mdscount=2 mdtcount=4 testlist=sanity-scrub,sanity-scrub,sanity-scrub
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I6e8bc9c5d587be72ecd7e33fa7e9959fe5b34006
Reviewed-on: http://review.whamcloud.com/21918
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Jian Yu <jian.yu@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/target/out_handler.c
lustre/target/update_recovery.c
lustre/tests/sanity-scrub.sh

index 2a3f610..d9ba4dc 100644 (file)
@@ -1057,7 +1057,7 @@ int out_handle(struct tgt_session_info *tsi)
        reply->ourp_count = updates;
        tti->tti_u.update.tti_update_reply = reply;
        tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
+
        /* Walk through updates in the request to execute them */
        for (i = 0; i < update_buf_count; i++) {
                struct tgt_handler      *h;
@@ -1069,9 +1069,16 @@ int out_handle(struct tgt_session_info *tsi)
                our = update_bufs[i];
                update_count = our->ourq_count;
                for (j = 0; j < update_count; j++) {
+                       struct lu_object_conf conf;
+
                        update = object_update_request_get(our, j, NULL);
+                       if (update->ou_type == OUT_CREATE)
+                               conf.loc_flags = LOC_F_NEW;
+                       else
+                               conf.loc_flags = 0;
 
-                       dt_obj = dt_locate(env, dt, &update->ou_fid);
+                       dt_obj = dt_locate_at(env, dt, &update->ou_fid,
+                               dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
                        if (IS_ERR(dt_obj))
                                GOTO(out, rc = PTR_ERR(dt_obj));
 
index 3ffecba..85764cf 100644 (file)
@@ -1167,6 +1167,17 @@ static int update_recovery_exec(const struct lu_env *env,
                dt_obj = dt_locate(env, tdtd->tdtd_dt, fid);
                if (IS_ERR(dt_obj)) {
                        rc = PTR_ERR(dt_obj);
+                       if (rc == -EREMCHG)
+                               LCONSOLE_WARN("%.16s: hit invalid OI mapping "
+                                             "for "DFID" during recovering, "
+                                             "that may because auto scrub is "
+                                             "disabled on related MDT, and "
+                                             "will cause recovery failure. "
+                                             "Please enable auto scrub and "
+                                             "retry the recovery.\n",
+                                             tdtd->tdtd_lut->lut_obd->obd_name,
+                                             PFID(fid));
+
                        break;
                }
                sub_dt_obj = dt_object_child(dt_obj);
index a137f87..29ba554 100644 (file)
@@ -143,6 +143,18 @@ scrub_prep() {
        done
        echo "prepared $(date)."
        cleanup_mount $MOUNT > /dev/null || error "Fail to stop client!"
+
+       # sync local transactions on every MDT
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "$LCTL set_param -n osd*.*MDT*.force_sync=1"
+
+       # wait for a while to cancel update logs after transactions committed.
+       sleep 3
+
+       # sync again to guarantee all things done.
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "$LCTL set_param -n osd*.*MDT*.force_sync=1"
+
        for n in $(seq $MDSCOUNT); do
                echo "stop mds$n"
                stop mds$n > /dev/null || error "Fail to stop MDS$n!"
@@ -667,16 +679,24 @@ test_5() {
                $LCTL set_param fail_val=3 fail_loc=0x190
 
        local n
+       declare -a pids
+
        for n in $(seq $MDSCOUNT); do
-               stat $DIR/$tdir/mds$n/${tfile}800 ||
-                       error "(17) Failed to stat mds$n/${tfile}800"
+               stat $DIR/$tdir/mds$n/${tfile}800 &
+               pids[$n]=$!
        done
 
-       scrub_check_status 18 scanning
+       sleep 3
+
+       scrub_check_status 17 scanning
 
        do_nodes $(comma_list $(mdts_nodes)) \
                $LCTL set_param fail_loc=0 fail_val=0
 
+       for n in $(seq $MDSCOUNT); do
+               wait ${pids[$n]} || error "(18) Fail to stat mds$n/${tfile}800"
+       done
+
        scrub_check_status 19 completed
        scrub_check_flags 20 ""
 }