From: Fan Yong Date: Tue, 21 Jun 2016 19:12:26 +0000 (+0800) Subject: LU-8472 scrub: try to avoid recovery during OI scrub X-Git-Tag: 2.8.59~33 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=a41c6fad4672a60166088b9ad8aeb4f1b51c38e7;ds=sidebyside LU-8472 scrub: try to avoid recovery during OI scrub It is known issue that FID based operation will hit -EINPROGRESS or -EREMCHG failure if related OI mapping is invalid (most cases because file-level backup/restore). On the other hand, the recovery for cross-MDTs modifications will trigger FID based operation(s) before OI scrub rebuilding related OI mappings. So during sanity-scrub tests, the scripts should avoid cross-MDTs recovery via sync all transactions before file-level backup. More warning message about the recovery failure if because of bad OI mappings. Another fix is about setting LOC_F_NEW flag for the object to be created via out_create(). Test-Parameters: mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs ostfilesystemtype=ldiskfs mdscount=2 mdtcount=4 testlist=sanity-scrub,sanity-scrub,sanity-scrub Signed-off-by: Fan Yong Change-Id: I6e8bc9c5d587be72ecd7e33fa7e9959fe5b34006 Reviewed-on: http://review.whamcloud.com/21918 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Jian Yu Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/lustre/target/out_handler.c b/lustre/target/out_handler.c index 2a3f610..d9ba4dc 100644 --- a/lustre/target/out_handler.c +++ b/lustre/target/out_handler.c @@ -1057,7 +1057,7 @@ int out_handle(struct tgt_session_info *tsi) reply->ourp_count = updates; tti->tti_u.update.tti_update_reply = reply; tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi)); - + /* Walk through updates in the request to execute them */ for (i = 0; i < update_buf_count; i++) { struct tgt_handler *h; @@ -1069,9 +1069,16 @@ int out_handle(struct tgt_session_info *tsi) our = update_bufs[i]; update_count = our->ourq_count; for (j = 0; j < update_count; j++) { + struct lu_object_conf conf; + update = object_update_request_get(our, j, NULL); + if (update->ou_type == OUT_CREATE) + conf.loc_flags = LOC_F_NEW; + else + conf.loc_flags = 0; - dt_obj = dt_locate(env, dt, &update->ou_fid); + dt_obj = dt_locate_at(env, dt, &update->ou_fid, + dt->dd_lu_dev.ld_site->ls_top_dev, &conf); if (IS_ERR(dt_obj)) GOTO(out, rc = PTR_ERR(dt_obj)); diff --git a/lustre/target/update_recovery.c b/lustre/target/update_recovery.c index 3ffecba1..85764cf 100644 --- a/lustre/target/update_recovery.c +++ b/lustre/target/update_recovery.c @@ -1167,6 +1167,17 @@ static int update_recovery_exec(const struct lu_env *env, dt_obj = dt_locate(env, tdtd->tdtd_dt, fid); if (IS_ERR(dt_obj)) { rc = PTR_ERR(dt_obj); + if (rc == -EREMCHG) + LCONSOLE_WARN("%.16s: hit invalid OI mapping " + "for "DFID" during recovering, " + "that may because auto scrub is " + "disabled on related MDT, and " + "will cause recovery failure. " + "Please enable auto scrub and " + "retry the recovery.\n", + tdtd->tdtd_lut->lut_obd->obd_name, + PFID(fid)); + break; } sub_dt_obj = dt_object_child(dt_obj); diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh index a137f87..29ba554 100644 --- a/lustre/tests/sanity-scrub.sh +++ b/lustre/tests/sanity-scrub.sh @@ -143,6 +143,18 @@ scrub_prep() { done echo "prepared $(date)." cleanup_mount $MOUNT > /dev/null || error "Fail to stop client!" + + # sync local transactions on every MDT + do_nodes $(comma_list $(mdts_nodes)) \ + "$LCTL set_param -n osd*.*MDT*.force_sync=1" + + # wait for a while to cancel update logs after transactions committed. + sleep 3 + + # sync again to guarantee all things done. + do_nodes $(comma_list $(mdts_nodes)) \ + "$LCTL set_param -n osd*.*MDT*.force_sync=1" + for n in $(seq $MDSCOUNT); do echo "stop mds$n" stop mds$n > /dev/null || error "Fail to stop MDS$n!" @@ -667,16 +679,24 @@ test_5() { $LCTL set_param fail_val=3 fail_loc=0x190 local n + declare -a pids + for n in $(seq $MDSCOUNT); do - stat $DIR/$tdir/mds$n/${tfile}800 || - error "(17) Failed to stat mds$n/${tfile}800" + stat $DIR/$tdir/mds$n/${tfile}800 & + pids[$n]=$! done - scrub_check_status 18 scanning + sleep 3 + + scrub_check_status 17 scanning do_nodes $(comma_list $(mdts_nodes)) \ $LCTL set_param fail_loc=0 fail_val=0 + for n in $(seq $MDSCOUNT); do + wait ${pids[$n]} || error "(18) Fail to stat mds$n/${tfile}800" + done + scrub_check_status 19 completed scrub_check_flags 20 "" }