From cecde8bdb4913fd4405d425b0bf3aead03181e9d Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Fri, 13 May 2016 23:44:20 +0800 Subject: [PATCH] LU-8218 osd: handle stale OI mapping for non-restore case Sometimes, the user may removes the MDT-object under ldiskfs mode directly but leaves related OI mapping there. Such case also can happen if the MDT-object lost because of disk corruption. Under such case, the OSD ldiskfs should has the ability to distinguish it from the case of MDT file-level backup/restore; otherwise, the up layer user will get -EREMCHG (78) when locating such MDT-object with the FID. Signed-off-by: Fan Yong Change-Id: Iede2542968c21755158637089d20a694f12b309e Reviewed-on: http://review.whamcloud.com/20659 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/lod/lod_object.c | 6 +- lustre/osd-ldiskfs/osd_handler.c | 152 +++++++++++++++++++++++---------------- lustre/tests/sanity-lfsck.sh | 61 ++++++++++++++++ 4 files changed, 155 insertions(+), 65 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 70e21dd..f1412d3 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -580,6 +580,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d +#define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e #define OBD_FAIL_LFSCK_NOTIFY_NET 0x16f0 #define OBD_FAIL_LFSCK_QUERY_NET 0x16f1 diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 87dd8b6..06f31e5 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -3520,7 +3520,8 @@ static int lod_declare_object_destroy(const struct lu_env *env, if (rc) RETURN(rc); - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ)) + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) || + OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2)) RETURN(0); /* declare destroy all striped objects */ @@ -3592,7 +3593,8 @@ static int lod_object_destroy(const struct lu_env *env, if (rc != 0) RETURN(rc); - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ)) + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) || + OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2)) RETURN(0); /* destroy all striped objects */ diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 8169304..a15d9b8 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -593,21 +593,25 @@ check_oi: * Generally, when the device is mounted, it will * auto check whether the system is restored from * file-level backup or not. We trust such detect - * to distinguish the 1st case from the 2nd case. */ - if (rc == 0) { - if (!IS_ERR(inode) && inode->i_generation != 0 && - inode->i_generation == id->oii_gen) - /* "id->oii_gen != OSD_OII_NOGEN" is for - * "@cached == false" case. */ - rc = -ENOENT; - else - rc = -EREMCHG; - } else { + * to distinguish the 1st case from the 2nd case: + * if the OI files are consistent but may contain + * stale OI mappings because of case 2, if iget() + * returns -ENOENT or -ESTALE, then it should be + * the case 2. */ + if (rc != 0) /* If the OI mapping was in OI file before the * osd_iget_check(), but now, it is disappear, * then it must be removed by race. That is a * normal race case. */ - } + GOTO(put, rc); + + if ((!IS_ERR(inode) && inode->i_generation != 0 && + inode->i_generation == id->oii_gen) || + (IS_ERR(inode) && !(dev->od_scrub.os_file.sf_flags & + SF_INCONSISTENT))) + rc = -ENOENT; + else + rc = -EREMCHG; } else { if (id->oii_gen == OSD_OII_NOGEN) osd_id_gen(id, inode->i_ino, inode->i_generation); @@ -955,11 +959,14 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, struct osd_device *dev; struct osd_idmap_cache *oic; struct osd_inode_id *id; + struct osd_inode_id *tid; struct inode *inode = NULL; struct osd_scrub *scrub; struct scrub_file *sf; __u32 flags = SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT | SS_AUTO_FULL; + __u32 saved_ino; + __u32 saved_gen; int result = 0; int rc1 = 0; bool cached = true; @@ -1024,7 +1031,7 @@ iget: if (IS_ERR(inode)) { result = PTR_ERR(inode); if (result == -ENOENT || result == -ESTALE) - GOTO(out, result = -ENOENT); + GOTO(out, result = 0); if (result == -EREMCHG) { @@ -1108,63 +1115,79 @@ join: LASSERT(obj->oo_inode->i_sb == osd_sb(dev)); result = osd_check_lma(env, obj); - if (result != 0) { - if (result == -ENODATA) { - if (cached) { - result = osd_oi_lookup(info, dev, fid, id, - OI_CHECK_FLD); - if (result != 0) { - /* result == -ENOENT means that the OI - * mapping has been removed by race, - * the target inode belongs to other - * object. - * - * Others error also can be returned - * directly. */ - iput(inode); - obj->oo_inode = NULL; - GOTO(out, result); - } else { - /* result == 0 means the cached OI - * mapping is still in the OI file, - * the target the inode is valid. */ - } - } else { - /* The current OI mapping is from the OI file, - * since the inode has been found via - * osd_iget_check(), no need recheck OI. */ - } - - goto found; - } - - iput(inode); - inode = NULL; - obj->oo_inode = NULL; - if (result != -EREMCHG) - GOTO(out, result); + if (result == 0) + goto found; - if (cached) { - result = osd_oi_lookup(info, dev, fid, id, - OI_CHECK_FLD); - /* result == -ENOENT means the cached OI mapping - * has been removed from the OI file by race, - * above target inode belongs to other object. - * - * Others error also can be returned directly. */ - if (result != 0) - GOTO(out, result); + tid = &info->oti_id3; + LASSERT(tid != id); - /* result == 0, goto trigger */ - } else { + if (result == -ENODATA) { + if (!cached) /* The current OI mapping is from the OI file, * since the inode has been found via * osd_iget_check(), no need recheck OI. */ + goto found; + + result = osd_oi_lookup(info, dev, fid, tid, OI_CHECK_FLD); + if (result == 0) { + LASSERTF(tid->oii_ino == id->oii_ino && + tid->oii_gen == id->oii_gen, + "OI mapping changed(1): %u/%u => %u/%u", + tid->oii_ino, tid->oii_gen, + id->oii_ino, id->oii_gen); + + LASSERTF(tid->oii_ino == inode->i_ino && + tid->oii_gen == inode->i_generation, + "locate wrong inode(1): %u/%u => %ld/%u", + tid->oii_ino, tid->oii_gen, + inode->i_ino, inode->i_generation); + + /* "result == 0" means the cached OI mapping is still in + * the OI file, so the target the inode is valid. */ + goto found; } - goto trigger; + /* "result == -ENOENT" means that the OI mappinghas been removed + * by race, the target inode belongs to other object. + * + * Others error can be returned directly. */ + if (result == -ENOENT) + result = 0; } + saved_ino = inode->i_ino; + saved_gen = inode->i_generation; + iput(inode); + inode = NULL; + obj->oo_inode = NULL; + + if (result != -EREMCHG) + GOTO(out, result); + + if (!cached) + /* The current OI mapping is from the OI file, + * since the inode has been found via + * osd_iget_check(), no need recheck OI. */ + goto trigger; + + result = osd_oi_lookup(info, dev, fid, tid, OI_CHECK_FLD); + /* "result == -ENOENT" means the cached OI mapping has been removed from + * the OI file by race, above target inode belongs to other object. + * + * Others error can be returned directly. */ + if (result != 0) + GOTO(out, result = (result == -ENOENT ? 0 : result)); + + LASSERTF(tid->oii_ino == id->oii_ino && tid->oii_gen == id->oii_gen, + "OI mapping changed(2): %u/%u => %u/%u", + tid->oii_ino, tid->oii_gen, id->oii_ino, id->oii_gen); + + LASSERTF(tid->oii_ino == saved_ino && tid->oii_gen == saved_gen, + "locate wrong inode(2): %u/%u => %u/%u", + tid->oii_ino, tid->oii_gen, saved_ino, saved_gen); + + goto trigger; + found: obj->oo_compat_dot_created = 1; obj->oo_compat_dotdot_created = 1; @@ -3115,7 +3138,8 @@ static int osd_declare_object_destroy(const struct lu_env *env, osd_dto_credits_noquota[DTO_OBJECT_DELETE]); /* Recycle idle OI leaf may cause additional three OI blocks * to be changed. */ - osd_trans_declare_op(env, oh, OSD_OT_DELETE, + if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2)) + osd_trans_declare_op(env, oh, OSD_OT_DELETE, osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3); /* one less inode */ rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode), @@ -3174,8 +3198,10 @@ static int osd_object_destroy(const struct lu_env *env, osd_trans_exec_op(env, th, OSD_OT_DESTROY); ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_DESTROY); - result = osd_oi_delete(osd_oti_get(env), osd, fid, oh->ot_handle, - OI_CHECK_FLD); + + if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2)) + result = osd_oi_delete(osd_oti_get(env), osd, fid, + oh->ot_handle, OI_CHECK_FLD); osd_trans_exec_check(env, th, OSD_OT_DESTROY); /* XXX: add to ext3 orphan list */ diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index 8eca8e8..80f38c1 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -2493,6 +2493,67 @@ test_18f() { } run_test 18f "Skip the failed OST(s) when handle orphan OST-objects" +test_18g() { + echo "#####" + echo "The target MDT-object is lost, but related OI mapping is there" + echo "The LFSCK should recreate the lost MDT-object without affected" + echo "by the stale OI mapping." + echo "#####" + + check_mount_and_prep + $LFS mkdir -i 0 $DIR/$tdir/a1 + $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1 + dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT + local fid1=$($LFS path2fid $DIR/$tdir/a1/f1) + echo ${fid1} + $LFS getstripe $DIR/$tdir/a1/f1 + cancel_lru_locks osc + + echo "Inject failure to simulate lost MDT-object but keep OI mapping" + #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e + do_facet mds1 $LCTL set_param fail_loc=0x162e + rm -f $DIR/$tdir/a1/f1 + + do_facet mds1 $LCTL set_param fail_loc=0 + cancel_lru_locks mdc + cancel_lru_locks osc + + echo "Trigger layout LFSCK on all devices to find out orphan OST-object" + $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!" + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" $LTIME || + error "(2) MDS${k} is not the expected 'completed'" + done + + for k in $(seq $OSTCOUNT); do + local cur_status=$(do_facet ost${k} $LCTL get_param -n \ + obdfilter.$(facet_svc ost${k}).lfsck_layout | + awk '/^status/ { print $2 }') + [ "$cur_status" == "completed" ] || + error "(3) OST${k} Expect 'completed', but got '$cur_status'" + done + + local repaired=$(do_facet mds1 $LCTL get_param -n \ + mdd.$(facet_svc mds1).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq $OSTCOUNT ] || + error "(4) Expect $OSTCOUNT fixed, but got: $repaired" + + echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace" + mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 || + error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0" + + $LFS path2fid $DIR/$tdir/a1/f1 + $LFS getstripe $DIR/$tdir/a1/f1 +} +run_test 18g "Find out orphan OST-object and repair it (7)" + $LCTL set_param debug=-cache > /dev/null test_19a() { -- 1.8.3.1