Whamcloud - gitweb
LU-8218 osd: handle stale OI mapping for non-restore case 59/20659/8
authorFan Yong <fan.yong@intel.com>
Fri, 13 May 2016 15:44:20 +0000 (23:44 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 22 Sep 2016 03:04:29 +0000 (03:04 +0000)
Sometimes, the user may removes the MDT-object under ldiskfs mode
directly but leaves related OI mapping there. Such case also can
happen if the MDT-object lost because of disk corruption. Under
such case, the OSD ldiskfs should has the ability to distinguish
it from the case of MDT file-level backup/restore; otherwise, the
up layer user will get -EREMCHG (78) when locating such MDT-object
with the FID.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Iede2542968c21755158637089d20a694f12b309e
Reviewed-on: http://review.whamcloud.com/20659
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_support.h
lustre/lod/lod_object.c
lustre/osd-ldiskfs/osd_handler.c
lustre/tests/sanity-lfsck.sh

index 70e21dd..f1412d3 100644 (file)
@@ -580,6 +580,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV   0x162b
 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME  0x162c
 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT        0x162d
 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV   0x162b
 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME  0x162c
 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT        0x162d
+#define OBD_FAIL_LFSCK_LOST_MDTOBJ2    0x162e
 
 #define OBD_FAIL_LFSCK_NOTIFY_NET      0x16f0
 #define OBD_FAIL_LFSCK_QUERY_NET       0x16f1
 
 #define OBD_FAIL_LFSCK_NOTIFY_NET      0x16f0
 #define OBD_FAIL_LFSCK_QUERY_NET       0x16f1
index 87dd8b6..06f31e5 100644 (file)
@@ -3520,7 +3520,8 @@ static int lod_declare_object_destroy(const struct lu_env *env,
        if (rc)
                RETURN(rc);
 
        if (rc)
                RETURN(rc);
 
-       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ))
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
+           OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
                RETURN(0);
 
        /* declare destroy all striped objects */
                RETURN(0);
 
        /* declare destroy all striped objects */
@@ -3592,7 +3593,8 @@ static int lod_object_destroy(const struct lu_env *env,
        if (rc != 0)
                RETURN(rc);
 
        if (rc != 0)
                RETURN(rc);
 
-       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ))
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ) ||
+           OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
                RETURN(0);
 
        /* destroy all striped objects */
                RETURN(0);
 
        /* destroy all striped objects */
index 8169304..a15d9b8 100644 (file)
@@ -593,21 +593,25 @@ check_oi:
                 *      Generally, when the device is mounted, it will
                 *      auto check whether the system is restored from
                 *      file-level backup or not. We trust such detect
                 *      Generally, when the device is mounted, it will
                 *      auto check whether the system is restored from
                 *      file-level backup or not. We trust such detect
-                *      to distinguish the 1st case from the 2nd case. */
-               if (rc == 0) {
-                       if (!IS_ERR(inode) && inode->i_generation != 0 &&
-                           inode->i_generation == id->oii_gen)
-                               /* "id->oii_gen != OSD_OII_NOGEN" is for
-                                * "@cached == false" case. */
-                               rc = -ENOENT;
-                       else
-                               rc = -EREMCHG;
-               } else {
+                *      to distinguish the 1st case from the 2nd case:
+                *      if the OI files are consistent but may contain
+                *      stale OI mappings because of case 2, if iget()
+                *      returns -ENOENT or -ESTALE, then it should be
+                *      the case 2. */
+               if (rc != 0)
                        /* If the OI mapping was in OI file before the
                         * osd_iget_check(), but now, it is disappear,
                         * then it must be removed by race. That is a
                         * normal race case. */
                        /* If the OI mapping was in OI file before the
                         * osd_iget_check(), but now, it is disappear,
                         * then it must be removed by race. That is a
                         * normal race case. */
-               }
+                       GOTO(put, rc);
+
+               if ((!IS_ERR(inode) && inode->i_generation != 0 &&
+                    inode->i_generation == id->oii_gen) ||
+                   (IS_ERR(inode) && !(dev->od_scrub.os_file.sf_flags &
+                                       SF_INCONSISTENT)))
+                       rc = -ENOENT;
+               else
+                       rc = -EREMCHG;
        } else {
                if (id->oii_gen == OSD_OII_NOGEN)
                        osd_id_gen(id, inode->i_ino, inode->i_generation);
        } else {
                if (id->oii_gen == OSD_OII_NOGEN)
                        osd_id_gen(id, inode->i_ino, inode->i_generation);
@@ -955,11 +959,14 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
        struct osd_device      *dev;
        struct osd_idmap_cache *oic;
        struct osd_inode_id    *id;
        struct osd_device      *dev;
        struct osd_idmap_cache *oic;
        struct osd_inode_id    *id;
+       struct osd_inode_id    *tid;
        struct inode           *inode = NULL;
        struct osd_scrub       *scrub;
        struct scrub_file      *sf;
        __u32                   flags = SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT |
                                        SS_AUTO_FULL;
        struct inode           *inode = NULL;
        struct osd_scrub       *scrub;
        struct scrub_file      *sf;
        __u32                   flags = SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT |
                                        SS_AUTO_FULL;
+       __u32                   saved_ino;
+       __u32                   saved_gen;
        int                     result  = 0;
        int                     rc1     = 0;
        bool                    cached  = true;
        int                     result  = 0;
        int                     rc1     = 0;
        bool                    cached  = true;
@@ -1024,7 +1031,7 @@ iget:
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                if (result == -ENOENT || result == -ESTALE)
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                if (result == -ENOENT || result == -ESTALE)
-                       GOTO(out, result = -ENOENT);
+                       GOTO(out, result = 0);
 
                if (result == -EREMCHG) {
 
 
                if (result == -EREMCHG) {
 
@@ -1108,63 +1115,79 @@ join:
        LASSERT(obj->oo_inode->i_sb == osd_sb(dev));
 
        result = osd_check_lma(env, obj);
        LASSERT(obj->oo_inode->i_sb == osd_sb(dev));
 
        result = osd_check_lma(env, obj);
-       if (result != 0) {
-               if (result == -ENODATA) {
-                       if (cached) {
-                               result = osd_oi_lookup(info, dev, fid, id,
-                                                      OI_CHECK_FLD);
-                               if (result != 0) {
-                                       /* result == -ENOENT means that the OI
-                                        * mapping has been removed by race,
-                                        * the target inode belongs to other
-                                        * object.
-                                        *
-                                        * Others error also can be returned
-                                        * directly. */
-                                       iput(inode);
-                                       obj->oo_inode = NULL;
-                                       GOTO(out, result);
-                               } else {
-                                       /* result == 0 means the cached OI
-                                        * mapping is still in the OI file,
-                                        * the target the inode is valid. */
-                               }
-                       } else {
-                               /* The current OI mapping is from the OI file,
-                                * since the inode has been found via
-                                * osd_iget_check(), no need recheck OI. */
-                       }
-
-                       goto found;
-               }
-
-               iput(inode);
-               inode = NULL;
-               obj->oo_inode = NULL;
-               if (result != -EREMCHG)
-                       GOTO(out, result);
+       if (result == 0)
+               goto found;
 
 
-               if (cached) {
-                       result = osd_oi_lookup(info, dev, fid, id,
-                                              OI_CHECK_FLD);
-                       /* result == -ENOENT means the cached OI mapping
-                        * has been removed from the OI file by race,
-                        * above target inode belongs to other object.
-                        *
-                        * Others error also can be returned directly. */
-                       if (result != 0)
-                               GOTO(out, result);
+       tid = &info->oti_id3;
+       LASSERT(tid != id);
 
 
-                       /* result == 0, goto trigger */
-               } else {
+       if (result == -ENODATA) {
+               if (!cached)
                        /* The current OI mapping is from the OI file,
                         * since the inode has been found via
                         * osd_iget_check(), no need recheck OI. */
                        /* The current OI mapping is from the OI file,
                         * since the inode has been found via
                         * osd_iget_check(), no need recheck OI. */
+                       goto found;
+
+               result = osd_oi_lookup(info, dev, fid, tid, OI_CHECK_FLD);
+               if (result == 0) {
+                       LASSERTF(tid->oii_ino == id->oii_ino &&
+                                tid->oii_gen == id->oii_gen,
+                                "OI mapping changed(1): %u/%u => %u/%u",
+                                tid->oii_ino, tid->oii_gen,
+                                id->oii_ino, id->oii_gen);
+
+                       LASSERTF(tid->oii_ino == inode->i_ino &&
+                                tid->oii_gen == inode->i_generation,
+                                "locate wrong inode(1): %u/%u => %ld/%u",
+                                tid->oii_ino, tid->oii_gen,
+                                inode->i_ino, inode->i_generation);
+
+                       /* "result == 0" means the cached OI mapping is still in
+                        * the OI file, so the target the inode is valid. */
+                       goto found;
                }
 
                }
 
-               goto trigger;
+               /* "result == -ENOENT" means that the OI mappinghas been removed
+                * by race, the target inode belongs to other object.
+                *
+                * Others error can be returned  directly. */
+               if (result == -ENOENT)
+                       result = 0;
        }
 
        }
 
+       saved_ino = inode->i_ino;
+       saved_gen = inode->i_generation;
+       iput(inode);
+       inode = NULL;
+       obj->oo_inode = NULL;
+
+       if (result != -EREMCHG)
+               GOTO(out, result);
+
+       if (!cached)
+               /* The current OI mapping is from the OI file,
+                * since the inode has been found via
+                * osd_iget_check(), no need recheck OI. */
+               goto trigger;
+
+       result = osd_oi_lookup(info, dev, fid, tid, OI_CHECK_FLD);
+       /* "result == -ENOENT" means the cached OI mapping has been removed from
+        * the OI file by race, above target inode belongs to other object.
+        *
+        * Others error can be returned directly. */
+       if (result != 0)
+               GOTO(out, result = (result == -ENOENT ? 0 : result));
+
+       LASSERTF(tid->oii_ino == id->oii_ino && tid->oii_gen == id->oii_gen,
+                "OI mapping changed(2): %u/%u => %u/%u",
+                tid->oii_ino, tid->oii_gen, id->oii_ino, id->oii_gen);
+
+       LASSERTF(tid->oii_ino == saved_ino && tid->oii_gen == saved_gen,
+                "locate wrong inode(2): %u/%u => %u/%u",
+                tid->oii_ino, tid->oii_gen, saved_ino, saved_gen);
+
+       goto trigger;
+
 found:
        obj->oo_compat_dot_created = 1;
        obj->oo_compat_dotdot_created = 1;
 found:
        obj->oo_compat_dot_created = 1;
        obj->oo_compat_dotdot_created = 1;
@@ -3115,7 +3138,8 @@ static int osd_declare_object_destroy(const struct lu_env *env,
                             osd_dto_credits_noquota[DTO_OBJECT_DELETE]);
        /* Recycle idle OI leaf may cause additional three OI blocks
         * to be changed. */
                             osd_dto_credits_noquota[DTO_OBJECT_DELETE]);
        /* Recycle idle OI leaf may cause additional three OI blocks
         * to be changed. */
-       osd_trans_declare_op(env, oh, OSD_OT_DELETE,
+       if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
+               osd_trans_declare_op(env, oh, OSD_OT_DELETE,
                             osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3);
        /* one less inode */
        rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
                             osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3);
        /* one less inode */
        rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
@@ -3174,8 +3198,10 @@ static int osd_object_destroy(const struct lu_env *env,
        osd_trans_exec_op(env, th, OSD_OT_DESTROY);
 
        ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_DESTROY);
        osd_trans_exec_op(env, th, OSD_OT_DESTROY);
 
        ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_DESTROY);
-       result = osd_oi_delete(osd_oti_get(env), osd, fid, oh->ot_handle,
-                              OI_CHECK_FLD);
+
+       if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
+               result = osd_oi_delete(osd_oti_get(env), osd, fid,
+                                      oh->ot_handle, OI_CHECK_FLD);
 
        osd_trans_exec_check(env, th, OSD_OT_DESTROY);
        /* XXX: add to ext3 orphan list */
 
        osd_trans_exec_check(env, th, OSD_OT_DESTROY);
        /* XXX: add to ext3 orphan list */
index 8eca8e8..80f38c1 100644 (file)
@@ -2493,6 +2493,67 @@ test_18f() {
 }
 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
 
 }
 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
 
+test_18g() {
+       echo "#####"
+       echo "The target MDT-object is lost, but related OI mapping is there"
+       echo "The LFSCK should recreate the lost MDT-object without affected"
+       echo "by the stale OI mapping."
+       echo "#####"
+
+       check_mount_and_prep
+       $LFS mkdir -i 0 $DIR/$tdir/a1
+       $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
+       dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
+       local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
+       echo ${fid1}
+       $LFS getstripe $DIR/$tdir/a1/f1
+       cancel_lru_locks osc
+
+       echo "Inject failure to simulate lost MDT-object but keep OI mapping"
+       #define OBD_FAIL_LFSCK_LOST_MDTOBJ2     0x162e
+       do_facet mds1 $LCTL set_param fail_loc=0x162e
+       rm -f $DIR/$tdir/a1/f1
+
+       do_facet mds1 $LCTL set_param fail_loc=0
+       cancel_lru_locks mdc
+       cancel_lru_locks osc
+
+       echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
+       $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
+
+       for k in $(seq $MDSCOUNT); do
+               # The LFSCK status query internal is 30 seconds. For the case
+               # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
+               # time to guarantee the status sync up.
+               wait_update_facet mds${k} "$LCTL get_param -n \
+                       mdd.$(facet_svc mds${k}).lfsck_layout |
+                       awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
+                       error "(2) MDS${k} is not the expected 'completed'"
+       done
+
+       for k in $(seq $OSTCOUNT); do
+               local cur_status=$(do_facet ost${k} $LCTL get_param -n \
+                               obdfilter.$(facet_svc ost${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$cur_status" == "completed" ] ||
+               error "(3) OST${k} Expect 'completed', but got '$cur_status'"
+       done
+
+       local repaired=$(do_facet mds1 $LCTL get_param -n \
+                        mdd.$(facet_svc mds1).lfsck_layout |
+                        awk '/^repaired_orphan/ { print $2 }')
+       [ $repaired -eq $OSTCOUNT ] ||
+               error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
+
+       echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
+       mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
+       error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
+
+       $LFS path2fid $DIR/$tdir/a1/f1
+       $LFS getstripe $DIR/$tdir/a1/f1
+}
+run_test 18g "Find out orphan OST-object and repair it (7)"
+
 $LCTL set_param debug=-cache > /dev/null
 
 test_19a() {
 $LCTL set_param debug=-cache > /dev/null
 
 test_19a() {