Whamcloud - gitweb
LU-15868 lfsck: don't crash upon dir migration failure 81/47381/7
authorLai Siyao <lai.siyao@whamcloud.com>
Tue, 17 May 2022 11:11:25 +0000 (07:11 -0400)
committerOleg Drokin <green@whamcloud.com>
Mon, 18 Jul 2022 05:35:13 +0000 (05:35 +0000)
LFSCK against directories that were migrated, but failed may crash,
it's because lost+found directory may not be initialized correctly,
and this error is skipped on purpose, add check in code that
dereference it.

lfsck_verify_lpf() may dereference NULL "child2".

lmv_name_to_stripe_index() should support stripe LMV, which is used
by LFSCK to verify name hash.

Add OBD_FAIL_OUT_EIO to simulate sub transaction failure.

Add sanity-lfsck 15d to verify LFSCK won't crash upon directory
migration failure.

Update sanity-lfsck 4 and 5 to start mds1 with OI scrub enabled, and
wait for mds1 OI scrub finish, otherwise LFSCK may fail to verify
lost+found later.

Test-Parameters: mdscount=2 mdtcount=4 testlist=sanity-lfsck \
env=ONLY=15d,ONLY_REPEAT=100
Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I1b1872da2b4ef8f7403effc4d1d3e298c6a0b7e6
Reviewed-on: https://review.whamcloud.com/47381
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_lmv.h
lustre/include/obd_support.h
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_lib.c
lustre/lfsck/lfsck_namespace.c
lustre/target/out_handler.c
lustre/tests/sanity-lfsck.sh

index aafaed3..5d5afc8 100644 (file)
@@ -370,14 +370,16 @@ __lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count,
 static inline int lmv_name_to_stripe_index(struct lmv_mds_md_v1 *lmv,
                                           const char *name, int namelen)
 {
-       if (lmv->lmv_magic == LMV_MAGIC_V1)
+       if (lmv->lmv_magic == LMV_MAGIC_V1 ||
+           lmv->lmv_magic == LMV_MAGIC_STRIPE)
                return __lmv_name_to_stripe_index(lmv->lmv_hash_type,
                                                  lmv->lmv_stripe_count,
                                                  lmv->lmv_migrate_hash,
                                                  lmv->lmv_migrate_offset,
                                                  name, namelen, true);
 
-       if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1))
+       if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) ||
+           lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE))
                return __lmv_name_to_stripe_index(
                                        le32_to_cpu(lmv->lmv_hash_type),
                                        le32_to_cpu(lmv->lmv_stripe_count),
index 96cd590..1199860 100644 (file)
@@ -678,6 +678,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_INVALIDATE_UPDATE     0x1705
 #define OBD_FAIL_OUT_UPDATE_DROP        0x1707
 #define OBD_FAIL_OUT_OBJECT_MISS       0x1708
+#define OBD_FAIL_OUT_EIO               0x1709
 
 /* MIGRATE */
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
index 2c2d90b..d6fb21d 100644 (file)
@@ -1036,13 +1036,13 @@ int lfsck_master_engine(void *args)
            (!list_empty(&lfsck->li_list_scan) ||
             !list_empty(&lfsck->li_list_double_scan))) {
                rc = lfsck_verify_lpf(env, lfsck);
-               /* Fail to verify the .lustre/lost+found/MDTxxxx/ may be not
-                * fatal, because the .lustre/lost+found/ maybe not accessed
-                * by the LFSCK if it does not add orphans or others to such
-                * directory. So go ahead until hit failure when really uses
-                * the directory. */
+               /* FIXME: once OI files are missing, this will fail, it should
+                * return error, but to satisfy sanity-lfsck test 4 & 5, leave
+                * it uninitialized here, and any code dereference it need to
+                * check.
+                */
                if (rc != 0)
-                       CDEBUG(D_LFSCK, "%s: master engine fail to verify the "
+                       CERROR("%s: master engine fail to verify the "
                               ".lustre/lost+found/, go ahead: rc = %d\n",
                               lfsck_lfsck2name(lfsck), rc);
        }
index 4302f4b..fdfab4a 100644 (file)
@@ -1499,11 +1499,8 @@ int lfsck_verify_lpf(const struct lu_env *env, struct lfsck_instance *lfsck)
                goto find_child1;
        }
 
-       if (unlikely(!dt_try_as_dir(env, child2))) {
-               lfsck_object_put(env, child2);
-               child2 = NULL;
-               rc = -ENOTDIR;
-       }
+       if (unlikely(!dt_try_as_dir(env, child2)))
+               GOTO(put, rc = -ENOTDIR);
 
 find_child1:
        if (fid_is_zero(&bk->lb_lpf_fid))
index 1dd801f..23c300a 100644 (file)
@@ -1464,7 +1464,8 @@ static int lfsck_namespace_create_orphan_dir(const struct lu_env *env,
                GOTO(log, rc = 1);
 
        if (dt_object_remote(orphan)) {
-               LASSERT(lfsck->li_lpf_root_obj != NULL);
+               if (lfsck->li_lpf_root_obj == NULL)
+                       GOTO(log, rc = -EBADF);
 
                idx = lfsck_find_mdt_idx_by_fid(env, lfsck, cfid);
                if (idx < 0)
index d530f82..c09f221 100644 (file)
@@ -1219,7 +1219,10 @@ int out_handle(struct tgt_session_info *tsi)
                                }
                        }
 
-                       rc = h->th_act(tsi);
+                       if (OBD_FAIL_CHECK(OBD_FAIL_OUT_EIO))
+                               rc = -EIO;
+                       else
+                               rc = h->th_act(tsi);
 next:
                        reply_index++;
                        dt_object_put(env, dt_obj);
index e2173fe..29e7e82 100644 (file)
@@ -1988,6 +1988,52 @@ test_15c() {
 }
 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
 
+test_15d() {
+       (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
+
+       check_mount_and_prep
+       rm -rf $DIR/$tdir
+       $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
+       $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
+               error "setdirstripe failed"
+
+       createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
+       createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
+
+       echo "Migrate $DIR/$tdir to MDT1"
+       $LFS migrate -m 1 $DIR/$tdir &
+       pid=$!
+
+       sleep 2
+       # fail sub transactions on random MDTs, which may cause some file
+       # inaccessible
+       #define OBD_FAIL_OUT_EIO                0x1709
+       for ((i = 0; i < $MDSCOUNT; i++)); do
+               do_facet mds$i $LCTL set_param fail_loc=0x1709
+               sleep 0.1
+               do_facet mds$i $LCTL set_param fail_loc=0
+       done
+
+       wait $pid
+
+       # LFSCK can't fully fix migrating directories, and may leave some
+       # files inaccessible, but it shouldn't cause crash
+       $START_NAMESPACE -A -r ||
+               error "Fail to start LFSCK for namespace"
+
+       wait_all_targets_blocked namespace completed 1
+
+       # resume migration may fail because some file may be inaccessible, but
+       # it shouldn't cause crash
+       $LFS migrate -m 1 $DIR/$tdir
+
+       # rm $tdir to avoid cleanup failure in the end
+       rm -rf $DIR/$tdir/*
+       $LFS rm_entry $DIR/$tdir/*
+       rm -rf $DIR/$tdir || error "rm $tdir failed"
+}
+run_test 15d "LFSCK don't crash upon dir migration failure"
+
 test_16() {
        (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
                skip "MDS older than 2.5.55, LU-3594"