From 54a2d4662b58e2ba4224b0e6b487a0a7cd2f28bb Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Tue, 17 May 2022 07:11:25 -0400 Subject: [PATCH] LU-15868 lfsck: don't crash upon dir migration failure LFSCK against directories that were migrated, but failed may crash, it's because lost+found directory may not be initialized correctly, and this error is skipped on purpose, add check in code that dereference it. lfsck_verify_lpf() may dereference NULL "child2". lmv_name_to_stripe_index() should support stripe LMV, which is used by LFSCK to verify name hash. Add OBD_FAIL_OUT_EIO to simulate sub transaction failure. Add sanity-lfsck 15d to verify LFSCK won't crash upon directory migration failure. Update sanity-lfsck 4 and 5 to start mds1 with OI scrub enabled, and wait for mds1 OI scrub finish, otherwise LFSCK may fail to verify lost+found later. Test-Parameters: mdscount=2 mdtcount=4 testlist=sanity-lfsck \ env=ONLY=15d,ONLY_REPEAT=100 Signed-off-by: Lai Siyao Change-Id: I1b1872da2b4ef8f7403effc4d1d3e298c6a0b7e6 Reviewed-on: https://review.whamcloud.com/47381 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- lustre/include/lustre_lmv.h | 6 ++++-- lustre/include/obd_support.h | 1 + lustre/lfsck/lfsck_engine.c | 12 +++++------ lustre/lfsck/lfsck_lib.c | 7 ++----- lustre/lfsck/lfsck_namespace.c | 3 ++- lustre/target/out_handler.c | 5 ++++- lustre/tests/sanity-lfsck.sh | 46 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 65 insertions(+), 15 deletions(-) diff --git a/lustre/include/lustre_lmv.h b/lustre/include/lustre_lmv.h index aafaed3..5d5afc8 100644 --- a/lustre/include/lustre_lmv.h +++ b/lustre/include/lustre_lmv.h @@ -370,14 +370,16 @@ __lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count, static inline int lmv_name_to_stripe_index(struct lmv_mds_md_v1 *lmv, const char *name, int namelen) { - if (lmv->lmv_magic == LMV_MAGIC_V1) + if (lmv->lmv_magic == LMV_MAGIC_V1 || + lmv->lmv_magic == LMV_MAGIC_STRIPE) return __lmv_name_to_stripe_index(lmv->lmv_hash_type, lmv->lmv_stripe_count, lmv->lmv_migrate_hash, lmv->lmv_migrate_offset, name, namelen, true); - if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1)) + if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) || + lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE)) return __lmv_name_to_stripe_index( le32_to_cpu(lmv->lmv_hash_type), le32_to_cpu(lmv->lmv_stripe_count), diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 96cd590..1199860 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -678,6 +678,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_INVALIDATE_UPDATE 0x1705 #define OBD_FAIL_OUT_UPDATE_DROP 0x1707 #define OBD_FAIL_OUT_OBJECT_MISS 0x1708 +#define OBD_FAIL_OUT_EIO 0x1709 /* MIGRATE */ #define OBD_FAIL_MIGRATE_ENTRIES 0x1801 diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index 2c2d90b..d6fb21d 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -1036,13 +1036,13 @@ int lfsck_master_engine(void *args) (!list_empty(&lfsck->li_list_scan) || !list_empty(&lfsck->li_list_double_scan))) { rc = lfsck_verify_lpf(env, lfsck); - /* Fail to verify the .lustre/lost+found/MDTxxxx/ may be not - * fatal, because the .lustre/lost+found/ maybe not accessed - * by the LFSCK if it does not add orphans or others to such - * directory. So go ahead until hit failure when really uses - * the directory. */ + /* FIXME: once OI files are missing, this will fail, it should + * return error, but to satisfy sanity-lfsck test 4 & 5, leave + * it uninitialized here, and any code dereference it need to + * check. + */ if (rc != 0) - CDEBUG(D_LFSCK, "%s: master engine fail to verify the " + CERROR("%s: master engine fail to verify the " ".lustre/lost+found/, go ahead: rc = %d\n", lfsck_lfsck2name(lfsck), rc); } diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 4302f4b..fdfab4a 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -1499,11 +1499,8 @@ int lfsck_verify_lpf(const struct lu_env *env, struct lfsck_instance *lfsck) goto find_child1; } - if (unlikely(!dt_try_as_dir(env, child2))) { - lfsck_object_put(env, child2); - child2 = NULL; - rc = -ENOTDIR; - } + if (unlikely(!dt_try_as_dir(env, child2))) + GOTO(put, rc = -ENOTDIR); find_child1: if (fid_is_zero(&bk->lb_lpf_fid)) diff --git a/lustre/lfsck/lfsck_namespace.c b/lustre/lfsck/lfsck_namespace.c index 1dd801f..23c300a 100644 --- a/lustre/lfsck/lfsck_namespace.c +++ b/lustre/lfsck/lfsck_namespace.c @@ -1464,7 +1464,8 @@ static int lfsck_namespace_create_orphan_dir(const struct lu_env *env, GOTO(log, rc = 1); if (dt_object_remote(orphan)) { - LASSERT(lfsck->li_lpf_root_obj != NULL); + if (lfsck->li_lpf_root_obj == NULL) + GOTO(log, rc = -EBADF); idx = lfsck_find_mdt_idx_by_fid(env, lfsck, cfid); if (idx < 0) diff --git a/lustre/target/out_handler.c b/lustre/target/out_handler.c index d530f82..c09f221 100644 --- a/lustre/target/out_handler.c +++ b/lustre/target/out_handler.c @@ -1219,7 +1219,10 @@ int out_handle(struct tgt_session_info *tsi) } } - rc = h->th_act(tsi); + if (OBD_FAIL_CHECK(OBD_FAIL_OUT_EIO)) + rc = -EIO; + else + rc = h->th_act(tsi); next: reply_index++; dt_object_put(env, dt_obj); diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index e2173fe..29e7e82 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -1988,6 +1988,52 @@ test_15c() { } run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)" +test_15d() { + (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + check_mount_and_prep + rm -rf $DIR/$tdir + $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed" + $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir || + error "setdirstripe failed" + + createmany -o $DIR/$tdir/f 100 || error "create sub files failed" + createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed" + + echo "Migrate $DIR/$tdir to MDT1" + $LFS migrate -m 1 $DIR/$tdir & + pid=$! + + sleep 2 + # fail sub transactions on random MDTs, which may cause some file + # inaccessible + #define OBD_FAIL_OUT_EIO 0x1709 + for ((i = 0; i < $MDSCOUNT; i++)); do + do_facet mds$i $LCTL set_param fail_loc=0x1709 + sleep 0.1 + do_facet mds$i $LCTL set_param fail_loc=0 + done + + wait $pid + + # LFSCK can't fully fix migrating directories, and may leave some + # files inaccessible, but it shouldn't cause crash + $START_NAMESPACE -A -r || + error "Fail to start LFSCK for namespace" + + wait_all_targets_blocked namespace completed 1 + + # resume migration may fail because some file may be inaccessible, but + # it shouldn't cause crash + $LFS migrate -m 1 $DIR/$tdir + + # rm $tdir to avoid cleanup failure in the end + rm -rf $DIR/$tdir/* + $LFS rm_entry $DIR/$tdir/* + rm -rf $DIR/$tdir || error "rm $tdir failed" +} +run_test 15d "LFSCK don't crash upon dir migration failure" + test_16() { (( $MDS1_VERSION > $(version_code 2.5.55) )) || skip "MDS older than 2.5.55, LU-3594" -- 1.8.3.1