From 151650e468ab423e831c30d635ea380e0434a122 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Fri, 28 Apr 2023 05:22:03 -0400 Subject: [PATCH] LU-16717 mdt: resume dir migration with bad_type LFSCK may set hash type to "none,bad_type" upon migration failure, set it back to "fnv_1a_64,migrating,bad_type,fixed" to allow migration resumption. fnv_1a_64 is set because it's the default hash type, and now that we don't know the hash type in the original migration command, just try with it. LFSCK just add "bad_type" flag on such directory, so that such migration can always be resumed in the future. Add sanity 230z. Signed-off-by: Lai Siyao Change-Id: I19606aefcb9115e6724843785aea89a1c380e23f Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50797 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- lustre/lfsck/lfsck_striped_dir.c | 6 +++++- lustre/mdt/mdt_reint.c | 32 ++++++++++++++++++++++++++++++++ lustre/tests/sanity.sh | 30 ++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/lustre/lfsck/lfsck_striped_dir.c b/lustre/lfsck/lfsck_striped_dir.c index 493b42b..b0c77dc 100644 --- a/lustre/lfsck/lfsck_striped_dir.c +++ b/lustre/lfsck/lfsck_striped_dir.c @@ -1567,7 +1567,11 @@ int lfsck_namespace_repair_bad_name_hash(const struct lu_env *env, GOTO(log, rc = 1); *lmv2 = llmv->ll_lmv; - lmv2->lmv_hash_type = LMV_HASH_TYPE_UNKNOWN | LMV_HASH_FLAG_BAD_TYPE; + /* only set BAD_TYPE here, do not clear hash type or MIGRATION flag, + * so that user can resume dir migration if this is caused by dir + * migration failure. + */ + lmv2->lmv_hash_type |= LMV_HASH_FLAG_BAD_TYPE; rc = lfsck_namespace_set_lmv_master(env, com, parent, lmv2, lfsck_dto2fid(shard), llmv->ll_lmv.lmv_master_mdt_index, diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 67bf2f4..e0123de 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -2114,6 +2114,17 @@ close: return rc ?: rc2; } +/* LFSCK used to clear hash type and MIGRATION flag upon migration failure */ +static inline bool lmv_is_failed_migration(const struct lmv_mds_md_v1 *lmv) +{ + return le32_to_cpu(lmv->lmv_hash_type) == + (LMV_HASH_TYPE_UNKNOWN | LMV_HASH_FLAG_BAD_TYPE) && + lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_migrate_hash)) && + le32_to_cpu(lmv->lmv_migrate_offset) > 0 && + le32_to_cpu(lmv->lmv_migrate_offset) < + le32_to_cpu(lmv->lmv_stripe_count); +} + /* * migrate file in below steps: * 1. lock source and target stripes @@ -2335,6 +2346,27 @@ lock_parent: else if (lmv_is_restriping(&ma->ma_lmv->lmv_md_v1)) /* race with restripe/auto-split */ GOTO(unlock_source, rc = -EBUSY); + else if (lmv_is_failed_migration(&ma->ma_lmv->lmv_md_v1)) { + struct lu_buf *buf = &info->mti_buf; + struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1; + __u32 version = le32_to_cpu(lmv->lmv_layout_version); + + /* migration failed before, and LFSCK cleared hash type + * and flags, fake it to resume migration. + */ + lmv->lmv_hash_type = + cpu_to_le32(LMV_HASH_TYPE_FNV_1A_64 | + LMV_HASH_FLAG_MIGRATION | + LMV_HASH_FLAG_BAD_TYPE | + LMV_HASH_FLAG_FIXED); + lmv->lmv_layout_version = cpu_to_le32(version + 1); + buf->lb_buf = lmv; + buf->lb_len = sizeof(*lmv); + rc = mo_xattr_set(env, mdt_object_child(sobj), buf, + XATTR_NAME_LMV, LU_XATTR_REPLACE); + mo_invalidate(env, mdt_object_child(sobj)); + GOTO(unlock_source, rc = -EALREADY); + } } /* if migration HSM is allowed */ diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0e48b7e..a54ff02 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -21896,6 +21896,36 @@ test_230y() { } run_test 230y "unlink dir with bad hash type" +test_230z() { + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + (( MDS1_VERSION >= $(version_code 2.15.55.45) )) || + skip "Need MDS version at least 2.15.55.45" + + local pid + + test_mkdir -c -1 $DIR/$tdir || error "mkdir $tdir failed" + $LFS getdirstripe $DIR/$tdir + createmany -d $DIR/$tdir/d 100 || error "createmany failed" + $LFS migrate -m 1 -c 2 -H fnv_1a_64 $DIR/$tdir & + pid=$! + sleep 1 + + #OBD_FAIL_MIGRATE_BAD_HASH 0x1802 + do_facet mds2 lctl set_param fail_loc=0x1802 + + wait $pid + do_facet mds2 lctl set_param fail_loc=0 + $LFS getdirstripe $DIR/$tdir + + # resume migration + $LFS migrate -m 1 -c 2 -H fnv_1a_64 $DIR/$tdir || + error "resume migration failed" + $LFS getdirstripe $DIR/$tdir + [ $($LFS getdirstripe -H $DIR/$tdir) == "fnv_1a_64,fixed" ] || + error "migration is not finished" +} +run_test 230z "resume dir migration with bad hash type" + test_231a() { # For simplicity this test assumes that max_pages_per_rpc -- 1.8.3.1