From 8dbf0494798862b54e21fd02fb0439ce9633b4cb Mon Sep 17 00:00:00 2001 From: Keguang Xu Date: Sat, 18 Jan 2025 22:36:01 +0800 Subject: [PATCH] LU-17427 mdt: reduce hold time for BFL rename lock In non-parallel rename, the order of resource locking follows a fixed order: first global BFL, then (up to) 4 FID locks. This means the BFL can be held for a long time if any client holding one of the FID locks is non-responsive for some reason. Especially in a big cluster where hundreds or thousands clients exist doing various workloads. To reduce hold time for BFL lock, we restructure the locking into two phases, 1. get the 4 child locks first (to cancel the majority of lock holders) 2. try to get the BFL resource lock if it is uncontended 3a. if BFL is contended then drop child locks and wait for it 3b. re-get the child locks if they were dropped in 3a Note in phase 1 there's no BFL umbrella anymore, which in turn may introduce deadlock, hence some order logic is added, e.g. mdt_rename_determine_lock_order() order by fid if objs belong to different parents. > perf results: 1. two active clients with only renames: max_dirs without-patch with 500x500 14418 14276 150x150 20419 20210 2. slow clients scenario, QPS=10 and 5s pause where P(X)=1% for master and P(X)=0.1% for patch (not quite sure if the P is reasonable or not), Metric (ms/req) without-patch with Average Latency 205.76 21.50 Signed-off-by: Keguang Xu Change-Id: I806f52a7c23acdb0f9cb7e7b4c8e306db2fad8d5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57741 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lockd.c | 3 +- lustre/mdt/mdt_handler.c | 1 + lustre/mdt/mdt_internal.h | 2 + lustre/mdt/mdt_lproc.c | 3 + lustre/mdt/mdt_reint.c | 179 ++++++++++++++++++++++++++++++++++------------ lustre/tests/sanityn.sh | 94 ++++++++++++++++++++++++ 6 files changed, 235 insertions(+), 47 deletions(-) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 697ad4f..eacd59d 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1560,7 +1560,8 @@ retry: } } - if (rc != 0 && !(flags & LDLM_FL_RESENT)) { + if ((rc != 0 || err == ELDLM_LOCK_ABORTED) && + !(flags & LDLM_FL_RESENT)) { if (lock->l_export) { ldlm_lock_cancel(lock); } else { diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 258d990..05f9ca4 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -6548,6 +6548,7 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, m->mdt_enable_remote_dir_gid = 0; m->mdt_enable_pin_gid = 0; m->mdt_enable_remote_rename = 1; + m->mdt_enable_rename_trylock = 1; m->mdt_enable_striped_dir = 1; m->mdt_enable_dmv_implicit_inherit = 1; m->mdt_dir_restripe_nsonly = 1; diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 12c3a52..ed954cb 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -286,6 +286,7 @@ struct mdt_device { mdt_enable_parallel_rename_crossdir:1, mdt_enable_remote_dir:1, mdt_enable_remote_rename:1, + mdt_enable_rename_trylock:1, mdt_enable_striped_dir:1, mdt_readonly:1, mdt_skip_lfsck:1, @@ -1317,6 +1318,7 @@ enum mdt_stat_idx { LPROC_MDT_RENAME_PAR_FILE, LPROC_MDT_RENAME_PAR_DIR, LPROC_MDT_RENAME_CROSSDIR, + LPROC_MDT_RENAME_TRYLOCK, LPROC_MDT_IO_READ, LPROC_MDT_IO_WRITE, LPROC_MDT_IO_READ_BYTES, diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index b2f87b7..ae223f8 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -846,6 +846,7 @@ MDT_BOOL_RW_ATTR(migrate_hsm_allowed); MDT_BOOL_RW_ATTR(enable_strict_som); MDT_BOOL_RW_ATTR(enable_dmv_implicit_inherit); MDT_BOOL_RW_ATTR(enable_dmv_xattr); +MDT_BOOL_RW_ATTR(enable_rename_trylock); static ssize_t enable_pin_gid_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -1497,6 +1498,7 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_dir_restripe_nsonly.attr, &lustre_attr_checksum_t10pi_enforce.attr, &lustre_attr_max_mod_rpcs_in_flight.attr, + &lustre_attr_enable_rename_trylock.attr, NULL, }; @@ -1619,6 +1621,7 @@ static const char * const mdt_stats[] = { [LPROC_MDT_RENAME_PAR_FILE] = "parallel_rename_file", [LPROC_MDT_RENAME_PAR_DIR] = "parallel_rename_dir", [LPROC_MDT_RENAME_CROSSDIR] = "crossdir_rename", + [LPROC_MDT_RENAME_TRYLOCK] = "rename_trylocks", [LPROC_MDT_IO_READ_BYTES] = "read_bytes", [LPROC_MDT_IO_WRITE_BYTES] = "write_bytes", [LPROC_MDT_IO_READ] = "read", diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 442776d..6d59544 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -1532,13 +1532,20 @@ put_parent: return rc; } -/** - * Get BFL lock for rename or migrate process. - **/ +/* + * Get BFL(Big Filesystem Lock) for rename or migrate process. + * + * \param trylock If set, return immediately if the + * lock cannot be acquired. + * + * \retval 0 on success(lock taken), + * -ev negative errno upon error + */ static int mdt_rename_lock(struct mdt_thread_info *info, - struct mdt_lock_handle *lh) + struct mdt_lock_handle *lh, bool trylock) { enum mds_ibits_locks ibits = MDS_INODELOCK_UPDATE; + enum mds_ibits_locks trybits = MDS_INODELOCK_NONE; struct lu_fid *fid = &info->mti_tmp_fid1; struct mdt_object *obj; int rc; @@ -1550,12 +1557,22 @@ static int mdt_rename_lock(struct mdt_thread_info *info, RETURN(PTR_ERR(obj)); mdt_lock_reg_init(lh, LCK_EX); + ibits = trylock ? MDS_INODELOCK_NONE : MDS_INODELOCK_UPDATE; + trybits = trylock ? MDS_INODELOCK_UPDATE : MDS_INODELOCK_NONE; rc = mdt_object_lock_internal(info, obj, &LUSTRE_BFL_FID, lh, - &ibits, 0, false); + &ibits, trybits, false); mdt_object_put(info->mti_env, obj); + if (trylock && (ibits & MDS_INODELOCK_UPDATE)) + RETURN(0); RETURN(rc); } +static inline int mdt_rename_lock_try(struct mdt_thread_info *info, + struct mdt_lock_handle *lh) +{ + return mdt_rename_lock(info, lh, true); +} + static void mdt_rename_unlock(struct mdt_thread_info *info, struct mdt_lock_handle *lh) { @@ -2246,7 +2263,7 @@ int mdt_reint_migrate(struct mdt_thread_info *info, * req is NULL if this is called by directory auto-split. */ if (req && !req_is_replay(req)) { - rc = mdt_rename_lock(info, rename_lh); + rc = mdt_rename_lock(info, rename_lh, false); if (rc != 0) { CERROR("%s: can't lock FS for rename: rc = %d\n", mdt_obd_name(info->mti_mdt), rc); @@ -2528,10 +2545,11 @@ unlock_rename: /* * determine lock order of sobj and tobj * - * there are two situations we need to lock tobj before sobj: + * there are three situations we need to lock tobj before sobj: * 1. sobj is child of tobj * 2. sobj and tobj are stripes of a directory, and stripe index of sobj is * larger than that of tobj + * 3. apart from the first two situations, tobj.fid < sobj.fid * * \retval 1 lock tobj before sobj * \retval 0 lock sobj before tobj @@ -2542,6 +2560,7 @@ static int mdt_rename_determine_lock_order(struct mdt_thread_info *info, struct mdt_object *tobj) { struct md_attr *ma = &info->mti_attr; + struct mdt_reint_record *rr = &info->mti_rr; struct lu_fid *spfid = &info->mti_tmp_fid1; struct lu_fid *tpfid = &info->mti_tmp_fid2; struct lmv_mds_md_v1 *lmv; @@ -2577,8 +2596,12 @@ static int mdt_rename_determine_lock_order(struct mdt_thread_info *info, if (rc) return rc; + /* + * should we order by fid if 1) sobj/tobj belong to different + * parents, or 2) they are not stripes of a directory + */ if (!lu_fid_eq(spfid, tpfid)) - return 0; + goto order_by_fid; /* check whether sobj and tobj are sibling stripes */ rc = mdt_stripe_get(info, sobj, ma, XATTR_NAME_LMV); @@ -2586,11 +2609,11 @@ static int mdt_rename_determine_lock_order(struct mdt_thread_info *info, return rc; if (!(ma->ma_valid & MA_LMV)) - return 0; + goto order_by_fid; lmv = &ma->ma_lmv->lmv_md_v1; if (!(le32_to_cpu(lmv->lmv_magic) & LMV_MAGIC_STRIPE)) - return 0; + goto order_by_fid; sindex = le32_to_cpu(lmv->lmv_master_mdt_index); ma->ma_valid = 0; @@ -2611,6 +2634,12 @@ static int mdt_rename_determine_lock_order(struct mdt_thread_info *info, return -EINVAL; return sindex < tindex ? 0 : 1; + +order_by_fid: + /* To avoid AB/BA deadlock given two phase locking */ + if (lu_fid_cmp(rr->rr_fid1, rr->rr_fid2) > 0) + return 1; + return 0; } /* Helper function for mdt_reint_rename so we don't need to opencode @@ -2630,7 +2659,6 @@ static int mdt_lock_two_dirs(struct mdt_thread_info *info, if (rc) return rc; - mdt_version_get_save(info, mfirstdir, 0); CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5); if (mfirstdir != mseconddir) { @@ -2645,7 +2673,6 @@ static int mdt_lock_two_dirs(struct mdt_thread_info *info, CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10); } } - mdt_version_get_save(info, mseconddir, 1); if (rc != 0) mdt_object_unlock(info, mfirstdir, lh_firstdirp, rc); @@ -2682,7 +2709,9 @@ static int mdt_reint_rename(struct mdt_thread_info *info, ktime_t kstart = ktime_get(); enum mdt_stat_idx msi = 0; bool remote; - bool bfl = false; + bool need_bfl = false; + bool preempt_done = false; + bool got_bfl = false; int rc; ENTRY; @@ -2697,6 +2726,14 @@ static int mdt_reint_rename(struct mdt_thread_info *info, !fid_is_md_operative(rr->rr_fid2)) RETURN(-EPERM); +lock_bfl: + msrcdir = mtgtdir = NULL; + mold = mnew = NULL; + lh_oldp = lh_lookup = lh_newp = NULL; + lh_srcdirp = lh_tgtdirp = NULL; + fid_zero(old_fid); + fid_zero(new_fid); + /* find both parents. */ msrcdir = mdt_parent_find_check(info, rr->rr_fid1, 0); if (IS_ERR(msrcdir)) @@ -2744,21 +2781,23 @@ static int mdt_reint_rename(struct mdt_thread_info *info, if (!mdt->mdt_enable_remote_rename && remote) GOTO(out_put_tgtdir, rc = -EXDEV); - if (remote || - (S_ISDIR(ma->ma_attr.la_mode) && - (msrcdir != mtgtdir || - !mdt->mdt_enable_parallel_rename_dir)) || - (!S_ISDIR(ma->ma_attr.la_mode) && - (!mdt->mdt_enable_parallel_rename_file || - (msrcdir != mtgtdir && - !mdt->mdt_enable_parallel_rename_crossdir)))) { - rc = mdt_rename_lock(info, rename_lh); + need_bfl |= remote || + (S_ISDIR(ma->ma_attr.la_mode) && + (msrcdir != mtgtdir || + !mdt->mdt_enable_parallel_rename_dir)) || + (!S_ISDIR(ma->ma_attr.la_mode) && + (!mdt->mdt_enable_parallel_rename_file || + (msrcdir != mtgtdir && + !mdt->mdt_enable_parallel_rename_crossdir))); + if (need_bfl && preempt_done) { + rc = mdt_rename_lock(info, rename_lh, false); if (rc != 0) { CERROR("%s: cannot lock for rename: rc = %d\n", mdt_obd_name(mdt), rc); GOTO(out_put_tgtdir, rc); } - bfl = true; + got_bfl = true; + msi = 0; } else { if (S_ISDIR(ma->ma_attr.la_mode)) msi = LPROC_MDT_RENAME_PAR_DIR; @@ -2775,7 +2814,6 @@ static int mdt_reint_rename(struct mdt_thread_info *info, } } -lock_parents: rc = mdt_rename_determine_lock_order(info, msrcdir, mtgtdir); if (rc < 0) GOTO(out_unlock_rename, rc); @@ -2811,6 +2849,9 @@ lock_parents: if (rc != 0) GOTO(out_unlock_rename, rc); + mdt_version_get_save(info, msrcdir, 0); + mdt_version_get_save(info, mtgtdir, 1); + CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME4, 5); CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME2, 5); @@ -2841,24 +2882,9 @@ lock_parents: GOTO(out_put_old, rc = -EXDEV); /* we used msrcdir as a hint to take BFL, but it may be wrong */ - if (unlikely(!bfl && !req_is_replay(req) && - !S_ISDIR(ma->ma_attr.la_mode) && - mdt_object_remote(mold))) { - LASSERT(!remote); - mdt_object_put(info->mti_env, mold); - mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc); - mdt_object_unlock(info, msrcdir, lh_srcdirp, rc); - - rc = mdt_rename_lock(info, rename_lh); - if (rc != 0) { - CERROR("%s: cannot re-lock for rename: rc = %d\n", - mdt_obd_name(mdt), rc); - GOTO(out_put_tgtdir, rc); - } - bfl = true; - msi = 0; - goto lock_parents; - } + need_bfl |= !req_is_replay(req) && + !S_ISDIR(ma->ma_attr.la_mode) && + mdt_object_remote(mold); /* Check if @mtgtdir is subdir of @mold, before locking child * to avoid reverse locking. @@ -2933,9 +2959,10 @@ lock_parents: /* We will lock in child fid order here to avoid a * deadlock related to hardlinks thats only possible with * regular files. LU-15491 + * + * To avoid deadlock on dirs given two phase locking */ - if (!S_ISDIR(lu_object_attr(&mold->mot_obj)) && - lu_fid_cmp(old_fid, new_fid) > 0) { + if (lu_fid_cmp(old_fid, new_fid) > 0) { child_reverse_lock = true; rc = mdt_object_check_lock(info, mtgtdir, mnew, lh_newp, MDS_INODELOCK_LOOKUP | @@ -2974,7 +3001,6 @@ lock_parents: * the rename onto victim will hold the layout * lock. See LU-4002. */ - if (!child_reverse_lock) { rc = mdt_object_check_lock(info, mtgtdir, mnew, lh_newp, MDS_INODELOCK_LOOKUP | @@ -3002,6 +3028,66 @@ lock_parents: mdt_enoent_version_save(info, 3); } + /* + * To reduce the hold time and contention on the BFL + * resource lock, here we use two phase locking: + * 1. get the 4 child locks first (to cancel the majority + * of lock holders) + * 2. try to get the BFL resource lock if it is uncontended + * 3a. if BFL is contended then drop child locks and wait for it + * 3b. re-get the child locks if they were dropped in 3a + * See LU-17427 + */ + if (need_bfl && !got_bfl && mdt->mdt_enable_rename_trylock) { + struct lu_fid old_fid_backup = *old_fid; + + rc = mdt_rename_lock_try(info, rename_lh); + *old_fid = old_fid_backup; + if (rc == 0) { + got_bfl = true; + msi = LPROC_MDT_RENAME_TRYLOCK; + + if (mtgtdir != msrcdir) { + /* Check if @mtgtdir is subdir of @mold */ + rc = mdo_is_subdir(info->mti_env, + mdt_object_child(mtgtdir), + old_fid); + if (rc) + GOTO(out_unlock_new, + rc = (rc == 1) ? -EINVAL : rc); + /* Check if @msrcdir is subdir of @mnew */ + if (mnew) { + rc = mdo_is_subdir(info->mti_env, + mdt_object_child(msrcdir), + new_fid); + if (rc) + GOTO(out_unlock_new, + rc = (rc == 1) ? -EINVAL : rc); + } + } + } + } + if (need_bfl && !got_bfl) { + /* drop child locks if we didn't get BFL with trylock above */ + if (mnew != NULL) + mdt_object_unlock(info, mnew, lh_newp, 1); + mdt_object_unlock(info, NULL, lh_lookup, 1); + mdt_object_unlock(info, mold, lh_oldp, 1); + + if (mnew != NULL) + mdt_object_put(info->mti_env, mnew); + mdt_object_put(info->mti_env, mold); + + mdt_object_unlock(info, mtgtdir, lh_tgtdirp, 1); + mdt_object_unlock(info, msrcdir, lh_srcdirp, 1); + + mdt_object_put(info->mti_env, mtgtdir); + mdt_object_put(info->mti_env, msrcdir); + + preempt_done = true; + goto lock_bfl; + } + /* step 5: rename it */ mdt_reint_init_ma(info, ma); @@ -3047,7 +3133,8 @@ out_unlock_parents: mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc); mdt_object_unlock(info, msrcdir, lh_srcdirp, rc); out_unlock_rename: - mdt_rename_unlock(info, rename_lh); + if (got_bfl) + mdt_rename_unlock(info, rename_lh); out_put_tgtdir: mdt_object_put(info->mti_env, mtgtdir); out_put_srcdir: diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 9bf16b9..e779270 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -3799,6 +3799,100 @@ test_55d() } run_test 55d "rename file vs link" +test_55e() +{ + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" + + # set children remote to parent, to trigger bfl + $LFS mkdir -i0 -c2 $DIR/$tdir || error "fail to create striped dir" + $LFS mkdir -i1 $DIR/$tdir/A || error "fail to create subdir A" + $LFS mkdir -i1 $DIR/$tdir/B || error "fail to create subdir B" + + #define OBD_FAIL_MDS_RENAME3 0x155 + # before any lock GOT, racing for parent locks + do_facet mds1 $LCTL set_param fail_loc=0x155 + stack_trap "do_facet mds1 $LCTL set_param fail_loc=0x0" + + mv -T $DIR/$tdir/A $DIR/$tdir/B & + PID1=$! + mv -T $DIR2/$tdir/B $DIR2/$tdir/A & + PID2=$! + + sleep 10 + wait $PID1; STATUS1=$? + wait $PID2; STATUS2=$? + + SUCCESS=$(( !$STATUS1 + !$STATUS2 )) + (( SUCCESS >= 1 )) || + error "expect at least one succ, actual: $SUCCESS" + rm -rf $DIR/$tdir + echo "We survived after AB/BA race test" +} +run_test 55e "rename race AB/BA under the same parent dir" + +test_55f() +{ + mkdir_on_mdt0 $DIR/$tdir + mkdir -p $DIR/$tdir/P1/A $DIR/$tdir/P2/B + + #define OBD_FAIL_MDS_RENAME3 0x155 + # before any lock GOT, racing for parent locks + do_facet mds1 $LCTL set_param fail_loc=0x155 + stack_trap "do_facet mds1 $LCTL set_param fail_loc=0x0" + + mv -T $DIR/$tdir/P1/A $DIR/$tdir/P2/B & + PID1=$! + mv -T $DIR2/$tdir/P2/B $DIR2/$tdir/P1/A & + PID2=$! + + sleep 10 + wait $PID1; STATUS1=$? + wait $PID2; STATUS2=$? + + SUCCESS=$(( !$STATUS1 + !$STATUS2 )) + (( SUCCESS >= 1 )) || + error "expect at least one succ, actual: $SUCCESS" + rm -rf $DIR/$tdir + echo "We survived after P1A_P2B/P2B_P1A race test" +} +run_test 55f "rename: (P1/A -> P2/B) race with (P2/B -> P1/A)" + +test_55g() +{ + mkdir_on_mdt0 $DIR/$tdir + mkdir -p $DIR/$tdir/a1/a2/a3/a4 + mkdir -p $DIR/$tdir/b1/b2/b3/b4 + + local param_file=$TMP/$tfile-params + save_lustre_params mds1 \ + "mdt.*.enable_rename_trylock" > $param_file + do_facet mds1 $LCTL set_param mdt.*.enable_rename_trylock=1 + #define OBD_FAIL_MDS_RENAME3 0x155 + do_facet mds1 $LCTL set_param fail_loc=0x155 + stack_trap "do_facet mds1 $LCTL set_param fail_loc=0x0" + + mv $DIR/$tdir/a1/a2 $DIR/$tdir/b1/b2/b3/b4/ & + PID1=$! + mv $DIR2/$tdir/b1/b2 $DIR2/$tdir/a1/a2/a3/a4/ & + PID2=$! + + sleep 10 + wait $PID1; STATUS1=$? + wait $PID2; STATUS2=$? + + SUCCESS=$(( !$STATUS1 + !$STATUS2 )) + (( SUCCESS >= 1 )) || + error "expect at least one succ, actual: $SUCCESS" + [[ -e $DIR/$tdir/b1/b2/b3/b4/a2 ]] || + [[ -e $DIR2/$tdir/a1/a2/a3/a4/b2 ]] || + error "expect at least one valid dir" + + restore_lustre_params <$param_file + rm -rf $DIR/$tdir + echo "We survived race with trylock test" +} +run_test 55g "rename: race with trylock" + test_56a() { $LFS setstripe -c 1 $MOUNT/$tfile || error "creating $MOUNT/$tfile" stack_trap "rm -f $MOUNT/$tfile" -- 1.8.3.1