return rc;
}
-/**
- * Get BFL lock for rename or migrate process.
- **/
+/*
+ * Get BFL(Big Filesystem Lock) for rename or migrate process.
+ *
+ * \param trylock If set, return immediately if the
+ * lock cannot be acquired.
+ *
+ * \retval 0 on success(lock taken),
+ * -ev negative errno upon error
+ */
static int mdt_rename_lock(struct mdt_thread_info *info,
- struct mdt_lock_handle *lh)
+ struct mdt_lock_handle *lh, bool trylock)
{
enum mds_ibits_locks ibits = MDS_INODELOCK_UPDATE;
+ enum mds_ibits_locks trybits = MDS_INODELOCK_NONE;
struct lu_fid *fid = &info->mti_tmp_fid1;
struct mdt_object *obj;
int rc;
RETURN(PTR_ERR(obj));
mdt_lock_reg_init(lh, LCK_EX);
+ ibits = trylock ? MDS_INODELOCK_NONE : MDS_INODELOCK_UPDATE;
+ trybits = trylock ? MDS_INODELOCK_UPDATE : MDS_INODELOCK_NONE;
rc = mdt_object_lock_internal(info, obj, &LUSTRE_BFL_FID, lh,
- &ibits, 0, false);
+ &ibits, trybits, false);
mdt_object_put(info->mti_env, obj);
+ if (trylock && (ibits & MDS_INODELOCK_UPDATE))
+ RETURN(0);
RETURN(rc);
}
+static inline int mdt_rename_lock_try(struct mdt_thread_info *info,
+ struct mdt_lock_handle *lh)
+{
+ return mdt_rename_lock(info, lh, true);
+}
+
static void mdt_rename_unlock(struct mdt_thread_info *info,
struct mdt_lock_handle *lh)
{
* req is NULL if this is called by directory auto-split.
*/
if (req && !req_is_replay(req)) {
- rc = mdt_rename_lock(info, rename_lh);
+ rc = mdt_rename_lock(info, rename_lh, false);
if (rc != 0) {
CERROR("%s: can't lock FS for rename: rc = %d\n",
mdt_obd_name(info->mti_mdt), rc);
/*
* determine lock order of sobj and tobj
*
- * there are two situations we need to lock tobj before sobj:
+ * there are three situations we need to lock tobj before sobj:
* 1. sobj is child of tobj
* 2. sobj and tobj are stripes of a directory, and stripe index of sobj is
* larger than that of tobj
+ * 3. apart from the first two situations, tobj.fid < sobj.fid
*
* \retval 1 lock tobj before sobj
* \retval 0 lock sobj before tobj
struct mdt_object *tobj)
{
struct md_attr *ma = &info->mti_attr;
+ struct mdt_reint_record *rr = &info->mti_rr;
struct lu_fid *spfid = &info->mti_tmp_fid1;
struct lu_fid *tpfid = &info->mti_tmp_fid2;
struct lmv_mds_md_v1 *lmv;
if (rc)
return rc;
+ /*
+ * should we order by fid if 1) sobj/tobj belong to different
+ * parents, or 2) they are not stripes of a directory
+ */
if (!lu_fid_eq(spfid, tpfid))
- return 0;
+ goto order_by_fid;
/* check whether sobj and tobj are sibling stripes */
rc = mdt_stripe_get(info, sobj, ma, XATTR_NAME_LMV);
return rc;
if (!(ma->ma_valid & MA_LMV))
- return 0;
+ goto order_by_fid;
lmv = &ma->ma_lmv->lmv_md_v1;
if (!(le32_to_cpu(lmv->lmv_magic) & LMV_MAGIC_STRIPE))
- return 0;
+ goto order_by_fid;
sindex = le32_to_cpu(lmv->lmv_master_mdt_index);
ma->ma_valid = 0;
return -EINVAL;
return sindex < tindex ? 0 : 1;
+
+order_by_fid:
+ /* To avoid AB/BA deadlock given two phase locking */
+ if (lu_fid_cmp(rr->rr_fid1, rr->rr_fid2) > 0)
+ return 1;
+ return 0;
}
/* Helper function for mdt_reint_rename so we don't need to opencode
if (rc)
return rc;
- mdt_version_get_save(info, mfirstdir, 0);
CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5);
if (mfirstdir != mseconddir) {
CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10);
}
}
- mdt_version_get_save(info, mseconddir, 1);
if (rc != 0)
mdt_object_unlock(info, mfirstdir, lh_firstdirp, rc);
ktime_t kstart = ktime_get();
enum mdt_stat_idx msi = 0;
bool remote;
- bool bfl = false;
+ bool need_bfl = false;
+ bool preempt_done = false;
+ bool got_bfl = false;
int rc;
ENTRY;
!fid_is_md_operative(rr->rr_fid2))
RETURN(-EPERM);
+lock_bfl:
+ msrcdir = mtgtdir = NULL;
+ mold = mnew = NULL;
+ lh_oldp = lh_lookup = lh_newp = NULL;
+ lh_srcdirp = lh_tgtdirp = NULL;
+ fid_zero(old_fid);
+ fid_zero(new_fid);
+
/* find both parents. */
msrcdir = mdt_parent_find_check(info, rr->rr_fid1, 0);
if (IS_ERR(msrcdir))
if (!mdt->mdt_enable_remote_rename && remote)
GOTO(out_put_tgtdir, rc = -EXDEV);
- if (remote ||
- (S_ISDIR(ma->ma_attr.la_mode) &&
- (msrcdir != mtgtdir ||
- !mdt->mdt_enable_parallel_rename_dir)) ||
- (!S_ISDIR(ma->ma_attr.la_mode) &&
- (!mdt->mdt_enable_parallel_rename_file ||
- (msrcdir != mtgtdir &&
- !mdt->mdt_enable_parallel_rename_crossdir)))) {
- rc = mdt_rename_lock(info, rename_lh);
+ need_bfl |= remote ||
+ (S_ISDIR(ma->ma_attr.la_mode) &&
+ (msrcdir != mtgtdir ||
+ !mdt->mdt_enable_parallel_rename_dir)) ||
+ (!S_ISDIR(ma->ma_attr.la_mode) &&
+ (!mdt->mdt_enable_parallel_rename_file ||
+ (msrcdir != mtgtdir &&
+ !mdt->mdt_enable_parallel_rename_crossdir)));
+ if (need_bfl && preempt_done) {
+ rc = mdt_rename_lock(info, rename_lh, false);
if (rc != 0) {
CERROR("%s: cannot lock for rename: rc = %d\n",
mdt_obd_name(mdt), rc);
GOTO(out_put_tgtdir, rc);
}
- bfl = true;
+ got_bfl = true;
+ msi = 0;
} else {
if (S_ISDIR(ma->ma_attr.la_mode))
msi = LPROC_MDT_RENAME_PAR_DIR;
}
}
-lock_parents:
rc = mdt_rename_determine_lock_order(info, msrcdir, mtgtdir);
if (rc < 0)
GOTO(out_unlock_rename, rc);
if (rc != 0)
GOTO(out_unlock_rename, rc);
+ mdt_version_get_save(info, msrcdir, 0);
+ mdt_version_get_save(info, mtgtdir, 1);
+
CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME4, 5);
CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME2, 5);
GOTO(out_put_old, rc = -EXDEV);
/* we used msrcdir as a hint to take BFL, but it may be wrong */
- if (unlikely(!bfl && !req_is_replay(req) &&
- !S_ISDIR(ma->ma_attr.la_mode) &&
- mdt_object_remote(mold))) {
- LASSERT(!remote);
- mdt_object_put(info->mti_env, mold);
- mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc);
- mdt_object_unlock(info, msrcdir, lh_srcdirp, rc);
-
- rc = mdt_rename_lock(info, rename_lh);
- if (rc != 0) {
- CERROR("%s: cannot re-lock for rename: rc = %d\n",
- mdt_obd_name(mdt), rc);
- GOTO(out_put_tgtdir, rc);
- }
- bfl = true;
- msi = 0;
- goto lock_parents;
- }
+ need_bfl |= !req_is_replay(req) &&
+ !S_ISDIR(ma->ma_attr.la_mode) &&
+ mdt_object_remote(mold);
/* Check if @mtgtdir is subdir of @mold, before locking child
* to avoid reverse locking.
/* We will lock in child fid order here to avoid a
* deadlock related to hardlinks thats only possible with
* regular files. LU-15491
+ *
+ * To avoid deadlock on dirs given two phase locking
*/
- if (!S_ISDIR(lu_object_attr(&mold->mot_obj)) &&
- lu_fid_cmp(old_fid, new_fid) > 0) {
+ if (lu_fid_cmp(old_fid, new_fid) > 0) {
child_reverse_lock = true;
rc = mdt_object_check_lock(info, mtgtdir, mnew, lh_newp,
MDS_INODELOCK_LOOKUP |
* the rename onto victim will hold the layout
* lock. See LU-4002.
*/
-
if (!child_reverse_lock) {
rc = mdt_object_check_lock(info, mtgtdir, mnew, lh_newp,
MDS_INODELOCK_LOOKUP |
mdt_enoent_version_save(info, 3);
}
+ /*
+ * To reduce the hold time and contention on the BFL
+ * resource lock, here we use two phase locking:
+ * 1. get the 4 child locks first (to cancel the majority
+ * of lock holders)
+ * 2. try to get the BFL resource lock if it is uncontended
+ * 3a. if BFL is contended then drop child locks and wait for it
+ * 3b. re-get the child locks if they were dropped in 3a
+ * See LU-17427
+ */
+ if (need_bfl && !got_bfl && mdt->mdt_enable_rename_trylock) {
+ struct lu_fid old_fid_backup = *old_fid;
+
+ rc = mdt_rename_lock_try(info, rename_lh);
+ *old_fid = old_fid_backup;
+ if (rc == 0) {
+ got_bfl = true;
+ msi = LPROC_MDT_RENAME_TRYLOCK;
+
+ if (mtgtdir != msrcdir) {
+ /* Check if @mtgtdir is subdir of @mold */
+ rc = mdo_is_subdir(info->mti_env,
+ mdt_object_child(mtgtdir),
+ old_fid);
+ if (rc)
+ GOTO(out_unlock_new,
+ rc = (rc == 1) ? -EINVAL : rc);
+ /* Check if @msrcdir is subdir of @mnew */
+ if (mnew) {
+ rc = mdo_is_subdir(info->mti_env,
+ mdt_object_child(msrcdir),
+ new_fid);
+ if (rc)
+ GOTO(out_unlock_new,
+ rc = (rc == 1) ? -EINVAL : rc);
+ }
+ }
+ }
+ }
+ if (need_bfl && !got_bfl) {
+ /* drop child locks if we didn't get BFL with trylock above */
+ if (mnew != NULL)
+ mdt_object_unlock(info, mnew, lh_newp, 1);
+ mdt_object_unlock(info, NULL, lh_lookup, 1);
+ mdt_object_unlock(info, mold, lh_oldp, 1);
+
+ if (mnew != NULL)
+ mdt_object_put(info->mti_env, mnew);
+ mdt_object_put(info->mti_env, mold);
+
+ mdt_object_unlock(info, mtgtdir, lh_tgtdirp, 1);
+ mdt_object_unlock(info, msrcdir, lh_srcdirp, 1);
+
+ mdt_object_put(info->mti_env, mtgtdir);
+ mdt_object_put(info->mti_env, msrcdir);
+
+ preempt_done = true;
+ goto lock_bfl;
+ }
+
/* step 5: rename it */
mdt_reint_init_ma(info, ma);
mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc);
mdt_object_unlock(info, msrcdir, lh_srcdirp, rc);
out_unlock_rename:
- mdt_rename_unlock(info, rename_lh);
+ if (got_bfl)
+ mdt_rename_unlock(info, rename_lh);
out_put_tgtdir:
mdt_object_put(info->mti_env, mtgtdir);
out_put_srcdir:
}
run_test 55d "rename file vs link"
+test_55e()
+{
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
+
+ # set children remote to parent, to trigger bfl
+ $LFS mkdir -i0 -c2 $DIR/$tdir || error "fail to create striped dir"
+ $LFS mkdir -i1 $DIR/$tdir/A || error "fail to create subdir A"
+ $LFS mkdir -i1 $DIR/$tdir/B || error "fail to create subdir B"
+
+ #define OBD_FAIL_MDS_RENAME3 0x155
+ # before any lock GOT, racing for parent locks
+ do_facet mds1 $LCTL set_param fail_loc=0x155
+ stack_trap "do_facet mds1 $LCTL set_param fail_loc=0x0"
+
+ mv -T $DIR/$tdir/A $DIR/$tdir/B &
+ PID1=$!
+ mv -T $DIR2/$tdir/B $DIR2/$tdir/A &
+ PID2=$!
+
+ sleep 10
+ wait $PID1; STATUS1=$?
+ wait $PID2; STATUS2=$?
+
+ SUCCESS=$(( !$STATUS1 + !$STATUS2 ))
+ (( SUCCESS >= 1 )) ||
+ error "expect at least one succ, actual: $SUCCESS"
+ rm -rf $DIR/$tdir
+ echo "We survived after AB/BA race test"
+}
+run_test 55e "rename race AB/BA under the same parent dir"
+
+test_55f()
+{
+ mkdir_on_mdt0 $DIR/$tdir
+ mkdir -p $DIR/$tdir/P1/A $DIR/$tdir/P2/B
+
+ #define OBD_FAIL_MDS_RENAME3 0x155
+ # before any lock GOT, racing for parent locks
+ do_facet mds1 $LCTL set_param fail_loc=0x155
+ stack_trap "do_facet mds1 $LCTL set_param fail_loc=0x0"
+
+ mv -T $DIR/$tdir/P1/A $DIR/$tdir/P2/B &
+ PID1=$!
+ mv -T $DIR2/$tdir/P2/B $DIR2/$tdir/P1/A &
+ PID2=$!
+
+ sleep 10
+ wait $PID1; STATUS1=$?
+ wait $PID2; STATUS2=$?
+
+ SUCCESS=$(( !$STATUS1 + !$STATUS2 ))
+ (( SUCCESS >= 1 )) ||
+ error "expect at least one succ, actual: $SUCCESS"
+ rm -rf $DIR/$tdir
+ echo "We survived after P1A_P2B/P2B_P1A race test"
+}
+run_test 55f "rename: (P1/A -> P2/B) race with (P2/B -> P1/A)"
+
+test_55g()
+{
+ mkdir_on_mdt0 $DIR/$tdir
+ mkdir -p $DIR/$tdir/a1/a2/a3/a4
+ mkdir -p $DIR/$tdir/b1/b2/b3/b4
+
+ local param_file=$TMP/$tfile-params
+ save_lustre_params mds1 \
+ "mdt.*.enable_rename_trylock" > $param_file
+ do_facet mds1 $LCTL set_param mdt.*.enable_rename_trylock=1
+ #define OBD_FAIL_MDS_RENAME3 0x155
+ do_facet mds1 $LCTL set_param fail_loc=0x155
+ stack_trap "do_facet mds1 $LCTL set_param fail_loc=0x0"
+
+ mv $DIR/$tdir/a1/a2 $DIR/$tdir/b1/b2/b3/b4/ &
+ PID1=$!
+ mv $DIR2/$tdir/b1/b2 $DIR2/$tdir/a1/a2/a3/a4/ &
+ PID2=$!
+
+ sleep 10
+ wait $PID1; STATUS1=$?
+ wait $PID2; STATUS2=$?
+
+ SUCCESS=$(( !$STATUS1 + !$STATUS2 ))
+ (( SUCCESS >= 1 )) ||
+ error "expect at least one succ, actual: $SUCCESS"
+ [[ -e $DIR/$tdir/b1/b2/b3/b4/a2 ]] ||
+ [[ -e $DIR2/$tdir/a1/a2/a3/a4/b2 ]] ||
+ error "expect at least one valid dir"
+
+ restore_lustre_params <$param_file
+ rm -rf $DIR/$tdir
+ echo "We survived race with trylock test"
+}
+run_test 55g "rename: race with trylock"
+
test_56a() {
$LFS setstripe -c 1 $MOUNT/$tfile || error "creating $MOUNT/$tfile"
stack_trap "rm -f $MOUNT/$tfile"