From 4e308ef74f271ec7e19917e3c0f88586537582c3 Mon Sep 17 00:00:00 2001 From: Hongchao Zhang Date: Sat, 5 Apr 2014 12:41:03 +0800 Subject: [PATCH] LU-4725 mdt: child-parent lock ordering in rename change rename so that it always has parent-child lock ordering, otherwise it may deadlock with other operations. Signed-off-by: Vitaly Fertman Signed-off-by: Hongchao Zhang Change-Id: If676da82ca50a20a4bb3aadef0f81c9c5ed3cbcb Xyratex-bug-id: MRP-1700 Reviewed-on: http://review.whamcloud.com/9538 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: wangdi Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 3 + lustre/mdt/mdt_reint.c | 247 ++++++++++++++++++++++++++++++------------- lustre/tests/sanityn.sh | 103 ++++++++++++++++++ 3 files changed, 281 insertions(+), 72 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 7365f60..f66a509 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -243,6 +243,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 #define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 #define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 +#define OBD_FAIL_MDS_RENAME 0x153 +#define OBD_FAIL_MDS_RENAME2 0x154 +#define OBD_FAIL_MDS_RENAME3 0x155 /* layout lock */ #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 948a7b3..1cdb175 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -1248,17 +1248,18 @@ static void mdt_rename_unlock(struct lustre_handle *lh) * target. Source should not be ancestor of target dir. May be other rename * checks can be moved here later. */ -static int mdt_rename_sanity(struct mdt_thread_info *info, struct lu_fid *fid) +static int mdt_rename_sanity(struct mdt_thread_info *info, + const struct lu_fid *dir_fid, + const struct lu_fid *fid) { - struct mdt_reint_record *rr = &info->mti_rr; - struct lu_fid dst_fid = *rr->rr_fid2; struct mdt_object *dst; + struct lu_fid dst_fid = *dir_fid; int rc = 0; ENTRY; /* If the source and target are in the same directory, they can not * be parent/child relationship, so subdir check is not needed */ - if (lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) + if (lu_fid_eq(dir_fid, fid)) return 0; do { @@ -1270,7 +1271,13 @@ static int mdt_rename_sanity(struct mdt_thread_info *info, struct lu_fid *fid) &dst_fid); mdt_object_put(info->mti_env, dst); if (rc != -EREMOTE && rc < 0) { - CERROR("Failed mdo_is_subdir(), rc %d\n", rc); + CERROR("%s: failed subdir check in "DFID" for " + DFID": rc = %d\n", + mdt_obd_name(info->mti_mdt), + PFID(dir_fid), PFID(fid), rc); + /* Return EINVAL only if a parent is the @fid */ + if (rc == -EINVAL) + rc = -EIO; } else { /* check the found fid */ if (lu_fid_eq(&dst_fid, fid)) @@ -1570,6 +1577,144 @@ out_put_parent: RETURN(rc); } +static struct mdt_object *mdt_object_find_check(struct mdt_thread_info *info, + const struct lu_fid *fid, + int idx) +{ + struct mdt_object *dir; + int rc; + ENTRY; + + dir = mdt_object_find(info->mti_env, info->mti_mdt, fid); + if (IS_ERR(dir)) + RETURN(dir); + + /* check early, the real version will be saved after locking */ + rc = mdt_version_get_check(info, dir, idx); + if (rc) + GOTO(out_put, rc); + + RETURN(dir); +out_put: + mdt_object_put(info->mti_env, dir); + return ERR_PTR(rc); +} + +static int mdt_object_lock_save(struct mdt_thread_info *info, + struct mdt_object *dir, + struct mdt_lock_handle *lh, + int idx) +{ + int rc; + + /* we lock the target dir if it is local */ + rc = mdt_object_lock(info, dir, lh, MDS_INODELOCK_UPDATE, + MDT_LOCAL_LOCK); + if (rc != 0) + return rc; + + /* get and save correct version after locking */ + mdt_version_get_save(info, dir, idx); + return 0; +} + + +static int mdt_rename_parents_lock(struct mdt_thread_info *info, + struct mdt_object **srcp, + struct mdt_object **tgtp) +{ + struct mdt_reint_record *rr = &info->mti_rr; + const struct lu_fid *fid_src = rr->rr_fid1; + const struct lu_fid *fid_tgt = rr->rr_fid2; + struct mdt_lock_handle *lh_src = &info->mti_lh[MDT_LH_PARENT]; + struct mdt_lock_handle *lh_tgt = &info->mti_lh[MDT_LH_CHILD]; + struct mdt_object *src; + struct mdt_object *tgt; + int reverse = 0; + int rc; + ENTRY; + + /* Check if the @src is not a child of the @tgt, otherwise a + * reverse locking must take place. + * + * Note: cannot be called after object_find, because if the object + * is destroyed in between it gets stuck in lu_object_find_at(), + * waiting for the last ref. */ + rc = mdt_rename_sanity(info, fid_src, fid_tgt); + if (rc == -EINVAL) + reverse = 1; + else if (rc) + RETURN(rc); + + /* find both parents. */ + src = mdt_object_find_check(info, fid_src, 0); + if (IS_ERR(src)) + RETURN(PTR_ERR(src)); + + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME3, 5); + + if (lu_fid_eq(fid_src, fid_tgt)) { + tgt = src; + mdt_object_get(info->mti_env, tgt); + } else { + tgt = mdt_object_find_check(info, fid_tgt, 1); + if (IS_ERR(tgt)) + GOTO(err_src_put, rc = PTR_ERR(tgt)); + + if (unlikely(mdt_object_remote(tgt))) { + CDEBUG(D_INFO, "Source dir "DFID" target dir "DFID + "on different MDTs\n", PFID(fid_src), + PFID(fid_tgt)); + GOTO(err_tgt_put, rc = -EXDEV); + } + } + + /* lock parents in the proper order. */ + if (reverse) { + rc = mdt_object_lock_save(info, tgt, lh_tgt, 1); + if (rc) + GOTO(err_tgt_put, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5); + + rc = mdt_object_lock_save(info, src, lh_src, 0); + } else { + rc = mdt_object_lock_save(info, src, lh_src, 0); + if (rc) + GOTO(err_tgt_put, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5); + + if (tgt != src) + rc = mdt_object_lock_save(info, tgt, lh_tgt, 1); + else if (lh_src->mlh_pdo_hash != lh_tgt->mlh_pdo_hash) { + rc = mdt_pdir_hash_lock(info, lh_tgt, tgt, + MDS_INODELOCK_UPDATE); + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10); + } + } + if (rc) + GOTO(err_unlock, rc); + + if (lu_object_is_dying(&tgt->mot_header)) + GOTO(err_unlock, rc = -ENOENT); + + *srcp = src; + *tgtp = tgt; + RETURN(0); + +err_unlock: + /* The order does not matter as the handle is checked inside, + * as well as not used handle. */ + mdt_object_unlock(info, src, lh_src, rc); + mdt_object_unlock(info, tgt, lh_tgt, rc); +err_tgt_put: + mdt_object_put(info->mti_env, tgt); +err_src_put: + mdt_object_put(info->mti_env, src); + RETURN(rc); +} + /* * VBR: rename versions in reply: 0 - src parent; 1 - tgt parent; * 2 - src child; 3 - tgt child. @@ -1592,8 +1737,8 @@ static int mdt_reint_rename_internal(struct mdt_thread_info *info, struct mdt_reint_record *rr = &info->mti_rr; struct md_attr *ma = &info->mti_attr; struct ptlrpc_request *req = mdt_info_req(info); - struct mdt_object *msrcdir; - struct mdt_object *mtgtdir; + struct mdt_object *msrcdir = NULL; + struct mdt_object *mtgtdir = NULL; struct mdt_object *mold; struct mdt_object *mnew = NULL; struct mdt_lock_handle *lh_srcdirp; @@ -1609,78 +1754,39 @@ static int mdt_reint_rename_internal(struct mdt_thread_info *info, PFID(rr->rr_fid1), PNAME(&rr->rr_name), PFID(rr->rr_fid2), PNAME(&rr->rr_tgt_name)); - /* step 1: lock the source dir. */ lh_srcdirp = &info->mti_lh[MDT_LH_PARENT]; mdt_lock_pdo_init(lh_srcdirp, LCK_PW, &rr->rr_name); - msrcdir = mdt_object_find_lock(info, rr->rr_fid1, lh_srcdirp, - MDS_INODELOCK_UPDATE); - if (IS_ERR(msrcdir)) - RETURN(PTR_ERR(msrcdir)); - - rc = mdt_version_get_check_save(info, msrcdir, 0); - if (rc) - GOTO(out_unlock_source, rc); - - /* step 2: find & lock the target dir. */ lh_tgtdirp = &info->mti_lh[MDT_LH_CHILD]; mdt_lock_pdo_init(lh_tgtdirp, LCK_PW, &rr->rr_tgt_name); - if (lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) { - mdt_object_get(info->mti_env, msrcdir); - mtgtdir = msrcdir; - if (lh_tgtdirp->mlh_pdo_hash != lh_srcdirp->mlh_pdo_hash) { - rc = mdt_pdir_hash_lock(info, lh_tgtdirp, mtgtdir, - MDS_INODELOCK_UPDATE); - if (rc != 0) - GOTO(out_unlock_source, rc); - OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10); - } - } else { - mtgtdir = mdt_object_find(info->mti_env, info->mti_mdt, - rr->rr_fid2); - if (IS_ERR(mtgtdir)) - GOTO(out_unlock_source, rc = PTR_ERR(mtgtdir)); - /* check early, the real version will be saved after locking */ - rc = mdt_version_get_check(info, mtgtdir, 1); - if (rc) - GOTO(out_put_target, rc); + /* step 1&2: lock the source and target dirs. */ + rc = mdt_rename_parents_lock(info, &msrcdir, &mtgtdir); + if (rc) + RETURN(rc); - if (unlikely(mdt_object_remote(mtgtdir))) { - CDEBUG(D_INFO, "Source dir "DFID" target dir "DFID - "on different MDTs\n", PFID(rr->rr_fid1), - PFID(rr->rr_fid2)); - GOTO(out_put_target, rc = -EXDEV); - } else { - if (likely(mdt_object_exists(mtgtdir))) { - /* we lock the target dir if it is local */ - rc = mdt_object_lock(info, mtgtdir, lh_tgtdirp, - MDS_INODELOCK_UPDATE, - MDT_LOCAL_LOCK); - if (rc != 0) - GOTO(out_put_target, rc); - /* get and save correct version after locking */ - mdt_version_get_save(info, mtgtdir, 1); - } else { - GOTO(out_put_target, rc = -ESTALE); - } - } - } + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME2, 5); /* step 3: find & lock the old object. */ fid_zero(old_fid); rc = mdt_lookup_version_check(info, msrcdir, &rr->rr_name, old_fid, 2); if (rc != 0) - GOTO(out_unlock_target, rc); + GOTO(out_unlock_parents, rc); if (lu_fid_eq(old_fid, rr->rr_fid1) || lu_fid_eq(old_fid, rr->rr_fid2)) - GOTO(out_unlock_target, rc = -EINVAL); + GOTO(out_unlock_parents, rc = -EINVAL); if (!fid_is_md_operative(old_fid)) - GOTO(out_unlock_target, rc = -EPERM); + GOTO(out_unlock_parents, rc = -EPERM); mold = mdt_object_find(info->mti_env, info->mti_mdt, old_fid); if (IS_ERR(mold)) - GOTO(out_unlock_target, rc = PTR_ERR(mold)); + GOTO(out_unlock_parents, rc = PTR_ERR(mold)); + + /* Check if @mtgtdir is subdir of @mold, before locking child + * to avoid reverse locking. */ + rc = mdt_rename_sanity(info, rr->rr_fid2, old_fid); + if (rc) + GOTO(out_put_old, rc); tgt_vbr_obj_set(info->mti_env, mdt_obj2dt(mold)); /* save version after locking */ @@ -1728,6 +1834,12 @@ static int mdt_reint_rename_internal(struct mdt_thread_info *info, if (rc != 0) GOTO(out_put_new, rc); + /* Check if @msrcdir is subdir of @mnew, before locking child + * to avoid reverse locking. */ + rc = mdt_rename_sanity(info, rr->rr_fid1, new_fid); + if (rc) + GOTO(out_unlock_old, rc); + /* We used to acquire MDS_INODELOCK_FULL here but we * can't do this now because a running HSM restore on * the rename onto victim will hold the layout @@ -1772,11 +1884,6 @@ static int mdt_reint_rename_internal(struct mdt_thread_info *info, mdt_fail_write(info->mti_env, info->mti_mdt->mdt_bottom, OBD_FAIL_MDS_REINT_RENAME_WRITE); - /* Check if @dst is subdir of @src. */ - rc = mdt_rename_sanity(info, old_fid); - if (rc) - GOTO(out_unlock_new, rc); - if (mnew != NULL) mutex_lock(&mnew->mot_lov_mutex); @@ -1799,7 +1906,6 @@ static int mdt_reint_rename_internal(struct mdt_thread_info *info, } EXIT; -out_unlock_new: if (mnew != NULL) mdt_object_unlock(info, mnew, lh_newp, rc); out_unlock_old: @@ -1809,11 +1915,8 @@ out_put_new: mdt_object_put(info->mti_env, mnew); out_put_old: mdt_object_put(info->mti_env, mold); -out_unlock_target: - mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc); -out_put_target: - mdt_object_put(info->mti_env, mtgtdir); -out_unlock_source: +out_unlock_parents: + mdt_object_unlock_put(info, mtgtdir, lh_tgtdirp, rc); mdt_object_unlock_put(info, msrcdir, lh_srcdirp, rc); return rc; } diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 8918f5d..2f5ab1b 100644 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -2370,6 +2370,109 @@ test_51d() { } run_test 51d "layout lock: losing layout lock should clean up memory map region" +test_54_part1() +{ + echo "==> rename vs getattr vs setxattr should not deadlock" + mkdir -p $DIR/d1/d2/d3 || error "(1) mkdir failed" + + do_facet mds $LCTL set_param fail_loc=$1 + + mv -T $DIR/d1/d2/d3 $DIR/d1/d3 & + PID1=$! + sleep 1 + + stat $DIR/d1/d2 & + PID2=$! + sleep 1 + + setfattr -n user.attr1 -v value1 $DIR2/d1 || error "(2) setfattr failed" + wait $PID1 || error "(3) mv failed" + wait $PID2 || error "(4) stat failed" + echo + + rm -rf $DIR/d1 +} + +test_54_part2() { + echo "==> rename vs getattr vs open vs getattr should not deadlock" + mkdir -p $DIR/d1/d2/d3 || error "(1) mkdir failed" + + do_facet mds $LCTL set_param fail_loc=$1 + + mv -T $DIR/d1/d2/d3 $DIR/d1/d3 & + PID1=$! + sleep 1 + + stat $DIR/d1/d2 & + PID2=$! + sleep 1 + + $MULTIOP $DIR2/d1/d2 Oc & + PID3=$! + sleep 1 + + stat $DIR/d1 || error "(2) stat failed" + + wait $PID1 || error "(3) mv failed" + wait $PID2 || error "(4) stat failed" + wait $PID3 && error "(5) multiop failed" + echo + rm -rf $DIR/d1 +} + +test_54() { + local p="$TMP/$TESTSUITE-$TESTNAME.parameters" + save_lustre_params client "llite.*.xattr_cache" > $p + lctl set_param llite.*.xattr_cache 1 || + { skip "xattr cache is not supported"; return 0; } + +#define OBD_FAIL_MDS_RENAME 0x153 +#define OBD_FAIL_MDS_RENAME2 0x154 + test_54_part1 0x80000153 || error 10 + test_54_part1 0x80000154 || error 11 + test_54_part2 0x80000153 || error 12 + test_54_part2 0x80000154 || error 13 + + restore_lustre_params < $p + rm -f $p +} +run_test 54 "rename locking" + +test_55a() { + mkdir -p $DIR/d1/d2 $DIR/d3 || error "(1) mkdir failed" + +#define OBD_FAIL_MDS_RENAME 0x153 + do_facet mds $LCTL set_param fail_loc=0x80000153 + + mv -T $DIR/d1/d2 $DIR/d3/d2 & + PID1=$! + sleep 1 + + rm -r $DIR2/d3 + wait $PID1 && error "(2) mv succeeded" + + rm -rf $DIR/d1 +} +run_test 55a "rename vs unlink target dir" + +test_55b() +{ + mkdir -p $DIR/d1/d2 $DIR/d3 || error "(1) mkdir failed" + +#define OBD_FAIL_MDS_RENAME 0x155 + do_facet mds $LCTL set_param fail_loc=0x80000155 + + mv -T $DIR/d1/d2 $DIR/d3/d2 & + PID1=$! + sleep 1 + + rm -r $DIR2/d1 + wait $PID1 && error "(2) mv succeeded" + + rm -rf $DIR/d1 +} +run_test 55b "rename vs unlink source dir" + test_60() { [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.3.0) ]] || { skip "Need MDS version at least 2.3.0"; return; } -- 1.8.3.1