From 33dc40d58ef6eb8b384fce1da9f8d21cad4ef6d8 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 31 Aug 2018 18:03:36 +0900 Subject: [PATCH] LU-10262 mdt: mdt_reint_open: check EEXIST without lock Many applications blindly open files with O_CREAT, and the mds gets a write lock to the parent directory for these even if the file already exists. Checking for file existence first lets us take a PR lock if file already existed even if O_CREAT was specified. This opens up multiple races between the first lookup and the actual locking, in each of them drop the resources we aquired and retry from scratch to keep things as far from complicated as possible, with mixed success. Update (eaujames): - rebase the patch - update tests Performance tests results: The array below presents the average "open" syscall latency for 20 files in a single directory accessed by 400 different clients. _______________________________________________________________ | Test cases | without patch | with patch | %improvement | |___________________|_______________|____________|______________| | readonly | 0.960s | 0.973s | -1.40% | |___________________|_______________|____________|______________| | readonly cached | 0.372s | 0.372s | +0.01% | |___________________|_______________|____________|______________| | O_CREAT+precreate | 1.645s | 0.968s | +41.13% | |___________________|_______________|____________|______________| | O_CREAT cached | 0.632s | 0.623s | +1.34% | |___________________|_______________|____________|______________| | O_CREAT | 1.261s | 1.093s | +13.32% | |___________________|_______________|____________|______________| (for more detail, see the ticket comments section) This patch optimizes concurent opens with O_CREAT flag when dentry are not cached by clients. Change-Id: I247b579d14d20036f89033c99ece457d70ba19e7 Signed-off-by: Dominique Martinet Reviewed-on: https://review.whamcloud.com/33098 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Etienne AUJAMES Reviewed-by: Yingjin Qian Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 2 + lustre/mdt/mdt_open.c | 35 +++++++--- lustre/mdt/mdt_reint.c | 4 ++ lustre/tests/sanityn.sh | 161 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+), 10 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 847c443..b871dad 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -242,6 +242,8 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_RMFID_NET 0x166 #define OBD_FAIL_MDS_CREATE_RACE 0x167 #define OBD_FAIL_MDS_STATFS_SPOOF 0x168 +#define OBD_FAIL_MDS_REINT_OPEN 0x169 +#define OBD_FAIL_MDS_REINT_OPEN2 0x16a /* layout lock */ #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index 5b474a4..ef12eb1 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -1313,6 +1313,7 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) int result, rc; int created = 0; int object_locked = 0; + enum ldlm_mode lock_mode = LCK_PR; u32 msg_flags; ktime_t kstart = ktime_get(); @@ -1390,25 +1391,27 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) if (result < 0) GOTO(out, result); - lh = &info->mti_lh[MDT_LH_PARENT]; - mdt_lock_pdo_init(lh, (open_flags & MDS_OPEN_CREAT) ? LCK_PW : LCK_PR, - &rr->rr_name); - parent = mdt_object_find(info->mti_env, mdt, rr->rr_fid1); if (IS_ERR(parent)) GOTO(out, result = PTR_ERR(parent)); - result = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE); - if (result != 0) { + /* get and check version of parent */ + result = mdt_version_get_check(info, parent, 0); + if (result) { mdt_object_put(info->mti_env, parent); GOTO(out, result); } - /* get and check version of parent */ - result = mdt_version_get_check(info, parent, 0); - if (result) - GOTO(out_parent, result); + OBD_RACE(OBD_FAIL_MDS_REINT_OPEN); +again_pw: + lh = &info->mti_lh[MDT_LH_PARENT]; + mdt_lock_pdo_init(lh, lock_mode, &rr->rr_name); + result = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE); + if (result != 0) { + mdt_object_put(info->mti_env, parent); + GOTO(out, result); + } fid_zero(child_fid); result = -ENOENT; @@ -1424,12 +1427,24 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) if (result != 0 && result != -ENOENT) GOTO(out_parent, result); + OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2); + if (result == -ENOENT) { mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG); if (!(open_flags & MDS_OPEN_CREAT)) GOTO(out_parent, result); if (mdt_rdonly(req->rq_export)) GOTO(out_parent, result = -EROFS); + + if (lock_mode == LCK_PR) { + /* first pass: get write lock and restart */ + mdt_object_unlock(info, parent, lh, 1); + mdt_clear_disposition(info, ldlm_rep, DISP_LOOKUP_NEG); + mdt_lock_handle_init(lh); + lock_mode = LCK_PW; + goto again_pw; + } + *child_fid = *info->mti_rr.rr_fid2; LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n", PFID(child_fid)); diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 35eac50..f6c2e0b 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -1071,6 +1071,8 @@ static int mdt_reint_unlink(struct mdt_thread_info *info, GOTO(put_parent, rc); } + OBD_RACE(OBD_FAIL_MDS_REINT_OPEN); + OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2); relock: parent_lh = &info->mti_lh[MDT_LH_PARENT]; mdt_lock_pdo_init(parent_lh, LCK_PW, &rr->rr_name); @@ -2630,6 +2632,8 @@ static int mdt_reint_rename(struct mdt_thread_info *info, lh_srcdirp = &info->mti_lh[MDT_LH_PARENT]; lh_tgtdirp = &info->mti_lh[MDT_LH_CHILD]; + OBD_RACE(OBD_FAIL_MDS_REINT_OPEN); + OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2); relock: mdt_lock_pdo_init(lh_srcdirp, LCK_PW, &rr->rr_name); mdt_lock_pdo_init(lh_tgtdirp, LCK_PW, &rr->rr_tgt_name); diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index d09a62f..55c9312 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -1873,6 +1873,57 @@ test_41h() { } run_test 41h "pdirops: create vs readdir ==============" +sub_test_41i() { + local PID1 PID2 + local fail_loc="$1" + local ret=0 + + do_nodes $(comma_list $(mdts_nodes)) \ + "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null + + $MULTIOP $DIR1/$tfile oO_CREAT:O_EXCL:c 2>/dev/null & + PID1=$! + sleep 0.2 + $MULTIOP $DIR2/$tfile oO_CREAT:O_EXCL:c 2>/dev/null & + PID2=$! + + if ! wait $PID1 && ! wait $PID2; then + echo "Both creates failed (1 should fail, 1 should succeed)" + ret=1 + elif wait $PID1 && wait $PID2; then + echo "Both creates succeeded (1 should fail, 1 should succeed)" + ret=2 + fi + + #Clean + do_nodes $(comma_list $(mdts_nodes)) \ + "lctl set_param -n fail_loc=0x0 || true" &>/dev/null + rm -f $DIR/$tfile + + return $ret +} + +test_41i() { + [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] || + skip "Need MDS version newer than 2.13.56" + local msg fail_loc + +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN 0x169 +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2 0x16a + for fail_loc in "0x80000169" "0x8000016a"; do + echo "Begin 100 tests with fail_loc=$fail_loc" + printf "Progress: " + for i in {1..100}; do + printf "*" + msg=$(sub_test_41i "$fail_loc") || + { echo; error "iter=$i : $msg"; } + done + echo + done +} +run_test 41i "reint_open: create vs create" + + # test 42: unlink and blocking operations test_42a() { pdo_lru_clear @@ -2208,6 +2259,62 @@ test_43j() { } run_test 43j "racy mkdir return EEXIST ==============" +sub_test_43k() { + local PID1 PID2 + local fail_loc="$1" + local ret=0 + + # We test in a separate directory to be able to unblock server thread in + # cfs_race() if LCK_PW is taken on the parent by mdt_reint_unlink. + test_mkdir $DIR2/$tdir + touch $DIR2/$tdir/$tfile + + do_nodes $(comma_list $(mdts_nodes)) \ + "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null + echo content > $DIR1/$tdir/$tfile & PID1=$! + pdo_sched + multiop $DIR2/$tdir/$tfile u & PID2=$! + + wait $PID1 || + { ret=$?; \ + echo -n "overwriting $tfile should succeed (err=$ret); "; } + wait $PID2 || + { ret=$?; \ + echo -n "unlinking $tfile should succeed (err=$ret);"; } + + #Clean + do_nodes $(comma_list $(mdts_nodes)) \ + "lctl set_param -n fail_loc=0x0 || true" &>/dev/null + rm -rf $DIR/$tdir + + return $ret +} + +test_43k() { + [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] || + skip "Need MDS version newer than 2.13.56" + local msg fail_loc + +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN 0x169 +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2 0x16a + for fail_loc in "0x80000169" "0x8000016a"; do + echo "Begin 100 tests with fail_loc=$fail_loc" + printf "Progress: " + for i in {1..100}; do + printf "*" + msg=$(sub_test_43k "$fail_loc") || + { echo; error "iter=$i : $msg"; } + done + echo + done + + #Clean + reset_fail_loc + + return 0 +} +run_test 43k "unlink vs create" + # test 44: rename tgt and blocking operations test_44a() { pdo_lru_clear @@ -2555,6 +2662,60 @@ test_45i() { } run_test 45i "pdirops: rename src vs remote mkdir" +sub_test_45j() { + local PID1 PID2 + local fail_loc="$1" + local ret=0 + + # We test in a sparate directory to be able to unblock server thread in + # cfs_race if LCK_PW is taken on the parent by mdt_reint_rename. + test_mkdir $DIR2/$tdir + echo file1 > $DIR2/$tdir/$tfile + echo file2 > $DIR2/$tdir/$tfile-2 + + do_nodes $(comma_list $(mdts_nodes)) \ + "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null + + cat $DIR1/$tdir/$tfile >/dev/null & + PID1=$! + pdo_sched + mrename $DIR2/$tdir/$tfile-2 $DIR2/$tdir/$tfile > /dev/null & + PID2=$! + + wait $PID1 || + { ret=$?; echo -n "cat $tfile should succeed (err=$ret); "; } + wait $PID2 || + { ret=$?; \ + echo -n "mrename $tfile-2 to $tfile failed (err=$ret);"; } + + #Clean + do_nodes $(comma_list $(mdts_nodes)) \ + "lctl set_param -n fail_loc=0x0 || true" &>/dev/null + rm -rf $DIR/$tdir + + return $ret +} + +test_45j() { + [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] || + skip "Need MDS version newer than 2.13.56" + local msg fail_loc + +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN 0x169 +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2 0x16a + for fail_loc in "0x80000169" "0x8000016a"; do + echo "Begin 100 tests with fail_loc=$fail_loc" + printf "Progress: " + for i in {1..100}; do + printf "*" + msg=$(sub_test_45j "$fail_loc") || + { echo; error "iter=$i : $msg"; } + done + echo + done +} +run_test 45j "read vs rename ==============" + # test 46: link and blocking operations test_46a() { pdo_lru_clear -- 1.8.3.1