Whamcloud - gitweb
LU-10262 mdt: mdt_reint_open: check EEXIST without lock 98/33098/17
authorDominique Martinet <dominique.martinet@cea.fr>
Fri, 31 Aug 2018 09:03:36 +0000 (18:03 +0900)
committerOleg Drokin <green@whamcloud.com>
Wed, 9 Dec 2020 07:48:43 +0000 (07:48 +0000)
Many applications blindly open files with O_CREAT, and the mds gets a
write lock to the parent directory for these even if the file already
exists.
Checking for file existence first lets us take a PR lock if file
already existed even if O_CREAT was specified.

This opens up multiple races between the first lookup and the actual
locking, in each of them drop the resources we aquired and retry from
scratch to keep things as far from complicated as possible, with mixed
success.

Update (eaujames):
 - rebase the patch
 - update tests

Performance tests results:

The array below presents the average "open" syscall latency for 20
files in a single directory accessed by 400 different clients.
 _______________________________________________________________
| Test cases        | without patch | with patch | %improvement |
|___________________|_______________|____________|______________|
| readonly          | 0.960s        | 0.973s     | -1.40%       |
|___________________|_______________|____________|______________|
| readonly cached   | 0.372s        | 0.372s     | +0.01%       |
|___________________|_______________|____________|______________|
| O_CREAT+precreate | 1.645s        | 0.968s     | +41.13%      |
|___________________|_______________|____________|______________|
| O_CREAT cached    | 0.632s        | 0.623s     | +1.34%       |
|___________________|_______________|____________|______________|
| O_CREAT           | 1.261s        | 1.093s     | +13.32%      |
|___________________|_______________|____________|______________|
(for more detail, see the ticket comments section)

This patch optimizes concurent opens with O_CREAT flag when dentry are
not cached by clients.

Change-Id: I247b579d14d20036f89033c99ece457d70ba19e7
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Reviewed-on: https://review.whamcloud.com/33098
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Etienne AUJAMES <eaujames@ddn.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/mdt/mdt_open.c
lustre/mdt/mdt_reint.c
lustre/tests/sanityn.sh

index 847c443..b871dad 100644 (file)
@@ -242,6 +242,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_RMFID_NET          0x166
 #define OBD_FAIL_MDS_CREATE_RACE        0x167
 #define OBD_FAIL_MDS_STATFS_SPOOF       0x168
+#define OBD_FAIL_MDS_REINT_OPEN                 0x169
+#define OBD_FAIL_MDS_REINT_OPEN2        0x16a
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
index 5b474a4..ef12eb1 100644 (file)
@@ -1313,6 +1313,7 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
        int result, rc;
        int created = 0;
        int object_locked = 0;
+       enum ldlm_mode lock_mode = LCK_PR;
        u32 msg_flags;
        ktime_t kstart = ktime_get();
 
@@ -1390,25 +1391,27 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
        if (result < 0)
                GOTO(out, result);
 
-       lh = &info->mti_lh[MDT_LH_PARENT];
-       mdt_lock_pdo_init(lh, (open_flags & MDS_OPEN_CREAT) ? LCK_PW : LCK_PR,
-                         &rr->rr_name);
-
        parent = mdt_object_find(info->mti_env, mdt, rr->rr_fid1);
        if (IS_ERR(parent))
                GOTO(out, result = PTR_ERR(parent));
 
-       result = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE);
-       if (result != 0) {
+       /* get and check version of parent */
+       result = mdt_version_get_check(info, parent, 0);
+       if (result) {
                mdt_object_put(info->mti_env, parent);
                GOTO(out, result);
        }
 
-       /* get and check version of parent */
-       result = mdt_version_get_check(info, parent, 0);
-       if (result)
-               GOTO(out_parent, result);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+again_pw:
+       lh = &info->mti_lh[MDT_LH_PARENT];
+       mdt_lock_pdo_init(lh, lock_mode, &rr->rr_name);
 
+       result = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE);
+       if (result != 0) {
+               mdt_object_put(info->mti_env, parent);
+               GOTO(out, result);
+       }
        fid_zero(child_fid);
 
        result = -ENOENT;
@@ -1424,12 +1427,24 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
        if (result != 0 && result != -ENOENT)
                GOTO(out_parent, result);
 
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
+
        if (result == -ENOENT) {
                mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
                if (!(open_flags & MDS_OPEN_CREAT))
                        GOTO(out_parent, result);
                if (mdt_rdonly(req->rq_export))
                        GOTO(out_parent, result = -EROFS);
+
+               if (lock_mode == LCK_PR) {
+                       /* first pass: get write lock and restart */
+                       mdt_object_unlock(info, parent, lh, 1);
+                       mdt_clear_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
+                       mdt_lock_handle_init(lh);
+                       lock_mode = LCK_PW;
+                       goto again_pw;
+               }
+
                *child_fid = *info->mti_rr.rr_fid2;
                LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n",
                         PFID(child_fid));
index 35eac50..f6c2e0b 100644 (file)
@@ -1071,6 +1071,8 @@ static int mdt_reint_unlink(struct mdt_thread_info *info,
                        GOTO(put_parent, rc);
        }
 
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
 relock:
        parent_lh = &info->mti_lh[MDT_LH_PARENT];
        mdt_lock_pdo_init(parent_lh, LCK_PW, &rr->rr_name);
@@ -2630,6 +2632,8 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
        lh_srcdirp = &info->mti_lh[MDT_LH_PARENT];
        lh_tgtdirp = &info->mti_lh[MDT_LH_CHILD];
 
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
 relock:
        mdt_lock_pdo_init(lh_srcdirp, LCK_PW, &rr->rr_name);
        mdt_lock_pdo_init(lh_tgtdirp, LCK_PW, &rr->rr_tgt_name);
index d09a62f..55c9312 100755 (executable)
@@ -1873,6 +1873,57 @@ test_41h() {
 }
 run_test 41h "pdirops: create vs readdir =============="
 
+sub_test_41i() {
+       local PID1 PID2
+       local fail_loc="$1"
+       local ret=0
+
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null
+
+       $MULTIOP $DIR1/$tfile oO_CREAT:O_EXCL:c 2>/dev/null &
+       PID1=$!
+       sleep 0.2
+       $MULTIOP $DIR2/$tfile oO_CREAT:O_EXCL:c 2>/dev/null &
+       PID2=$!
+
+       if ! wait $PID1 && ! wait $PID2; then
+               echo "Both creates failed (1 should fail, 1 should succeed)"
+               ret=1
+       elif wait $PID1 && wait $PID2; then
+               echo "Both creates succeeded (1 should fail, 1 should succeed)"
+               ret=2
+       fi
+
+       #Clean
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=0x0 || true" &>/dev/null
+       rm -f $DIR/$tfile
+
+       return $ret
+}
+
+test_41i() {
+       [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] ||
+               skip "Need MDS version newer than 2.13.56"
+       local msg fail_loc
+
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN         0x169
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2        0x16a
+       for fail_loc in "0x80000169" "0x8000016a"; do
+               echo "Begin 100 tests with fail_loc=$fail_loc"
+               printf "Progress: "
+               for i in {1..100}; do
+                       printf "*"
+                       msg=$(sub_test_41i "$fail_loc") ||
+                               { echo; error "iter=$i : $msg"; }
+               done
+               echo
+       done
+}
+run_test 41i "reint_open: create vs create"
+
+
 # test 42: unlink and blocking operations
 test_42a() {
        pdo_lru_clear
@@ -2208,6 +2259,62 @@ test_43j() {
 }
 run_test 43j "racy mkdir return EEXIST =============="
 
+sub_test_43k() {
+       local PID1 PID2
+       local fail_loc="$1"
+       local ret=0
+
+       # We test in a separate directory to be able to unblock server thread in
+       # cfs_race() if LCK_PW is taken on the parent by mdt_reint_unlink.
+       test_mkdir $DIR2/$tdir
+       touch $DIR2/$tdir/$tfile
+
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null
+       echo content > $DIR1/$tdir/$tfile & PID1=$!
+       pdo_sched
+       multiop $DIR2/$tdir/$tfile u & PID2=$!
+
+       wait $PID1 ||
+               { ret=$?; \
+               echo -n "overwriting $tfile should succeed (err=$ret); "; }
+       wait $PID2 ||
+               { ret=$?; \
+               echo -n "unlinking $tfile should succeed (err=$ret);"; }
+
+       #Clean
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=0x0 || true" &>/dev/null
+       rm -rf $DIR/$tdir
+
+       return $ret
+}
+
+test_43k() {
+       [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] ||
+               skip "Need MDS version newer than 2.13.56"
+       local msg fail_loc
+
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN         0x169
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2        0x16a
+       for fail_loc in "0x80000169" "0x8000016a"; do
+               echo "Begin 100 tests with fail_loc=$fail_loc"
+               printf "Progress: "
+               for i in {1..100}; do
+                       printf "*"
+                       msg=$(sub_test_43k "$fail_loc") ||
+                               { echo; error "iter=$i : $msg"; }
+               done
+               echo
+       done
+
+       #Clean
+       reset_fail_loc
+
+       return 0
+}
+run_test 43k "unlink vs create"
+
 # test 44: rename tgt and blocking operations
 test_44a() {
        pdo_lru_clear
@@ -2555,6 +2662,60 @@ test_45i() {
 }
 run_test 45i "pdirops: rename src vs remote mkdir"
 
+sub_test_45j() {
+       local PID1 PID2
+       local fail_loc="$1"
+       local ret=0
+
+       # We test in a sparate directory to be able to unblock server thread in
+       # cfs_race if LCK_PW is taken on the parent by mdt_reint_rename.
+       test_mkdir $DIR2/$tdir
+       echo file1 > $DIR2/$tdir/$tfile
+       echo file2 > $DIR2/$tdir/$tfile-2
+
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null
+
+       cat $DIR1/$tdir/$tfile >/dev/null &
+       PID1=$!
+       pdo_sched
+       mrename $DIR2/$tdir/$tfile-2 $DIR2/$tdir/$tfile > /dev/null &
+       PID2=$!
+
+       wait $PID1 ||
+               { ret=$?; echo -n "cat $tfile should succeed (err=$ret); "; }
+       wait $PID2 ||
+               { ret=$?; \
+               echo -n "mrename $tfile-2 to $tfile failed (err=$ret);"; }
+
+       #Clean
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=0x0 || true" &>/dev/null
+       rm -rf $DIR/$tdir
+
+       return $ret
+}
+
+test_45j() {
+       [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] ||
+               skip "Need MDS version newer than 2.13.56"
+       local msg fail_loc
+
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN         0x169
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2        0x16a
+       for fail_loc in "0x80000169" "0x8000016a"; do
+               echo "Begin 100 tests with fail_loc=$fail_loc"
+               printf "Progress: "
+               for i in {1..100}; do
+                       printf "*"
+                       msg=$(sub_test_45j "$fail_loc") ||
+                               { echo; error "iter=$i : $msg"; }
+               done
+               echo
+       done
+}
+run_test 45j "read vs rename =============="
+
 # test 46: link and blocking operations
 test_46a() {
        pdo_lru_clear