Whamcloud - gitweb
LU-10262 mdt: mdt_reint_open: check EEXIST without lock 72/41172/2
authorDominique Martinet <dominique.martinet@cea.fr>
Fri, 31 Aug 2018 09:03:36 +0000 (18:03 +0900)
committerOleg Drokin <green@whamcloud.com>
Thu, 4 Mar 2021 08:35:34 +0000 (08:35 +0000)
Many applications blindly open files with O_CREAT, and the mds gets a
write lock to the parent directory for these even if the file already
exists.
Checking for file existence first lets us take a PR lock if file
already existed even if O_CREAT was specified.

This opens up multiple races between the first lookup and the actual
locking, in each of them drop the resources we aquired and retry from
scratch to keep things as far from complicated as possible, with mixed
success.

Update (eaujames):
 - rebase the patch
 - update tests

Performance tests results:

The array below presents the average "open" syscall latency for 20
files in a single directory accessed by 400 different clients.
 _______________________________________________________________
| Test cases        | without patch | with patch | %improvement |
|___________________|_______________|____________|______________|
| readonly          | 0.960s        | 0.973s     | -1.40%       |
|___________________|_______________|____________|______________|
| readonly cached   | 0.372s        | 0.372s     | +0.01%       |
|___________________|_______________|____________|______________|
| O_CREAT+precreate | 1.645s        | 0.968s     | +41.13%      |
|___________________|_______________|____________|______________|
| O_CREAT cached    | 0.632s        | 0.623s     | +1.34%       |
|___________________|_______________|____________|______________|
| O_CREAT           | 1.261s        | 1.093s     | +13.32%      |
|___________________|_______________|____________|______________|
(for more detail, see the ticket comments section)

This patch optimizes concurent opens with O_CREAT flag when dentry are
not cached by clients.

Lustre-change: https://review.whamcloud.com/33098
Lustre-commit: 33dc40d58ef6eb8b384fce1da9f8d21cad4ef6d8

Change-Id: I247b579d14d20036f89033c99ece457d70ba19e7
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Etienne AUJAMES <eaujames@ddn.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Reviewed-on: https://review.whamcloud.com/41172
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/obd_support.h
lustre/mdt/mdt_open.c
lustre/mdt/mdt_reint.c
lustre/tests/sanityn.sh

index 1327843..84e5be7 100644 (file)
@@ -252,6 +252,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_HSM_CDT_DELAY      0x164
 #define OBD_FAIL_MDS_ORPHAN_DELETE      0x165
 #define OBD_FAIL_MDS_RMFID_NET          0x166
+#define OBD_FAIL_MDS_REINT_OPEN                 0x169
+#define OBD_FAIL_MDS_REINT_OPEN2        0x16a
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
index 3ef8da4..093dbfd 100644 (file)
@@ -1305,6 +1305,7 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
        int result, rc;
        int created = 0;
        int object_locked = 0;
+       enum ldlm_mode lock_mode = LCK_PR;
        u32 msg_flags;
 
        ENTRY;
@@ -1387,25 +1388,27 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
                GOTO(out, result);
 
 again:
-       lh = &info->mti_lh[MDT_LH_PARENT];
-       mdt_lock_pdo_init(lh, (open_flags & MDS_OPEN_CREAT) ? LCK_PW : LCK_PR,
-                         &rr->rr_name);
-
        parent = mdt_object_find(info->mti_env, mdt, rr->rr_fid1);
        if (IS_ERR(parent))
                GOTO(out, result = PTR_ERR(parent));
 
-       result = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE);
-       if (result != 0) {
+       /* get and check version of parent */
+       result = mdt_version_get_check(info, parent, 0);
+       if (result) {
                mdt_object_put(info->mti_env, parent);
                GOTO(out, result);
        }
 
-       /* get and check version of parent */
-       result = mdt_version_get_check(info, parent, 0);
-       if (result)
-               GOTO(out_parent, result);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+again_pw:
+       lh = &info->mti_lh[MDT_LH_PARENT];
+       mdt_lock_pdo_init(lh, lock_mode, &rr->rr_name);
 
+       result = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE);
+       if (result != 0) {
+               mdt_object_put(info->mti_env, parent);
+               GOTO(out, result);
+       }
        fid_zero(child_fid);
 
        result = -ENOENT;
@@ -1421,6 +1424,7 @@ again:
        if (result != 0 && result != -ENOENT && result != -ESTALE)
                GOTO(out_parent, result);
 
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
        if (result == -ENOENT || result == -ESTALE) {
                /* If the object is dead, let's check if the object
                 * is being migrated to a new object */
@@ -1464,6 +1468,16 @@ again:
                        GOTO(out_parent, result);
                if (mdt_rdonly(req->rq_export))
                        GOTO(out_parent, result = -EROFS);
+
+               if (lock_mode == LCK_PR) {
+                       /* first pass: get write lock and restart */
+                       mdt_object_unlock(info, parent, lh, 1);
+                       mdt_clear_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
+                       mdt_lock_handle_init(lh);
+                       lock_mode = LCK_PW;
+                       goto again_pw;
+               }
+
                *child_fid = *info->mti_rr.rr_fid2;
                LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n",
                         PFID(child_fid));
index 42657c6..051fe5f 100644 (file)
@@ -875,6 +875,8 @@ static int mdt_reint_unlink(struct mdt_thread_info *info,
                        GOTO(put_parent, rc);
        }
 
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
 relock:
        parent_lh = &info->mti_lh[MDT_LH_PARENT];
        mdt_lock_pdo_init(parent_lh, LCK_PW, &rr->rr_name);
@@ -2410,6 +2412,8 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
        lh_srcdirp = &info->mti_lh[MDT_LH_PARENT];
        lh_tgtdirp = &info->mti_lh[MDT_LH_CHILD];
 
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
 relock:
        mdt_lock_pdo_init(lh_srcdirp, LCK_PW, &rr->rr_name);
        mdt_lock_pdo_init(lh_tgtdirp, LCK_PW, &rr->rr_tgt_name);
index 6f4d639..c8847e7 100755 (executable)
@@ -1766,6 +1766,57 @@ test_41h() {
 }
 run_test 41h "pdirops: create vs readdir =============="
 
+sub_test_41i() {
+       local PID1 PID2
+       local fail_loc="$1"
+       local ret=0
+
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null
+
+       $MULTIOP $DIR1/$tfile oO_CREAT:O_EXCL:c 2>/dev/null &
+       PID1=$!
+       sleep 0.2
+       $MULTIOP $DIR2/$tfile oO_CREAT:O_EXCL:c 2>/dev/null &
+       PID2=$!
+
+       if ! wait $PID1 && ! wait $PID2; then
+               echo "Both creates failed (1 should fail, 1 should succeed)"
+               ret=1
+       elif wait $PID1 && wait $PID2; then
+               echo "Both creates succeeded (1 should fail, 1 should succeed)"
+               ret=2
+       fi
+
+       #Clean
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=0x0 || true" &>/dev/null
+       rm -f $DIR/$tfile
+
+       return $ret
+}
+
+test_41i() {
+       [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] ||
+               skip "Need MDS version newer than 2.13.56"
+       local msg fail_loc
+
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN         0x169
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2        0x16a
+       for fail_loc in "0x80000169" "0x8000016a"; do
+               echo "Begin 100 tests with fail_loc=$fail_loc"
+               printf "Progress: "
+               for i in {1..100}; do
+                       printf "*"
+                       msg=$(sub_test_41i "$fail_loc") ||
+                               { echo; error "iter=$i : $msg"; }
+               done
+               echo
+       done
+}
+run_test 41i "reint_open: create vs create"
+
+
 # test 42: unlink and blocking operations
 test_42a() {
        pdo_lru_clear
@@ -2021,6 +2072,62 @@ test_43i() {
 }
 run_test 43i "pdirops: unlink vs remote mkdir"
 
+sub_test_43k() {
+       local PID1 PID2
+       local fail_loc="$1"
+       local ret=0
+
+       # We test in a separate directory to be able to unblock server thread in
+       # cfs_race() if LCK_PW is taken on the parent by mdt_reint_unlink.
+       test_mkdir $DIR2/$tdir
+       touch $DIR2/$tdir/$tfile
+
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null
+       echo content > $DIR1/$tdir/$tfile & PID1=$!
+       sleep 0.5
+       multiop $DIR2/$tdir/$tfile u & PID2=$!
+
+       wait $PID1 ||
+               { ret=$?; \
+               echo -n "overwriting $tfile should succeed (err=$ret); "; }
+       wait $PID2 ||
+               { ret=$?; \
+               echo -n "unlinking $tfile should succeed (err=$ret);"; }
+
+       #Clean
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=0x0 || true" &>/dev/null
+       rm -rf $DIR/$tdir
+
+       return $ret
+}
+
+test_43k() {
+       [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] ||
+               skip "Need MDS version newer than 2.13.56"
+       local msg fail_loc
+
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN         0x169
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2        0x16a
+       for fail_loc in "0x80000169" "0x8000016a"; do
+               echo "Begin 100 tests with fail_loc=$fail_loc"
+               printf "Progress: "
+               for i in {1..100}; do
+                       printf "*"
+                       msg=$(sub_test_43k "$fail_loc") ||
+                               { echo; error "iter=$i : $msg"; }
+               done
+               echo
+       done
+
+       #Clean
+       reset_fail_loc
+
+       return 0
+}
+run_test 43k "unlink vs create"
+
 # test 44: rename tgt and blocking operations
 test_44a() {
        pdo_lru_clear
@@ -2304,6 +2411,60 @@ test_45i() {
 }
 run_test 45i "pdirops: rename src vs remote mkdir"
 
+sub_test_45j() {
+       local PID1 PID2
+       local fail_loc="$1"
+       local ret=0
+
+       # We test in a sparate directory to be able to unblock server thread in
+       # cfs_race if LCK_PW is taken on the parent by mdt_reint_rename.
+       test_mkdir $DIR2/$tdir
+       echo file1 > $DIR2/$tdir/$tfile
+       echo file2 > $DIR2/$tdir/$tfile-2
+
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=${fail_loc} || true" &>/dev/null
+
+       cat $DIR1/$tdir/$tfile >/dev/null &
+       PID1=$!
+       sleep 0.5
+       mrename $DIR2/$tdir/$tfile-2 $DIR2/$tdir/$tfile > /dev/null &
+       PID2=$!
+
+       wait $PID1 ||
+               { ret=$?; echo -n "cat $tfile should succeed (err=$ret); "; }
+       wait $PID2 ||
+               { ret=$?; \
+               echo -n "mrename $tfile-2 to $tfile failed (err=$ret);"; }
+
+       #Clean
+       do_nodes $(comma_list $(mdts_nodes)) \
+               "lctl set_param -n fail_loc=0x0 || true" &>/dev/null
+       rm -rf $DIR/$tdir
+
+       return $ret
+}
+
+test_45j() {
+       [[ $MDS1_VERSION -le $(version_code 2.13.56) ]] ||
+               skip "Need MDS version newer than 2.13.56"
+       local msg fail_loc
+
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN         0x169
+#define OBD_FAIL_ONCE|OBD_FAIL_MDS_REINT_OPEN2        0x16a
+       for fail_loc in "0x80000169" "0x8000016a"; do
+               echo "Begin 100 tests with fail_loc=$fail_loc"
+               printf "Progress: "
+               for i in {1..100}; do
+                       printf "*"
+                       msg=$(sub_test_45j "$fail_loc") ||
+                               { echo; error "iter=$i : $msg"; }
+               done
+               echo
+       done
+}
+run_test 45j "read vs rename =============="
+
 # test 46: link and blocking operations
 test_46a() {
        pdo_lru_clear