Whamcloud - gitweb
LU-11926 ldlm: Lost lease lock on migrate error 82/34182/4
authorAndriy Skulysh <c17819@cray.com>
Tue, 4 Dec 2018 13:27:58 +0000 (15:27 +0200)
committerOleg Drokin <green@whamcloud.com>
Thu, 21 Mar 2019 03:43:41 +0000 (03:43 +0000)
All the file operations have the following locking order - parent,
child. If a lock for a child is returned to the client, the following
operations on this file are done by the child fid.

However, the migrate is an exception - it takes the lease lock first and
takes the PW parent lock next during the MDS_REINT.

At the same time, if there is a parallel racing operation (open) which
has taken a lock on parent (conflicting with the next MDS_REINT) and
is trying to take a lock on child - it is blocked until
the lease cancel comes.

The lease cancel is piggy-backed on the MDS_REINT RPC and is handled
at the end of the operation, trying to take the conflicting parent lock
first - thus a deadlock occurs.

At the same time, the lease lock is not supposed to block anything, it
is just an indicator on the server there is no other conflicting
operation has occurred during the migration - thus
set LDLM_FL_CANCEL_ON_BLOCK on it and the conflicting operation
will not be blocked.

In this case, the MDS_REINT will return -EAGAIN as the lease
is cancelled and the client will retry its migration.

Change-Id: Ib6cdc24ffe4ecb99d314a5466bcbb066a1d04dc1
Cray-bug-id: LUS-6811
Signed-off-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Vitaly Fertman <c17818@cray.com>
Reviewed-by: Alexander Boyko <c17825@cray.com>
Reviewed-on: https://review.whamcloud.com/34182
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/llite/file.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_open.c
lustre/tests/sanity.sh

index 09dfbc7..bf32221 100644 (file)
@@ -387,6 +387,7 @@ extern char obd_jobid_var[];
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE     0x32b
+#define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
index 058585d..764669e 100644 (file)
@@ -1385,7 +1385,7 @@ existing_lock:
                          * granted lock will be cancelled immediately after
                          * sending completion AST.
                          */
-                        if (dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+                       if (ldlm_is_cancel_on_block(lock)) {
                                 unlock_res_and_lock(lock);
                                 ldlm_lock_cancel(lock);
                                 lock_res_and_lock(lock);
@@ -1756,9 +1756,6 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
        }
        ldlm_set_cbpending(lock);
 
-       if (ldlm_is_cancel_on_block(lock))
-               ldlm_set_cancel(lock);
-
         do_ast = (!lock->l_readers && !lock->l_writers);
         unlock_res_and_lock(lock);
 
index a6310d9..0568a00 100644 (file)
@@ -478,6 +478,9 @@ int ldlm_cli_enqueue_local(const struct lu_env *env,
         if (*flags & LDLM_FL_ATOMIC_CB)
                ldlm_set_atomic_cb(lock);
 
+       if (*flags & LDLM_FL_CANCEL_ON_BLOCK)
+               ldlm_set_cancel_on_block(lock);
+
         if (policy != NULL)
                 lock->l_policy_data = *policy;
         if (client_cookie != NULL)
@@ -1546,6 +1549,10 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
        ldlm_set_canceling(lock);
        unlock_res_and_lock(lock);
 
+       if (cancel_flags & LCF_LOCAL)
+               OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE,
+                                cfs_fail_val);
+
        rc = ldlm_cli_cancel_local(lock);
        if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) {
                LDLM_LOCK_RELEASE(lock);
index 155236c..b429b2b 100644 (file)
@@ -4160,7 +4160,9 @@ again:
        if (rc == 0) {
                LASSERT(request != NULL);
                ll_update_times(request, parent);
+       }
 
+       if (rc == 0 || rc == -EAGAIN) {
                body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
                LASSERT(body != NULL);
 
@@ -4181,7 +4183,7 @@ again:
                request = NULL;
        }
 
-       /* Try again if the file layout has changed. */
+       /* Try again if the lease has cancelled. */
        if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
                goto again;
 
index 8064065..c2bb878 100644 (file)
@@ -2937,6 +2937,11 @@ static int mdt_object_local_lock(struct mdt_thread_info *info,
        /* Only enqueue LOOKUP lock for remote object */
        LASSERT(ergo(mdt_object_remote(o), *ibits == MDS_INODELOCK_LOOKUP));
 
+       /* Lease lock are granted with LDLM_FL_CANCEL_ON_BLOCK */
+       if (lh->mlh_type == MDT_REG_LOCK && lh->mlh_reg_mode == LCK_EX &&
+           *ibits == MDS_INODELOCK_OPEN)
+               dlmflags |= LDLM_FL_CANCEL_ON_BLOCK;
+
        if (lh->mlh_type == MDT_PDO_LOCK) {
                 /* check for exists after object is locked */
                 if (mdt_object_exists(o) == 0) {
index 602bc6a..40025f6 100644 (file)
@@ -902,7 +902,7 @@ static int mdt_object_open_lock(struct mdt_thread_info *info,
 
                CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
                        PFID(mdt_object_fid(obj)),
-                       atomic_read(&obj->mot_open_count), lm);
+                       atomic_read(&obj->mot_lease_count), lm);
        }
 
        mdt_lock_reg_init(lhc, lm);
index 9c8714b..db750bf 100755 (executable)
@@ -18846,6 +18846,31 @@ test_317() {
 }
 run_test 317 "Verify blocks get correctly update after truncate"
 
+test_319() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+
+       local before=$(date +%s)
+       local evict
+       local mdir=$DIR/$tdir
+       local file=$mdir/xxx
+
+       $LFS mkdir -i0 $mdir || error "mkdir $mdir fails"
+       touch $file
+
+#define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
+       $LCTL set_param fail_val=5 fail_loc=0x8000032c
+       $LFS mv -m1 $file &
+
+       sleep 1
+       dd if=$file of=/dev/null
+       wait
+       evict=$($LCTL get_param mdc.$FSNAME-MDT*.state |
+         awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }')
+
+       [ -z "$evict" ] || [[ $evict -le $before ]] || error "eviction happened"
+}
+run_test 319 "lost lease lock on migrate error"
+
 test_fake_rw() {
        local read_write=$1
        if [ "$read_write" = "write" ]; then