Whamcloud - gitweb
LU-7791 ldlm: signal vs CP callback race 97/44297/2
authorAndriy Skulysh <c17819@cray.com>
Tue, 3 May 2016 07:41:56 +0000 (10:41 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 10 Aug 2021 06:34:58 +0000 (06:34 +0000)
In case of interrupted wait for a CP AST
failed_lock_cleanup() sets LDLM_FL_LOCAL_ONLY, so
the client wouldn't cancel the lock on CP AST.

A lock isn't canceled on the server on reception

Lustre-change: https://review.whamcloud.com/19898
Lustre-commit: 7fff052c930da4822c3b2a13d130da7473a20a58

Cray-bug-id: LUS-2021
Change-Id: Id1e365b41f1fb8a0f9a32c0c929457b22ceba8ef
Signed-off-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Alexander Boyko <c17825@cray.com>
Reviewed-by: Andrew Perepechko <c17827@cray.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-on: https://review.whamcloud.com/44297
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/tests/sanityn.sh

index 32121b5..b5aed43 100644 (file)
@@ -391,6 +391,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_SRV_GL_AST        0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW     0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH    0x328
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE     0x32b
index bb45f4f..137ba42 100644 (file)
@@ -1802,12 +1802,26 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
         EXIT;
 }
 
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+       if (req->rq_no_reply)
+               return 0;
+
+       req->rq_status = rc;
+       if (!req->rq_packed_final) {
+               rc = lustre_pack_reply(req, 1, NULL, NULL);
+               if (rc)
+                       return rc;
+       }
+       return ptlrpc_reply(req);
+}
+
 /**
  * Callback handler for receiving incoming completion ASTs.
  *
  * This only can happen on client side.
  */
-static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
                                     struct ldlm_namespace *ns,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
@@ -1823,6 +1837,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
                long to = cfs_time_seconds(1);
 
+               ldlm_callback_reply(req, 0);
+
                while (to > 0) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        to = schedule_timeout(to);
@@ -1867,6 +1883,12 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
                lock_res_and_lock(lock);
        }
 
+       if (ldlm_is_failed(lock)) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_RELEASE(lock);
+               RETURN(-EINVAL);
+       }
+
        if (ldlm_is_destroyed(lock) ||
            ldlm_is_granted(lock)) {
                /* bug 11300: the lock has already been granted */
@@ -1935,6 +1957,8 @@ out:
                wake_up(&lock->l_waitq);
        }
        LDLM_LOCK_RELEASE(lock);
+
+       return 0;
 }
 
 /**
@@ -1991,20 +2015,6 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
         EXIT;
 }
 
-static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
-{
-        if (req->rq_no_reply)
-                return 0;
-
-        req->rq_status = rc;
-        if (!req->rq_packed_final) {
-                rc = lustre_pack_reply(req, 1, NULL, NULL);
-                if (rc)
-                        return rc;
-        }
-        return ptlrpc_reply(req);
-}
-
 static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
                               enum ldlm_cancel_flags cancel_flags)
 {
@@ -2311,30 +2321,31 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 CDEBUG(D_INODE, "blocking ast\n");
                 req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
                if (!ldlm_is_cancel_on_block(lock)) {
-                        rc = ldlm_callback_reply(req, 0);
-                        if (req->rq_no_reply || rc)
-                                ldlm_callback_errmsg(req, "Normal process", rc,
-                                                     &dlm_req->lock_handle[0]);
-                }
-                if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
-                        ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
-                break;
-        case LDLM_CP_CALLBACK:
-                CDEBUG(D_INODE, "completion ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
-                ldlm_callback_reply(req, 0);
-                ldlm_handle_cp_callback(req, ns, dlm_req, lock);
-                break;
-        case LDLM_GL_CALLBACK:
-                CDEBUG(D_INODE, "glimpse ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
-                ldlm_handle_gl_callback(req, ns, dlm_req, lock);
-                break;
-        default:
-                LBUG();                         /* checked above */
-        }
+                       rc = ldlm_callback_reply(req, 0);
+                       if (req->rq_no_reply || rc)
+                               ldlm_callback_errmsg(req, "Normal process", rc,
+                                                    &dlm_req->lock_handle[0]);
+               }
+               if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+                       ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+               break;
+       case LDLM_CP_CALLBACK:
+               CDEBUG(D_INODE, "completion ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+               rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
+                       ldlm_callback_reply(req, rc);
+               break;
+       case LDLM_GL_CALLBACK:
+               CDEBUG(D_INODE, "glimpse ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+               ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+               break;
+       default:
+               LBUG(); /* checked above */
+       }
 
-        RETURN(0);
+       RETURN(0);
 }
 
 #ifdef HAVE_SERVER_SUPPORT
index 69736ab..a761407 100644 (file)
@@ -1147,9 +1147,12 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
         if (lock->l_conn_export) {
                 bool local_only;
 
-                LDLM_DEBUG(lock, "client-side cancel");
-                /* Set this flag to prevent others from getting new references*/
-                lock_res_and_lock(lock);
+               LDLM_DEBUG(lock, "client-side cancel");
+               OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
+                                cfs_fail_val);
+
+               /* Set this flag to prevent others from getting new references*/
+               lock_res_and_lock(lock);
                ldlm_set_cbpending(lock);
                local_only = !!(lock->l_flags &
                                (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
index c8847e7..3e17f99 100755 (executable)
@@ -4612,6 +4612,42 @@ test_93() {
 }
 run_test 93 "alloc_rr should not allocate on same ost"
 
+test_94() {
+       $LCTL set_param osc.*.idle_timeout=0
+       dd if=/dev/zero of=$DIR2/$tfile bs=4k count=2 conv=fsync
+
+       local before=$(date +%s)
+       local evict
+
+       $LCTL mark write
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+       $LCTL set_param fail_val=5 fail_loc=0x80000312
+       dd if=/dev/zero of=$DIR/$tfile conv=notrunc oflag=append bs=4k count=1 &
+       local pid=$!
+       sleep 2
+
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
+       $LCTL set_param fail_val=6 fail_loc=0x80000329
+       $LCTL mark kill $pid
+       kill -ALRM $pid
+
+       dd if=/dev/zero of=$DIR2/$tfile conv=notrunc oflag=append bs=4k count=1
+
+       wait $pid
+       dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 conv=fsync
+
+       evict=$(do_facet client $LCTL get_param \
+               osc.$FSNAME-OST*-osc-*/state |
+           awk -F"[ [,]" '/EVICTED ]$/ { if (t<$5) {t=$5;} } END { print t }')
+
+       [ -z "$evict" ] || [[ $evict -le $before ]] ||
+               (do_facet client $LCTL get_param \
+                       osc.$FSNAME-OST*-osc-*/state;
+                   error "eviction happened: $evict before:$before")
+       $LCTL set_param osc.*.idle_timeout=debug
+}
+run_test 94 "signal vs CP callback race"
+
 # Data-on-MDT tests
 test_100a() {
        skip "Reserved for glimpse-ahead" && return