Whamcloud - gitweb
LU-7791 ldlm: signal vs CP callback race 98/19898/9
authorAndriy Skulysh <c17819@cray.com>
Tue, 3 May 2016 07:41:56 +0000 (10:41 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 28 Jan 2020 06:02:00 +0000 (06:02 +0000)
In case of interrupted wait for a CP AST
failed_lock_cleanup() sets LDLM_FL_LOCAL_ONLY, so
the client wouldn't cancel the lock on CP AST.

A lock isn't canceled on the server on reception

Cray-bug-id: LUS-2021
Change-Id: Id1e365b41f1fb8a0f9a32c0c929457b22ceba8ef
Signed-off-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Alexander Boyko <c17825@cray.com>
Reviewed-by: Andrew Perepechko <c17827@cray.com>
Reviewed-on: https://review.whamcloud.com/19898
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/tests/sanityn.sh

index a728bef..5cf86de 100644 (file)
@@ -386,6 +386,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_SRV_GL_AST        0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW     0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH    0x328
 #define OBD_FAIL_LDLM_SRV_GL_AST        0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW     0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH    0x328
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE     0x32b
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE     0x32b
index e208689..faf7545 100644 (file)
@@ -1884,15 +1884,29 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
        EXIT;
 }
 
        EXIT;
 }
 
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+       if (req->rq_no_reply)
+               return 0;
+
+       req->rq_status = rc;
+       if (!req->rq_packed_final) {
+               rc = lustre_pack_reply(req, 1, NULL, NULL);
+               if (rc)
+                       return rc;
+       }
+       return ptlrpc_reply(req);
+}
+
 /**
  * Callback handler for receiving incoming completion ASTs.
  *
  * This only can happen on client side.
  */
 /**
  * Callback handler for receiving incoming completion ASTs.
  *
  * This only can happen on client side.
  */
-static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
-                                   struct ldlm_namespace *ns,
-                                   struct ldlm_request *dlm_req,
-                                   struct ldlm_lock *lock)
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
+                                    struct ldlm_namespace *ns,
+                                    struct ldlm_request *dlm_req,
+                                    struct ldlm_lock *lock)
 {
        struct list_head ast_list;
        int lvb_len;
 {
        struct list_head ast_list;
        int lvb_len;
@@ -1906,6 +1920,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
                long to = cfs_time_seconds(1);
 
        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
                long to = cfs_time_seconds(1);
 
+               ldlm_callback_reply(req, 0);
+
                while (to > 0) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(to);
                while (to > 0) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(to);
@@ -1949,6 +1965,12 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
                lock_res_and_lock(lock);
        }
 
                lock_res_and_lock(lock);
        }
 
+       if (ldlm_is_failed(lock)) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_RELEASE(lock);
+               RETURN(-EINVAL);
+       }
+
        if (ldlm_is_destroyed(lock) ||
            ldlm_is_granted(lock)) {
                /* b=11300: the lock has already been granted */
        if (ldlm_is_destroyed(lock) ||
            ldlm_is_granted(lock)) {
                /* b=11300: the lock has already been granted */
@@ -2021,6 +2043,8 @@ out:
                wake_up(&lock->l_waitq);
        }
        LDLM_LOCK_RELEASE(lock);
                wake_up(&lock->l_waitq);
        }
        LDLM_LOCK_RELEASE(lock);
+
+       return 0;
 }
 
 /**
 }
 
 /**
@@ -2077,20 +2101,6 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
        EXIT;
 }
 
        EXIT;
 }
 
-static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
-{
-       if (req->rq_no_reply)
-               return 0;
-
-       req->rq_status = rc;
-       if (!req->rq_packed_final) {
-               rc = lustre_pack_reply(req, 1, NULL, NULL);
-               if (rc)
-                       return rc;
-       }
-       return ptlrpc_reply(req);
-}
-
 static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
                               enum ldlm_cancel_flags cancel_flags)
 {
 static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
                               enum ldlm_cancel_flags cancel_flags)
 {
@@ -2426,8 +2436,9 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
        case LDLM_CP_CALLBACK:
                CDEBUG(D_INODE, "completion ast\n");
                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
        case LDLM_CP_CALLBACK:
                CDEBUG(D_INODE, "completion ast\n");
                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
-               ldlm_callback_reply(req, 0);
-               ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
+                       ldlm_callback_reply(req, rc);
                break;
        case LDLM_GL_CALLBACK:
                CDEBUG(D_INODE, "glimpse ast\n");
                break;
        case LDLM_GL_CALLBACK:
                CDEBUG(D_INODE, "glimpse ast\n");
index 691f10c..143d0a9 100644 (file)
@@ -1226,6 +1226,9 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
                bool local_only;
 
                LDLM_DEBUG(lock, "client-side cancel");
                bool local_only;
 
                LDLM_DEBUG(lock, "client-side cancel");
+               OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
+                                cfs_fail_val);
+
                /* Set this flag to prevent others from getting new references*/
                lock_res_and_lock(lock);
                ldlm_set_cbpending(lock);
                /* Set this flag to prevent others from getting new references*/
                lock_res_and_lock(lock);
                ldlm_set_cbpending(lock);
index 6be8d80..72a3971 100755 (executable)
@@ -4514,6 +4514,42 @@ test_93() {
 }
 run_test 93 "alloc_rr should not allocate on same ost"
 
 }
 run_test 93 "alloc_rr should not allocate on same ost"
 
+test_94() {
+       $LCTL set_param osc.*.idle_timeout=0
+       dd if=/dev/zero of=$DIR2/$tfile bs=4k count=2 conv=fsync
+
+       local before=$(date +%s)
+       local evict
+
+       $LCTL mark write
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+       $LCTL set_param fail_val=5 fail_loc=0x80000312
+       dd if=/dev/zero of=$DIR/$tfile conv=notrunc oflag=append bs=4k count=1 &
+       local pid=$!
+       sleep 2
+
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
+       $LCTL set_param fail_val=6 fail_loc=0x80000329
+       $LCTL mark kill $pid
+       kill -ALRM $pid
+
+       dd if=/dev/zero of=$DIR2/$tfile conv=notrunc oflag=append bs=4k count=1
+
+       wait $pid
+       dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 conv=fsync
+
+       evict=$(do_facet client $LCTL get_param \
+               osc.$FSNAME-OST*-osc-*/state |
+           awk -F"[ [,]" '/EVICTED ]$/ { if (t<$5) {t=$5;} } END { print t }')
+
+       [ -z "$evict" ] || [[ $evict -le $before ]] ||
+               (do_facet client $LCTL get_param \
+                       osc.$FSNAME-OST*-osc-*/state;
+                   error "eviction happened: $evict before:$before")
+       $LCTL set_param osc.*.idle_timeout=debug
+}
+run_test 94 "signal vs CP callback race"
+
 # Data-on-MDT tests
 test_100a() {
        skip "Reserved for glimpse-ahead" && return
 # Data-on-MDT tests
 test_100a() {
        skip "Reserved for glimpse-ahead" && return