Whamcloud - gitweb
LU-7791 ldlm: signal vs CP callback race 98/18498/5
authorAndriy Skulysh <andriy.skulysh@seagate.com>
Tue, 1 Mar 2016 22:15:55 +0000 (00:15 +0200)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 21 Apr 2016 02:28:04 +0000 (02:28 +0000)
a lock isn't canceled on the server on reception
of a signal on the client when it is blocked waiting
for a lock to be granted, thus client is evicted for
not responding to LDLM_FL_AST_SENT.

Seagate-bug-id: MRP-3314
Change-Id: Ie025bb58b13fb2d741119f74c87439f917983268
Signed-off-by: Andriy Skulysh <andriy.skulysh@seagate.com>
Reviewed-on: http://review.whamcloud.com/18498
Tested-by: Jenkins
Reviewed-by: Vitaly Fertman <vitaly.fertman@seagate.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Chris Horn <hornc@cray.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/tests/sanityn.sh

index 971f8e3..919be18 100644 (file)
@@ -366,6 +366,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_SRV_GL_AST        0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW     0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH    0x328
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
index e848e5c..42d76b8 100644 (file)
@@ -1793,7 +1793,7 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
  *
  * This only can happen on client side.
  */
-static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
                                     struct ldlm_namespace *ns,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
@@ -1837,6 +1837,11 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
        }
 
        lock_res_and_lock(lock);
+       if (ldlm_is_failed(lock)) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_RELEASE(lock);
+               RETURN(-EINVAL);
+       }
        if (ldlm_is_destroyed(lock) ||
            lock->l_granted_mode == lock->l_req_mode) {
                /* bug 11300: the lock has already been granted */
@@ -1916,6 +1921,8 @@ out:
                wake_up(&lock->l_waitq);
        }
        LDLM_LOCK_RELEASE(lock);
+
+       return 0;
 }
 
 /**
@@ -2322,10 +2329,10 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                         ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
                 break;
         case LDLM_CP_CALLBACK:
-                CDEBUG(D_INODE, "completion ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
-                ldlm_callback_reply(req, 0);
-                ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               CDEBUG(D_INODE, "completion ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+               rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               ldlm_callback_reply(req, rc);
                 break;
         case LDLM_GL_CALLBACK:
                 CDEBUG(D_INODE, "glimpse ast\n");
index 1e2a6cd..1d61fed 100644 (file)
@@ -1110,7 +1110,10 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
         if (lock->l_conn_export) {
                 bool local_only;
 
-                LDLM_DEBUG(lock, "client-side cancel");
+               LDLM_DEBUG(lock, "client-side cancel");
+               OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
+                                cfs_fail_val);
+
                 /* Set this flag to prevent others from getting new references*/
                 lock_res_and_lock(lock);
                ldlm_set_cbpending(lock);
index 096bebc..6b00d2f 100644 (file)
@@ -3567,6 +3567,40 @@ test_92() {
 }
 run_test 92 "create remote directory under orphan directory"
 
+test_93() {
+       dd if=/dev/zero of=$DIR2/$tfile bs=4k count=2 conv=fsync
+
+       local before=$(date +%s)
+       local evict
+
+       $LCTL mark write
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+       $LCTL set_param fail_val=5 fail_loc=0x80000312
+       dd if=/dev/zero of=$DIR/$tfile conv=notrunc oflag=append bs=4k count=1 &
+       local pid=$!
+       sleep 2
+
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
+       $LCTL set_param fail_val=6 fail_loc=0x80000329
+       $LCTL mark kill $pid
+       kill -ALRM $pid
+
+       dd if=/dev/zero of=$DIR2/$tfile conv=notrunc oflag=append bs=4k count=1
+
+       wait $pid
+       dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 conv=fsync
+
+       evict=$(do_facet client $LCTL get_param \
+               osc.$FSNAME-OST*-osc-*/state | \
+           awk -F"[ [,]" '/EVICTED ]$/ { if (t<$5) {t=$5;} } END { print t }')
+
+       [ -z "$evict" ] || [[ $evict -le $before ]] ||
+               (do_facet client $LCTL get_param \
+                       osc.$FSNAME-OST*-osc-*/state;
+                   error "eviction happened: $evict before:$before")
+}
+run_test 93 "signal vs CP callback race"
+
 log "cleanup: ======================================================"
 
 # kill and wait in each test only guarentee script finish, but command in script