Whamcloud - gitweb
LU-8175 ldlm: conflicting PW & PR extent locks on a client 45/20345/5
authorAndriy Skulysh <andriy.skulysh@seagate.com>
Thu, 14 Jul 2016 10:43:31 +0000 (13:43 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 2 Sep 2016 02:24:43 +0000 (02:24 +0000)
PW lock isn't replayed once a lock is marked
LDLM_FL_CANCELING and glimpse lock doesn't wait for
conflicting locks on the client. So the server will
grant a PR lock in response to the glimpse lock request,
which conflicts with the PW lock in LDLM_FL_CANCELING
state on the client.

Lock in LDLM_FL_CANCELING state may still have pending IO,
so it should be replayed until LDLM_FL_BL_DONE is set to
avoid granted conflicting lock by a server.

Change-Id: I99a1d81a8932ac7b7b3346558446f9d638156309
Seagate-bug-id: MRP-3311
Signed-off-by: Andriy Skulysh <andriy.skulysh@seagate.com>
Reviewed-on: http://review.whamcloud.com/20345
Tested-by: Jenkins
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_request.c
lustre/osc/osc_request.c
lustre/tests/recovery-small.sh
lustre/tests/replay-ost-single.sh

index b064f91..cc8a6ec 100644 (file)
@@ -370,6 +370,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_WATERMARK_LOW     0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH    0x328
 
+#define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
+
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
 
index 721e458..b51265f 100644 (file)
@@ -1035,6 +1035,26 @@ void ldlm_extent_add_lock(struct ldlm_resource *res,
         /* even though we use interval tree to manage the extent lock, we also
          * add the locks into grant list, for debug purpose, .. */
         ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) {
+               struct ldlm_lock *lck;
+
+               list_for_each_entry_reverse(lck, &res->lr_granted,
+                                           l_res_link) {
+                       if (lck == lock)
+                               continue;
+                       if (lockmode_compat(lck->l_granted_mode,
+                                           lock->l_granted_mode))
+                               continue;
+                       if (ldlm_extent_overlap(&lck->l_req_extent,
+                                               &lock->l_req_extent)) {
+                               CDEBUG(D_ERROR, "granting conflicting lock %p "
+                                               "%p\n", lck, lock);
+                               ldlm_resource_dump(D_ERROR, res);
+                               LBUG();
+                       }
+               }
+       }
 }
 
 /** Remove cancelled lock from resource interval tree. */
index 279baee..f196849 100644 (file)
@@ -2191,7 +2191,7 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
          * bug 17614: locks being actively cancelled. Get a reference
          * on a lock so that it does not disapear under us (e.g. due to cancel)
          */
-        if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+       if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) {
                list_add(&lock->l_pending_chain, list);
                 LDLM_LOCK_GET(lock);
         }
@@ -2260,7 +2260,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
 
 
         /* Bug 11974: Do not replay a lock which is actively being canceled */
-       if (ldlm_is_canceling(lock)) {
+       if (ldlm_is_bl_done(lock)) {
                 LDLM_DEBUG(lock, "Not replaying canceled lock:");
                 RETURN(0);
         }
index e980ad4..aafe470 100644 (file)
@@ -1909,7 +1909,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
        DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%uw in flight",
                  page_count, aa, cli->cl_r_in_flight,
                  cli->cl_w_in_flight);
-       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, 4);
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
 
        ptlrpcd_add_req(req);
        rc = 0;
index 165112a..4f7421f 100755 (executable)
@@ -2510,6 +2510,7 @@ test_131() {
        # another IO under the same lock
        #define OBD_FAIL_OSC_DELAY_IO            0x414
        $LCTL set_param fail_loc=0x80000414
+       $LCTL set_param fail_val=4 fail_loc=0x80000414
        dd if=/dev/zero of=$DIR/$tfile count=1 conv=notrunc oflag=dsync &
        local pid=$!
        sleep 1
index c02d9fa..5bf8610 100755 (executable)
@@ -413,6 +413,25 @@ test_9() {
 }
 run_test 9 "Verify that no req deadline happened during recovery"
 
+test_10() {
+       rm -f $TDIR/$tfile
+
+       dd if=/dev/zero of=$TDIR/$tfile count=10 || error "dd failed"
+
+       #define OBD_FAIL_OSC_DELAY_IO            0x414
+       $LCTL set_param fail_val=60 fail_loc=0x414
+       cancel_lru_locks OST0000-osc &
+       sleep 2
+       facet_failover ost1 || error "failover: $?"
+
+       #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
+       $LCTL set_param fail_loc=0x32a
+       stat $TDIR/$tfile
+
+       wait
+}
+run_test 10 "conflicting PW & PR locks on a client"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status