From: Andriy Skulysh Date: Thu, 14 Jul 2016 10:43:31 +0000 (+0300) Subject: LU-8175 ldlm: conflicting PW & PR extent locks on a client X-Git-Tag: 2.8.58~43 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=80a818b80373bebd1438a74aeebda102b4885e53 LU-8175 ldlm: conflicting PW & PR extent locks on a client PW lock isn't replayed once a lock is marked LDLM_FL_CANCELING and glimpse lock doesn't wait for conflicting locks on the client. So the server will grant a PR lock in response to the glimpse lock request, which conflicts with the PW lock in LDLM_FL_CANCELING state on the client. Lock in LDLM_FL_CANCELING state may still have pending IO, so it should be replayed until LDLM_FL_BL_DONE is set to avoid granted conflicting lock by a server. Change-Id: I99a1d81a8932ac7b7b3346558446f9d638156309 Seagate-bug-id: MRP-3311 Signed-off-by: Andriy Skulysh Reviewed-on: http://review.whamcloud.com/20345 Tested-by: Jenkins Reviewed-by: Jinshan Xiong Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index b064f91..cc8a6ec 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -370,6 +370,8 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 #define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 +#define OBD_FAIL_LDLM_GRANT_CHECK 0x32a + /* LOCKLESS IO */ #define OBD_FAIL_LDLM_SET_CONTENTION 0x385 diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 721e458..b51265f 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -1035,6 +1035,26 @@ void ldlm_extent_add_lock(struct ldlm_resource *res, /* even though we use interval tree to manage the extent lock, we also * add the locks into grant list, for debug purpose, .. */ ldlm_resource_add_lock(res, &res->lr_granted, lock); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) { + struct ldlm_lock *lck; + + list_for_each_entry_reverse(lck, &res->lr_granted, + l_res_link) { + if (lck == lock) + continue; + if (lockmode_compat(lck->l_granted_mode, + lock->l_granted_mode)) + continue; + if (ldlm_extent_overlap(&lck->l_req_extent, + &lock->l_req_extent)) { + CDEBUG(D_ERROR, "granting conflicting lock %p " + "%p\n", lck, lock); + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + } + } } /** Remove cancelled lock from resource interval tree. */ diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 279baee..f196849 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -2191,7 +2191,7 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) * bug 17614: locks being actively cancelled. Get a reference * on a lock so that it does not disapear under us (e.g. due to cancel) */ - if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) { + if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) { list_add(&lock->l_pending_chain, list); LDLM_LOCK_GET(lock); } @@ -2260,7 +2260,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) /* Bug 11974: Do not replay a lock which is actively being canceled */ - if (ldlm_is_canceling(lock)) { + if (ldlm_is_bl_done(lock)) { LDLM_DEBUG(lock, "Not replaying canceled lock:"); RETURN(0); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index e980ad4..aafe470 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1909,7 +1909,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%uw in flight", page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, 4); + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val); ptlrpcd_add_req(req); rc = 0; diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 165112a..4f7421f 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2510,6 +2510,7 @@ test_131() { # another IO under the same lock #define OBD_FAIL_OSC_DELAY_IO 0x414 $LCTL set_param fail_loc=0x80000414 + $LCTL set_param fail_val=4 fail_loc=0x80000414 dd if=/dev/zero of=$DIR/$tfile count=1 conv=notrunc oflag=dsync & local pid=$! sleep 1 diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index c02d9fa..5bf8610 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -413,6 +413,25 @@ test_9() { } run_test 9 "Verify that no req deadline happened during recovery" +test_10() { + rm -f $TDIR/$tfile + + dd if=/dev/zero of=$TDIR/$tfile count=10 || error "dd failed" + + #define OBD_FAIL_OSC_DELAY_IO 0x414 + $LCTL set_param fail_val=60 fail_loc=0x414 + cancel_lru_locks OST0000-osc & + sleep 2 + facet_failover ost1 || error "failover: $?" + + #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a + $LCTL set_param fail_loc=0x32a + stat $TDIR/$tfile + + wait +} +run_test 10 "conflicting PW & PR locks on a client" + complete $SECONDS check_and_cleanup_lustre exit_status