From: Vitaly Fertman Date: Mon, 30 Jun 2014 19:36:58 +0000 (+0400) Subject: LU-5266 ldlm: granting the same lock twice on recovery X-Git-Tag: 2.6.0-RC1~5 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=63851b5816bb30687fbf3750380d6b448e9400f1;hp=17fdeba9ae419078b0ea6cddd84e74c647317b33 LU-5266 ldlm: granting the same lock twice on recovery ASSERTION( lock->l_granted_mode != lock->l_req_mode ) is hit when resending LDLM_FL_REPLAY. Don't add lock to waiting list as it is already added or granted. Change-Id: Ib8e5d2c7588f6cacd1723529e70d29f63742caad Xyratex-bug-id: MRP-1944 Signed-off-by: Andriy Skulysh Signed-off-by: Vitaly Fertman Reviewed-on: http://review.whamcloud.com/10903 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin Tested-by: Andreas Dilger Reviewed-by: Andreas Dilger Reviewed-by: John L. Hammond --- diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index d6b7de5..2237ae1 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1756,6 +1756,8 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, ldlm_grant_lock(lock, NULL); GOTO(out, rc = ELDLM_OK); #ifdef HAVE_SERVER_SUPPORT + } else if (*flags & LDLM_FL_RESENT) { + GOTO(out, rc = ELDLM_OK); } else if (*flags & LDLM_FL_REPLAY) { if (*flags & LDLM_FL_BLOCK_CONV) { ldlm_resource_add_lock(res, &res->lr_converting, lock); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 307a823..4d9e72f 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -900,6 +900,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity); + if (OBD_FAIL_PRECHECK(OBD_FAIL_OST_LDLM_REPLY_NET)) { + LDLM_DEBUG(lock, "dropping CP AST"); + RETURN(0); + } + req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse, &RQF_LDLM_CP_CALLBACK); if (req == NULL) @@ -1232,10 +1237,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, } #endif - if (unlikely(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) - flags |= LDLM_FL_RESENT; - - if (unlikely(flags & (LDLM_FL_REPLAY | LDLM_FL_RESENT))) { + if (unlikely((flags & LDLM_FL_REPLAY) || + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) { /* Find an existing lock in the per-export lock hash */ /* In the function below, .hs_keycmp resolves to * ldlm_export_lock_keycmp() */ @@ -1245,9 +1248,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, if (lock != NULL) { DEBUG_REQ(D_DLMTRACE, req, "found existing lock cookie " LPX64, lock->l_handle.h_cookie); + flags |= LDLM_FL_RESENT; GOTO(existing_lock, rc = 0); - } else { - flags &= ~LDLM_FL_RESENT; } } diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index b08f1cc..7b73099 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -863,6 +863,32 @@ run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1" # end commit on sharing tests +test_24() { + cancel_lru_locks osc + + $SETSTRIPE -i 0 -c 1 $DIR/$tfile + + # get lock for the 1st client + dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null || + error "failed to write data" + + # get waiting locks for the 2nd client + drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" & + sleep 1 + +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 + # failover, replay and resend replayed waiting locks + do_facet ost1 lctl set_param fail_loc=0x80000213 + fail ost1 + + # multiop does not finish because CP AST is skipped; + # it is ok to kill it in the test, because CP AST is already re-sent + # and it does not hung forever in real life + killall multiop + wait +} +run_test 24 "replay|resend" + complete $SECONDS SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP