From: Vitaly Fertman Date: Mon, 29 Sep 2014 23:21:21 +0000 (-0400) Subject: LU-5266 ldlm: granting the same lock twice on recovery X-Git-Tag: 2.5.3.90~85 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=f64522b6fb49784c100aa9c2d71046ff2514f5ca LU-5266 ldlm: granting the same lock twice on recovery ASSERTION( lock->l_granted_mode != lock->l_req_mode ) is hit when resending LDLM_FL_REPLAY. Don't add lock to waiting list as it is already added or granted. Lustre-commit : 63851b5816bb30687fbf3750380d6b448e9400f1 Lustre-change: http://review.whamcloud.com/10903 Change-Id: Ib8e5d2c7588f6cacd1723529e70d29f63742caad Xyratex-bug-id: MRP-1944 Signed-off-by: Andriy Skulysh Signed-off-by: Vitaly Fertman Reviewed-on: http://review.whamcloud.com/10903 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin Tested-by: Andreas Dilger Reviewed-by: Andreas Dilger Reviewed-by: John L. Hammond --- diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 7e82777..49c30a7 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1687,6 +1687,9 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, } } + if (*flags & LDLM_FL_RESENT) + RETURN(ELDLM_OK); + /* For a replaying lock, it might be already in granted list. So * unlinking the lock will cause the interval node to be freed, we * have to allocate the interval node early otherwise we can't regrant diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 80b2951..a89b2f8 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -901,6 +901,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity); + if (OBD_FAIL_PRECHECK(OBD_FAIL_OST_LDLM_REPLY_NET)) { + LDLM_DEBUG(lock, "dropping CP AST"); + RETURN(0); + } + req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse, &RQF_LDLM_CP_CALLBACK); if (req == NULL) @@ -1233,10 +1238,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, } #endif - if (unlikely(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) - flags |= LDLM_FL_RESENT; - - if (unlikely(flags & (LDLM_FL_REPLAY | LDLM_FL_RESENT))) { + if (unlikely((flags & LDLM_FL_REPLAY) || + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) { /* Find an existing lock in the per-export lock hash */ /* In the function below, .hs_keycmp resolves to * ldlm_export_lock_keycmp() */ @@ -1246,9 +1249,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, if (lock != NULL) { DEBUG_REQ(D_DLMTRACE, req, "found existing lock cookie " LPX64, lock->l_handle.h_cookie); + flags |= LDLM_FL_RESENT; GOTO(existing_lock, rc = 0); - } else { - flags &= ~LDLM_FL_RESENT; } } diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index f7b1dda..3bc1b6c 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -881,6 +881,32 @@ run_test 24 "reconstruct on non-existing object" # end commit on sharing tests +test_24() { + cancel_lru_locks osc + + $SETSTRIPE -i 0 -c 1 $DIR/$tfile + + # get lock for the 1st client + dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null || + error "failed to write data" + + # get waiting locks for the 2nd client + drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" & + sleep 1 + +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 + # failover, replay and resend replayed waiting locks + do_facet ost1 lctl set_param fail_loc=0x80000213 + fail ost1 + + # multiop does not finish because CP AST is skipped; + # it is ok to kill it in the test, because CP AST is already re-sent + # and it does not hung forever in real life + killall multiop + wait +} +run_test 24 "replay|resend" + complete $SECONDS SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP