LU-5266 ldlm: granting the same lock twice on recovery

author Vitaly Fertman <vitaly_fertman@xyratex.com>

Mon, 29 Sep 2014 23:21:21 +0000 (19:21 -0400)

committer Oleg Drokin <oleg.drokin@intel.com>

Tue, 7 Oct 2014 15:42:28 +0000 (11:42 -0400)
author Vitaly Fertman <vitaly_fertman@xyratex.com>
Mon, 29 Sep 2014 23:21:21 +0000 (19:21 -0400)
committer Oleg Drokin <oleg.drokin@intel.com>
Tue, 7 Oct 2014 15:42:28 +0000 (11:42 -0400)
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 7e82777..49c30a7 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -1687,6 +1687,9 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
                  }
          }
  
+       if (*flags & LDLM_FL_RESENT)
+               RETURN(ELDLM_OK);
+
         /* For a replaying lock, it might be already in granted list. So
          * unlinking the lock will cause the interval node to be freed, we
          * have to allocate the interval node early otherwise we can't regrant
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 80b2951..a89b2f8 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -901,6 +901,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
          total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(),
                                            lock->l_last_activity);
  
+       if (OBD_FAIL_PRECHECK(OBD_FAIL_OST_LDLM_REPLY_NET)) {
+               LDLM_DEBUG(lock, "dropping CP AST");
+               RETURN(0);
+       }
+
          req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse,
                                      &RQF_LDLM_CP_CALLBACK);
          if (req == NULL)
@@ -1233,10 +1238,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
          }
  #endif
  
-       if (unlikely(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
-               flags |= LDLM_FL_RESENT;
-
-       if (unlikely(flags & (LDLM_FL_REPLAY | LDLM_FL_RESENT))) {
+       if (unlikely((flags & LDLM_FL_REPLAY) ||
+                    (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) {
                  /* Find an existing lock in the per-export lock hash */
                 /* In the function below, .hs_keycmp resolves to
                  * ldlm_export_lock_keycmp() */
@@ -1246,9 +1249,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                  if (lock != NULL) {
                          DEBUG_REQ(D_DLMTRACE, req, "found existing lock cookie "
                                    LPX64, lock->l_handle.h_cookie);
+                       flags |= LDLM_FL_RESENT;
                          GOTO(existing_lock, rc = 0);
-               } else {
-                       flags &= ~LDLM_FL_RESENT;
                 }
          }
  
diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh

index f7b1dda..3bc1b6c 100755 (executable)
--- a/lustre/tests/replay-dual.sh
+++ b/lustre/tests/replay-dual.sh
@@ -881,6 +881,32 @@ run_test 24 "reconstruct on non-existing object"
  
  # end commit on sharing tests
  
+test_24() {
+       cancel_lru_locks osc
+
+       $SETSTRIPE -i 0 -c 1 $DIR/$tfile
+
+       # get lock for the 1st client
+       dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
+               error "failed to write data"
+
+       # get waiting locks for the 2nd client
+       drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
+       sleep 1
+
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+       # failover, replay and resend replayed waiting locks
+       do_facet ost1 lctl set_param fail_loc=0x80000213
+       fail ost1
+
+       # multiop does not finish because CP AST is skipped;
+       # it is ok to kill it in the test, because CP AST is already re-sent
+       # and it does not hung forever in real life
+       killall multiop
+       wait
+}
+run_test 24 "replay|resend"
+
  complete $SECONDS
  SLEEP=$((`date +%s` - $NOW))
  [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
author	Vitaly Fertman <vitaly_fertman@xyratex.com>
	Mon, 29 Sep 2014 23:21:21 +0000 (19:21 -0400)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Tue, 7 Oct 2014 15:42:28 +0000 (11:42 -0400)
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/tests/replay-dual.sh		patch \| blob \| history