LU-14027 ldlm: Do not hang if recovery restarted during lock replay

author Oleg Drokin <green@whamcloud.com>

Wed, 14 Oct 2020 03:55:02 +0000 (23:55 -0400)

committer Oleg Drokin <green@whamcloud.com>

Thu, 19 Nov 2020 15:11:14 +0000 (15:11 +0000)
author Oleg Drokin <green@whamcloud.com>
Wed, 14 Oct 2020 03:55:02 +0000 (23:55 -0400)
committer Oleg Drokin <green@whamcloud.com>
Thu, 19 Nov 2020 15:11:14 +0000 (15:11 +0000)
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 3edec56..462ead5 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -388,6 +388,7 @@ extern char obd_jobid_var[];
  #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
  #define OBD_FAIL_LDLM_PROLONG_PAUSE     0x32b
  #define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
+#define OBD_FAIL_LDLM_LOCK_REPLAY       0x32d
  
  /* LOCKLESS IO */
  #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index baa75f5..a3386bc 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -2750,6 +2750,11 @@ static int target_recovery_thread(void *arg)
                 LASSERT(trd->trd_processing_task == current->pid);
                 DEBUG_REQ(D_HA, req, "processing lock from %s:",
                           libcfs_nid2str(req->rq_peer.nid));
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
+                       req->rq_status = -ENODEV;
+                       target_request_copy_put(req);
+                       continue;
+               }
                 handle_recovery_req(thread, req,
                                     trd->trd_recovery_handler);
                 target_request_copy_put(req);
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index 46b3751..baccc46 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -2580,9 +2580,11 @@ int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
  
         list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
                 list_del_init(&lock->l_pending_chain);
-               if (rc) {
+               /* If we disconnected in the middle - cleanup and let
+                * reconnection to happen again. LU-14027 */
+               if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) {
                         LDLM_LOCK_RELEASE(lock);
-                       continue; /* or try to do the rest? */
+                       continue;
                 }
                 rc = replay_one_lock(imp, lock);
                 LDLM_LOCK_RELEASE(lock);
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 5eac2d7..9c12920 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -4939,6 +4939,51 @@ test_134() {
  }
  run_test 134 "replay creation of a file created in a pool"
  
+# LU-14027
+test_135() {
+       mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+
+       # All files to ost1
+       $LFS setstripe -S $((128 * 1024)) -i 0 $DIR/$tdir
+
+       replay_barrier ost1
+
+       # Create 20 files so we have 20 ost locks
+       for i in $(seq 20) ; do
+               echo blah > $DIR/$tdir/file.${i}
+       done
+
+       shutdown_facet ost1
+       reboot_facet ost1
+       change_active ost1
+       wait_for_facet ost1
+
+       #define OBD_FAIL_TGT_REPLAY_RECONNECT     0x32d
+       # Make sure lock replay server side never completes and errors out.
+       do_facet ost1 "$LCTL set_param fail_val=20"
+       do_facet ost1 "$LCTL set_param fail_loc=0x32d"
+
+       mount_facet ost1
+
+       # Now make sure we notice
+       (sync;sync;sync) &
+       local PID=$?
+       sleep 20 # should we do something proactive to make reconnects go?
+       kill -0 $PID || error "Unexpected sync success"
+
+       shutdown_facet ost1
+       reboot_facet ost1
+       change_active ost1
+       wait_for_facet ost1
+
+       do_facet ost1 "$LCTL set_param fail_loc=0"
+       mount_facet ost1
+       echo blah > $DIR/$tdir/file.test2
+
+       rm -rf $DIR/$tdir
+}
+run_test 135 "Server failure in lock replay phase"
+
  complete $SECONDS
  check_and_cleanup_lustre
  exit_status
author	Oleg Drokin <green@whamcloud.com>
	Wed, 14 Oct 2020 03:55:02 +0000 (23:55 -0400)
committer	Oleg Drokin <green@whamcloud.com>
	Thu, 19 Nov 2020 15:11:14 +0000 (15:11 +0000)
lustre/include/obd_support.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history