LU-13614 ldlm: revert LU-11762

author Vladimir Saveliev <c17830@cray.com>

Tue, 28 Jul 2020 22:28:22 +0000 (01:28 +0300)

committer Oleg Drokin <green@whamcloud.com>

Mon, 12 Oct 2020 05:48:46 +0000 (05:48 +0000)
author Vladimir Saveliev <c17830@cray.com>
Tue, 28 Jul 2020 22:28:22 +0000 (01:28 +0300)
committer Oleg Drokin <green@whamcloud.com>
Mon, 12 Oct 2020 05:48:46 +0000 (05:48 +0000)
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 2548973..bed99d1 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -647,6 +647,7 @@ extern char obd_jobid_var[];
  #define OBD_FAIL_LARGE_STRIPE          0x1703
  #define OBD_FAIL_OUT_ENOSPC             0x1704
  #define OBD_FAIL_INVALIDATE_UPDATE     0x1705
+#define OBD_FAIL_OUT_UPDATE_DROP        0x1707
  
  /* MIGRATE */
  #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index 74d769b..d6d793b 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -2266,8 +2266,7 @@ repeat:
                 /** evict exports which didn't finish recovery yet */
                 class_disconnect_stale_exports(obd, exp_finished);
                 return 1;
-       } else if (obd->obd_recovery_expired &&
-                  obd->obd_recovery_timeout < obd->obd_recovery_time_hard) {
+       } else if (obd->obd_recovery_expired) {
                 obd->obd_recovery_expired = 0;
  
                 /** If some clients died being recovered, evict them */
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index 9193550..339fe68 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -713,6 +713,12 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
                 RETURN(0);
  
+       if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DELAY_RECOV) &&
+                    lustre_msg_get_opc(request->rq_reqmsg) == MDS_CONNECT &&
+                    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0)) {
+               RETURN(0);
+       }
+
         LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
         LASSERT(request->rq_wait_ctx == 0);
  
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c

index 83ecb5f..5b49504 100644 (file)
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -419,6 +419,11 @@ static int tgt_handle_request0(struct tgt_session_info *tsi,
                      OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET)))
                 RETURN(0);
  
+       /* drop OUT_UPDATE rpc */
+       if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == OUT_UPDATE &&
+                    OBD_FAIL_CHECK(OBD_FAIL_OUT_UPDATE_DROP)))
+               RETURN(0);
+
         rc = tgt_request_preprocess(tsi, h, req);
         /* pack reply if reply format is fixed */
         if (rc == 0 && h->th_flags & HAS_REPLY) {
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 6ce8a57..60b78f2 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -3036,6 +3036,46 @@ test_143() {
  }
  run_test 143 "orphan cleanup thread shouldn't be blocked even delete failed"
  
+test_145() {
+       [ $MDSCOUNT -lt 3 ] && skip "needs >= 3 MDTs"
+       [ $(facet_active_host mds2) = $(facet_active_host mds3) ] &&
+               skip "needs mds2 and mds3 on separate nodes"
+
+       replay_barrier mds1
+
+       touch $DIR/$tfile
+
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+       echo block mds_connect from mds2
+       do_facet mds2 "$LCTL set_param fail_loc=0x507"
+
+#define OBD_FAIL_OUT_UPDATE_DROP       0x1707
+       echo block recovery updates from mds3
+       do_facet mds3 "$LCTL set_param fail_loc=0x1707"
+
+       local hard_timeout=\
+$(do_facet mds1 $LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard)
+
+       fail mds1 &
+
+       local get_soft_timeout_cmd=\
+"$LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_soft 2>/dev/null"
+
+       echo wait until mds1 recovery_time_soft is $hard_timeout
+       wait_update $(facet_host mds1) "$get_soft_timeout_cmd" \
+"$hard_timeout" $hard_timeout
+
+       echo unblock mds_connect from mds2
+       do_facet mds2 "$LCTL set_param fail_loc=0"
+
+       echo upblock recovery updates from mds3
+       do_facet mds3 "$LCTL set_param fail_loc=0"
+
+       wait
+       [ -f $DIR/$tfile ] || error "$DIR/$tfile does not exist"
+}
+run_test 145 "connect mdtlovs and process update logs after recovery expire"
+
  complete $SECONDS
  check_and_cleanup_lustre
  exit_status
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 726d773..1e1124a 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -8,11 +8,12 @@ init_test_env $@
  init_logging
  
  ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT "
+# bug number for skipped test: LU-13614
+ALWAYS_EXCEPT+="               59"
  
  if [ "$mds1_FSTYPE" = zfs ]; then
         # bug number for skipped test: LU-11388
         ALWAYS_EXCEPT+="               131b"
-       # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  fi
  if $SHARED_KEY; then
         # bug number for skipped tests: LU-9795 (all below)
@@ -21,6 +22,7 @@ if $SHARED_KEY; then
         ALWAYS_EXCEPT="$ALWAYS_EXCEPT   85b     86      88      89      90"
         ALWAYS_EXCEPT="$ALWAYS_EXCEPT   93a     100a    100b    120"
  fi
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
  build_test_filter
author	Vladimir Saveliev <c17830@cray.com>
	Tue, 28 Jul 2020 22:28:22 +0000 (01:28 +0300)
committer	Oleg Drokin <green@whamcloud.com>
	Mon, 12 Oct 2020 05:48:46 +0000 (05:48 +0000)
lustre/include/obd_support.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history
lustre/target/tgt_handler.c		patch \| blob \| history
lustre/tests/recovery-small.sh		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history