Whamcloud - gitweb
LU-13614 ldlm: revert LU-11762 32/39532/7
authorVladimir Saveliev <c17830@cray.com>
Tue, 28 Jul 2020 22:28:22 +0000 (01:28 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 12 Oct 2020 05:48:46 +0000 (05:48 +0000)
Commit fe5c801657 introduced a problem for recovery.

When recovery timeout reaches hard recovery timeout
target_recovery_overseer() leaves obd_recovery_expired flag set. That
makes check_for_next_transno() to not wait until next replay request
arrives which leads to assertion:
LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);

Test to illustrace the issue is added.

replay-single.sh:test_59 is added to EXCEPT_ALWAYS list:
  it was broken harmlessly before this patch and this patch made that
  test really fail due to that defect.

Fixes: fe5c80165 ("LU-11762 ldlm: ensure the recovery timer is armed")
HPE-bug-id: LUS-8299
Signed-off-by: Vladimir Saveliev <c17830@cray.com>
Change-Id: Ia694a519b5d73620be3014e92fd671d388550979
Reviewed-on: https://review.whamcloud.com/39532
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/ptlrpc/niobuf.c
lustre/target/tgt_handler.c
lustre/tests/recovery-small.sh
lustre/tests/replay-single.sh

index 2548973..bed99d1 100644 (file)
@@ -647,6 +647,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LARGE_STRIPE          0x1703
 #define OBD_FAIL_OUT_ENOSPC             0x1704
 #define OBD_FAIL_INVALIDATE_UPDATE     0x1705
+#define OBD_FAIL_OUT_UPDATE_DROP        0x1707
 
 /* MIGRATE */
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
index 74d769b..d6d793b 100644 (file)
@@ -2266,8 +2266,7 @@ repeat:
                /** evict exports which didn't finish recovery yet */
                class_disconnect_stale_exports(obd, exp_finished);
                return 1;
-       } else if (obd->obd_recovery_expired &&
-                  obd->obd_recovery_timeout < obd->obd_recovery_time_hard) {
+       } else if (obd->obd_recovery_expired) {
                obd->obd_recovery_expired = 0;
 
                /** If some clients died being recovered, evict them */
index 9193550..339fe68 100644 (file)
@@ -713,6 +713,12 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
                RETURN(0);
 
+       if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DELAY_RECOV) &&
+                    lustre_msg_get_opc(request->rq_reqmsg) == MDS_CONNECT &&
+                    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0)) {
+               RETURN(0);
+       }
+
        LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
        LASSERT(request->rq_wait_ctx == 0);
 
index 83ecb5f..5b49504 100644 (file)
@@ -419,6 +419,11 @@ static int tgt_handle_request0(struct tgt_session_info *tsi,
                     OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET)))
                RETURN(0);
 
+       /* drop OUT_UPDATE rpc */
+       if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == OUT_UPDATE &&
+                    OBD_FAIL_CHECK(OBD_FAIL_OUT_UPDATE_DROP)))
+               RETURN(0);
+
        rc = tgt_request_preprocess(tsi, h, req);
        /* pack reply if reply format is fixed */
        if (rc == 0 && h->th_flags & HAS_REPLY) {
index 6ce8a57..60b78f2 100755 (executable)
@@ -3036,6 +3036,46 @@ test_143() {
 }
 run_test 143 "orphan cleanup thread shouldn't be blocked even delete failed"
 
+test_145() {
+       [ $MDSCOUNT -lt 3 ] && skip "needs >= 3 MDTs"
+       [ $(facet_active_host mds2) = $(facet_active_host mds3) ] &&
+               skip "needs mds2 and mds3 on separate nodes"
+
+       replay_barrier mds1
+
+       touch $DIR/$tfile
+
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+       echo block mds_connect from mds2
+       do_facet mds2 "$LCTL set_param fail_loc=0x507"
+
+#define OBD_FAIL_OUT_UPDATE_DROP       0x1707
+       echo block recovery updates from mds3
+       do_facet mds3 "$LCTL set_param fail_loc=0x1707"
+
+       local hard_timeout=\
+$(do_facet mds1 $LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard)
+
+       fail mds1 &
+
+       local get_soft_timeout_cmd=\
+"$LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_soft 2>/dev/null"
+
+       echo wait until mds1 recovery_time_soft is $hard_timeout
+       wait_update $(facet_host mds1) "$get_soft_timeout_cmd" \
+"$hard_timeout" $hard_timeout
+
+       echo unblock mds_connect from mds2
+       do_facet mds2 "$LCTL set_param fail_loc=0"
+
+       echo upblock recovery updates from mds3
+       do_facet mds3 "$LCTL set_param fail_loc=0"
+
+       wait
+       [ -f $DIR/$tfile ] || error "$DIR/$tfile does not exist"
+}
+run_test 145 "connect mdtlovs and process update logs after recovery expire"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
index 726d773..1e1124a 100755 (executable)
@@ -8,11 +8,12 @@ init_test_env $@
 init_logging
 
 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT "
+# bug number for skipped test: LU-13614
+ALWAYS_EXCEPT+="               59"
 
 if [ "$mds1_FSTYPE" = zfs ]; then
        # bug number for skipped test: LU-11388
        ALWAYS_EXCEPT+="               131b"
-       # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 fi
 if $SHARED_KEY; then
        # bug number for skipped tests: LU-9795 (all below)
@@ -21,6 +22,7 @@ if $SHARED_KEY; then
        ALWAYS_EXCEPT="$ALWAYS_EXCEPT   85b     86      88      89      90"
        ALWAYS_EXCEPT="$ALWAYS_EXCEPT   93a     100a    100b    120"
 fi
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 build_test_filter