Commit
fe5c801657 introduced a problem for recovery.
When recovery timeout reaches hard recovery timeout
target_recovery_overseer() leaves obd_recovery_expired flag set. That
makes check_for_next_transno() to not wait until next replay request
arrives which leads to assertion:
LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
Test to illustrace the issue is added.
replay-single.sh:test_59 is added to EXCEPT_ALWAYS list:
it was broken harmlessly before this patch and this patch made that
test really fail due to that defect.
Fixes:
fe5c80165 ("LU-11762 ldlm: ensure the recovery timer is armed")
HPE-bug-id: LUS-8299
Signed-off-by: Vladimir Saveliev <c17830@cray.com>
Change-Id: Ia694a519b5d73620be3014e92fd671d388550979
Reviewed-on: https://review.whamcloud.com/39532
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
#define OBD_FAIL_LARGE_STRIPE 0x1703
#define OBD_FAIL_OUT_ENOSPC 0x1704
#define OBD_FAIL_INVALIDATE_UPDATE 0x1705
+#define OBD_FAIL_OUT_UPDATE_DROP 0x1707
/* MIGRATE */
#define OBD_FAIL_MIGRATE_ENTRIES 0x1801
/** evict exports which didn't finish recovery yet */
class_disconnect_stale_exports(obd, exp_finished);
return 1;
- } else if (obd->obd_recovery_expired &&
- obd->obd_recovery_timeout < obd->obd_recovery_time_hard) {
+ } else if (obd->obd_recovery_expired) {
obd->obd_recovery_expired = 0;
/** If some clients died being recovered, evict them */
if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
RETURN(0);
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DELAY_RECOV) &&
+ lustre_msg_get_opc(request->rq_reqmsg) == MDS_CONNECT &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0)) {
+ RETURN(0);
+ }
+
LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
LASSERT(request->rq_wait_ctx == 0);
OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET)))
RETURN(0);
+ /* drop OUT_UPDATE rpc */
+ if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == OUT_UPDATE &&
+ OBD_FAIL_CHECK(OBD_FAIL_OUT_UPDATE_DROP)))
+ RETURN(0);
+
rc = tgt_request_preprocess(tsi, h, req);
/* pack reply if reply format is fixed */
if (rc == 0 && h->th_flags & HAS_REPLY) {
}
run_test 143 "orphan cleanup thread shouldn't be blocked even delete failed"
+test_145() {
+ [ $MDSCOUNT -lt 3 ] && skip "needs >= 3 MDTs"
+ [ $(facet_active_host mds2) = $(facet_active_host mds3) ] &&
+ skip "needs mds2 and mds3 on separate nodes"
+
+ replay_barrier mds1
+
+ touch $DIR/$tfile
+
+#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
+ echo block mds_connect from mds2
+ do_facet mds2 "$LCTL set_param fail_loc=0x507"
+
+#define OBD_FAIL_OUT_UPDATE_DROP 0x1707
+ echo block recovery updates from mds3
+ do_facet mds3 "$LCTL set_param fail_loc=0x1707"
+
+ local hard_timeout=\
+$(do_facet mds1 $LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard)
+
+ fail mds1 &
+
+ local get_soft_timeout_cmd=\
+"$LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_soft 2>/dev/null"
+
+ echo wait until mds1 recovery_time_soft is $hard_timeout
+ wait_update $(facet_host mds1) "$get_soft_timeout_cmd" \
+"$hard_timeout" $hard_timeout
+
+ echo unblock mds_connect from mds2
+ do_facet mds2 "$LCTL set_param fail_loc=0"
+
+ echo upblock recovery updates from mds3
+ do_facet mds3 "$LCTL set_param fail_loc=0"
+
+ wait
+ [ -f $DIR/$tfile ] || error "$DIR/$tfile does not exist"
+}
+run_test 145 "connect mdtlovs and process update logs after recovery expire"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status
init_logging
ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT "
+# bug number for skipped test: LU-13614
+ALWAYS_EXCEPT+=" 59"
if [ "$mds1_FSTYPE" = zfs ]; then
# bug number for skipped test: LU-11388
ALWAYS_EXCEPT+=" 131b"
- # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
fi
if $SHARED_KEY; then
# bug number for skipped tests: LU-9795 (all below)
ALWAYS_EXCEPT="$ALWAYS_EXCEPT 85b 86 88 89 90"
ALWAYS_EXCEPT="$ALWAYS_EXCEPT 93a 100a 100b 120"
fi
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
build_test_filter