From: Chris Horn Date: Thu, 4 May 2023 22:48:28 +0000 (-0500) Subject: LU-16483 tests: replay-single test_200 fixes X-Git-Tag: 2.15.57~22 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=fdfdf5c05cf64294068a5cbfe818b64bd9e577f9;p=fs%2Flustre-release.git LU-16483 tests: replay-single test_200 fixes Modify test to ensure idle disconnect is enabled for all targets except OST0000. This prevents an issue where an idle ping is sent to another target instead of OST0000. Re-work test to check the debug log for all relevant messages. rcli is not set correctly when RCLIENTS contains multiple hostnames. Fix it by not surrounding RCLIENTS with double quotes. Added a debug statement to ptl_send_rpc(), and moved an existing one, to faciliate debugging any future test failures. Test-Parameters: trivial clients=3 testlist=replay-single env=ONLY=200,ONLY_REPEAT=100 Fixes: eb1f4a5 ("LU-16483 ptlrpc: Track highest reply XID") Signed-off-by: Chris Horn Change-Id: If0a214092dad1e40f1b9e785e179ef67f686b85a Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50891 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Deiter Reviewed-by: Oleg Drokin --- diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 77c5c5c..8047bc5 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -942,12 +942,15 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) request->rq_deadline = request->rq_sent + request->rq_timeout + ptlrpc_at_get_net_latency(request); + DEBUG_REQ(D_INFO, request, "send flags=%x", + lustre_msg_get_flags(request->rq_reqmsg)); + if (unlikely(opc == OBD_PING && - CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val))) + CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val))) { + DEBUG_REQ(D_INFO, request, "Simulate delay send failure"); GOTO(skip_send, rc); + } - DEBUG_REQ(D_INFO, request, "send flags=%x", - lustre_msg_get_flags(request->rq_reqmsg)); rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqbuf, request->rq_reqdata_len, LNET_NOACK_REQ, &request->rq_req_cbid, diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 0c1bb11..a4c2386 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -5017,10 +5017,21 @@ test_200() { local rcli old - rcli=$(echo "$RCLIENTS" | cut -d ' ' -f 1) - old=$(do_node "$rcli" "$LCTL get_param -n osc.*OST0000*.idle_timeout") + rcli=$(echo $RCLIENTS | cut -d ' ' -f 1) - if ((old != 0)); then + echo "Selected \"$rcli\" from \"$RCLIENTS\"" + + # We want idle disconnect enabled on all targets except OST0000. Test + # assumes all nodes are configured the same. + old=$(do_node $rcli "$LCTL get_param -n *.*.idle_timeout 2>/dev/null | + head -n 1") + [[ -n $old ]] || error "Cannot determine current idle_timeout" + + if ((old == 0)); then + do_node "$rcli" "$LCTL set_param *.*.idle_timeout=10" + do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0" + stack_trap "do_node $rcli $LCTL set_param *.*.idle_timeout=$old" + else do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0" stack_trap "do_node $rcli $LCTL set_param osc.*OST0000*.idle_timeout=$old" fi @@ -5056,7 +5067,7 @@ test_200() { # unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ? # (OBD_TIMEOUT_DEFAULT / 4) : 1; # Delay a ping for 3 intervals - local timeout delay ts1 ts2 elapsed + local timeout delay timeout=$(do_node "$rcli" "$LCTL get_param -n timeout") (( timeout > 4 )) && delay=$((3 * timeout / 4)) || delay=3 @@ -5064,8 +5075,6 @@ test_200() { do_node "$rcli" "$LCTL clear" log "delay ping ${delay}s" - ts1=$(date +%s) - #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL 0x535 do_node "$rcli" "$LCTL set_param fail_loc=0x80000535 fail_val=${delay}" # Send ping that will be delayed for ${delay} seconds @@ -5073,49 +5082,52 @@ test_200() { # thread is delayed instead do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" - # Make sure we hit the fail_loc - wait_update --quiet "$rcli" \ - "dmesg | tail -n 10 | \ - grep -c 'fail_timeout id 535 sleeping for'" \ - "1" "${timeout}" - - (( $? != 0 )) && error "Did not hit fail_loc" - - ts2=$(date +%s) - - # If we raced with pinger thread then we need to wait ${delay} seconds - # for the ping to fail - elapsed=$((ts2 - ts1)) - - if (( elapsed < delay )); then - sleep $(((delay - elapsed) + 1)) - fi - - do_node "$rcli" "$LCTL dk" > "$TMP/lustre-log-${TESTNAME}.log" - - local nsent - - nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" \ - "$TMP/lustre-log-${TESTNAME}.log") + local logfile="$TMP/lustre-log-${TESTNAME}.log" + + local nsent expired + local waited=0 + local begin=$SECONDS + local max_wait=$((delay + 1)) + local reason="" + local sleep="" + while (( $waited <= $max_wait )); do + [[ -z $sleep ]] || sleep $sleep + sleep=1 + waited=$((SECONDS - begin)) + do_node "$rcli" "$LCTL dk" >> ${logfile} + + if ! grep -q 'fail_timeout id 535 sleeping for' $logfile; then + reason="Did not hit fail_loc" + continue + fi - if (( nsent <= 1)); then - cat "$TMP/lustre-log-${TESTNAME}.log" - rm -f "$TMP/lustre-log-${TESTNAME}.log" - error "Did not send more than 1 obd ping" - fi + if ! grep -q 'cfs_fail_timeout id 535 awake' $logfile; then + reason="Delayed send did not wake" + continue + fi - local expired + nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" $logfile) + if (( nsent <= 1 )); then + reason="Did not send more than 1 obd ping" + continue + fi - expired=$(grep "Request sent has timed out .* o400->" \ - "$TMP/lustre-log-${TESTNAME}.log") + expired=$(grep "Request sent has timed out .* o400->" $logfile) + if [[ -z $expired ]]; then + reason="RPC did not time out" + continue + fi + reason="" + break + done - if [[ -z $expired ]]; then - cat "$TMP/lustre-log-${TESTNAME}.log" - rm -f "$TMP/lustre-log-${TESTNAME}.log" - error "RPC did not time out" + if [[ -n $reason ]]; then + cat $logfile + rm -f $logfile + error "$reason" else echo "${expired}" - rm -f "$TMP/lustre-log-${TESTNAME}.log" + rm -f $logfile fi declare -a conns2