Whamcloud - gitweb
LU-16483 tests: replay-single test_200 fixes 91/50891/11
authorChris Horn <chris.horn@hpe.com>
Thu, 4 May 2023 22:48:28 +0000 (17:48 -0500)
committerOleg Drokin <green@whamcloud.com>
Thu, 27 Jul 2023 07:19:50 +0000 (07:19 +0000)
Modify test to ensure idle disconnect is enabled for all targets
except OST0000. This prevents an issue where an idle ping is sent to
another target instead of OST0000.

Re-work test to check the debug log for all relevant messages.

rcli is not set correctly when RCLIENTS contains multiple hostnames.
Fix it by not surrounding RCLIENTS with double quotes.

Added a debug statement to ptl_send_rpc(), and moved an existing one,
to faciliate debugging any future test failures.

Test-Parameters: trivial clients=3 testlist=replay-single env=ONLY=200,ONLY_REPEAT=100
Fixes: eb1f4a5 ("LU-16483 ptlrpc: Track highest reply XID")
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: If0a214092dad1e40f1b9e785e179ef67f686b85a
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50891
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Deiter <alex.deiter@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ptlrpc/niobuf.c
lustre/tests/replay-single.sh

index 77c5c5c..8047bc5 100644 (file)
@@ -942,12 +942,15 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        request->rq_deadline = request->rq_sent + request->rq_timeout +
                ptlrpc_at_get_net_latency(request);
 
+       DEBUG_REQ(D_INFO, request, "send flags=%x",
+                 lustre_msg_get_flags(request->rq_reqmsg));
+
        if (unlikely(opc == OBD_PING &&
-           CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val)))
+           CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val))) {
+               DEBUG_REQ(D_INFO, request, "Simulate delay send failure");
                GOTO(skip_send, rc);
+       }
 
-       DEBUG_REQ(D_INFO, request, "send flags=%x",
-                 lustre_msg_get_flags(request->rq_reqmsg));
        rc = ptl_send_buf(&request->rq_req_md_h,
                          request->rq_reqbuf, request->rq_reqdata_len,
                          LNET_NOACK_REQ, &request->rq_req_cbid,
index 0c1bb11..a4c2386 100755 (executable)
@@ -5017,10 +5017,21 @@ test_200() {
 
        local rcli old
 
-       rcli=$(echo "$RCLIENTS" | cut -d ' ' -f 1)
-       old=$(do_node "$rcli" "$LCTL get_param -n osc.*OST0000*.idle_timeout")
+       rcli=$(echo $RCLIENTS | cut -d ' ' -f 1)
 
-       if ((old != 0)); then
+       echo "Selected \"$rcli\" from \"$RCLIENTS\""
+
+       # We want idle disconnect enabled on all targets except OST0000. Test
+       # assumes all nodes are configured the same.
+       old=$(do_node $rcli "$LCTL get_param -n *.*.idle_timeout 2>/dev/null |
+             head -n 1")
+       [[ -n $old ]] || error "Cannot determine current idle_timeout"
+
+       if ((old == 0)); then
+               do_node "$rcli" "$LCTL set_param *.*.idle_timeout=10"
+               do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0"
+               stack_trap "do_node $rcli $LCTL set_param *.*.idle_timeout=$old"
+       else
                do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0"
                stack_trap "do_node $rcli $LCTL set_param osc.*OST0000*.idle_timeout=$old"
        fi
@@ -5056,7 +5067,7 @@ test_200() {
        # unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
        #                              (OBD_TIMEOUT_DEFAULT / 4) : 1;
        # Delay a ping for 3 intervals
-       local timeout delay ts1 ts2 elapsed
+       local timeout delay
 
        timeout=$(do_node "$rcli" "$LCTL get_param -n timeout")
        (( timeout > 4 )) && delay=$((3 * timeout / 4)) || delay=3
@@ -5064,8 +5075,6 @@ test_200() {
        do_node "$rcli" "$LCTL clear"
        log "delay ping ${delay}s"
 
-       ts1=$(date +%s)
-
        #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL    0x535
        do_node "$rcli" "$LCTL set_param fail_loc=0x80000535 fail_val=${delay}"
        # Send ping that will be delayed for ${delay} seconds
@@ -5073,49 +5082,52 @@ test_200() {
        # thread is delayed instead
        do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1"
 
-       # Make sure we hit the fail_loc
-       wait_update --quiet "$rcli" \
-               "dmesg | tail -n 10 | \
-                grep -c 'fail_timeout id 535 sleeping for'" \
-               "1" "${timeout}"
-
-       (( $? != 0 )) && error "Did not hit fail_loc"
-
-       ts2=$(date +%s)
-
-       # If we raced with pinger thread then we need to wait ${delay} seconds
-       # for the ping to fail
-       elapsed=$((ts2 - ts1))
-
-       if (( elapsed < delay )); then
-               sleep $(((delay - elapsed) + 1))
-       fi
-
-       do_node "$rcli" "$LCTL dk" > "$TMP/lustre-log-${TESTNAME}.log"
-
-       local nsent
-
-       nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" \
-               "$TMP/lustre-log-${TESTNAME}.log")
+       local logfile="$TMP/lustre-log-${TESTNAME}.log"
+
+       local nsent expired
+       local waited=0
+       local begin=$SECONDS
+       local max_wait=$((delay + 1))
+       local reason=""
+       local sleep=""
+       while (( $waited <= $max_wait )); do
+               [[ -z $sleep ]] || sleep $sleep
+               sleep=1
+               waited=$((SECONDS - begin))
+               do_node "$rcli" "$LCTL dk" >> ${logfile}
+
+               if ! grep -q 'fail_timeout id 535 sleeping for' $logfile; then
+                       reason="Did not hit fail_loc"
+                       continue
+               fi
 
-       if (( nsent <= 1)); then
-               cat "$TMP/lustre-log-${TESTNAME}.log"
-               rm -f "$TMP/lustre-log-${TESTNAME}.log"
-               error "Did not send more than 1 obd ping"
-       fi
+               if ! grep -q 'cfs_fail_timeout id 535 awake' $logfile; then
+                       reason="Delayed send did not wake"
+                       continue
+               fi
 
-       local expired
+               nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" $logfile)
+               if (( nsent <= 1 )); then
+                       reason="Did not send more than 1 obd ping"
+                       continue
+               fi
 
-       expired=$(grep "Request sent has timed out .* o400->" \
-                 "$TMP/lustre-log-${TESTNAME}.log")
+               expired=$(grep "Request sent has timed out .* o400->" $logfile)
+               if [[ -z $expired ]]; then
+                       reason="RPC did not time out"
+                       continue
+               fi
+               reason=""
+               break
+       done
 
-       if [[ -z $expired ]]; then
-               cat "$TMP/lustre-log-${TESTNAME}.log"
-               rm -f "$TMP/lustre-log-${TESTNAME}.log"
-               error "RPC did not time out"
+       if [[ -n $reason ]]; then
+               cat $logfile
+               rm -f $logfile
+               error "$reason"
        else
                echo "${expired}"
-               rm -f "$TMP/lustre-log-${TESTNAME}.log"
+               rm -f $logfile
        fi
 
        declare -a conns2