Whamcloud - gitweb
LU-16483 ptlrpc: Track highest reply XID 07/49807/10
authorChris Horn <chris.horn@hpe.com>
Fri, 27 Jan 2023 20:47:25 +0000 (14:47 -0600)
committerOleg Drokin <green@whamcloud.com>
Tue, 18 Apr 2023 03:22:21 +0000 (03:22 +0000)
Keep track of the highest XID that we've received a reply for.
When an OBD_PING expires, do not disconnect the import if the failed
XID is less than or equal to the last reply XID. This avoids situation
where a lost OBD_PING rpc causes a reconnect even though we've
completed other RPCs in the meantime.

HPE-bug-id: LUS-11474
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I7e66bcc1368fa41ec86ffd843abac676f8d29254
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49807
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_import.h
lustre/include/obd_support.h
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/niobuf.c
lustre/tests/replay-single.sh

index 7b97c55..71b9d14 100644 (file)
@@ -203,8 +203,10 @@ struct obd_import {
 
        /** List of not replied requests */
        struct list_head        imp_unreplied_list;
-       /** Known maximal replied XID */
+       /** XID below which we know all replies have been received */
        __u64                   imp_known_replied_xid;
+       /** highest XID for which we have received a reply */
+       __u64                   imp_highest_replied_xid;
 
        /** obd device for this import */
        struct obd_device       *imp_obd;
index 732dfa6..4106083 100644 (file)
@@ -474,6 +474,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_NET_ERROR_RPC          0x532
 #define OBD_FAIL_PTLRPC_IDLE_RACE       0x533
 #define OBD_FAIL_PTLRPC_ENQ_RESEND      0x534
+#define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL         0x535
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 /*     OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
index deac037..657aa63 100644 (file)
@@ -2289,13 +2289,15 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
        struct obd_import *imp = req->rq_import;
        unsigned int debug_mask = D_RPCTRACE;
        int rc = 0;
+       __u32 opc;
 
        ENTRY;
        spin_lock(&req->rq_lock);
        req->rq_timedout = 1;
        spin_unlock(&req->rq_lock);
 
-       if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
+       opc = lustre_msg_get_opc(req->rq_reqmsg);
+       if (ptlrpc_console_allow(req, opc,
                                 lustre_msg_get_status(req->rq_reqmsg)))
                debug_mask = D_WARNING;
        DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
@@ -2352,7 +2354,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
                rc = 1;
        }
 
-       ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+       if (opc != OBD_PING || req->rq_xid > imp->imp_highest_replied_xid)
+               ptlrpc_fail_import(imp,
+                                  lustre_msg_get_conn_cnt(req->rq_reqmsg));
 
        RETURN(rc);
 }
index d616b74..33ce2b0 100644 (file)
@@ -177,6 +177,9 @@ void reply_in_callback(struct lnet_event *ev)
        if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
                req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
 
+       if (req->rq_xid > req->rq_import->imp_highest_replied_xid)
+               req->rq_import->imp_highest_replied_xid = req->rq_xid;
+
 out_wake:
        /* NB don't unlock till after wakeup; req can disappear under us
         * since we don't have our own ref */
index 64dfa70..d1c3b1c 100644 (file)
@@ -942,6 +942,10 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        request->rq_deadline = request->rq_sent + request->rq_timeout +
                ptlrpc_at_get_net_latency(request);
 
+       if (unlikely(opc == OBD_PING &&
+           OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val)))
+               GOTO(skip_send, rc);
+
        DEBUG_REQ(D_INFO, request, "send flags=%x",
                  lustre_msg_get_flags(request->rq_reqmsg));
        rc = ptl_send_buf(&request->rq_req_md_h,
@@ -954,6 +958,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        if (likely(rc == 0))
                GOTO(out, rc);
 
+skip_send:
        request->rq_req_unlinked = 1;
        ptlrpc_req_finished(request);
        if (noreply)
index 15abd0f..50ca21c 100755 (executable)
@@ -5018,6 +5018,136 @@ test_136() {
 }
 run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm"
 
+test_200() {
+       [[ -z $RCLIENTS ]] && skip "Need remote client"
+
+       local rcli old
+
+       rcli=$(echo "$RCLIENTS" | cut -d ' ' -f 1)
+       old=$(do_node "$rcli" "$LCTL get_param -n osc.*OST0000*.idle_timeout")
+
+       if ((old != 0)); then
+               do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0"
+               stack_trap "do_node $rcli $LCTL set_param osc.*OST0000*.idle_timeout=$old"
+       fi
+
+       # Ensure OST0 import is idle
+       wait_update "$rcli" \
+               "$LCTL get_param osc.*OST0000*.import | \
+                awk 'BEGIN{ count=0 } /idle: 0/ { count+=1 } \
+                     END { print count }'" \
+               "0" "30"
+
+       (( $? != 0 )) && error "OST0000 not idle after 30s"
+
+       # Send ping to ensure import is FULL
+       do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" ||
+               error "OBD ping failed"
+
+       declare -a conns
+
+       conns=( $(do_node "$rcli" "$LCTL get_param *.*.import" |
+                 awk '/connection_attempts:/{print $NF}' | xargs echo) )
+
+       local saved_debug
+
+       saved_debug=$(do_node "$rcli" \
+                     "cat /sys/module/libcfs/parameters/libcfs_debug" \
+                     2>/dev/null)
+       [[ -z $saved_debug ]] && error "Failed to get existing debug"
+       stack_trap "do_node $rcli $LCTL set_param debug=$saved_debug"
+       do_node "$rcli" "$LCTL set_param debug=+info+rpctrace"
+
+       # From lustre/obdclass/class_obd.c
+       # unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
+       #                              (OBD_TIMEOUT_DEFAULT / 4) : 1;
+       # Delay a ping for 3 intervals
+       local timeout delay ts1 ts2 elapsed
+
+       timeout=$(do_node "$rcli" "$LCTL get_param -n timeout")
+       (( timeout > 4 )) && delay=$((3 * timeout / 4)) || delay=3
+
+       do_node "$rcli" "$LCTL clear"
+       log "delay ping ${delay}s"
+
+       ts1=$(date +%s)
+
+       #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL    0x535
+       do_node "$rcli" "$LCTL set_param fail_loc=0x80000535 fail_val=${delay}"
+       # Send ping that will be delayed for ${delay} seconds
+       # This races with the pinger, but it is okay if a ping sent by pinger
+       # thread is delayed instead
+       do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1"
+
+       # Make sure we hit the fail_loc
+       wait_update --quiet "$rcli" \
+               "dmesg | tail -n 10 | \
+                grep -c 'fail_timeout id 535 sleeping for'" \
+               "1" "${timeout}"
+
+       (( $? != 0 )) && error "Did not hit fail_loc"
+
+       ts2=$(date +%s)
+
+       # If we raced with pinger thread then we need to wait ${delay} seconds
+       # for the ping to fail
+       elapsed=$((ts2 - ts1))
+
+       if (( elapsed < delay )); then
+               sleep $(((delay - elapsed) + 1))
+       fi
+
+       do_node "$rcli" "$LCTL dk" > "$TMP/lustre-log-${TESTNAME}.log"
+
+       local nsent
+
+       nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" \
+               "$TMP/lustre-log-${TESTNAME}.log")
+
+       if (( nsent <= 1)); then
+               cat "$TMP/lustre-log-${TESTNAME}.log"
+               rm -f "$TMP/lustre-log-${TESTNAME}.log"
+               error "Did not send more than 1 obd ping"
+       fi
+
+       local expired
+
+       expired=$(grep "Request sent has timed out .* o400->" \
+                 "$TMP/lustre-log-${TESTNAME}.log")
+
+       if [[ -z $expired ]]; then
+               cat "$TMP/lustre-log-${TESTNAME}.log"
+               rm -f "$TMP/lustre-log-${TESTNAME}.log"
+               error "RPC did not time out"
+       else
+               echo "${expired}"
+               rm -f "$TMP/lustre-log-${TESTNAME}.log"
+       fi
+
+       declare -a conns2
+
+       conns2=( $(do_node "$rcli" "$LCTL get_param *.*.import" |
+                  awk '/connection_attempts:/{print $NF}' | xargs echo) )
+
+       echo "conns: ${conns[*]}"
+
+       echo "conns2: ${conns2[*]}"
+
+       (( ${#conns[@]} != ${#conns2[@]} )) &&
+               error "Expected ${#conns[@]} imports found ${#conns2[@]}"
+
+       local i
+
+       for ((i = 0; i < ${#conns[@]}; i++)); do
+               if (( conns[i] != conns2[i] )); then
+                       error "New connection attempt ${conns[i]} -> ${conns2[i]}"
+               fi
+       done
+
+       return 0
+}
+run_test 200 "Dropping one OBD_PING should not cause disconnect"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status