Whamcloud - gitweb
LU-16483 ptlrpc: Track highest reply XID
authorChris Horn <chris.horn@hpe.com>
Fri, 27 Jan 2023 20:47:25 +0000 (14:47 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 14 Sep 2023 07:26:07 +0000 (07:26 +0000)
Keep track of the highest XID that we've received a reply for.
When an OBD_PING expires, do not disconnect the import if the failed
XID is less than or equal to the last reply XID. This avoids situation
where a lost OBD_PING rpc causes a reconnect even though we've
completed other RPCs in the meantime.

Lustre-change: https://review.whamcloud.com/49807
Lustre-commit: eb1f4a5222039be9f728839ec8f9cde904a1273f

LU-16483 tests: replay-single test_200 fixes

Modify test to ensure idle disconnect is enabled for all targets
except OST0000. This prevents an issue where an idle ping is sent to
another target instead of OST0000.

Re-work test to check the debug log for all relevant messages.

rcli is not set correctly when RCLIENTS contains multiple hostnames.
Fix it by not surrounding RCLIENTS with double quotes.

Added a debug statement to ptl_send_rpc(), and moved an existing one,
to faciliate debugging any future test failures.

Lustre-change: https://review.whamcloud.com/50891
Lustre-commit: fdfdf5c05cf64294068a5cbfe818b64bd9e577f9

HPE-bug-id: LUS-11474
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I7e66bcc1368fa41ec86ffd843abac676f8d29254
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52321
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/lustre_import.h
lustre/include/obd_support.h
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/niobuf.c
lustre/tests/replay-single.sh

index 1a2a1a7..066972a 100644 (file)
@@ -204,8 +204,10 @@ struct obd_import {
 
        /** List of not replied requests */
        struct list_head        imp_unreplied_list;
-       /** Known maximal replied XID */
+       /** XID below which we know all replies have been received */
        __u64                   imp_known_replied_xid;
+       /** highest XID for which we have received a reply */
+       __u64                   imp_highest_replied_xid;
 
        /** obd device for this import */
        struct obd_device       *imp_obd;
index ad953fe..76bca28 100644 (file)
@@ -470,6 +470,9 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_ROUND_XID       0x530
 #define OBD_FAIL_PTLRPC_CONNECT_RACE    0x531
 #define OBD_FAIL_NET_ERROR_RPC          0x532
+#define OBD_FAIL_PTLRPC_IDLE_RACE       0x533
+#define OBD_FAIL_PTLRPC_ENQ_RESEND      0x534
+#define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL         0x535
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 /*     OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
index b9fa24a..f77fa9f 100644 (file)
@@ -2301,13 +2301,15 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
        struct obd_import *imp = req->rq_import;
        unsigned int debug_mask = D_RPCTRACE;
        int rc = 0;
+       __u32 opc;
 
        ENTRY;
        spin_lock(&req->rq_lock);
        req->rq_timedout = 1;
        spin_unlock(&req->rq_lock);
 
-       if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
+       opc = lustre_msg_get_opc(req->rq_reqmsg);
+       if (ptlrpc_console_allow(req, opc,
                                 lustre_msg_get_status(req->rq_reqmsg)))
                debug_mask = D_WARNING;
        DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
@@ -2364,7 +2366,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
                rc = 1;
        }
 
-       ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+       if (opc != OBD_PING || req->rq_xid > imp->imp_highest_replied_xid)
+               ptlrpc_fail_import(imp,
+                                  lustre_msg_get_conn_cnt(req->rq_reqmsg));
 
        RETURN(rc);
 }
index ec684b6..1b7cb38 100644 (file)
@@ -178,6 +178,9 @@ void reply_in_callback(struct lnet_event *ev)
        if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
                req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
 
+       if (req->rq_xid > req->rq_import->imp_highest_replied_xid)
+               req->rq_import->imp_highest_replied_xid = req->rq_xid;
+
 out_wake:
        /* NB don't unlock till after wakeup; req can disappear under us
         * since we don't have our own ref */
index 27180bb..d4c0b71 100644 (file)
@@ -700,6 +700,7 @@ int ptlrpc_error(struct ptlrpc_request *req)
 int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 {
        int rc;
+       __u32 opc;
        int mpflag = 0;
        struct lnet_handle_md bulk_cookie;
        struct ptlrpc_connection *connection;
@@ -768,6 +769,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                          "Allocating new XID for resend on EINPROGRESS");
        }
 
+       opc = lustre_msg_get_opc(request->rq_reqmsg);
+
        if (request->rq_bulk != NULL) {
                ptlrpc_set_bulk_mbits(request);
                lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits);
@@ -919,6 +922,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 
        DEBUG_REQ(D_INFO, request, "send flags=%x",
                  lustre_msg_get_flags(request->rq_reqmsg));
+
+       if (unlikely(opc == OBD_PING &&
+           OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val))) {
+               DEBUG_REQ(D_INFO, request, "Simulate delay send failure");
+               GOTO(skip_send, rc);
+       }
+
        rc = ptl_send_buf(&request->rq_req_md_h,
                          request->rq_reqbuf, request->rq_reqdata_len,
                          LNET_NOACK_REQ, &request->rq_req_cbid,
@@ -928,6 +938,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        if (likely(rc == 0))
                GOTO(out, rc);
 
+skip_send:
        request->rq_req_unlinked = 1;
        ptlrpc_req_finished(request);
        if (noreply)
index ad1bc0a..c37f5c5 100755 (executable)
@@ -5048,6 +5048,148 @@ test_136() {
 }
 run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm"
 
+test_200() {
+       [[ -z $RCLIENTS ]] && skip "Need remote client"
+
+       local rcli old
+
+       rcli=$(echo $RCLIENTS | cut -d ' ' -f 1)
+
+       echo "Selected \"$rcli\" from \"$RCLIENTS\""
+
+       # We want idle disconnect enabled on all targets except OST0000. Test
+       # assumes all nodes are configured the same.
+       old=$(do_node $rcli "$LCTL get_param -n *.*.idle_timeout 2>/dev/null |
+             head -n 1")
+       [[ -n $old ]] || error "Cannot determine current idle_timeout"
+
+       if ((old == 0)); then
+               do_node "$rcli" "$LCTL set_param *.*.idle_timeout=10"
+               do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0"
+               stack_trap "do_node $rcli $LCTL set_param *.*.idle_timeout=$old"
+       else
+               do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0"
+               stack_trap "do_node $rcli $LCTL set_param osc.*OST0000*.idle_timeout=$old"
+       fi
+
+       # Ensure OST0 import is idle
+       wait_update "$rcli" \
+               "$LCTL get_param osc.*OST0000*.import | \
+                awk 'BEGIN{ count=0 } /idle: 0/ { count+=1 } \
+                     END { print count }'" \
+               "0" "30"
+
+       (( $? != 0 )) && error "OST0000 not idle after 30s"
+
+       # Send ping to ensure import is FULL
+       do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" ||
+               error "OBD ping failed"
+
+       declare -a conns
+
+       conns=( $(do_node "$rcli" "$LCTL get_param *.*.import" |
+                 awk '/connection_attempts:/{print $NF}' | xargs echo) )
+
+       local saved_debug
+
+       saved_debug=$(do_node "$rcli" \
+                     "cat /sys/module/libcfs/parameters/libcfs_debug" \
+                     2>/dev/null)
+       [[ -z $saved_debug ]] && error "Failed to get existing debug"
+       stack_trap "do_node $rcli $LCTL set_param debug=$saved_debug"
+       do_node "$rcli" "$LCTL set_param debug=+info+rpctrace"
+
+       # From lustre/obdclass/class_obd.c
+       # unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
+       #                              (OBD_TIMEOUT_DEFAULT / 4) : 1;
+       # Delay a ping for 3 intervals
+       local timeout delay
+
+       timeout=$(do_node "$rcli" "$LCTL get_param -n timeout")
+       (( timeout > 4 )) && delay=$((3 * timeout / 4)) || delay=3
+
+       do_node "$rcli" "$LCTL clear"
+       log "delay ping ${delay}s"
+
+       #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL    0x535
+       do_node "$rcli" "$LCTL set_param fail_loc=0x80000535 fail_val=${delay}"
+       # Send ping that will be delayed for ${delay} seconds
+       # This races with the pinger, but it is okay if a ping sent by pinger
+       # thread is delayed instead
+       do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1"
+
+       local logfile="$TMP/lustre-log-${TESTNAME}.log"
+
+       local nsent expired
+       local waited=0
+       local begin=$SECONDS
+       local max_wait=$((delay + 1))
+       local reason=""
+       local sleep=""
+       while (( $waited <= $max_wait )); do
+               [[ -z $sleep ]] || sleep $sleep
+               sleep=1
+               waited=$((SECONDS - begin))
+               do_node "$rcli" "$LCTL dk" >> ${logfile}
+
+               if ! grep -q 'fail_timeout id 535 sleeping for' $logfile; then
+                       reason="Did not hit fail_loc"
+                       continue
+               fi
+
+               if ! grep -q 'cfs_fail_timeout id 535 awake' $logfile; then
+                       reason="Delayed send did not wake"
+                       continue
+               fi
+
+               nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" $logfile)
+               if (( nsent <= 1 )); then
+                       reason="Did not send more than 1 obd ping"
+                       continue
+               fi
+
+               expired=$(grep "Request sent has timed out .* o400->" $logfile)
+               if [[ -z $expired ]]; then
+                       reason="RPC did not time out"
+                       continue
+               fi
+               reason=""
+               break
+       done
+
+       if [[ -n $reason ]]; then
+               cat $logfile
+               rm -f $logfile
+               error "$reason"
+       else
+               echo "${expired}"
+               rm -f $logfile
+       fi
+
+       declare -a conns2
+
+       conns2=( $(do_node "$rcli" "$LCTL get_param *.*.import" |
+                  awk '/connection_attempts:/{print $NF}' | xargs echo) )
+
+       echo "conns: ${conns[*]}"
+
+       echo "conns2: ${conns2[*]}"
+
+       (( ${#conns[@]} != ${#conns2[@]} )) &&
+               error "Expected ${#conns[@]} imports found ${#conns2[@]}"
+
+       local i
+
+       for ((i = 0; i < ${#conns[@]}; i++)); do
+               if (( conns[i] != conns2[i] )); then
+                       error "New connection attempt ${conns[i]} -> ${conns2[i]}"
+               fi
+       done
+
+       return 0
+}
+run_test 200 "Dropping one OBD_PING should not cause disconnect"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 exit_status