From eb1f4a5222039be9f728839ec8f9cde904a1273f Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Fri, 27 Jan 2023 14:47:25 -0600 Subject: [PATCH] LU-16483 ptlrpc: Track highest reply XID Keep track of the highest XID that we've received a reply for. When an OBD_PING expires, do not disconnect the import if the failed XID is less than or equal to the last reply XID. This avoids situation where a lost OBD_PING rpc causes a reconnect even though we've completed other RPCs in the meantime. HPE-bug-id: LUS-11474 Signed-off-by: Chris Horn Change-Id: I7e66bcc1368fa41ec86ffd843abac676f8d29254 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49807 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lustre/include/lustre_import.h | 4 +- lustre/include/obd_support.h | 1 + lustre/ptlrpc/client.c | 8 ++- lustre/ptlrpc/events.c | 3 + lustre/ptlrpc/niobuf.c | 5 ++ lustre/tests/replay-single.sh | 130 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 148 insertions(+), 3 deletions(-) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 7b97c55..71b9d14 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -203,8 +203,10 @@ struct obd_import { /** List of not replied requests */ struct list_head imp_unreplied_list; - /** Known maximal replied XID */ + /** XID below which we know all replies have been received */ __u64 imp_known_replied_xid; + /** highest XID for which we have received a reply */ + __u64 imp_highest_replied_xid; /** obd device for this import */ struct obd_device *imp_obd; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 732dfa6..4106083 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -474,6 +474,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_NET_ERROR_RPC 0x532 #define OBD_FAIL_PTLRPC_IDLE_RACE 0x533 #define OBD_FAIL_PTLRPC_ENQ_RESEND 0x534 +#define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL 0x535 #define OBD_FAIL_OBD_PING_NET 0x600 /* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index deac037..657aa63 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2289,13 +2289,15 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) struct obd_import *imp = req->rq_import; unsigned int debug_mask = D_RPCTRACE; int rc = 0; + __u32 opc; ENTRY; spin_lock(&req->rq_lock); req->rq_timedout = 1; spin_unlock(&req->rq_lock); - if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg), + opc = lustre_msg_get_opc(req->rq_reqmsg); + if (ptlrpc_console_allow(req, opc, lustre_msg_get_status(req->rq_reqmsg))) debug_mask = D_WARNING; DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]", @@ -2352,7 +2354,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) rc = 1; } - ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); + if (opc != OBD_PING || req->rq_xid > imp->imp_highest_replied_xid) + ptlrpc_fail_import(imp, + lustre_msg_get_conn_cnt(req->rq_reqmsg)); RETURN(rc); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index d616b74..33ce2b0 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -177,6 +177,9 @@ void reply_in_callback(struct lnet_event *ev) if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) req->rq_import->imp_last_reply_time = ktime_get_real_seconds(); + if (req->rq_xid > req->rq_import->imp_highest_replied_xid) + req->rq_import->imp_highest_replied_xid = req->rq_xid; + out_wake: /* NB don't unlock till after wakeup; req can disappear under us * since we don't have our own ref */ diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 64dfa70..d1c3b1c 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -942,6 +942,10 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) request->rq_deadline = request->rq_sent + request->rq_timeout + ptlrpc_at_get_net_latency(request); + if (unlikely(opc == OBD_PING && + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val))) + GOTO(skip_send, rc); + DEBUG_REQ(D_INFO, request, "send flags=%x", lustre_msg_get_flags(request->rq_reqmsg)); rc = ptl_send_buf(&request->rq_req_md_h, @@ -954,6 +958,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (likely(rc == 0)) GOTO(out, rc); +skip_send: request->rq_req_unlinked = 1; ptlrpc_req_finished(request); if (noreply) diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 15abd0f..50ca21c 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -5018,6 +5018,136 @@ test_136() { } run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm" +test_200() { + [[ -z $RCLIENTS ]] && skip "Need remote client" + + local rcli old + + rcli=$(echo "$RCLIENTS" | cut -d ' ' -f 1) + old=$(do_node "$rcli" "$LCTL get_param -n osc.*OST0000*.idle_timeout") + + if ((old != 0)); then + do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0" + stack_trap "do_node $rcli $LCTL set_param osc.*OST0000*.idle_timeout=$old" + fi + + # Ensure OST0 import is idle + wait_update "$rcli" \ + "$LCTL get_param osc.*OST0000*.import | \ + awk 'BEGIN{ count=0 } /idle: 0/ { count+=1 } \ + END { print count }'" \ + "0" "30" + + (( $? != 0 )) && error "OST0000 not idle after 30s" + + # Send ping to ensure import is FULL + do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" || + error "OBD ping failed" + + declare -a conns + + conns=( $(do_node "$rcli" "$LCTL get_param *.*.import" | + awk '/connection_attempts:/{print $NF}' | xargs echo) ) + + local saved_debug + + saved_debug=$(do_node "$rcli" \ + "cat /sys/module/libcfs/parameters/libcfs_debug" \ + 2>/dev/null) + [[ -z $saved_debug ]] && error "Failed to get existing debug" + stack_trap "do_node $rcli $LCTL set_param debug=$saved_debug" + do_node "$rcli" "$LCTL set_param debug=+info+rpctrace" + + # From lustre/obdclass/class_obd.c + # unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ? + # (OBD_TIMEOUT_DEFAULT / 4) : 1; + # Delay a ping for 3 intervals + local timeout delay ts1 ts2 elapsed + + timeout=$(do_node "$rcli" "$LCTL get_param -n timeout") + (( timeout > 4 )) && delay=$((3 * timeout / 4)) || delay=3 + + do_node "$rcli" "$LCTL clear" + log "delay ping ${delay}s" + + ts1=$(date +%s) + + #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL 0x535 + do_node "$rcli" "$LCTL set_param fail_loc=0x80000535 fail_val=${delay}" + # Send ping that will be delayed for ${delay} seconds + # This races with the pinger, but it is okay if a ping sent by pinger + # thread is delayed instead + do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" + + # Make sure we hit the fail_loc + wait_update --quiet "$rcli" \ + "dmesg | tail -n 10 | \ + grep -c 'fail_timeout id 535 sleeping for'" \ + "1" "${timeout}" + + (( $? != 0 )) && error "Did not hit fail_loc" + + ts2=$(date +%s) + + # If we raced with pinger thread then we need to wait ${delay} seconds + # for the ping to fail + elapsed=$((ts2 - ts1)) + + if (( elapsed < delay )); then + sleep $(((delay - elapsed) + 1)) + fi + + do_node "$rcli" "$LCTL dk" > "$TMP/lustre-log-${TESTNAME}.log" + + local nsent + + nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" \ + "$TMP/lustre-log-${TESTNAME}.log") + + if (( nsent <= 1)); then + cat "$TMP/lustre-log-${TESTNAME}.log" + rm -f "$TMP/lustre-log-${TESTNAME}.log" + error "Did not send more than 1 obd ping" + fi + + local expired + + expired=$(grep "Request sent has timed out .* o400->" \ + "$TMP/lustre-log-${TESTNAME}.log") + + if [[ -z $expired ]]; then + cat "$TMP/lustre-log-${TESTNAME}.log" + rm -f "$TMP/lustre-log-${TESTNAME}.log" + error "RPC did not time out" + else + echo "${expired}" + rm -f "$TMP/lustre-log-${TESTNAME}.log" + fi + + declare -a conns2 + + conns2=( $(do_node "$rcli" "$LCTL get_param *.*.import" | + awk '/connection_attempts:/{print $NF}' | xargs echo) ) + + echo "conns: ${conns[*]}" + + echo "conns2: ${conns2[*]}" + + (( ${#conns[@]} != ${#conns2[@]} )) && + error "Expected ${#conns[@]} imports found ${#conns2[@]}" + + local i + + for ((i = 0; i < ${#conns[@]}; i++)); do + if (( conns[i] != conns2[i] )); then + error "New connection attempt ${conns[i]} -> ${conns2[i]}" + fi + done + + return 0 +} +run_test 200 "Dropping one OBD_PING should not cause disconnect" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1