From fc4b744d8c427e707076d7781281ff89bff2257d Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Fri, 27 Jan 2023 14:47:25 -0600 Subject: [PATCH] LU-16483 ptlrpc: Track highest reply XID Keep track of the highest XID that we've received a reply for. When an OBD_PING expires, do not disconnect the import if the failed XID is less than or equal to the last reply XID. This avoids situation where a lost OBD_PING rpc causes a reconnect even though we've completed other RPCs in the meantime. Lustre-change: https://review.whamcloud.com/49807 Lustre-commit: eb1f4a5222039be9f728839ec8f9cde904a1273f LU-16483 tests: replay-single test_200 fixes Modify test to ensure idle disconnect is enabled for all targets except OST0000. This prevents an issue where an idle ping is sent to another target instead of OST0000. Re-work test to check the debug log for all relevant messages. rcli is not set correctly when RCLIENTS contains multiple hostnames. Fix it by not surrounding RCLIENTS with double quotes. Added a debug statement to ptl_send_rpc(), and moved an existing one, to faciliate debugging any future test failures. Lustre-change: https://review.whamcloud.com/50891 Lustre-commit: fdfdf5c05cf64294068a5cbfe818b64bd9e577f9 HPE-bug-id: LUS-11474 Signed-off-by: Chris Horn Change-Id: I7e66bcc1368fa41ec86ffd843abac676f8d29254 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52321 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Andreas Dilger --- lustre/include/lustre_import.h | 4 +- lustre/include/obd_support.h | 3 + lustre/ptlrpc/client.c | 8 ++- lustre/ptlrpc/events.c | 3 + lustre/ptlrpc/niobuf.c | 11 ++++ lustre/tests/replay-single.sh | 142 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 168 insertions(+), 3 deletions(-) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 1a2a1a7..066972a 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -204,8 +204,10 @@ struct obd_import { /** List of not replied requests */ struct list_head imp_unreplied_list; - /** Known maximal replied XID */ + /** XID below which we know all replies have been received */ __u64 imp_known_replied_xid; + /** highest XID for which we have received a reply */ + __u64 imp_highest_replied_xid; /** obd device for this import */ struct obd_device *imp_obd; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index ad953fe..76bca28 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -470,6 +470,9 @@ extern char obd_jobid_var[]; #define OBD_FAIL_PTLRPC_ROUND_XID 0x530 #define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531 #define OBD_FAIL_NET_ERROR_RPC 0x532 +#define OBD_FAIL_PTLRPC_IDLE_RACE 0x533 +#define OBD_FAIL_PTLRPC_ENQ_RESEND 0x534 +#define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL 0x535 #define OBD_FAIL_OBD_PING_NET 0x600 /* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index b9fa24a..f77fa9f 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2301,13 +2301,15 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) struct obd_import *imp = req->rq_import; unsigned int debug_mask = D_RPCTRACE; int rc = 0; + __u32 opc; ENTRY; spin_lock(&req->rq_lock); req->rq_timedout = 1; spin_unlock(&req->rq_lock); - if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg), + opc = lustre_msg_get_opc(req->rq_reqmsg); + if (ptlrpc_console_allow(req, opc, lustre_msg_get_status(req->rq_reqmsg))) debug_mask = D_WARNING; DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]", @@ -2364,7 +2366,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) rc = 1; } - ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); + if (opc != OBD_PING || req->rq_xid > imp->imp_highest_replied_xid) + ptlrpc_fail_import(imp, + lustre_msg_get_conn_cnt(req->rq_reqmsg)); RETURN(rc); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index ec684b6..1b7cb38 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -178,6 +178,9 @@ void reply_in_callback(struct lnet_event *ev) if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) req->rq_import->imp_last_reply_time = ktime_get_real_seconds(); + if (req->rq_xid > req->rq_import->imp_highest_replied_xid) + req->rq_import->imp_highest_replied_xid = req->rq_xid; + out_wake: /* NB don't unlock till after wakeup; req can disappear under us * since we don't have our own ref */ diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 27180bb..d4c0b71 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -700,6 +700,7 @@ int ptlrpc_error(struct ptlrpc_request *req) int ptl_send_rpc(struct ptlrpc_request *request, int noreply) { int rc; + __u32 opc; int mpflag = 0; struct lnet_handle_md bulk_cookie; struct ptlrpc_connection *connection; @@ -768,6 +769,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) "Allocating new XID for resend on EINPROGRESS"); } + opc = lustre_msg_get_opc(request->rq_reqmsg); + if (request->rq_bulk != NULL) { ptlrpc_set_bulk_mbits(request); lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits); @@ -919,6 +922,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) DEBUG_REQ(D_INFO, request, "send flags=%x", lustre_msg_get_flags(request->rq_reqmsg)); + + if (unlikely(opc == OBD_PING && + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND_FAIL, cfs_fail_val))) { + DEBUG_REQ(D_INFO, request, "Simulate delay send failure"); + GOTO(skip_send, rc); + } + rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqbuf, request->rq_reqdata_len, LNET_NOACK_REQ, &request->rq_req_cbid, @@ -928,6 +938,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (likely(rc == 0)) GOTO(out, rc); +skip_send: request->rq_req_unlinked = 1; ptlrpc_req_finished(request); if (noreply) diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index ad1bc0a..c37f5c5 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -5048,6 +5048,148 @@ test_136() { } run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm" +test_200() { + [[ -z $RCLIENTS ]] && skip "Need remote client" + + local rcli old + + rcli=$(echo $RCLIENTS | cut -d ' ' -f 1) + + echo "Selected \"$rcli\" from \"$RCLIENTS\"" + + # We want idle disconnect enabled on all targets except OST0000. Test + # assumes all nodes are configured the same. + old=$(do_node $rcli "$LCTL get_param -n *.*.idle_timeout 2>/dev/null | + head -n 1") + [[ -n $old ]] || error "Cannot determine current idle_timeout" + + if ((old == 0)); then + do_node "$rcli" "$LCTL set_param *.*.idle_timeout=10" + do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0" + stack_trap "do_node $rcli $LCTL set_param *.*.idle_timeout=$old" + else + do_node "$rcli" "$LCTL set_param osc.*OST0000*.idle_timeout=0" + stack_trap "do_node $rcli $LCTL set_param osc.*OST0000*.idle_timeout=$old" + fi + + # Ensure OST0 import is idle + wait_update "$rcli" \ + "$LCTL get_param osc.*OST0000*.import | \ + awk 'BEGIN{ count=0 } /idle: 0/ { count+=1 } \ + END { print count }'" \ + "0" "30" + + (( $? != 0 )) && error "OST0000 not idle after 30s" + + # Send ping to ensure import is FULL + do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" || + error "OBD ping failed" + + declare -a conns + + conns=( $(do_node "$rcli" "$LCTL get_param *.*.import" | + awk '/connection_attempts:/{print $NF}' | xargs echo) ) + + local saved_debug + + saved_debug=$(do_node "$rcli" \ + "cat /sys/module/libcfs/parameters/libcfs_debug" \ + 2>/dev/null) + [[ -z $saved_debug ]] && error "Failed to get existing debug" + stack_trap "do_node $rcli $LCTL set_param debug=$saved_debug" + do_node "$rcli" "$LCTL set_param debug=+info+rpctrace" + + # From lustre/obdclass/class_obd.c + # unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ? + # (OBD_TIMEOUT_DEFAULT / 4) : 1; + # Delay a ping for 3 intervals + local timeout delay + + timeout=$(do_node "$rcli" "$LCTL get_param -n timeout") + (( timeout > 4 )) && delay=$((3 * timeout / 4)) || delay=3 + + do_node "$rcli" "$LCTL clear" + log "delay ping ${delay}s" + + #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL 0x535 + do_node "$rcli" "$LCTL set_param fail_loc=0x80000535 fail_val=${delay}" + # Send ping that will be delayed for ${delay} seconds + # This races with the pinger, but it is okay if a ping sent by pinger + # thread is delayed instead + do_node "$rcli" "lctl set_param osc.*OST0000*.ping=1" + + local logfile="$TMP/lustre-log-${TESTNAME}.log" + + local nsent expired + local waited=0 + local begin=$SECONDS + local max_wait=$((delay + 1)) + local reason="" + local sleep="" + while (( $waited <= $max_wait )); do + [[ -z $sleep ]] || sleep $sleep + sleep=1 + waited=$((SECONDS - begin)) + do_node "$rcli" "$LCTL dk" >> ${logfile} + + if ! grep -q 'fail_timeout id 535 sleeping for' $logfile; then + reason="Did not hit fail_loc" + continue + fi + + if ! grep -q 'cfs_fail_timeout id 535 awake' $logfile; then + reason="Delayed send did not wake" + continue + fi + + nsent=$(grep -c "ptl_send_rpc.*o400->.*OST0000" $logfile) + if (( nsent <= 1 )); then + reason="Did not send more than 1 obd ping" + continue + fi + + expired=$(grep "Request sent has timed out .* o400->" $logfile) + if [[ -z $expired ]]; then + reason="RPC did not time out" + continue + fi + reason="" + break + done + + if [[ -n $reason ]]; then + cat $logfile + rm -f $logfile + error "$reason" + else + echo "${expired}" + rm -f $logfile + fi + + declare -a conns2 + + conns2=( $(do_node "$rcli" "$LCTL get_param *.*.import" | + awk '/connection_attempts:/{print $NF}' | xargs echo) ) + + echo "conns: ${conns[*]}" + + echo "conns2: ${conns2[*]}" + + (( ${#conns[@]} != ${#conns2[@]} )) && + error "Expected ${#conns[@]} imports found ${#conns2[@]}" + + local i + + for ((i = 0; i < ${#conns[@]}; i++)); do + if (( conns[i] != conns2[i] )); then + error "New connection attempt ${conns[i]} -> ${conns2[i]}" + fi + done + + return 0 +} +run_test 200 "Dropping one OBD_PING should not cause disconnect" + complete_test $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1