From 48c34c71de65e8a251a218bc9ecb7c5ed522d786 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Tue, 1 Nov 2022 14:33:18 -0600 Subject: [PATCH] LU-16290 lnet: Signal completion on ping send failure Call complete() on the ping_data::completion if we get LNET_EVENT_SEND with non-zero status. Otherwise the thread which issued the ping is stuck waiting for the full ping timeout. A pd_unlinked member is added to struct ping_data to indicate whether the associated MD has been unlinked. This is checked by lnet_ping() to determine whether it needs to explicitly called LNetMDUnlink(). Lastly, in cases where we do not receive a reply, we now return the value of pd.rc, if it is non-zero, rather than -EIO. This can provide more information about the underlying ping failure. HPE-bug-id: LUS-11317 Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Chris Horn Change-Id: I1bc573cf7397e319993fa8aabb31c5f3b59768e7 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49020 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Serguei Smirnov Reviewed-by: Frank Sehr Reviewed-by: Oleg Drokin --- lnet/lnet/api-ni.c | 13 ++++++++++--- lustre/tests/sanity-lnet.sh | 31 +++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 637a9f6..7f8e04e 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -5450,6 +5450,7 @@ EXPORT_SYMBOL(LNetGetId); struct ping_data { int rc; int replied; + int pd_unlinked; struct lnet_handle_md mdh; struct completion completion; }; @@ -5470,7 +5471,12 @@ lnet_ping_event_handler(struct lnet_event *event) pd->replied = 1; pd->rc = event->mlength; } + if (event->unlinked) + pd->pd_unlinked = 1; + + if (event->unlinked || + (event->type == LNET_EVENT_SEND && event->status)) complete(&pd->completion); } @@ -5543,13 +5549,14 @@ static int lnet_ping(struct lnet_process_id id4, struct lnet_nid *src_nid, /* NB must wait for the UNLINK event below... */ } - if (wait_for_completion_timeout(&pd.completion, timeout) == 0) { - /* Ensure completion in finite time... */ + /* Ensure completion in finite time... */ + wait_for_completion_timeout(&pd.completion, timeout); + if (!pd.pd_unlinked) { LNetMDUnlink(pd.mdh); wait_for_completion(&pd.completion); } if (!pd.replied) { - rc = -EIO; + rc = pd.rc ?: -EIO; goto fail_ping_buffer_decref; } diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index d3eb6cf..f42050a 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -3071,6 +3071,37 @@ test_251() { } run_test 251 "Define multiple kfi networks on single interface" +test_252() { + setup_health_test false || return $? + + local rc=0 + + do_rpc_nodes $RNODE unload_modules_local || rc=$? + + if [[ $rc -ne 0 ]]; then + cleanup_health_test || return $? + + error "Failed to unload modules on $RNODE rc=$rc" + else + RLOADED=false + fi + + local ts1=$(date +%s) + + do_lnetctl ping --timeout 15 ${RNIDS[0]} && + error "Expected ping ${RNIDS[0]} to fail" + + local ts2=$(date +%s) + + local delta=$(echo "$ts2 - $ts1" | bc) + + [[ $delta -lt 15 ]] || + error "Ping took longer than expected to fail: $delta" + + cleanup_health_test +} +run_test 252 "Ping to down peer should unlink quickly" + test_300() { # LU-13274 local header -- 1.8.3.1