Whamcloud - gitweb
LU-16290 lnet: Signal completion on ping send failure 20/49020/3
authorChris Horn <chris.horn@hpe.com>
Tue, 1 Nov 2022 20:33:18 +0000 (14:33 -0600)
committerOleg Drokin <green@whamcloud.com>
Mon, 14 Nov 2022 08:26:47 +0000 (08:26 +0000)
Call complete() on the ping_data::completion if we get
LNET_EVENT_SEND with non-zero status. Otherwise the thread which
issued the ping is stuck waiting for the full ping timeout.

A pd_unlinked member is added to struct ping_data to indicate whether
the associated MD has been unlinked. This is checked by lnet_ping() to
determine whether it needs to explicitly called LNetMDUnlink().

Lastly, in cases where we do not receive a reply, we now return the
value of pd.rc, if it is non-zero, rather than -EIO. This can provide
more information about the underlying ping failure.

HPE-bug-id: LUS-11317
Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I1bc573cf7397e319993fa8aabb31c5f3b59768e7
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49020
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/api-ni.c
lustre/tests/sanity-lnet.sh

index 637a9f6..7f8e04e 100644 (file)
@@ -5450,6 +5450,7 @@ EXPORT_SYMBOL(LNetGetId);
 struct ping_data {
        int rc;
        int replied;
+       int pd_unlinked;
        struct lnet_handle_md mdh;
        struct completion completion;
 };
@@ -5470,7 +5471,12 @@ lnet_ping_event_handler(struct lnet_event *event)
                pd->replied = 1;
                pd->rc = event->mlength;
        }
+
        if (event->unlinked)
+               pd->pd_unlinked = 1;
+
+       if (event->unlinked ||
+           (event->type == LNET_EVENT_SEND && event->status))
                complete(&pd->completion);
 }
 
@@ -5543,13 +5549,14 @@ static int lnet_ping(struct lnet_process_id id4, struct lnet_nid *src_nid,
                /* NB must wait for the UNLINK event below... */
        }
 
-       if (wait_for_completion_timeout(&pd.completion, timeout) == 0) {
-               /* Ensure completion in finite time... */
+       /* Ensure completion in finite time... */
+       wait_for_completion_timeout(&pd.completion, timeout);
+       if (!pd.pd_unlinked) {
                LNetMDUnlink(pd.mdh);
                wait_for_completion(&pd.completion);
        }
        if (!pd.replied) {
-               rc = -EIO;
+               rc = pd.rc ?: -EIO;
                goto fail_ping_buffer_decref;
        }
 
index d3eb6cf..f42050a 100755 (executable)
@@ -3071,6 +3071,37 @@ test_251() {
 }
 run_test 251 "Define multiple kfi networks on single interface"
 
+test_252() {
+       setup_health_test false || return $?
+
+       local rc=0
+
+       do_rpc_nodes $RNODE unload_modules_local || rc=$?
+
+       if [[ $rc -ne 0 ]]; then
+               cleanup_health_test || return $?
+
+               error "Failed to unload modules on $RNODE rc=$rc"
+       else
+               RLOADED=false
+       fi
+
+       local ts1=$(date +%s)
+
+       do_lnetctl ping --timeout 15 ${RNIDS[0]} &&
+               error "Expected ping ${RNIDS[0]} to fail"
+
+       local ts2=$(date +%s)
+
+       local delta=$(echo "$ts2 - $ts1" | bc)
+
+       [[ $delta -lt 15 ]] ||
+               error "Ping took longer than expected to fail: $delta"
+
+       cleanup_health_test
+}
+run_test 252 "Ping to down peer should unlink quickly"
+
 test_300() {
        # LU-13274
        local header