Whamcloud - gitweb
LU-16290 lnet: Signal completion on ping send failure
authorChris Horn <chris.horn@hpe.com>
Tue, 1 Nov 2022 20:33:18 +0000 (14:33 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 29 Jul 2023 07:56:50 +0000 (07:56 +0000)
Call complete() on the ping_data::completion if we get
LNET_EVENT_SEND with non-zero status. Otherwise the thread which
issued the ping is stuck waiting for the full ping timeout.

A pd_unlinked member is added to struct ping_data to indicate whether
the associated MD has been unlinked. This is checked by lnet_ping() to
determine whether it needs to explicitly called LNetMDUnlink().

Lastly, in cases where we do not receive a reply, we now return the
value of pd.rc, if it is non-zero, rather than -EIO. This can provide
more information about the underlying ping failure.

Lustre-change: https://review.whamcloud.com/49020
Lustre-commit: 48c34c71de65e8a251a218bc9ecb7c5ed522d786

HPE-bug-id: LUS-11317
Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I1bc573cf7397e319993fa8aabb31c5f3b59768e7
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51700
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
lnet/lnet/api-ni.c
lustre/tests/sanity-lnet.sh

index 1cc2e07..9d54e67 100644 (file)
@@ -4278,6 +4278,7 @@ EXPORT_SYMBOL(LNetGetId);
 struct ping_data {
        int rc;
        int replied;
+       int pd_unlinked;
        struct lnet_handle_md mdh;
        struct completion completion;
 };
@@ -4298,7 +4299,12 @@ lnet_ping_event_handler(struct lnet_event *event)
                pd->replied = 1;
                pd->rc = event->mlength;
        }
+
        if (event->unlinked)
+               pd->pd_unlinked = 1;
+
+       if (event->unlinked ||
+           (event->type == LNET_EVENT_SEND && event->status))
                complete(&pd->completion);
 }
 
@@ -4362,13 +4368,14 @@ static int lnet_ping(struct lnet_process_id id, lnet_nid_t src_nid,
                /* NB must wait for the UNLINK event below... */
        }
 
-       if (wait_for_completion_timeout(&pd.completion, timeout) == 0) {
-               /* Ensure completion in finite time... */
+       /* Ensure completion in finite time... */
+       wait_for_completion_timeout(&pd.completion, timeout);
+       if (!pd.pd_unlinked) {
                LNetMDUnlink(pd.mdh);
                wait_for_completion(&pd.completion);
        }
        if (!pd.replied) {
-               rc = -EIO;
+               rc = pd.rc ?: -EIO;
                goto fail_ping_buffer_decref;
        }
 
index 647ea07..f6d6583 100755 (executable)
@@ -2789,6 +2789,37 @@ do_expired_message_drop_test() {
        return 0
 }
 
+test_252() {
+       setup_health_test false || return $?
+
+       local rc=0
+
+       do_rpc_nodes $RNODE unload_modules_local || rc=$?
+
+       if [[ $rc -ne 0 ]]; then
+               cleanup_health_test || return $?
+
+               error "Failed to unload modules on $RNODE rc=$rc"
+       else
+               RLOADED=false
+       fi
+
+       local ts1=$(date +%s)
+
+       do_lnetctl ping --timeout 15 ${RNIDS[0]} &&
+               error "Expected ping ${RNIDS[0]} to fail"
+
+       local ts2=$(date +%s)
+
+       local delta=$(echo "$ts2 - $ts1" | bc)
+
+       [[ $delta -lt 15 ]] ||
+               error "Ping took longer than expected to fail: $delta"
+
+       cleanup_health_test
+}
+run_test 252 "Ping to down peer should unlink quickly"
+
 test_253() {
        setup_health_test false || return $?