Whamcloud - gitweb
LU-15398 lnet: Avoid peer NI recovery for local interface
authorChris Horn <chris.horn@hpe.com>
Thu, 23 Dec 2021 20:15:27 +0000 (14:15 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 23 Mar 2024 20:31:50 +0000 (20:31 +0000)
If a MR peer has a MR peer entry for itself (can happen if manually
created or discovery is run on itself for some reason), then it is
possible for it to put its own interfaces into peer recovery. Problems
with local interfaces should be handled via local NI recovery.

Lustre-change: https://review.whamcloud.com/45933
Lustre-commit: fb5d7036ec356c825f2aadece68cd7c4af487680

Test-Parameters: trivial testlist=sanity-lnet
HPE-bug-id: LUS-10661
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I5b28195979a6113fa863b5795a4528b072610891
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51683
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lnet/lnet/lib-msg.c
lustre/tests/sanity-lnet.sh

index 6592cf3..f040e37 100644 (file)
@@ -877,6 +877,12 @@ lnet_health_check(struct lnet_msg *msg)
                        if (!lnet_isrouter(lpni))
                                handle_remote_health = false;
                }
+               /* Do not put my interfaces into peer NI recovery. They should
+                * be handled with local NI recovery.
+                */
+               if (handle_remote_health && lpni &&
+                   lnet_nid2ni_locked(lpni->lpni_nid, 0))
+                       handle_remote_health = false;
                lnet_net_unlock(0);
        }
 
index 4a27d00..2dc364b 100755 (executable)
@@ -1974,7 +1974,7 @@ check_nid_in_recovq() {
        local found=false
        local nid=""
 
-       echo "Check recovery queue"
+       echo "Check \"$1\" recovery queue"
        echo "$recovq"
        if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then
                error "Expect $expect NIDs found: \"$recovq\""
@@ -2411,6 +2411,46 @@ test_215() {
 }
 run_test 215 "Test lnetctl ping --source option"
 
+test_216() {
+       local rc=0
+
+       reinit_dlc || return $?
+
+       add_net "tcp" "${INTERFACES[0]}" || return $?
+       add_net "tcp1" "${INTERFACES[0]}" || return $?
+
+       local nids=( $($LCTL list_nids | xargs echo) )
+
+       do_lnetctl discover ${nids[0]} ||
+               error "Initial discovery failed"
+
+       do_lnetctl ping --source ${nids[0]} ${nids[0]} ||
+               error "Initial ping failed $?"
+
+       do_lnetctl ping --source ${nids[1]} ${nids[1]} ||
+               error "Initial ping failed $?"
+
+       local src dst
+       for src in ${nids[@]}; do
+               for dst in ${nids[@]}; do
+                       $LCTL net_drop_add -r 1 -s $src -d $dst -e network_timeout
+               done
+       done
+
+       do_lnetctl ping ${nids[0]} || rc=$?
+
+       $LCTL net_drop_del -a
+
+       [[ $rc -eq 0 ]] &&
+               error "expected ping to fail"
+
+       check_nid_in_recovq "-p" 0
+       check_nid_in_recovq "-l" 1
+
+       return 0
+}
+run_test 216 "Failed send to peer NI owned by local host should not trigger peer NI recovery"
+
 test_217() {
        reinit_dlc || return $?