Whamcloud - gitweb
LU-15398 lnet: Avoid peer NI recovery for local interface 33/45933/10
authorChris Horn <chris.horn@hpe.com>
Thu, 23 Dec 2021 20:15:27 +0000 (14:15 -0600)
committerOleg Drokin <green@whamcloud.com>
Mon, 31 Jan 2022 01:34:02 +0000 (01:34 +0000)
If a MR peer has a MR peer entry for itself (can happen if manually
created or discovery is run on itself for some reason), then it is
possible for it to put its own interfaces into peer recovery. Problems
with local interfaces should be handled via local NI recovery.

Test-Parameters: trivial testlist=sanity-lnet
HPE-bug-id: LUS-10661
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I5b28195979a6113fa863b5795a4528b072610891
Reviewed-on: https://review.whamcloud.com/45933
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/lib-msg.c
lustre/tests/sanity-lnet.sh

index 462163f..ed979bc 100644 (file)
@@ -878,6 +878,12 @@ lnet_health_check(struct lnet_msg *msg)
                        if (!lnet_isrouter(lpni))
                                handle_remote_health = false;
                }
+               /* Do not put my interfaces into peer NI recovery. They should
+                * be handled with local NI recovery.
+                */
+               if (handle_remote_health && lpni &&
+                   lnet_nid_to_ni_locked(&lpni->lpni_nid, 0))
+                       handle_remote_health = false;
                lnet_net_unlock(0);
        }
 
index 947a451..860f712 100755 (executable)
@@ -1834,7 +1834,7 @@ check_nid_in_recovq() {
        local found=false
        local nid=""
 
-       echo "Check recovery queue"
+       echo "Check \"$1\" recovery queue"
        echo "$recovq"
        if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then
                error "Expect $expect NIDs found: \"$recovq\""
@@ -2295,6 +2295,46 @@ test_215() {
 }
 run_test 215 "Test lnetctl ping --source option"
 
+test_216() {
+       local rc=0
+
+       reinit_dlc || return $?
+
+       add_net "tcp" "${INTERFACES[0]}" || return $?
+       add_net "tcp1" "${INTERFACES[0]}" || return $?
+
+       local nids=( $($LCTL list_nids | xargs echo) )
+
+       do_lnetctl discover ${nids[0]} ||
+               error "Initial discovery failed"
+
+       do_lnetctl ping --source ${nids[0]} ${nids[0]} ||
+               error "Initial ping failed $?"
+
+       do_lnetctl ping --source ${nids[1]} ${nids[1]} ||
+               error "Initial ping failed $?"
+
+       local src dst
+       for src in ${nids[@]}; do
+               for dst in ${nids[@]}; do
+                       $LCTL net_drop_add -r 1 -s $src -d $dst -e network_timeout
+               done
+       done
+
+       do_lnetctl ping ${nids[0]} || rc=$?
+
+       $LCTL net_drop_del -a
+
+       [[ $rc -eq 0 ]] &&
+               error "expected ping to fail"
+
+       check_nid_in_recovq "-p" 0
+       check_nid_in_recovq "-l" 1
+
+       return 0
+}
+run_test 216 "Failed send to peer NI owned by local host should not trigger peer NI recovery"
+
 test_230() {
        # LU-12815
        echo "Check valid values; Should succeed"