From fb5d7036ec356c825f2aadece68cd7c4af487680 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 23 Dec 2021 14:15:27 -0600 Subject: [PATCH] LU-15398 lnet: Avoid peer NI recovery for local interface If a MR peer has a MR peer entry for itself (can happen if manually created or discovery is run on itself for some reason), then it is possible for it to put its own interfaces into peer recovery. Problems with local interfaces should be handled via local NI recovery. Test-Parameters: trivial testlist=sanity-lnet HPE-bug-id: LUS-10661 Signed-off-by: Chris Horn Change-Id: I5b28195979a6113fa863b5795a4528b072610891 Reviewed-on: https://review.whamcloud.com/45933 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Serguei Smirnov Reviewed-by: Andriy Skulysh Reviewed-by: Oleg Drokin --- lnet/lnet/lib-msg.c | 6 ++++++ lustre/tests/sanity-lnet.sh | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 462163f..ed979bc 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -878,6 +878,12 @@ lnet_health_check(struct lnet_msg *msg) if (!lnet_isrouter(lpni)) handle_remote_health = false; } + /* Do not put my interfaces into peer NI recovery. They should + * be handled with local NI recovery. + */ + if (handle_remote_health && lpni && + lnet_nid_to_ni_locked(&lpni->lpni_nid, 0)) + handle_remote_health = false; lnet_net_unlock(0); } diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 947a451..860f712 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1834,7 +1834,7 @@ check_nid_in_recovq() { local found=false local nid="" - echo "Check recovery queue" + echo "Check \"$1\" recovery queue" echo "$recovq" if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then error "Expect $expect NIDs found: \"$recovq\"" @@ -2295,6 +2295,46 @@ test_215() { } run_test 215 "Test lnetctl ping --source option" +test_216() { + local rc=0 + + reinit_dlc || return $? + + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + local nids=( $($LCTL list_nids | xargs echo) ) + + do_lnetctl discover ${nids[0]} || + error "Initial discovery failed" + + do_lnetctl ping --source ${nids[0]} ${nids[0]} || + error "Initial ping failed $?" + + do_lnetctl ping --source ${nids[1]} ${nids[1]} || + error "Initial ping failed $?" + + local src dst + for src in ${nids[@]}; do + for dst in ${nids[@]}; do + $LCTL net_drop_add -r 1 -s $src -d $dst -e network_timeout + done + done + + do_lnetctl ping ${nids[0]} || rc=$? + + $LCTL net_drop_del -a + + [[ $rc -eq 0 ]] && + error "expected ping to fail" + + check_nid_in_recovq "-p" 0 + check_nid_in_recovq "-l" 1 + + return 0 +} +run_test 216 "Failed send to peer NI owned by local host should not trigger peer NI recovery" + test_230() { # LU-12815 echo "Check valid values; Should succeed" -- 1.8.3.1