From 72087a1b0cf0673d61b4d5056d97a647e5cdad92 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 23 Dec 2021 14:15:27 -0600 Subject: [PATCH] LU-15398 lnet: Avoid peer NI recovery for local interface If a MR peer has a MR peer entry for itself (can happen if manually created or discovery is run on itself for some reason), then it is possible for it to put its own interfaces into peer recovery. Problems with local interfaces should be handled via local NI recovery. Lustre-change: https://review.whamcloud.com/45933 Lustre-commit: fb5d7036ec356c825f2aadece68cd7c4af487680 Test-Parameters: trivial testlist=sanity-lnet HPE-bug-id: LUS-10661 Signed-off-by: Chris Horn Change-Id: I5b28195979a6113fa863b5795a4528b072610891 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51683 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Cyril Bordage Reviewed-by: Andreas Dilger --- lnet/lnet/lib-msg.c | 6 ++++++ lustre/tests/sanity-lnet.sh | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 6592cf3..f040e37 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -877,6 +877,12 @@ lnet_health_check(struct lnet_msg *msg) if (!lnet_isrouter(lpni)) handle_remote_health = false; } + /* Do not put my interfaces into peer NI recovery. They should + * be handled with local NI recovery. + */ + if (handle_remote_health && lpni && + lnet_nid2ni_locked(lpni->lpni_nid, 0)) + handle_remote_health = false; lnet_net_unlock(0); } diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 4a27d00..2dc364b 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1974,7 +1974,7 @@ check_nid_in_recovq() { local found=false local nid="" - echo "Check recovery queue" + echo "Check \"$1\" recovery queue" echo "$recovq" if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then error "Expect $expect NIDs found: \"$recovq\"" @@ -2411,6 +2411,46 @@ test_215() { } run_test 215 "Test lnetctl ping --source option" +test_216() { + local rc=0 + + reinit_dlc || return $? + + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + local nids=( $($LCTL list_nids | xargs echo) ) + + do_lnetctl discover ${nids[0]} || + error "Initial discovery failed" + + do_lnetctl ping --source ${nids[0]} ${nids[0]} || + error "Initial ping failed $?" + + do_lnetctl ping --source ${nids[1]} ${nids[1]} || + error "Initial ping failed $?" + + local src dst + for src in ${nids[@]}; do + for dst in ${nids[@]}; do + $LCTL net_drop_add -r 1 -s $src -d $dst -e network_timeout + done + done + + do_lnetctl ping ${nids[0]} || rc=$? + + $LCTL net_drop_del -a + + [[ $rc -eq 0 ]] && + error "expected ping to fail" + + check_nid_in_recovq "-p" 0 + check_nid_in_recovq "-l" 1 + + return 0 +} +run_test 216 "Failed send to peer NI owned by local host should not trigger peer NI recovery" + test_217() { reinit_dlc || return $? -- 1.8.3.1