From 72726a311814bc0c0eefb22a769c9ebf7912839e Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 27 Jun 2024 10:40:19 -0600 Subject: [PATCH] LU-14810 lnet: Do not issue multiple PUSHes PUSH ACK may be delayed in network. Meanwhile, some event could cause peer to go through discovery again (e.g. config change or NI state change). The discovery state machine doesn't consider whether there is an outstanding PUSH so it may issue another one for the same peer. When delayed ACK arrives it will then clear PUSH_SENT, so now discovery doesn't know that there is an outstanding PUSH. If discovery is stopped then it doesn't unlink the push MD and this can cause an assert in lnet_assert_handler_unused() because the push event handler is still in use. Modify the discovery state machine to check for PUSH_SENT when determining whether a peer needs a PUSH. sanity-lnet test_304 can reproduce this issue under ipv6 configuration if modules are unloaded at the end of the test. Test-Parameters: trivial Signed-off-by: Chris Horn Change-Id: Ic3f7a8b44f85a18afb939fdbfa1f9bc5dc64d93d Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55559 Tested-by: jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Serguei Smirnov Reviewed-by: Cyril Bordage Reviewed-by: Frank Sehr Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 2 ++ lustre/tests/sanity-lnet.sh | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index e2d251b..81bf473 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -1134,6 +1134,8 @@ lnet_peer_needs_push(struct lnet_peer *lp) return false; if (lp->lp_state & LNET_PEER_MARK_DELETED) return false; + if (lp->lp_state & LNET_PEER_PUSH_SENT) + return false; if (lp->lp_state & LNET_PEER_FORCE_PUSH) return true; if (lp->lp_state & LNET_PEER_NO_DISCOVERY) diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 43e2665..d9ee55c 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -4159,7 +4159,8 @@ EOF # LNET_PEER_DISCOVERED(4) | LNET_PEER_NIDS_UPTODATE(8) (( $locked_peer_state != "1048849")) && error "Wrong peer state \"$locked_peer_state\" expected 1048849" - return 0 + + cleanup_lnet } run_test 304 "Check locked primary peer nid consolidation" -- 1.8.3.1