From: Chris Horn Date: Tue, 4 Feb 2025 16:37:01 +0000 (-0700) Subject: LU-18697 lnet: lnet_peer_del_nid refcount loss X-Git-Tag: 2.15.7-RC1~3 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=c00bb50624a3eedad16759075cebeedf17030ba4;p=fs%2Flustre-release.git LU-18697 lnet: lnet_peer_del_nid refcount loss This is a regression introduced in the b2_15 port of Lustre-change: https://review.whamcloud.com/50106 Lustre-commit: aacb16191a72bc6db1155030849efb0d6971a572 In lnet_peer_del_nid(), the call to lnet_peer_ni_find_locked() takes a reference on the lnet_peer_ni, but this reference is not dropped if the peer state has LNET_PEER_LOCK_PRIMARY bit set and the nid being deleted is the primary NID of the peer. A test case is added to exercise this code path. HPE-bug-id: LUS-12709 Fixes: c9badd8648 ("LU-14668 lnet: Lock primary NID logic") Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Chris Horn Change-Id: Ib717672189824e61a184ddcb9127d2921f2a66db Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57977 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Serguei Smirnov Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 7064452..3a0fe0f 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -547,7 +547,6 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) } } - lpni = lnet_peer_ni_find_locked(&nid); /* If we're asked to lock down the primary NID we shouldn't be * deleting it */ @@ -557,6 +556,7 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) goto out; } + lpni = lnet_peer_ni_find_locked(&nid); if (!lpni) { rc = -ENOENT; goto out; @@ -3847,17 +3847,17 @@ __must_hold(&lp->lp_lock) if (rc) goto fail_unlink; - CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid)); - spin_lock(&lp->lp_lock); + + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, lp->lp_state); + return 0; fail_unlink: LNetMDUnlink(lp->lp_push_mdh); LNetInvalidateMDHandle(&lp->lp_push_mdh); fail_error: - CDEBUG(D_NET, "peer %s(%p): %d\n", libcfs_nidstr(&lp->lp_primary_nid), - lp, rc); /* * The errors that get us here are considered hard errors and * cause Discovery to terminate. So we clear PUSH_SENT, but do @@ -3867,6 +3867,8 @@ fail_error: */ spin_lock(&lp->lp_lock); lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED); + CDEBUG(D_NET, "peer %s(%p) state %#x: %d\n", + libcfs_nidstr(&lp->lp_primary_nid), lp, lp->lp_state, rc); return rc; } diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index a8d33ee..321d6f9 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -3153,6 +3153,56 @@ EOF } run_test 304 "Check locked primary peer nid consolidation" +test_350() { + reinit_dlc || return $? + + do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} || + error "Failed to add net rc=$?" + do_lnetctl net add --net ${NETTYPE}2 --if ${INTERFACES[0]} || + error "Failed to add net rc=$?" + + local nid1=$($LCTL list_nids | head -n 1) + local nid2=$($LCTL list_nids | tail -n 1) + + [[ -n $nid1 && -n $nid2 ]] || error "Failed to get nids" + + local pnid=${nid1}3 + + do_lnetctl peer add --prim ${pnid} --lock_prim --nid $nid1,$nid2 || + error "Failed to add peer rc=$?" + +#define LNET_PEER_MULTI_RAIL BIT(0) +#define LNET_PEER_LOCK_PRIMARY BIT(20) + local state=1048577 + + do_lnetctl peer set --state $state --nid $pnid || + error "Failed to set peer state rc=$?" + + local actual=$($LNETCTL peer show -v 3 --nid $pnid | + awk '/peer state/{print $NF}') + + ((actual == state)) || + error "Expect peer state $state but found $actual" + + do_lnetctl discover $pnid || error "Discovery failed rc=$?" + + cat < $TMP/sanity-lnet-$testnum-expected.yaml +peer: + - primary nid: ${pnid} + Multi-Rail: True + peer ni: + - nid: ${nid1} + state: NA + - nid: ${pnid} + state: NA + - nid: ${nid2} + state: NA +EOF + $LNETCTL peer show > $TMP/sanity-lnet-$testnum-actual.yaml + compare_yaml_files || error "Unexpected peer config" + $LUSTRE_RMMOD +} +run_test 350 "Check refcount loss when locked primary NID doesn't exist" complete $SECONDS cleanup_testsuite