Whamcloud - gitweb
LU-18697 lnet: lnet_peer_del_nid refcount loss 77/57977/3
authorChris Horn <chris.horn@hpe.com>
Tue, 4 Feb 2025 16:37:01 +0000 (09:37 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 21 May 2025 03:42:43 +0000 (03:42 +0000)
This is a regression introduced in the b2_15 port of

Lustre-change: https://review.whamcloud.com/50106
Lustre-commit: aacb16191a72bc6db1155030849efb0d6971a572

In lnet_peer_del_nid(), the call to lnet_peer_ni_find_locked() takes
a reference on the lnet_peer_ni, but this reference is not dropped
if the peer state has LNET_PEER_LOCK_PRIMARY bit set and the nid
being deleted is the primary NID of the peer.

A test case is added to exercise this code path.

HPE-bug-id: LUS-12709
Fixes: c9badd8648 ("LU-14668 lnet: Lock primary NID logic")
Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ib717672189824e61a184ddcb9127d2921f2a66db
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57977
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/peer.c
lustre/tests/sanity-lnet.sh

index 7064452..3a0fe0f 100644 (file)
@@ -547,7 +547,6 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
                }
        }
 
-       lpni = lnet_peer_ni_find_locked(&nid);
        /* If we're asked to lock down the primary NID we shouldn't be
         * deleting it
         */
@@ -557,6 +556,7 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
                goto out;
        }
 
+       lpni = lnet_peer_ni_find_locked(&nid);
        if (!lpni) {
                rc = -ENOENT;
                goto out;
@@ -3847,17 +3847,17 @@ __must_hold(&lp->lp_lock)
        if (rc)
                goto fail_unlink;
 
-       CDEBUG(D_NET, "peer %s\n", libcfs_nidstr(&lp->lp_primary_nid));
-
        spin_lock(&lp->lp_lock);
+
+       CDEBUG(D_NET, "peer %s(%p) state %#x\n",
+              libcfs_nidstr(&lp->lp_primary_nid), lp, lp->lp_state);
+
        return 0;
 
 fail_unlink:
        LNetMDUnlink(lp->lp_push_mdh);
        LNetInvalidateMDHandle(&lp->lp_push_mdh);
 fail_error:
-       CDEBUG(D_NET, "peer %s(%p): %d\n", libcfs_nidstr(&lp->lp_primary_nid),
-              lp, rc);
        /*
         * The errors that get us here are considered hard errors and
         * cause Discovery to terminate. So we clear PUSH_SENT, but do
@@ -3867,6 +3867,8 @@ fail_error:
         */
        spin_lock(&lp->lp_lock);
        lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED);
+       CDEBUG(D_NET, "peer %s(%p) state %#x: %d\n",
+              libcfs_nidstr(&lp->lp_primary_nid), lp, lp->lp_state, rc);
        return rc;
 }
 
index a8d33ee..321d6f9 100755 (executable)
@@ -3153,6 +3153,56 @@ EOF
 }
 run_test 304 "Check locked primary peer nid consolidation"
 
+test_350() {
+       reinit_dlc || return $?
+
+       do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} ||
+               error "Failed to add net rc=$?"
+       do_lnetctl net add --net ${NETTYPE}2 --if ${INTERFACES[0]} ||
+               error "Failed to add net rc=$?"
+
+       local nid1=$($LCTL list_nids | head -n 1)
+       local nid2=$($LCTL list_nids | tail -n 1)
+
+       [[ -n $nid1 && -n $nid2 ]] || error "Failed to get nids"
+
+       local pnid=${nid1}3
+
+       do_lnetctl peer add --prim ${pnid} --lock_prim --nid $nid1,$nid2 ||
+               error "Failed to add peer rc=$?"
+
+#define LNET_PEER_MULTI_RAIL            BIT(0)
+#define LNET_PEER_LOCK_PRIMARY          BIT(20)
+       local state=1048577
+
+       do_lnetctl peer set --state $state --nid $pnid ||
+               error "Failed to set peer state rc=$?"
+
+       local actual=$($LNETCTL peer show -v 3 --nid $pnid |
+                      awk '/peer state/{print $NF}')
+
+       ((actual == state)) ||
+               error "Expect peer state $state but found $actual"
+
+       do_lnetctl discover $pnid || error "Discovery failed rc=$?"
+
+       cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
+peer:
+    - primary nid: ${pnid}
+      Multi-Rail: True
+      peer ni:
+        - nid: ${nid1}
+          state: NA
+        - nid: ${pnid}
+          state: NA
+        - nid: ${nid2}
+          state: NA
+EOF
+       $LNETCTL peer show > $TMP/sanity-lnet-$testnum-actual.yaml
+       compare_yaml_files || error "Unexpected peer config"
+       $LUSTRE_RMMOD
+}
+run_test 350 "Check refcount loss when locked primary NID doesn't exist"
 complete $SECONDS
 
 cleanup_testsuite