Whamcloud - gitweb
LU-13895 lnet: Prevent discovery on deleted peer 05/39605/7
authorChris Horn <chris.horn@hpe.com>
Thu, 6 Aug 2020 21:21:29 +0000 (16:21 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 10 Mar 2021 08:01:47 +0000 (08:01 +0000)
We needn't perform any discovery activities on a peer that has had
lnet_peer_del() called on it.

Test-Parameters: trivial
HPE-bug-id: LUS-9192
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I5c89dc89038d2c8bf4d2a29029af7720963b81a2
Reviewed-on: https://review.whamcloud.com/39605
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/peer.c

index 4fdcd87..b719414 100644 (file)
@@ -969,6 +969,8 @@ lnet_peer_needs_push(struct lnet_peer *lp)
 {
        if (!(lp->lp_state & LNET_PEER_MULTI_RAIL))
                return false;
+       if (lp->lp_state & LNET_PEER_MARK_DELETED)
+               return false;
        if (lp->lp_state & LNET_PEER_FORCE_PUSH)
                return true;
        if (lp->lp_state & LNET_PEER_NO_DISCOVERY)
index 22a1bdd..41565ee 100644 (file)
@@ -764,6 +764,8 @@ struct lnet_peer {
 
 /* peer is marked for deletion */
 #define LNET_PEER_MARK_DELETION                BIT(18)
+/* lnet_peer_del()/lnet_peer_del_locked() has been called on the peer */
+#define LNET_PEER_MARK_DELETED         BIT(19)
 
 struct lnet_peer_net {
        /* chain on lp_peer_nets */
index ff2ce69..e3eb262 100644 (file)
@@ -459,6 +459,10 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 
        CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid));
 
+       spin_lock(&peer->lp_lock);
+       peer->lp_state |= LNET_PEER_MARK_DELETED;
+       spin_unlock(&peer->lp_lock);
+
        lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
        while (lpni != NULL) {
                lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
@@ -471,9 +475,41 @@ lnet_peer_del_locked(struct lnet_peer *peer)
        return rc2;
 }
 
+/*
+ * Discovering this peer is taking too long. Cancel any Ping or Push
+ * that discovery is waiting on by unlinking the relevant MDs. The
+ * lnet_discovery_event_handler() will proceed from here and complete
+ * the cleanup.
+ */
+static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
+{
+       struct lnet_handle_md ping_mdh;
+       struct lnet_handle_md push_mdh;
+
+       LNetInvalidateMDHandle(&ping_mdh);
+       LNetInvalidateMDHandle(&push_mdh);
+
+       spin_lock(&lp->lp_lock);
+       if (lp->lp_state & LNET_PEER_PING_SENT) {
+               ping_mdh = lp->lp_ping_mdh;
+               LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+       }
+       if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+               push_mdh = lp->lp_push_mdh;
+               LNetInvalidateMDHandle(&lp->lp_push_mdh);
+       }
+       spin_unlock(&lp->lp_lock);
+
+       if (!LNetMDHandleIsInvalid(ping_mdh))
+               LNetMDUnlink(ping_mdh);
+       if (!LNetMDHandleIsInvalid(push_mdh))
+               LNetMDUnlink(push_mdh);
+}
+
 static int
 lnet_peer_del(struct lnet_peer *peer)
 {
+       lnet_peer_cancel_discovery(peer);
        lnet_net_lock(LNET_LOCK_EX);
        lnet_peer_del_locked(peer);
        lnet_net_unlock(LNET_LOCK_EX);
@@ -2970,6 +3006,10 @@ __must_hold(&lp->lp_lock)
        CDEBUG(D_NET, "peer %s(%p) state %#x\n",
               libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state);
 
+       /* no-op if lnet_peer_del() has already been called on this peer */
+       if (lp->lp_state & LNET_PEER_MARK_DELETED)
+               return 0;
+
        if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
                return -ESHUTDOWN;
 
@@ -3402,37 +3442,6 @@ static void lnet_peer_discovery_error(struct lnet_peer *lp, int error)
 }
 
 /*
- * Discovering this peer is taking too long. Cancel any Ping or Push
- * that discovery is waiting on by unlinking the relevant MDs. The
- * lnet_discovery_event_handler() will proceed from here and complete
- * the cleanup.
- */
-static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
-{
-       struct lnet_handle_md ping_mdh;
-       struct lnet_handle_md push_mdh;
-
-       LNetInvalidateMDHandle(&ping_mdh);
-       LNetInvalidateMDHandle(&push_mdh);
-
-       spin_lock(&lp->lp_lock);
-       if (lp->lp_state & LNET_PEER_PING_SENT) {
-               ping_mdh = lp->lp_ping_mdh;
-               LNetInvalidateMDHandle(&lp->lp_ping_mdh);
-       }
-       if (lp->lp_state & LNET_PEER_PUSH_SENT) {
-               push_mdh = lp->lp_push_mdh;
-               LNetInvalidateMDHandle(&lp->lp_push_mdh);
-       }
-       spin_unlock(&lp->lp_lock);
-
-       if (!LNetMDHandleIsInvalid(ping_mdh))
-               LNetMDUnlink(ping_mdh);
-       if (!LNetMDHandleIsInvalid(push_mdh))
-               LNetMDUnlink(push_mdh);
-}
-
-/*
  * Wait for work to be queued or some other change that must be
  * attended to. Returns non-zero if the discovery thread should shut
  * down.
@@ -3588,7 +3597,8 @@ static int lnet_peer_discovery(void *arg)
                        CDEBUG(D_NET, "peer %s(%p) state %#x\n",
                                libcfs_nid2str(lp->lp_primary_nid), lp,
                                lp->lp_state);
-                       if (lp->lp_state & LNET_PEER_MARK_DELETION)
+                       if (lp->lp_state & (LNET_PEER_MARK_DELETION |
+                                           LNET_PEER_MARK_DELETED))
                                rc = lnet_peer_deletion(lp);
                        else if (lp->lp_state & LNET_PEER_DATA_PRESENT)
                                rc = lnet_peer_data_present(lp);