Whamcloud - gitweb
LU-12739 lnet: Don't queue msg when discovery has completed
authorChris Horn <hornc@cray.com>
Mon, 9 Sep 2019 17:54:08 +0000 (12:54 -0500)
committerOleg Drokin <green@linuxhacker.ru>
Wed, 14 Sep 2022 02:53:03 +0000 (22:53 -0400)
In lnet_initiate_peer_discovery(), it is possible for the peer object
to change after the call to lnet_discover_peer_locked(), and it is
also possible for the peer to complete discovery between the first
call to lnet_peer_is_uptodate() and our placing the lnet_msg onto
the peer's lp_dc_pendq. After the call to lnet_discover_peer_locked()
check whether the, potentially new, peer object is up to date while
holding the lp_lock. If the peer is up to date, then we needn't
queue the message. Otherwise, we continue to hold the lock to place
the message on the peer's lp_dc_pendq.

Lustre-change: https://review.whamcloud.com/36139
Lustre-commit: 4ef62976448d6821df9aab3e720fd8d9d0bdefce

Test-Parameters: trivial testlist=sanity-lnet
Cray-bug-id: LUS-7596
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: Ib3da7447588479bb35afcc3fe176b9120d915a89

lnet/include/lnet/lib-lnet.h
lnet/lnet/lib-move.c
lnet/lnet/peer.c

index 048531b..3e572d3 100644 (file)
@@ -994,6 +994,7 @@ lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni)
 }
 
 bool lnet_peer_is_uptodate(struct lnet_peer *lp);
+bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp);
 
 static inline bool
 lnet_peer_needs_push(struct lnet_peer *lp)
index b178fd0..9bcc5cd 100644 (file)
@@ -2574,9 +2574,9 @@ again:
        msg->msg_src_nid_param = src_nid;
 
        /*
-        * Now that we have a peer_ni, check if we want to discover
-        * the peer. Traffic to the LNET_RESERVED_PORTAL should not
-        * trigger discovery.
+        * If necessary, perform discovery on the peer that owns this peer_ni.
+        * Note, this can result in the ownership of this peer_ni changing
+        * to another peer object.
         */
        peer = lpni->lpni_peer_net->lpn_peer;
        if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) {
@@ -2589,20 +2589,23 @@ again:
                }
                /* The peer may have changed. */
                peer = lpni->lpni_peer_net->lpn_peer;
-               /* queue message and return */
-               msg->msg_rtr_nid_param = rtr_nid;
-               msg->msg_sending = 0;
                spin_lock(&peer->lp_lock);
-               list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
-               spin_unlock(&peer->lp_lock);
-               lnet_peer_ni_decref_locked(lpni);
-               primary_nid = peer->lp_primary_nid;
-               lnet_net_unlock(cpt);
+               if (!lnet_peer_is_uptodate_locked(peer)) {
+                       /* queue message and return */
+                       msg->msg_rtr_nid_param = rtr_nid;
+                       msg->msg_sending = 0;
+                       list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+                       lnet_peer_ni_decref_locked(lpni);
+                       primary_nid = peer->lp_primary_nid;
+                       spin_unlock(&peer->lp_lock);
+                       lnet_net_unlock(cpt);
 
-               CDEBUG(D_NET, "%s pending discovery\n",
-                      libcfs_nid2str(primary_nid));
+                       CDEBUG(D_NET, "%s pending discovery\n",
+                              libcfs_nid2str(primary_nid));
 
-               return LNET_DC_WAIT;
+                       return LNET_DC_WAIT;
+               }
+               spin_unlock(&peer->lp_lock);
        }
        lnet_peer_ni_decref_locked(lpni);
 
index ea70e38..27aa94b 100644 (file)
@@ -1744,6 +1744,17 @@ out_mutex_unlock:
        return lpni;
 }
 
+bool
+lnet_peer_is_uptodate(struct lnet_peer *lp)
+{
+       bool rc;
+
+       spin_lock(&lp->lp_lock);
+       rc = lnet_peer_is_uptodate_locked(lp);
+       spin_unlock(&lp->lp_lock);
+       return rc;
+}
+
 /*
  * Is a peer uptodate from the point of view of discovery?
  *
@@ -1753,11 +1764,11 @@ out_mutex_unlock:
  * Otherwise look at whether the peer needs rediscovering.
  */
 bool
-lnet_peer_is_uptodate(struct lnet_peer *lp)
+lnet_peer_is_uptodate_locked(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
 {
        bool rc;
 
-       spin_lock(&lp->lp_lock);
        if (lp->lp_state & (LNET_PEER_DISCOVERING |
                            LNET_PEER_FORCE_PING |
                            LNET_PEER_FORCE_PUSH)) {
@@ -1779,7 +1790,6 @@ lnet_peer_is_uptodate(struct lnet_peer *lp)
        } else {
                rc = false;
        }
-       spin_unlock(&lp->lp_lock);
 
        return rc;
 }