Whamcloud - gitweb
LU-12739 lnet: Don't queue msg when discovery has completed 39/36139/3
authorChris Horn <hornc@cray.com>
Mon, 9 Sep 2019 17:54:08 +0000 (12:54 -0500)
committerOleg Drokin <green@whamcloud.com>
Fri, 27 Sep 2019 23:12:25 +0000 (23:12 +0000)
In lnet_initiate_peer_discovery(), it is possible for the peer object
to change after the call to lnet_discover_peer_locked(), and it is
also possible for the peer to complete discovery between the first
call to lnet_peer_is_uptodate() and our placing the lnet_msg onto
the peer's lp_dc_pendq. After the call to lnet_discover_peer_locked()
check whether the, potentially new, peer object is up to date while
holding the lp_lock. If the peer is up to date, then we needn't
queue the message. Otherwise, we continue to hold the lock to place
the message on the peer's lp_dc_pendq.

Cray-bug-id: LUS-7596
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: Ib3da7447588479bb35afcc3fe176b9120d915a89
Reviewed-on: https://review.whamcloud.com/36139
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/lib-move.c
lnet/lnet/peer.c

index 304f4f6..428ae91 100644 (file)
@@ -992,6 +992,7 @@ lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni)
 }
 
 bool lnet_peer_is_uptodate(struct lnet_peer *lp);
+bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp);
 bool lnet_is_discovery_disabled(struct lnet_peer *lp);
 bool lnet_peer_gw_discovery(struct lnet_peer *lp);
 
index 1706ef7..88f20e7 100644 (file)
@@ -2009,15 +2009,21 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni,
        }
        /* The peer may have changed. */
        peer = lpni->lpni_peer_net->lpn_peer;
+       spin_lock(&peer->lp_lock);
+       if (lnet_peer_is_uptodate_locked(peer)) {
+               spin_unlock(&peer->lp_lock);
+               lnet_peer_ni_decref_locked(lpni);
+               return 0;
+       }
        /* queue message and return */
        msg->msg_rtr_nid_param = rtr_nid;
        msg->msg_sending = 0;
        msg->msg_txpeer = NULL;
-       spin_lock(&peer->lp_lock);
        list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+       primary_nid = peer->lp_primary_nid;
        spin_unlock(&peer->lp_lock);
+
        lnet_peer_ni_decref_locked(lpni);
-       primary_nid = peer->lp_primary_nid;
 
        CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
                msg, libcfs_nid2str(primary_nid));
@@ -2671,11 +2677,10 @@ again:
        msg->msg_src_nid_param = src_nid;
 
        /*
-        * Now that we have a peer_ni, check if we want to discover
-        * the peer. Traffic to the LNET_RESERVED_PORTAL should not
-        * trigger discovery.
+        * If necessary, perform discovery on the peer that owns this peer_ni.
+        * Note, this can result in the ownership of this peer_ni changing
+        * to another peer object.
         */
-       peer = lpni->lpni_peer_net->lpn_peer;
        rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt);
        if (rc) {
                lnet_peer_ni_decref_locked(lpni);
@@ -2684,6 +2689,8 @@ again:
        }
        lnet_peer_ni_decref_locked(lpni);
 
+       peer = lpni->lpni_peer_net->lpn_peer;
+
        /*
         * Identify the different send cases
         */
index d107819..e9505fd 100644 (file)
@@ -1844,6 +1844,17 @@ lnet_peer_gw_discovery(struct lnet_peer *lp)
        return rc;
 }
 
+bool
+lnet_peer_is_uptodate(struct lnet_peer *lp)
+{
+       bool rc;
+
+       spin_lock(&lp->lp_lock);
+       rc = lnet_peer_is_uptodate_locked(lp);
+       spin_unlock(&lp->lp_lock);
+       return rc;
+}
+
 /*
  * Is a peer uptodate from the point of view of discovery?
  *
@@ -1853,11 +1864,11 @@ lnet_peer_gw_discovery(struct lnet_peer *lp)
  * Otherwise look at whether the peer needs rediscovering.
  */
 bool
-lnet_peer_is_uptodate(struct lnet_peer *lp)
+lnet_peer_is_uptodate_locked(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
 {
        bool rc;
 
-       spin_lock(&lp->lp_lock);
        if (lp->lp_state & (LNET_PEER_DISCOVERING |
                            LNET_PEER_FORCE_PING |
                            LNET_PEER_FORCE_PUSH)) {
@@ -1874,7 +1885,6 @@ lnet_peer_is_uptodate(struct lnet_peer *lp)
        } else {
                rc = false;
        }
-       spin_unlock(&lp->lp_lock);
 
        return rc;
 }