From 4ef62976448d6821df9aab3e720fd8d9d0bdefce Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Mon, 9 Sep 2019 12:54:08 -0500 Subject: [PATCH] LU-12739 lnet: Don't queue msg when discovery has completed In lnet_initiate_peer_discovery(), it is possible for the peer object to change after the call to lnet_discover_peer_locked(), and it is also possible for the peer to complete discovery between the first call to lnet_peer_is_uptodate() and our placing the lnet_msg onto the peer's lp_dc_pendq. After the call to lnet_discover_peer_locked() check whether the, potentially new, peer object is up to date while holding the lp_lock. If the peer is up to date, then we needn't queue the message. Otherwise, we continue to hold the lock to place the message on the peer's lp_dc_pendq. Cray-bug-id: LUS-7596 Signed-off-by: Chris Horn Change-Id: Ib3da7447588479bb35afcc3fe176b9120d915a89 Reviewed-on: https://review.whamcloud.com/36139 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexandr Boyko Reviewed-by: Amir Shehata Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 1 + lnet/lnet/lib-move.c | 19 +++++++++++++------ lnet/lnet/peer.c | 16 +++++++++++++--- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 304f4f6..428ae91 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -992,6 +992,7 @@ lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni) } bool lnet_peer_is_uptodate(struct lnet_peer *lp); +bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp); bool lnet_is_discovery_disabled(struct lnet_peer *lp); bool lnet_peer_gw_discovery(struct lnet_peer *lp); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 1706ef7..88f20e7 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2009,15 +2009,21 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, } /* The peer may have changed. */ peer = lpni->lpni_peer_net->lpn_peer; + spin_lock(&peer->lp_lock); + if (lnet_peer_is_uptodate_locked(peer)) { + spin_unlock(&peer->lp_lock); + lnet_peer_ni_decref_locked(lpni); + return 0; + } /* queue message and return */ msg->msg_rtr_nid_param = rtr_nid; msg->msg_sending = 0; msg->msg_txpeer = NULL; - spin_lock(&peer->lp_lock); list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); + primary_nid = peer->lp_primary_nid; spin_unlock(&peer->lp_lock); + lnet_peer_ni_decref_locked(lpni); - primary_nid = peer->lp_primary_nid; CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", msg, libcfs_nid2str(primary_nid)); @@ -2671,11 +2677,10 @@ again: msg->msg_src_nid_param = src_nid; /* - * Now that we have a peer_ni, check if we want to discover - * the peer. Traffic to the LNET_RESERVED_PORTAL should not - * trigger discovery. + * If necessary, perform discovery on the peer that owns this peer_ni. + * Note, this can result in the ownership of this peer_ni changing + * to another peer object. */ - peer = lpni->lpni_peer_net->lpn_peer; rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt); if (rc) { lnet_peer_ni_decref_locked(lpni); @@ -2684,6 +2689,8 @@ again: } lnet_peer_ni_decref_locked(lpni); + peer = lpni->lpni_peer_net->lpn_peer; + /* * Identify the different send cases */ diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index d107819..e9505fd 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1844,6 +1844,17 @@ lnet_peer_gw_discovery(struct lnet_peer *lp) return rc; } +bool +lnet_peer_is_uptodate(struct lnet_peer *lp) +{ + bool rc; + + spin_lock(&lp->lp_lock); + rc = lnet_peer_is_uptodate_locked(lp); + spin_unlock(&lp->lp_lock); + return rc; +} + /* * Is a peer uptodate from the point of view of discovery? * @@ -1853,11 +1864,11 @@ lnet_peer_gw_discovery(struct lnet_peer *lp) * Otherwise look at whether the peer needs rediscovering. */ bool -lnet_peer_is_uptodate(struct lnet_peer *lp) +lnet_peer_is_uptodate_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) { bool rc; - spin_lock(&lp->lp_lock); if (lp->lp_state & (LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH)) { @@ -1874,7 +1885,6 @@ lnet_peer_is_uptodate(struct lnet_peer *lp) } else { rc = false; } - spin_unlock(&lp->lp_lock); return rc; } -- 1.8.3.1