From 439520f762b093edba9af2f4ab63011eafab28d5 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 11 Jun 2019 11:25:27 -0700 Subject: [PATCH] LU-12424 lnet: prevent loop in LNetPrimaryNID() If discovery is disabled locally or at the remote end, then attempt discovery only once. Do not update the internal database when discovery is disabled and do not repeat discovery. This change prevents LNet from getting hung waiting for discovery to complete. Signed-off-by: Amir Shehata Change-Id: I4543b0f71e6cf297a1a5f058ebcc6bf74b8ac328 Reviewed-on: https://review.whamcloud.com/35191 Reviewed-by: Olaf Weber Tested-by: Jenkins Reviewed-by: Chris Horn Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/lnet/peer.c | 75 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index f58e0e0..2ae8956 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1148,6 +1148,35 @@ lnet_peer_primary_nid_locked(lnet_nid_t nid) return primary_nid; } +bool +lnet_is_discovery_disabled_locked(struct lnet_peer *lp) +{ + if (lnet_peer_discovery_disabled) + return true; + + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) || + (lp->lp_state & LNET_PEER_NO_DISCOVERY)) { + return true; + } + + return false; +} + +/* + * Peer Discovery + */ +bool +lnet_is_discovery_disabled(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + rc = lnet_is_discovery_disabled_locked(lp); + spin_unlock(&lp->lp_lock); + + return rc; +} + lnet_nid_t LNetPrimaryNID(lnet_nid_t nid) { @@ -1164,11 +1193,16 @@ LNetPrimaryNID(lnet_nid_t nid) goto out_unlock; } lp = lpni->lpni_peer_net->lpn_peer; + while (!lnet_peer_is_uptodate(lp)) { rc = lnet_discover_peer_locked(lpni, cpt, true); if (rc) goto out_decref; lp = lpni->lpni_peer_net->lpn_peer; + + /* Only try once if discovery is disabled */ + if (lnet_is_discovery_disabled(lp)) + break; } primary_nid = lp->lp_primary_nid; out_decref: @@ -1796,35 +1830,6 @@ out_mutex_unlock: } bool -lnet_is_discovery_disabled_locked(struct lnet_peer *lp) -{ - if (lnet_peer_discovery_disabled) - return true; - - if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) || - (lp->lp_state & LNET_PEER_NO_DISCOVERY)) { - return true; - } - - return false; -} - -/* - * Peer Discovery - */ -bool -lnet_is_discovery_disabled(struct lnet_peer *lp) -{ - bool rc = false; - - spin_lock(&lp->lp_lock); - rc = lnet_is_discovery_disabled_locked(lp); - spin_unlock(&lp->lp_lock); - - return rc; -} - -bool lnet_peer_gw_discovery(struct lnet_peer *lp) { bool rc = false; @@ -2168,8 +2173,6 @@ again: break; lnet_peer_queue_for_discovery(lp); - if (lnet_is_discovery_disabled(lp)) - break; /* * if caller requested a non-blocking operation then * return immediately. Once discovery is complete then the @@ -2187,6 +2190,16 @@ again: lnet_peer_decref_locked(lp); /* Peer may have changed */ lp = lpni->lpni_peer_net->lpn_peer; + + /* + * Wait for discovery to complete, but don't repeat if + * discovery is disabled. This is done to ensure we can + * use discovery as a standard ping as well for backwards + * compatibility with routers which do not have discovery + * or have discovery disabled + */ + if (lnet_is_discovery_disabled(lp)) + break; } finish_wait(&lp->lp_dc_waitq, &wait); -- 1.8.3.1