From 7cc0b292ecdbc578b3ca4d6312b7c74874712fdc Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 11 Jun 2019 11:25:27 -0700 Subject: [PATCH] LU-12424 lnet: prevent loop in LNetPrimaryNID() If discovery is disabled locally or at the remote end, then attempt discovery only once. Do not update the internal database when discovery is disabled and do not repeat discovery. This change prevents LNet from getting hung waiting for discovery to complete. Signed-off-by: Amir Shehata Change-Id: I4543b0f71e6cf297a1a5f058ebcc6bf74b8ac328 Reviewed-on: https://review.whamcloud.com/35191 Reviewed-by: Olaf Weber Tested-by: Jenkins Reviewed-by: Chris Horn Tested-by: Maloo Reviewed-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/38890 Reviewed-by: Chris Horn Tested-by: jenkins --- lnet/lnet/peer.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index b23bb02..1259dc4 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1077,6 +1077,35 @@ lnet_peer_primary_nid_locked(lnet_nid_t nid) return primary_nid; } +bool +lnet_is_discovery_disabled_locked(struct lnet_peer *lp) +{ + if (lnet_peer_discovery_disabled) + return true; + + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) || + (lp->lp_state & LNET_PEER_NO_DISCOVERY)) { + return true; + } + + return false; +} + +/* + * Peer Discovery + */ +bool +lnet_is_discovery_disabled(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + rc = lnet_is_discovery_disabled_locked(lp); + spin_unlock(&lp->lp_lock); + + return rc; +} + lnet_nid_t LNetPrimaryNID(lnet_nid_t nid) { @@ -1093,11 +1122,16 @@ LNetPrimaryNID(lnet_nid_t nid) goto out_unlock; } lp = lpni->lpni_peer_net->lpn_peer; + while (!lnet_peer_is_uptodate(lp)) { rc = lnet_discover_peer_locked(lpni, cpt, true); if (rc) goto out_decref; lp = lpni->lpni_peer_net->lpn_peer; + + /* Only try once if discovery is disabled */ + if (lnet_is_discovery_disabled(lp)) + break; } primary_nid = lp->lp_primary_nid; out_decref: @@ -1701,10 +1735,6 @@ out_mutex_unlock: } /* - * Peer Discovery - */ - -/* * Is a peer uptodate from the point of view of discovery? * * If it is currently being processed, obviously not. @@ -2036,6 +2066,7 @@ again: if (lnet_peer_is_uptodate(lp)) break; lnet_peer_queue_for_discovery(lp); + /* * if caller requested a non-blocking operation then * return immediately. Once discovery is complete then the @@ -2053,6 +2084,16 @@ again: lnet_peer_decref_locked(lp); /* Peer may have changed */ lp = lpni->lpni_peer_net->lpn_peer; + + /* + * Wait for discovery to complete, but don't repeat if + * discovery is disabled. This is done to ensure we can + * use discovery as a standard ping as well for backwards + * compatibility with routers which do not have discovery + * or have discovery disabled + */ + if (lnet_is_discovery_disabled(lp)) + break; } finish_wait(&lp->lp_dc_waitq, &wait); -- 1.8.3.1