Whamcloud - gitweb
LU-12424 lnet: prevent loop in LNetPrimaryNID() 90/38890/4
authorAmir Shehata <ashehata@whamcloud.com>
Tue, 11 Jun 2019 18:25:27 +0000 (11:25 -0700)
committerOleg Drokin <green@whamcloud.com>
Fri, 7 Aug 2020 21:12:34 +0000 (21:12 +0000)
If discovery is disabled locally or at the remote end, then attempt
discovery only once. Do not update the internal database when
discovery is disabled and do not repeat discovery.

This change prevents LNet from getting hung waiting for
discovery to complete.

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I4543b0f71e6cf297a1a5f058ebcc6bf74b8ac328
Reviewed-on: https://review.whamcloud.com/35191
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
Reviewed-by: Chris Horn <hornc@cray.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/38890
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Tested-by: jenkins <devops@whamcloud.com>
lnet/lnet/peer.c

index b23bb02..1259dc4 100644 (file)
@@ -1077,6 +1077,35 @@ lnet_peer_primary_nid_locked(lnet_nid_t nid)
        return primary_nid;
 }
 
+bool
+lnet_is_discovery_disabled_locked(struct lnet_peer *lp)
+{
+       if (lnet_peer_discovery_disabled)
+               return true;
+
+       if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) ||
+           (lp->lp_state & LNET_PEER_NO_DISCOVERY)) {
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * Peer Discovery
+ */
+bool
+lnet_is_discovery_disabled(struct lnet_peer *lp)
+{
+       bool rc = false;
+
+       spin_lock(&lp->lp_lock);
+       rc = lnet_is_discovery_disabled_locked(lp);
+       spin_unlock(&lp->lp_lock);
+
+       return rc;
+}
+
 lnet_nid_t
 LNetPrimaryNID(lnet_nid_t nid)
 {
@@ -1093,11 +1122,16 @@ LNetPrimaryNID(lnet_nid_t nid)
                goto out_unlock;
        }
        lp = lpni->lpni_peer_net->lpn_peer;
+
        while (!lnet_peer_is_uptodate(lp)) {
                rc = lnet_discover_peer_locked(lpni, cpt, true);
                if (rc)
                        goto out_decref;
                lp = lpni->lpni_peer_net->lpn_peer;
+
+               /* Only try once if discovery is disabled */
+               if (lnet_is_discovery_disabled(lp))
+                       break;
        }
        primary_nid = lp->lp_primary_nid;
 out_decref:
@@ -1701,10 +1735,6 @@ out_mutex_unlock:
 }
 
 /*
- * Peer Discovery
- */
-
-/*
  * Is a peer uptodate from the point of view of discovery?
  *
  * If it is currently being processed, obviously not.
@@ -2036,6 +2066,7 @@ again:
                if (lnet_peer_is_uptodate(lp))
                        break;
                lnet_peer_queue_for_discovery(lp);
+
                /*
                 * if caller requested a non-blocking operation then
                 * return immediately. Once discovery is complete then the
@@ -2053,6 +2084,16 @@ again:
                lnet_peer_decref_locked(lp);
                /* Peer may have changed */
                lp = lpni->lpni_peer_net->lpn_peer;
+
+               /*
+                * Wait for discovery to complete, but don't repeat if
+                * discovery is disabled. This is done to ensure we can
+                * use discovery as a standard ping as well for backwards
+                * compatibility with routers which do not have discovery
+                * or have discovery disabled
+                */
+               if (lnet_is_discovery_disabled(lp))
+                       break;
        }
        finish_wait(&lp->lp_dc_waitq, &wait);