Whamcloud - gitweb
LU-15169 Revert "LU-14668 lnet: Lock primary NID logic" 86/45386/3
authorChris Horn <chris.horn@hpe.com>
Tue, 26 Oct 2021 20:23:37 +0000 (15:23 -0500)
committerOleg Drokin <green@whamcloud.com>
Mon, 13 Dec 2021 03:51:42 +0000 (03:51 +0000)
This patch breaks client mounts under certain LNet configurations.

This reverts commit 024f9303bc6f32a3113357c864765c4f9c93ed03.

Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ic1f1d07694fe49df14c803a9434d673e61c7dd67
Reviewed-on: https://review.whamcloud.com/45386
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/peer.c

index a06b99d..4725a01 100644 (file)
@@ -545,15 +545,6 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
                }
        }
 
-       /* If we're asked to lock down the primary NID we shouldn't be
-        * deleting it
-        */
-       if (lp->lp_state & LNET_PEER_LOCK_PRIMARY &&
-           nid_same(&primary_nid, &nid)) {
-               rc = -EPERM;
-               goto out;
-       }
-
        lpni = lnet_peer_ni_find_locked(&nid);
        if (!lpni) {
                rc = -ENOENT;
@@ -1462,18 +1453,13 @@ LNetPrimaryNID(lnet_nid_t nid)
         * down then this discovery can introduce long delays into the mount
         * process, so skip it if it isn't necessary.
         */
-       if (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) {
+       while (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) {
                spin_lock(&lp->lp_lock);
                /* force a full discovery cycle */
-               lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH |
-                               LNET_PEER_LOCK_PRIMARY;
+               lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
                spin_unlock(&lp->lp_lock);
 
-               /* start discovery in the background. Messages to that
-                * peer will not go through until the discovery is
-                * complete
-                */
-               rc = lnet_discover_peer_locked(lpni, cpt, false);
+               rc = lnet_discover_peer_locked(lpni, cpt, true);
                if (rc)
                        goto out_decref;
                /* The lpni (or lp) for this NID may have changed and our ref is
@@ -1487,6 +1473,14 @@ LNetPrimaryNID(lnet_nid_t nid)
                        goto out_unlock;
                }
                lp = lpni->lpni_peer_net->lpn_peer;
+
+               /* If we find that the peer has discovery disabled then we will
+                * not modify whatever primary NID is currently set for this
+                * peer. Thus, we can break out of this loop even if the peer
+                * is not fully up to date.
+                */
+               if (lnet_is_discovery_disabled(lp))
+                       break;
        }
        primary_nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
 out_decref:
@@ -1523,9 +1517,9 @@ lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
  */
 static int
 lnet_peer_attach_peer_ni(struct lnet_peer *lp,
-                        struct lnet_peer_net *lpn,
-                        struct lnet_peer_ni *lpni,
-                        unsigned flags)
+                               struct lnet_peer_net *lpn,
+                               struct lnet_peer_ni *lpni,
+                               unsigned flags)
 {
        struct lnet_peer_table *ptable;
        bool new_lpn = false;
@@ -1592,8 +1586,6 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp,
                        lnet_peer_clr_non_mr_pref_nids(lp);
                }
        }
-       if (flags & LNET_PEER_LOCK_PRIMARY)
-               lp->lp_state |= LNET_PEER_LOCK_PRIMARY;
        spin_unlock(&lp->lp_lock);
 
        lp->lp_nnis++;
@@ -1755,27 +1747,9 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
                }
                /* If this is the primary NID, destroy the peer. */
                if (lnet_peer_ni_is_primary(lpni)) {
-                       struct lnet_peer *lp2 =
+                       struct lnet_peer *rtr_lp =
                                lpni->lpni_peer_net->lpn_peer;
-                       int rtr_refcount = lp2->lp_rtr_refcount;
-
-                       /* If the new peer that this NID belongs to is
-                        * a primary NID for another peer which we're
-                        * suppose to preserve the Primary for then we
-                        * don't want to mess with it. But the
-                        * configuration is wrong at this point, so we
-                        * should flag both of these peers as in a bad
-                        * state
-                        */
-                       if (lp2->lp_state & LNET_PEER_LOCK_PRIMARY) {
-                               spin_lock(&lp->lp_lock);
-                               lp->lp_state |= LNET_PEER_BAD_CONFIG;
-                               spin_unlock(&lp->lp_lock);
-                               spin_lock(&lp2->lp_lock);
-                               lp2->lp_state |= LNET_PEER_BAD_CONFIG;
-                               spin_unlock(&lp2->lp_lock);
-                               goto out_free_lpni;
-                       }
+                       int rtr_refcount = rtr_lp->lp_rtr_refcount;
                        /*
                         * if we're trying to delete a router it means
                         * we're moving this peer NI to a new peer so must
@@ -1783,9 +1757,9 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags)
                         */
                        if (rtr_refcount > 0) {
                                flags |= LNET_PEER_RTR_NI_FORCE_DEL;
-                               lnet_rtr_transfer_to_peer(lp2, lp);
+                               lnet_rtr_transfer_to_peer(rtr_lp, lp);
                        }
-                       lnet_peer_del(lp2);
+                       lnet_peer_del(lpni->lpni_peer_net->lpn_peer);
                        lnet_peer_ni_decref_locked(lpni);
                        lpni = lnet_peer_ni_alloc(&nid);
                        if (!lpni) {
@@ -1843,8 +1817,7 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid,
        if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid)
                goto out;
 
-       if (!(lp->lp_state & LNET_PEER_LOCK_PRIMARY))
-               lnet_nid4_to_nid(nid, &lp->lp_primary_nid);
+       lnet_nid4_to_nid(nid, &lp->lp_primary_nid);
 
        rc = lnet_peer_add_nid(lp, nid, flags);
        if (rc) {
@@ -1852,14 +1825,6 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid,
                goto out;
        }
 out:
-       /* if this is a configured peer or the primary for that peer has
-        * been locked, then we don't want to flag this scenario as
-        * a failure
-        */
-       if (lp->lp_state & LNET_PEER_CONFIGURED ||
-           lp->lp_state & LNET_PEER_LOCK_PRIMARY)
-               return 0;
-
        CDEBUG(D_NET, "peer %s NID %s: %d\n",
               libcfs_nidstr(&old), libcfs_nid2str(nid), rc);