Whamcloud - gitweb
LU-14668 lnet: Lock primary NID logic
[fs/lustre-release.git] / lnet / lnet / peer.c
index 33bd351..bc6e250 100644 (file)
@@ -540,6 +540,16 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
                        goto out;
                }
        }
+
+       /* If we're asked to lock down the primary NID we shouldn't be
+        * deleting it
+        */
+       if (lp->lp_state & LNET_PEER_LOCK_PRIMARY &&
+           primary_nid == nid) {
+               rc = -EPERM;
+               goto out;
+       }
+
        lpni = lnet_find_peer_ni_locked(nid);
        if (!lpni) {
                rc = -ENOENT;
@@ -1399,13 +1409,18 @@ LNetPrimaryNID(lnet_nid_t nid)
         * down then this discovery can introduce long delays into the mount
         * process, so skip it if it isn't necessary.
         */
-       while (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) {
+       if (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) {
                spin_lock(&lp->lp_lock);
                /* force a full discovery cycle */
-               lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+               lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH |
+                               LNET_PEER_LOCK_PRIMARY;
                spin_unlock(&lp->lp_lock);
 
-               rc = lnet_discover_peer_locked(lpni, cpt, true);
+               /* start discovery in the background. Messages to that
+                * peer will not go through until the discovery is
+                * complete
+                */
+               rc = lnet_discover_peer_locked(lpni, cpt, false);
                if (rc)
                        goto out_decref;
                /* The lpni (or lp) for this NID may have changed and our ref is
@@ -1419,14 +1434,6 @@ LNetPrimaryNID(lnet_nid_t nid)
                        goto out_unlock;
                }
                lp = lpni->lpni_peer_net->lpn_peer;
-
-               /* If we find that the peer has discovery disabled then we will
-                * not modify whatever primary NID is currently set for this
-                * peer. Thus, we can break out of this loop even if the peer
-                * is not fully up to date.
-                */
-               if (lnet_is_discovery_disabled(lp))
-                       break;
        }
        primary_nid = lp->lp_primary_nid;
 out_decref:
@@ -1463,9 +1470,9 @@ lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
  */
 static int
 lnet_peer_attach_peer_ni(struct lnet_peer *lp,
-                               struct lnet_peer_net *lpn,
-                               struct lnet_peer_ni *lpni,
-                               unsigned flags)
+                        struct lnet_peer_net *lpn,
+                        struct lnet_peer_ni *lpni,
+                        unsigned flags)
 {
        struct lnet_peer_table *ptable;
        bool new_lpn = false;
@@ -1532,6 +1539,8 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp,
                        lnet_peer_clr_non_mr_pref_nids(lp);
                }
        }
+       if (flags & LNET_PEER_LOCK_PRIMARY)
+               lp->lp_state |= LNET_PEER_LOCK_PRIMARY;
        spin_unlock(&lp->lp_lock);
 
        lp->lp_nnis++;
@@ -1686,9 +1695,27 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
                }
                /* If this is the primary NID, destroy the peer. */
                if (lnet_peer_ni_is_primary(lpni)) {
-                       struct lnet_peer *rtr_lp =
+                       struct lnet_peer *lp2 =
                                lpni->lpni_peer_net->lpn_peer;
-                       int rtr_refcount = rtr_lp->lp_rtr_refcount;
+                       int rtr_refcount = lp2->lp_rtr_refcount;
+
+                       /* If the new peer that this NID belongs to is
+                        * a primary NID for another peer which we're
+                        * suppose to preserve the Primary for then we
+                        * don't want to mess with it. But the
+                        * configuration is wrong at this point, so we
+                        * should flag both of these peers as in a bad
+                        * state
+                        */
+                       if (lp2->lp_state & LNET_PEER_LOCK_PRIMARY) {
+                               spin_lock(&lp->lp_lock);
+                               lp->lp_state |= LNET_PEER_BAD_CONFIG;
+                               spin_unlock(&lp->lp_lock);
+                               spin_lock(&lp2->lp_lock);
+                               lp2->lp_state |= LNET_PEER_BAD_CONFIG;
+                               spin_unlock(&lp2->lp_lock);
+                               goto out_free_lpni;
+                       }
                        /*
                         * if we're trying to delete a router it means
                         * we're moving this peer NI to a new peer so must
@@ -1696,9 +1723,9 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
                         */
                        if (rtr_refcount > 0) {
                                flags |= LNET_PEER_RTR_NI_FORCE_DEL;
-                               lnet_rtr_transfer_to_peer(rtr_lp, lp);
+                               lnet_rtr_transfer_to_peer(lp2, lp);
                        }
-                       lnet_peer_del(lpni->lpni_peer_net->lpn_peer);
+                       lnet_peer_del(lp2);
                        lnet_peer_ni_decref_locked(lpni);
                        lpni = lnet_peer_ni_alloc(nid);
                        if (!lpni) {
@@ -1755,7 +1782,8 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
        if (lp->lp_primary_nid == nid)
                goto out;
 
-       lp->lp_primary_nid = nid;
+       if (!(lp->lp_state & LNET_PEER_LOCK_PRIMARY))
+               lp->lp_primary_nid = nid;
 
        rc = lnet_peer_add_nid(lp, nid, flags);
        if (rc) {
@@ -1763,8 +1791,17 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
                goto out;
        }
 out:
+       /* if this is a configured peer or the primary for that peer has
+        * been locked, then we don't want to flag this scenario as
+        * a failure
+        */
+       if (lp->lp_state & LNET_PEER_CONFIGURED ||
+           lp->lp_state & LNET_PEER_LOCK_PRIMARY)
+               return 0;
+
        CDEBUG(D_NET, "peer %s NID %s: %d\n",
               libcfs_nid2str(old), libcfs_nid2str(nid), rc);
+
        return rc;
 }