Whamcloud - gitweb
LU-17379 lnet: parallelize peer discovery via LNetAddPeer 33/53933/10
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Tue, 6 Feb 2024 03:24:01 +0000 (19:24 -0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 30 Apr 2024 06:54:12 +0000 (06:54 +0000)
Initiate peer discovery via its non-primary NIDs
as they are being added in LNetAddPeer by pretending
that they belong to different peers. This may be
useful if some of the comma-separated NIDs in the
mount command (including the first listed NID) are down.
If discovery is performed in the background and there's
at least one reachable NID in the list, the discovery
will succeed and peer records will get consolidated.

If primary NID locking is enabled, The first NID in the list
provided by Lustre to LNetAddPeer always gets locked as primary:
even if it doesn't get discovered.

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I449cb9898c0242db874555a62fe8099352e913e6
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53933
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/lnet/peer.c

index b5de29e..fa493b5 100644 (file)
@@ -1433,6 +1433,9 @@ struct lnet_peer {
 
        /* timestamp of primary nid lock */
        __u64                   lp_prim_lock_ts;
+
+       /* merge and assign this NID as primary when discovery completes */
+       struct lnet_nid         lp_merge_primary_nid;
 };
 
 /*
index 92f8d44..f444eb3 100644 (file)
@@ -243,6 +243,7 @@ lnet_peer_alloc(struct lnet_nid *nid)
        lp->lp_primary_nid = *nid;
        lp->lp_disc_src_nid = LNET_ANY_NID;
        lp->lp_disc_dst_nid = LNET_ANY_NID;
+       lp->lp_merge_primary_nid = LNET_ANY_NID;
        if (lnet_peers_start_down())
                lp->lp_alive = false;
        else
@@ -1331,6 +1332,19 @@ lnet_is_discovery_disabled(struct lnet_peer *lp)
        return rc;
 }
 
+static void
+lnet_discover_peer_nid(struct lnet_nid *nid)
+{
+       int cpt = lnet_net_lock_current();
+       struct lnet_peer_ni *lpni = lnet_peer_ni_find_locked(nid);
+
+       if (lpni) {
+               lnet_discover_peer_locked(lpni, cpt, false);
+               lnet_peer_ni_decref_locked(lpni);
+       }
+       lnet_net_unlock(cpt);
+}
+
 int
 LNetAddPeer(struct lnet_nid *nids, u32 num_nids)
 {
@@ -1351,6 +1365,8 @@ LNetAddPeer(struct lnet_nid *nids, u32 num_nids)
        mr = lnet_peer_discovery_disabled == 0;
 
        rc = 0;
+       CDEBUG(D_NET, "num_nids %d\n", num_nids);
+
        for (i = 0; i < num_nids; i++) {
                if (nid_is_lo0(&nids[i]))
                        continue;
@@ -1370,13 +1386,26 @@ LNetAddPeer(struct lnet_nid *nids, u32 num_nids)
                                pnid = lp->lp_primary_nid;
                                /* Drop refcount from lookup */
                                lnet_peer_decref_locked(lp);
+                       } else if (mr && !rc) {
+                               lnet_discover_peer_nid(&pnid);
                        }
                } else if (lnet_peer_discovery_disabled) {
                        rc = lnet_add_peer_ni(&nids[i], &LNET_ANY_NID, mr,
                                              flags);
-               } else {
-                       rc = lnet_add_peer_ni(&pnid, &nids[i], mr,
-                                             flags);
+               } else if (!nid_same(&pnid, &nids[i])) {
+                       rc = lnet_add_peer_ni(&nids[i], &LNET_ANY_NID,
+                                             mr, 0);
+                       if (!rc) {
+                               if (lock_prim_nid) {
+                                       struct lnet_peer *lp;
+                                       lp = lnet_find_peer(&nids[i]);
+                                       if (lp) {
+                                               lp->lp_merge_primary_nid = pnid;
+                                               lnet_peer_decref_locked(lp);
+                                       }
+                               }
+                               lnet_discover_peer_nid(&nids[i]);
+                       }
                }
 
                if (rc && rc != -EEXIST)
@@ -3597,7 +3626,6 @@ __must_hold(&lp->lp_lock)
        flags = LNET_PEER_DISCOVERED;
        if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
                flags |= LNET_PEER_MULTI_RAIL;
-
        /*
         * Check whether the primary NID in the message matches the
         * primary NID of the peer. If it does, update the peer, if
@@ -3616,6 +3644,16 @@ __must_hold(&lp->lp_lock)
                lnet_ping_buffer_decref(pbuf);
                goto out;
        }
+       /* If lp_merge_primary_nid is set, assign it as primary,
+        * which causes the peers to merge.
+        */
+       if (!LNET_NID_IS_ANY(&lp->lp_merge_primary_nid)) {
+
+               rc = lnet_peer_set_primary_nid(lp, &lp->lp_merge_primary_nid,
+                                              flags);
+               lp->lp_merge_primary_nid = LNET_ANY_NID;
+       }
+
        if (nid_is_lo0(&lp->lp_primary_nid)) {
                rc = lnet_peer_set_primary_nid(lp, &nid, flags);
                if (rc)