Whamcloud - gitweb
LU-18815 mgc: don't fail/lbug on many NIDs 02/58502/11
authorMikhail Pershin <mpershin@whamcloud.com>
Fri, 14 Mar 2025 14:11:32 +0000 (17:11 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 8 Jul 2025 03:55:45 +0000 (03:55 +0000)
Keep server starting on node with more that 32 NIDs,
allowing first 32 NIDs per target.
Account '-o network' mount option to don't use other
networks as server import peers

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: If4c997be3480eba8b75888a070fb5a721b71b894
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58502
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Marc Vef <mvef@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/obdclass/lustre_peer.c
lustre/obdclass/obd_mount.c
lustre/tests/conf-sanity.sh

index b1664af..f6d216d 100644 (file)
@@ -61,7 +61,7 @@ int class_add_uuid(const char *uuid, struct lnet_nid *nid)
 {
        struct uuid_nid_data *data, *entry;
        int found = 0;
-       int rc;
+       int rc = 0;
 
        LASSERT(nid->nid_type != 0);  /* valid newconfig NID is never zero */
 
@@ -87,7 +87,10 @@ int class_add_uuid(const char *uuid, struct lnet_nid *nid)
                                        break;
 
                        if (i == entry->un_nid_count) {
-                               LASSERT(entry->un_nid_count < MTI_NIDS_MAX);
+                               if (i == MTI_NIDS_MAX) {
+                                       rc = -EOVERFLOW;
+                                       break;
+                               }
                                entry->un_nids[entry->un_nid_count++] = *nid;
                        }
                        break;
@@ -97,6 +100,12 @@ int class_add_uuid(const char *uuid, struct lnet_nid *nid)
                list_add(&data->un_list, &g_uuid_list);
        spin_unlock(&g_uuid_lock);
 
+       if (rc) {
+               CWARN("%s: can't add NID %s: rc = %d\n", uuid,
+                      libcfs_nidstr(nid), rc);
+               /* continue with already added NIDs */
+       }
+
        if (found) {
                CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
                       libcfs_nidstr(nid), entry->un_nid_count);
index 35030fc..15995fc 100644 (file)
@@ -402,14 +402,19 @@ int lustre_start_mgc(struct super_block *sb)
        /* Add the primary NIDs for the MGS */
        i = 0;
        if (IS_SERVER(lsi)) {
+               char *nidnet = lsi->lsi_lmd->lmd_nidnet;
+
                /* All mgsnode are listed in lmd_mgs at this moment */
                ptr = lsi->lsi_lmd->lmd_mgs;
                CDEBUG(D_MOUNT, "mgs NIDs %s.\n", ptr);
                if (IS_MGS(lsi)) {
-                       /* Use local NIDs (including LO) */
                        struct lnet_processid id;
 
+                       /* Use local NIDs (including LO) */
                        while ((rc = LNetGetId(i++, &id, true)) != -ENOENT) {
+                               if (nidnet && libcfs_str2net(nidnet) !=
+                                             LNET_NID_NET(&id.nid))
+                                       continue;
                                rc = do_lcfg_nid(mgcname, &id.nid,
                                                LCFG_ADD_UUID, nidstr);
                        }
@@ -425,6 +430,10 @@ int lustre_start_mgc(struct super_block *sb)
                         * by commas.
                         */
                        while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                               if (nidnet && libcfs_str2net(nidnet) !=
+                                             LNET_NID_NET(&nid))
+                                       continue;
+
                                rc = do_lcfg_nid(mgcname, &nid,
                                                 LCFG_ADD_UUID, nidstr);
                                if (rc == 0)
index 04a6fb2..191e8db 100755 (executable)
@@ -6575,6 +6575,36 @@ test_73b() {
 }
 run_test 73b "Large failnode NID list in mountdata"
 
+cleanup_73c() {
+       LOAD_MODULES_REMOTE=true cleanup
+}
+
+test_73c() {
+       (( $OST1_VERSION >= $(version_code 2.16.54) )) ||
+               skip "Need OST version at least 2.16.54 to don't LBUG"
+
+       cleanup
+       LOAD_MODULES_REMOTE=true load_modules
+
+       INTERFACES=( $(lnet_if_list) )
+       local inf=${INTERFACES[0]}
+
+       do_facet ost1 "$LNETCTL lnet configure" ||
+               error "unable to configure lnet on ost1"
+
+       stack_trap "cleanup_73c"
+
+       for ((n = 100; n <= 135; n++)); do
+               do_facet ost1 "$LNETCTL net add --net ${NETTYPE}$n --if $inf" ||
+                       skip "unable to configure net #$n on ost1"
+       done
+
+       echo "restart with 35 nets"
+       start_mgsmds
+       start_ost || error "unable to start ost1"
+}
+run_test 73c "Server mount doesn't fail with > 32 nets"
+
 test_73d() { #LU-18896
        (( $OST1_VERSION >= $(version_code 2.16.53) )) ||
                skip "need OST >= 2.16.53 for LU-18896 fix"