From: Mikhail Pershin Date: Fri, 14 Mar 2025 14:11:32 +0000 (+0300) Subject: LU-18815 mgc: don't fail/lbug on many NIDs X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=0922d5907365c3e999a8b2c9f236f59c3b75e8a8;p=fs%2Flustre-release.git LU-18815 mgc: don't fail/lbug on many NIDs Keep server starting on node with more that 32 NIDs, allowing first 32 NIDs per target. Account '-o network' mount option to don't use other networks as server import peers Signed-off-by: Mikhail Pershin Change-Id: If4c997be3480eba8b75888a070fb5a721b71b894 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58502 Reviewed-by: Sebastien Buisson Reviewed-by: Marc Vef Reviewed-by: James Simmons Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c index b1664af..f6d216d 100644 --- a/lustre/obdclass/lustre_peer.c +++ b/lustre/obdclass/lustre_peer.c @@ -61,7 +61,7 @@ int class_add_uuid(const char *uuid, struct lnet_nid *nid) { struct uuid_nid_data *data, *entry; int found = 0; - int rc; + int rc = 0; LASSERT(nid->nid_type != 0); /* valid newconfig NID is never zero */ @@ -87,7 +87,10 @@ int class_add_uuid(const char *uuid, struct lnet_nid *nid) break; if (i == entry->un_nid_count) { - LASSERT(entry->un_nid_count < MTI_NIDS_MAX); + if (i == MTI_NIDS_MAX) { + rc = -EOVERFLOW; + break; + } entry->un_nids[entry->un_nid_count++] = *nid; } break; @@ -97,6 +100,12 @@ int class_add_uuid(const char *uuid, struct lnet_nid *nid) list_add(&data->un_list, &g_uuid_list); spin_unlock(&g_uuid_lock); + if (rc) { + CWARN("%s: can't add NID %s: rc = %d\n", uuid, + libcfs_nidstr(nid), rc); + /* continue with already added NIDs */ + } + if (found) { CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, libcfs_nidstr(nid), entry->un_nid_count); diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 35030fc..15995fc 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -402,14 +402,19 @@ int lustre_start_mgc(struct super_block *sb) /* Add the primary NIDs for the MGS */ i = 0; if (IS_SERVER(lsi)) { + char *nidnet = lsi->lsi_lmd->lmd_nidnet; + /* All mgsnode are listed in lmd_mgs at this moment */ ptr = lsi->lsi_lmd->lmd_mgs; CDEBUG(D_MOUNT, "mgs NIDs %s.\n", ptr); if (IS_MGS(lsi)) { - /* Use local NIDs (including LO) */ struct lnet_processid id; + /* Use local NIDs (including LO) */ while ((rc = LNetGetId(i++, &id, true)) != -ENOENT) { + if (nidnet && libcfs_str2net(nidnet) != + LNET_NID_NET(&id.nid)) + continue; rc = do_lcfg_nid(mgcname, &id.nid, LCFG_ADD_UUID, nidstr); } @@ -425,6 +430,10 @@ int lustre_start_mgc(struct super_block *sb) * by commas. */ while (class_parse_nid(ptr, &nid, &ptr) == 0) { + if (nidnet && libcfs_str2net(nidnet) != + LNET_NID_NET(&nid)) + continue; + rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID, nidstr); if (rc == 0) diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 04a6fb2..191e8db 100755 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -6575,6 +6575,36 @@ test_73b() { } run_test 73b "Large failnode NID list in mountdata" +cleanup_73c() { + LOAD_MODULES_REMOTE=true cleanup +} + +test_73c() { + (( $OST1_VERSION >= $(version_code 2.16.54) )) || + skip "Need OST version at least 2.16.54 to don't LBUG" + + cleanup + LOAD_MODULES_REMOTE=true load_modules + + INTERFACES=( $(lnet_if_list) ) + local inf=${INTERFACES[0]} + + do_facet ost1 "$LNETCTL lnet configure" || + error "unable to configure lnet on ost1" + + stack_trap "cleanup_73c" + + for ((n = 100; n <= 135; n++)); do + do_facet ost1 "$LNETCTL net add --net ${NETTYPE}$n --if $inf" || + skip "unable to configure net #$n on ost1" + done + + echo "restart with 35 nets" + start_mgsmds + start_ost || error "unable to start ost1" +} +run_test 73c "Server mount doesn't fail with > 32 nets" + test_73d() { #LU-18896 (( $OST1_VERSION >= $(version_code 2.16.53) )) || skip "need OST >= 2.16.53 for LU-18896 fix"