From 0efa1b99e26e454bb6dd71574541e30a10030936 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Sat, 3 May 2025 17:52:19 +0300 Subject: [PATCH] LU-18973 mgc: account failovers from all sources Once set up initially MGC is not updating import failovers from other mounts. That causes problems with MGC on MGS - it is always set up with only @lo interface, so if MGS failed over to other node, all targets/clients on primary node are unable to find MGS, because MGC has only @lo peer Patch reworks lustre_start_mgc() code to account all failover peers from each user of that MGC. It adds new failover NIDs even if MGC exists already. Patch re-organizes also the way how peers are identified. It uses peer UUID as 'Primary NID' string instead of naming it as 'MGC_##' so same NIDs don't produces new mappings and don't pollute import with duplicated connections. That makes LCFG_DEL_UUID obsoleted as well, because lustre_stop_mgc() was its last user. Signed-off-by: Mikhail Pershin Change-Id: Icea5b74a16972e8a5f2737257086074630e652a8 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59076 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Sebastien Buisson Reviewed-by: Marc Vef Reviewed-by: Oleg Drokin --- lustre/obdclass/obd_mount.c | 167 ++++++++++++++++---------------------------- lustre/tests/conf-sanity.sh | 31 ++++++++ 2 files changed, 92 insertions(+), 106 deletions(-) diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 1165b5d..35030fc 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -199,44 +199,58 @@ SERVER_ONLY_EXPORT_SYMBOL(lustre_start_simple); static DEFINE_MUTEX(mgc_start_lock); -/* 9 for '_%x' (INT_MAX as hex is 8 chars - '7FFFFFFF') and 1 for '\0' */ -#define NIDUUID_SUFFIX_MAX_LEN 10 -static inline int mgc_niduuid_create(char **niduuid, char *nidstr) +/** + * Parse MGS failover nodes from provided NID list and add + * them to existing MGC import + */ +static bool lustre_add_mgc_failnodes(struct obd_device *obd, char *ptr) { - size_t niduuid_len = strlen(nidstr) + strlen(LUSTRE_MGC_OBDNAME) + - NIDUUID_SUFFIX_MAX_LEN; - - LASSERT(niduuid); + struct obd_import *imp = obd->u.cli.cl_import; + char node[LNET_NIDSTR_SIZE]; + struct lnet_nid nid; + int rc; + bool large_nids = false; - /* See comment in niduuid_create() */ - if (niduuid_len > UUID_MAX) { - nidstr += niduuid_len - UUID_MAX; - niduuid_len = strlen(LUSTRE_MGC_OBDNAME) + - strlen(nidstr) + NIDUUID_SUFFIX_MAX_LEN; - } + LASSERT(imp); - OBD_ALLOC(*niduuid, niduuid_len); - if (!*niduuid) - return -ENOMEM; + /* Add any failover MGS NIDs */ + while (ptr) { + int count = 0; - snprintf(*niduuid, niduuid_len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr); - return 0; -} + while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { + large_nids |= !nid_is_nid4(&nid); -static inline void mgc_niduuid_destroy(char **niduuid) -{ - if (*niduuid) { - char *tmp = strchr(*niduuid, '_'); + /* New failover node */ + if (!count) /* construct node UUID from primary NID */ + libcfs_nidstr_r(&nid, node, LNET_NIDSTR_SIZE); - /* If the "_%x" suffix hasn't been added yet then the size - * calculation below should still be correct - */ - if (tmp) - *tmp = '\0'; - - OBD_FREE(*niduuid, strlen(*niduuid) + NIDUUID_SUFFIX_MAX_LEN); + rc = class_add_uuid(node, &nid); + if (rc) { + libcfs_nidstr_r(&nid, node, LNET_NIDSTR_SIZE); + CWARN("%s: can't add failover NID %s, rc = %d\n", + obd->obd_name, node, rc); + } else { + count++; + } + if (*ptr == ':') + break; + } + /* if new peer mapping was created */ + if (count > 0) { + struct obd_uuid uuid; + + obd_str2uuid(&uuid, node); + rc = obd_add_conn(imp, &uuid, 0); + if (rc) + CWARN("%s: can't add failover peer %s, rc = %d\n", + obd->obd_name, node, rc); + } else { + /* at ":/fsname" */ + break; + } } - *niduuid = NULL; + + return large_nids; } /** @@ -256,10 +270,10 @@ int lustre_start_mgc(struct super_block *sb) uuid_t uuidc; struct lnet_nid nid; char nidstr[LNET_NIDSTR_SIZE]; - char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; + char *mgcname = NULL, *mgssec = NULL; bool large_nids = false; - char *ptr, *niduuid_suffix; - int rc = 0, i = 0, j; + char *ptr; + int rc = 0, i = 0; size_t len; ENTRY; @@ -306,8 +320,7 @@ int lustre_start_mgc(struct super_block *sb) libcfs_nidstr_r(&nid, nidstr, sizeof(nidstr)); len = strlen(LUSTRE_MGC_OBDNAME) + strlen(nidstr) + 1; OBD_ALLOC(mgcname, len); - rc = mgc_niduuid_create(&niduuid, nidstr); - if (rc || mgcname == NULL) + if (!mgcname) GOTO(out_free, rc = -ENOMEM); snprintf(mgcname, len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr); @@ -357,6 +370,8 @@ int lustre_start_mgc(struct super_block *sb) } } + lustre_add_mgc_failnodes(obd, ptr); + recov_bk = 0; /* * If we are restarting the MGS, don't try to keep the MGC's @@ -386,9 +401,8 @@ int lustre_start_mgc(struct super_block *sb) /* Add the primary NIDs for the MGS */ i = 0; - niduuid_suffix = niduuid + strlen(niduuid); - snprintf(niduuid_suffix, NIDUUID_SUFFIX_MAX_LEN, "_%x", i); if (IS_SERVER(lsi)) { + /* All mgsnode are listed in lmd_mgs at this moment */ ptr = lsi->lsi_lmd->lmd_mgs; CDEBUG(D_MOUNT, "mgs NIDs %s.\n", ptr); if (IS_MGS(lsi)) { @@ -397,16 +411,11 @@ int lustre_start_mgc(struct super_block *sb) while ((rc = LNetGetId(i++, &id, true)) != -ENOENT) { rc = do_lcfg_nid(mgcname, &id.nid, - LCFG_ADD_UUID, - niduuid); + LCFG_ADD_UUID, nidstr); } } else { - /* Use mgsnode= nids */ - /* mount -o mgsnode=nid */ - if (lsi->lsi_lmd->lmd_mgs) { - ptr = lsi->lsi_lmd->lmd_mgs; - } else if (class_find_param(ptr, PARAM_MGSNODE, - &ptr) != 0) { + /* Target must have at least one mgsnode */ + if (!ptr) { CERROR("No MGS NIDs given.\n"); GOTO(out_free, rc = -EINVAL); } @@ -417,8 +426,7 @@ int lustre_start_mgc(struct super_block *sb) */ while (class_parse_nid(ptr, &nid, &ptr) == 0) { rc = do_lcfg_nid(mgcname, &nid, - LCFG_ADD_UUID, - niduuid); + LCFG_ADD_UUID, nidstr); if (rc == 0) ++i; /* Stop at the first failover NID */ @@ -430,8 +438,7 @@ int lustre_start_mgc(struct super_block *sb) /* Use NIDs from mount line: uml1,1@elan:uml2,2@elan:/lustre */ ptr = lsi->lsi_lmd->lmd_dev; while (class_parse_nid(ptr, &nid, &ptr) == 0) { - rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID, - niduuid); + rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID, nidstr); if (rc == 0) ++i; /* Stop at the first failover NID */ @@ -443,7 +450,6 @@ int lustre_start_mgc(struct super_block *sb) CERROR("No valid MGS NIDs found.\n"); GOTO(out_free, rc = -EINVAL); } - lsi->lsi_lmd->lmd_mgs_failnodes = 1; /* Random uuid for MGC allows easier reconnects */ OBD_ALLOC_PTR(uuid); @@ -456,46 +462,18 @@ int lustre_start_mgc(struct super_block *sb) /* Start the MGC */ rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, - niduuid, NULL, lsi->lsi_lmd->lmd_nidnet); + nidstr, NULL, lsi->lsi_lmd->lmd_nidnet); if (rc) GOTO(out_free, rc); - /* Add any failover MGS NIDs */ - i = 1; - while (ptr && ((*ptr == ':' || - class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { - /* New failover node */ - snprintf(niduuid_suffix, NIDUUID_SUFFIX_MAX_LEN, "_%x", i); - j = 0; - while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { - if (!nid_is_nid4(&nid)) - large_nids = true; - - rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID, - niduuid); - if (rc == 0) - ++j; - if (*ptr == ':') - break; - } - if (j > 0) { - rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, - niduuid, NULL, NULL, NULL); - if (rc == 0) - ++i; - } else { - /* at ":/fsname" */ - break; - } - } - lsi->lsi_lmd->lmd_mgs_failnodes = i; - obd = class_name2obd(mgcname); if (!obd) { CERROR("Can't find mgcobd %s\n", mgcname); GOTO(out_free, rc = -ENOTCONN); } + large_nids = lustre_add_mgc_failnodes(obd, ptr); + rc = obd_set_info_async(NULL, obd->obd_self_export, strlen(KEY_MGSSEC), KEY_MGSSEC, strlen(mgssec), mgssec, NULL); @@ -551,7 +529,6 @@ out_free: OBD_FREE_PTR(uuid); OBD_FREE_PTR(data); OBD_FREE(mgcname, len); - mgc_niduuid_destroy(&niduuid); RETURN(rc); } @@ -561,9 +538,7 @@ SERVER_ONLY int lustre_stop_mgc(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; - char *niduuid = NULL, *niduuid_suffix; - char nidstr[LNET_NIDSTR_SIZE]; - int i, rc = 0; + int rc = 0; ENTRY; @@ -574,16 +549,6 @@ SERVER_ONLY int lustre_stop_mgc(struct super_block *sb) RETURN(-ENOENT); lsi->lsi_mgc = NULL; - /* Reconstruct the NID uuid from the obd_name */ - strscpy(nidstr, &obd->obd_name[0] + strlen(LUSTRE_MGC_OBDNAME), - sizeof(nidstr)); - - rc = mgc_niduuid_create(&niduuid, nidstr); - if (rc) - RETURN(-ENOMEM); - - niduuid_suffix = niduuid + strlen(niduuid); - mutex_lock(&mgc_start_lock); LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { @@ -616,20 +581,10 @@ SERVER_ONLY int lustre_stop_mgc(struct super_block *sb) if (rc) GOTO(out, rc); - for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { - snprintf(niduuid_suffix, NIDUUID_SUFFIX_MAX_LEN, "_%x", i); - rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, - niduuid, NULL, NULL, NULL); - if (rc) - CERROR("del MDC UUID %s failed: rc = %d\n", - niduuid, rc); - } out: /* class_import_put will get rid of the additional connections */ mutex_unlock(&mgc_start_lock); - mgc_niduuid_destroy(&niduuid); - RETURN(rc); } SERVER_ONLY_EXPORT_SYMBOL(lustre_stop_mgc); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 7699250..2e8b86d 100755 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -12052,6 +12052,37 @@ test_155() { } run_test 155 "gap in seq allocation from ofd after restarting" +test_160() { + ((OST1_VERSION >= $(version_code 2.16.55) )) || + skip "need OST >= 2.16.55 to have MGC with all failovers" + + stopall + reformat + + local mgs_nid=$(do_facet mgs $LCTL list_nids | head -1) + local failover_nid="192.168.252.160@${NETTYPE}" + local mgs_nodes=$mgs_nid:$failover_nid + local tmp_mnt="$TMP/lmount" + local count; + + start_mgsmds + start_ost + + stack_trap "cleanup; reformat" + + do_facet mgs "mkdir -p $tmp_mnt" + do_facet mgs "$MOUNT_CMD $mgs_nodes:/$FSNAME $tmp_mnt" || + error "Fail to mount local client on MGS" + do_facet mgs "umount $tmp_mnt" + + do_facet mgs "$LCTL get_param mgc.MGC${mgs_nid}.import" + count=$(do_facet mgs "$LCTL get_param mgc.MGC${mgs_nid}.import" | + grep -c "$failover_nid") + (( count > 0 )) || + error "MGC misses failover MGS nid" +} +run_test 160 "MGC updates failnodes from all participants" + cleanup_200() { local modopts=$1 stopall -- 1.8.3.1