Whamcloud - gitweb
LU-18973 mgc: account failovers from all sources 76/59076/9
authorMikhail Pershin <mpershin@whamcloud.com>
Sat, 3 May 2025 14:52:19 +0000 (17:52 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 17 Jun 2025 02:49:23 +0000 (02:49 +0000)
Once set up initially MGC is not updating import failovers
from other mounts. That causes problems with MGC on MGS -
it is always set up with only @lo interface, so if MGS
failed over to other node, all targets/clients on primary
node are unable to find MGS, because MGC has only @lo peer

Patch reworks lustre_start_mgc() code to account all
failover peers from each user of that MGC. It adds new
failover NIDs even if MGC exists already.

Patch re-organizes also the way how  peers are identified.
It uses peer UUID as 'Primary NID' string instead of
naming it as 'MGC<PrimaryNID>_##' so same NIDs don't
produces new mappings and don't pollute import with
duplicated connections.

That makes LCFG_DEL_UUID obsoleted as well, because
lustre_stop_mgc() was its last user.

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: Icea5b74a16972e8a5f2737257086074630e652a8
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59076
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Marc Vef <mvef@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/obdclass/obd_mount.c
lustre/tests/conf-sanity.sh

index 1165b5d..35030fc 100644 (file)
@@ -199,44 +199,58 @@ SERVER_ONLY_EXPORT_SYMBOL(lustre_start_simple);
 
 static DEFINE_MUTEX(mgc_start_lock);
 
-/* 9 for '_%x' (INT_MAX as hex is 8 chars - '7FFFFFFF') and 1 for '\0' */
-#define NIDUUID_SUFFIX_MAX_LEN 10
-static inline int mgc_niduuid_create(char **niduuid, char *nidstr)
+/**
+ * Parse MGS failover nodes from provided NID list and add
+ * them to existing MGC import
+ */
+static bool lustre_add_mgc_failnodes(struct obd_device *obd, char *ptr)
 {
-       size_t niduuid_len = strlen(nidstr) + strlen(LUSTRE_MGC_OBDNAME) +
-                            NIDUUID_SUFFIX_MAX_LEN;
-
-       LASSERT(niduuid);
+       struct obd_import *imp = obd->u.cli.cl_import;
+       char node[LNET_NIDSTR_SIZE];
+       struct lnet_nid nid;
+       int rc;
+       bool large_nids = false;
 
-       /* See comment in niduuid_create() */
-       if (niduuid_len > UUID_MAX) {
-               nidstr += niduuid_len - UUID_MAX;
-               niduuid_len = strlen(LUSTRE_MGC_OBDNAME) +
-                             strlen(nidstr) + NIDUUID_SUFFIX_MAX_LEN;
-       }
+       LASSERT(imp);
 
-       OBD_ALLOC(*niduuid, niduuid_len);
-       if (!*niduuid)
-               return -ENOMEM;
+       /* Add any failover MGS NIDs */
+       while (ptr) {
+               int count = 0;
 
-       snprintf(*niduuid, niduuid_len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr);
-       return 0;
-}
+               while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+                       large_nids |= !nid_is_nid4(&nid);
 
-static inline void mgc_niduuid_destroy(char **niduuid)
-{
-       if (*niduuid) {
-               char *tmp = strchr(*niduuid, '_');
+                       /* New failover node */
+                       if (!count) /* construct node UUID from primary NID */
+                               libcfs_nidstr_r(&nid, node, LNET_NIDSTR_SIZE);
 
-               /* If the "_%x" suffix hasn't been added yet then the size
-                * calculation below should still be correct
-                */
-               if (tmp)
-                       *tmp = '\0';
-
-               OBD_FREE(*niduuid, strlen(*niduuid) + NIDUUID_SUFFIX_MAX_LEN);
+                       rc = class_add_uuid(node, &nid);
+                       if (rc) {
+                               libcfs_nidstr_r(&nid, node, LNET_NIDSTR_SIZE);
+                               CWARN("%s: can't add failover NID %s, rc = %d\n",
+                                     obd->obd_name, node, rc);
+                       } else {
+                               count++;
+                       }
+                       if (*ptr == ':')
+                               break;
+               }
+               /* if new peer mapping was created */
+               if (count > 0) {
+                       struct obd_uuid uuid;
+
+                       obd_str2uuid(&uuid, node);
+                       rc = obd_add_conn(imp, &uuid, 0);
+                       if (rc)
+                               CWARN("%s: can't add failover peer %s, rc = %d\n",
+                                     obd->obd_name, node, rc);
+               } else {
+                       /* at ":/fsname" */
+                       break;
+               }
        }
-       *niduuid = NULL;
+
+       return large_nids;
 }
 
 /**
@@ -256,10 +270,10 @@ int lustre_start_mgc(struct super_block *sb)
        uuid_t uuidc;
        struct lnet_nid nid;
        char nidstr[LNET_NIDSTR_SIZE];
-       char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+       char *mgcname = NULL, *mgssec = NULL;
        bool large_nids = false;
-       char *ptr, *niduuid_suffix;
-       int rc = 0, i = 0, j;
+       char *ptr;
+       int rc = 0, i = 0;
        size_t len;
 
        ENTRY;
@@ -306,8 +320,7 @@ int lustre_start_mgc(struct super_block *sb)
        libcfs_nidstr_r(&nid, nidstr, sizeof(nidstr));
        len = strlen(LUSTRE_MGC_OBDNAME) + strlen(nidstr) + 1;
        OBD_ALLOC(mgcname, len);
-       rc = mgc_niduuid_create(&niduuid, nidstr);
-       if (rc || mgcname == NULL)
+       if (!mgcname)
                GOTO(out_free, rc = -ENOMEM);
 
        snprintf(mgcname, len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr);
@@ -357,6 +370,8 @@ int lustre_start_mgc(struct super_block *sb)
                        }
                }
 
+               lustre_add_mgc_failnodes(obd, ptr);
+
                recov_bk = 0;
                /*
                 * If we are restarting the MGS, don't try to keep the MGC's
@@ -386,9 +401,8 @@ int lustre_start_mgc(struct super_block *sb)
 
        /* Add the primary NIDs for the MGS */
        i = 0;
-       niduuid_suffix = niduuid + strlen(niduuid);
-       snprintf(niduuid_suffix, NIDUUID_SUFFIX_MAX_LEN, "_%x", i);
        if (IS_SERVER(lsi)) {
+               /* All mgsnode are listed in lmd_mgs at this moment */
                ptr = lsi->lsi_lmd->lmd_mgs;
                CDEBUG(D_MOUNT, "mgs NIDs %s.\n", ptr);
                if (IS_MGS(lsi)) {
@@ -397,16 +411,11 @@ int lustre_start_mgc(struct super_block *sb)
 
                        while ((rc = LNetGetId(i++, &id, true)) != -ENOENT) {
                                rc = do_lcfg_nid(mgcname, &id.nid,
-                                                LCFG_ADD_UUID,
-                                                niduuid);
+                                               LCFG_ADD_UUID, nidstr);
                        }
                } else {
-                       /* Use mgsnode= nids */
-                       /* mount -o mgsnode=nid */
-                       if (lsi->lsi_lmd->lmd_mgs) {
-                               ptr = lsi->lsi_lmd->lmd_mgs;
-                       } else if (class_find_param(ptr, PARAM_MGSNODE,
-                                                   &ptr) != 0) {
+                       /* Target must have at least one mgsnode */
+                       if (!ptr) {
                                CERROR("No MGS NIDs given.\n");
                                GOTO(out_free, rc = -EINVAL);
                        }
@@ -417,8 +426,7 @@ int lustre_start_mgc(struct super_block *sb)
                         */
                        while (class_parse_nid(ptr, &nid, &ptr) == 0) {
                                rc = do_lcfg_nid(mgcname, &nid,
-                                                LCFG_ADD_UUID,
-                                                niduuid);
+                                                LCFG_ADD_UUID, nidstr);
                                if (rc == 0)
                                        ++i;
                                /* Stop at the first failover NID */
@@ -430,8 +438,7 @@ int lustre_start_mgc(struct super_block *sb)
                /* Use NIDs from mount line: uml1,1@elan:uml2,2@elan:/lustre */
                ptr = lsi->lsi_lmd->lmd_dev;
                while (class_parse_nid(ptr, &nid, &ptr) == 0) {
-                       rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID,
-                                        niduuid);
+                       rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID, nidstr);
                        if (rc == 0)
                                ++i;
                        /* Stop at the first failover NID */
@@ -443,7 +450,6 @@ int lustre_start_mgc(struct super_block *sb)
                CERROR("No valid MGS NIDs found.\n");
                GOTO(out_free, rc = -EINVAL);
        }
-       lsi->lsi_lmd->lmd_mgs_failnodes = 1;
 
        /* Random uuid for MGC allows easier reconnects */
        OBD_ALLOC_PTR(uuid);
@@ -456,46 +462,18 @@ int lustre_start_mgc(struct super_block *sb)
        /* Start the MGC */
        rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
                                 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
-                                niduuid, NULL, lsi->lsi_lmd->lmd_nidnet);
+                                nidstr, NULL, lsi->lsi_lmd->lmd_nidnet);
        if (rc)
                GOTO(out_free, rc);
 
-       /* Add any failover MGS NIDs */
-       i = 1;
-       while (ptr && ((*ptr == ':' ||
-              class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
-               /* New failover node */
-               snprintf(niduuid_suffix, NIDUUID_SUFFIX_MAX_LEN, "_%x", i);
-               j = 0;
-               while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
-                       if (!nid_is_nid4(&nid))
-                               large_nids = true;
-
-                       rc = do_lcfg_nid(mgcname, &nid, LCFG_ADD_UUID,
-                                        niduuid);
-                       if (rc == 0)
-                               ++j;
-                       if (*ptr == ':')
-                               break;
-               }
-               if (j > 0) {
-                       rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
-                                    niduuid, NULL, NULL, NULL);
-                       if (rc == 0)
-                               ++i;
-               } else {
-                       /* at ":/fsname" */
-                       break;
-               }
-       }
-       lsi->lsi_lmd->lmd_mgs_failnodes = i;
-
        obd = class_name2obd(mgcname);
        if (!obd) {
                CERROR("Can't find mgcobd %s\n", mgcname);
                GOTO(out_free, rc = -ENOTCONN);
        }
 
+       large_nids = lustre_add_mgc_failnodes(obd, ptr);
+
        rc = obd_set_info_async(NULL, obd->obd_self_export,
                                strlen(KEY_MGSSEC), KEY_MGSSEC,
                                strlen(mgssec), mgssec, NULL);
@@ -551,7 +529,6 @@ out_free:
        OBD_FREE_PTR(uuid);
        OBD_FREE_PTR(data);
        OBD_FREE(mgcname, len);
-       mgc_niduuid_destroy(&niduuid);
 
        RETURN(rc);
 }
@@ -561,9 +538,7 @@ SERVER_ONLY int lustre_stop_mgc(struct super_block *sb)
 {
        struct lustre_sb_info *lsi = s2lsi(sb);
        struct obd_device *obd;
-       char *niduuid = NULL, *niduuid_suffix;
-       char nidstr[LNET_NIDSTR_SIZE];
-       int i, rc = 0;
+       int rc = 0;
 
        ENTRY;
 
@@ -574,16 +549,6 @@ SERVER_ONLY int lustre_stop_mgc(struct super_block *sb)
                RETURN(-ENOENT);
        lsi->lsi_mgc = NULL;
 
-       /* Reconstruct the NID uuid from the obd_name */
-       strscpy(nidstr, &obd->obd_name[0] + strlen(LUSTRE_MGC_OBDNAME),
-               sizeof(nidstr));
-
-       rc = mgc_niduuid_create(&niduuid, nidstr);
-       if (rc)
-               RETURN(-ENOMEM);
-
-       niduuid_suffix = niduuid + strlen(niduuid);
-
        mutex_lock(&mgc_start_lock);
        LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
        if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
@@ -616,20 +581,10 @@ SERVER_ONLY int lustre_stop_mgc(struct super_block *sb)
        if (rc)
                GOTO(out, rc);
 
-       for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
-               snprintf(niduuid_suffix, NIDUUID_SUFFIX_MAX_LEN, "_%x", i);
-               rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
-                            niduuid, NULL, NULL, NULL);
-               if (rc)
-                       CERROR("del MDC UUID %s failed: rc = %d\n",
-                              niduuid, rc);
-       }
 out:
        /* class_import_put will get rid of the additional connections */
        mutex_unlock(&mgc_start_lock);
 
-       mgc_niduuid_destroy(&niduuid);
-
        RETURN(rc);
 }
 SERVER_ONLY_EXPORT_SYMBOL(lustre_stop_mgc);
index 7699250..2e8b86d 100755 (executable)
@@ -12052,6 +12052,37 @@ test_155() {
 }
 run_test 155 "gap in seq allocation from ofd after restarting"
 
+test_160() {
+       ((OST1_VERSION >= $(version_code 2.16.55) )) ||
+               skip "need OST >= 2.16.55 to have MGC with all failovers"
+
+       stopall
+       reformat
+
+       local mgs_nid=$(do_facet mgs $LCTL list_nids | head -1)
+       local failover_nid="192.168.252.160@${NETTYPE}"
+       local mgs_nodes=$mgs_nid:$failover_nid
+       local tmp_mnt="$TMP/lmount"
+       local count;
+
+       start_mgsmds
+       start_ost
+
+       stack_trap "cleanup; reformat"
+
+       do_facet mgs "mkdir -p $tmp_mnt"
+       do_facet mgs "$MOUNT_CMD $mgs_nodes:/$FSNAME $tmp_mnt" ||
+               error "Fail to mount local client on MGS"
+       do_facet mgs "umount $tmp_mnt"
+
+       do_facet mgs "$LCTL get_param mgc.MGC${mgs_nid}.import"
+       count=$(do_facet mgs "$LCTL get_param mgc.MGC${mgs_nid}.import" |
+               grep -c "$failover_nid")
+       (( count > 0 )) ||
+               error "MGC misses failover MGS nid"
+}
+run_test 160 "MGC updates failnodes from all participants"
+
 cleanup_200() {
        local modopts=$1
        stopall