From 87afd2d7c12289a85254580d5edca951335895e4 Mon Sep 17 00:00:00 2001 From: nathan Date: Fri, 2 Jun 2006 19:24:51 +0000 Subject: [PATCH] Branch b1_5 b=10586 Removed the few bits of code assuming a single MGC. Now, name MGCs with the first NID of the MGS they point to; start a new MGC if the name doesn't exist. This allows maximal re-use of the MGC (all servers/clients pointing to the same MGS), but also allows connects to new MGSs as needed. --- lustre/include/obd_class.h | 2 +- lustre/ldlm/ldlm_lib.c | 3 +- lustre/mgc/mgc_request.c | 31 +++++------ lustre/obdclass/obd_mount.c | 129 ++++++++++++++++++++++++++++++-------------- lustre/ptlrpc/import.c | 9 +++- lustre/tests/insanity.sh | 6 +-- 6 files changed, 120 insertions(+), 60 deletions(-) diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 05110b7..d78cb10 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -99,7 +99,6 @@ void class_decref(struct obd_device *obd); #define CFG_F_COMPAT146 0x08 /* Using old-style log */ #define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ - /* Passed as data param to class_config_parse_llog */ struct config_llog_instance { char * cfg_instance; @@ -120,6 +119,7 @@ struct config_llog_data { struct config_llog_instance cld_cfg; struct list_head cld_list_chain; atomic_t cld_refcount; + struct obd_export *cld_mgcexp; unsigned int cld_stopping:1; }; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 614a91c..a839917 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -444,7 +444,8 @@ int client_disconnect_export(struct obd_export *exp) if (obd->obd_namespace != NULL) { /* obd_no_recov == local only */ ldlm_cli_cancel_unused(obd->obd_namespace, NULL, - obd->obd_no_recov, NULL); + obd->obd_no_recov ? LDLM_FL_LOCAL_ONLY:0, + NULL); ldlm_namespace_free(obd->obd_namespace, obd->obd_no_recov); obd->obd_namespace = NULL; } diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index f54011e..d3d922c 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -45,8 +45,6 @@ #include #include -/* There's only 1 MGC on a node. Anybody using this must have a ref lock */ -static struct obd_device *the_mgc; int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id) { @@ -105,8 +103,7 @@ static void config_log_put(struct config_llog_data *cld) atomic_read(&cld->cld_refcount)); if (atomic_dec_and_test(&cld->cld_refcount)) { CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); - LASSERT(the_mgc); - class_export_put(the_mgc->obd_self_export); + class_export_put(cld->cld_mgcexp); OBD_FREE(cld->cld_logname, strlen(cld->cld_logname) + 1); if (cld->cld_cfg.cfg_instance != NULL) OBD_FREE(cld->cld_cfg.cfg_instance, @@ -162,6 +159,7 @@ static int config_log_add(char *logname, struct config_llog_instance *cfg, struct super_block *sb) { struct config_llog_data *cld; + struct lustre_sb_info *lsi = s2lsi(sb); int rc; ENTRY; @@ -181,7 +179,10 @@ static int config_log_add(char *logname, struct config_llog_instance *cfg, cld->cld_cfg.cfg_flags = 0; cld->cld_cfg.cfg_sb = sb; atomic_set(&cld->cld_refcount, 1); - class_export_get(the_mgc->obd_self_export); + + /* Keep the mgc around until we are done */ + cld->cld_mgcexp = class_export_get(lsi->lsi_mgc->obd_self_export); + if (cfg->cfg_instance != NULL) { OBD_ALLOC(cld->cld_cfg.cfg_instance, strlen(cfg->cfg_instance) + 1); @@ -223,7 +224,7 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg) RETURN(rc); } -/* Failsafe */ +/* Failsafe FIXME remove this */ static void config_log_end_all(void) { struct list_head *tmp, *n; @@ -233,7 +234,7 @@ static void config_log_end_all(void) spin_lock(&config_list_lock); list_for_each_safe(tmp, n, &config_llog_list) { cld = list_entry(tmp, struct config_llog_data, cld_list_chain); - CERROR("conflog failsafe %s\n", cld->cld_logname); + CERROR("\n\nconflog failsafe %s\n\n\n", cld->cld_logname); list_del(&cld->cld_list_chain); config_log_put(cld); } @@ -345,7 +346,9 @@ static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) case OBD_CLEANUP_EARLY: break; case OBD_CLEANUP_EXPORTS: - config_log_end_all(); + if (obd->obd_type->typ_refcnt <= 2) + /* Only for the last mgc */ + config_log_end_all(); break; case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); @@ -366,11 +369,11 @@ static int mgc_cleanup(struct obd_device *obd) LASSERT(cli->cl_mgc_vfsmnt == NULL); - the_mgc = NULL; - /* COMPAT_146 - old config logs may have added profiles we don't know about */ - class_del_profiles(); + if (obd->obd_type->typ_refcnt <= 2) + /* Only for the last mgc */ + class_del_profiles(); ptlrpcd_decref(); @@ -395,7 +398,6 @@ static int mgc_setup(struct obd_device *obd, obd_count len, void *buf) GOTO(err_cleanup, rc); } - the_mgc = obd; RETURN(rc); err_cleanup: @@ -440,8 +442,6 @@ static int mgc_async_requeue(void *data) lwi = LWI_TIMEOUT(3 * HZ + (ll_rand() & 0xff), NULL, NULL); l_wait_event(waitq, 0, &lwi); - /* We're holding a lock on the mgc, but not necessarily the lsi */ - LASSERT(the_mgc); #if 0 /* Re-send server info every time, in case MGS needs to regen its logs (for write_conf). Do we need this? It's extra RPCs for @@ -450,7 +450,8 @@ static int mgc_async_requeue(void *data) /* Unsafe - we don't know that the lsi hasn't been destroyed */ server_register_target(cld->cld_cfg.cfg_sb); #endif - rc = mgc_process_log(the_mgc, cld); + + rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld); out: /* Whether we enqueued again or not in mgc_process_log, we're done with the ref from the old mgc_blocking_ast */ diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 9a3495b..cbe6d85 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -25,7 +25,7 @@ #define DEBUG_SUBSYSTEM S_MGMT -#define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */ +#define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */ #define PRINT_CMD LCONSOLE #define PRINT_MASK D_SUPER @@ -201,7 +201,7 @@ static int server_deregister_mount(char *name) RETURN(0); } -/* obd's look up a registered mount using their name. This is just +/* obd's look up a registered mount using their obdname. This is just for initial obd setup to find the mount struct. It should not be called every time you want to mntget. */ struct lustre_mount_info *server_get_mount(char *name) @@ -589,22 +589,53 @@ static int lustre_start_mgc(struct super_block *sb) struct obd_uuid *uuid; class_uuid_t uuidc; lnet_nid_t nid; - char niduuid[10]; + char *mgcname, *niduuid; char *ptr; int recov_bk = 0; - int rc = 0, i = 0, j; + int rc = 0, i = 0, j, len; ENTRY; LASSERT(lsi->lsi_lmd); - obd = class_name2obd(LUSTRE_MGC_OBDNAME); + /* Find the first non-lo MGS nid for our MGC name */ + if (lsi->lsi_flags & LSI_SERVER) { + ptr = lsi->lsi_ldd->ldd_params; + /* Use mgsnode= nids */ + if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) && + (class_parse_nid(ptr, &nid, &ptr) == 0)) { + i++; + } else if (IS_MGS(lsi->lsi_ldd)) { + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + nid = id.nid; + i++; + break; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + if (class_parse_nid(ptr, &nid, &ptr) == 0) + i++; + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + RETURN(-EINVAL); + } + + len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1; + OBD_ALLOC(mgcname, len); + OBD_ALLOC(niduuid, len + 2); + if (!mgcname || !niduuid) + GOTO(out_free, rc = -ENOMEM); + sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid)); + + obd = class_name2obd(mgcname); if (obd) { + /* Re-using an existing MGC */ atomic_inc(&obd->u.cli.cl_mgc_refcount); - /* There's only one MGC, but users could give different - MGS nids on the mount line. So now do we add new MGS uuids - or not? Since there's only one MGS per site, the MGS uuids - _should_ all be the same. Maybe check here? - */ /* If we are restarting the MGS, don't try to keep the MGC's old connection, or registration will fail. */ @@ -618,7 +649,7 @@ static int lustre_start_mgc(struct super_block *sb) (using its local copy of the log), but we do want to connect if at all possible. */ recov_bk++; - CDEBUG(D_MOUNT, "Set MGS reconnect %d\n", recov_bk); + CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk); rc = obd_set_info_async(obd->obd_self_export, strlen(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP, @@ -626,27 +657,29 @@ static int lustre_start_mgc(struct super_block *sb) GOTO(out, rc = 0); } - CDEBUG(D_MOUNT, "Start MGC '%s'\n", LUSTRE_MGC_OBDNAME); + CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); /* Add the primary nids for the MGS */ + i = 0; + sprintf(niduuid, "%s%x", mgcname, i); if (lsi->lsi_flags & LSI_SERVER) { ptr = lsi->lsi_ldd->ldd_params; if (IS_MGS(lsi->lsi_ldd)) { /* Use local nids (including LO) */ lnet_process_id_t id; while ((rc = LNetGetId(i++, &id)) != -ENOENT) { - rc = do_lcfg(LUSTRE_MGC_OBDNAME, id.nid, - LCFG_ADD_UUID, "mgsnid0", 0,0,0); + rc = do_lcfg(mgcname, id.nid, + LCFG_ADD_UUID, niduuid, 0,0,0); } } else { /* Use mgsnode= nids */ if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) { CERROR("No MGS nids given.\n"); - RETURN(-EINVAL); + GOTO(out_free, rc = -EINVAL); } while (class_parse_nid(ptr, &nid, &ptr) == 0) { - rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid, - LCFG_ADD_UUID, "mgsnid0", 0,0,0); + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); i++; } } @@ -654,8 +687,8 @@ static int lustre_start_mgc(struct super_block *sb) /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ ptr = lsi->lsi_lmd->lmd_dev; while (class_parse_nid(ptr, &nid, &ptr) == 0) { - rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid, - LCFG_ADD_UUID, "mgsnid0", 0,0,0); + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); i++; /* Stop at the first failover nid */ if (*ptr == ':') @@ -664,7 +697,7 @@ static int lustre_start_mgc(struct super_block *sb) } if (i == 0) { CERROR("No valid MGS nids found.\n"); - RETURN(-EINVAL); + GOTO(out_free, rc = -EINVAL); } lsi->lsi_lmd->lmd_mgs_failnodes = 1; @@ -674,29 +707,29 @@ static int lustre_start_mgc(struct super_block *sb) class_uuid_unparse(uuidc, uuid); /* Start the MGC */ - rc = lustre_start_simple(LUSTRE_MGC_OBDNAME, LUSTRE_MGC_NAME, + rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, - "mgsnid0"); + niduuid); OBD_FREE_PTR(uuid); if (rc) - RETURN(rc); + GOTO(out_free, rc); /* Add any failover MGS nids */ i = 1; while ((*ptr == ':' || class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) { /* New failover node */ - sprintf(niduuid, "mgsnid%d", i); + sprintf(niduuid, "%s%x", mgcname, i); j = 0; while (class_parse_nid(ptr, &nid, &ptr) == 0) { j++; - rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid, + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, niduuid, 0,0,0); if (*ptr == ':') break; } if (j > 0) { - rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_ADD_CONN, + rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, niduuid, 0, 0, 0); i++; } else { @@ -706,10 +739,10 @@ static int lustre_start_mgc(struct super_block *sb) } lsi->lsi_lmd->lmd_mgs_failnodes = i; - obd = class_name2obd(LUSTRE_MGC_OBDNAME); + obd = class_name2obd(mgcname); if (!obd) { - CERROR("Can't find mgcobd %s\n", LUSTRE_MGC_OBDNAME); - RETURN(-ENOTCONN); + CERROR("Can't find mgcobd %s\n", mgcname); + GOTO(out_free, rc = -ENOTCONN); } /* Try all connections, but only once. */ @@ -740,6 +773,11 @@ out: /* Keep the mgc info in the sb. Note that many lsi's can point to the same mgc.*/ lsi->lsi_mgc = obd; +out_free: + if (mgcname) + OBD_FREE(mgcname, len); + if (niduuid) + OBD_FREE(niduuid, len + 2); RETURN(rc); } @@ -747,8 +785,8 @@ static int lustre_stop_mgc(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; - char niduuid[10]; - int i, rc; + char *niduuid, *ptr = 0; + int i, rc, len; ENTRY; if (!lsi) @@ -776,20 +814,32 @@ static int lustre_stop_mgc(struct super_block *sb) if (obd->u.cli.cl_mgc_mgsexp) obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + /* Save the obdname for cleaning the nid uuids */ + len = strlen(obd->obd_name) + 3; + OBD_ALLOC(niduuid, len); + if (niduuid) { + strcpy(niduuid, obd->obd_name); + ptr = niduuid + strlen(niduuid); + } + rc = class_manual_cleanup(obd); - if (rc) + if (rc) RETURN(rc); - + + /* Clean the nid uuids */ + if (!niduuid) + RETURN(-ENOMEM); for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { - sprintf(niduuid, "mgsnid%d", i); + sprintf(ptr, "%x", i); rc = do_lcfg(obd->obd_name, 0, LCFG_DEL_UUID, niduuid, 0, 0, 0); if (rc) CERROR("del MDC UUID %s failed: rc = %d\n", niduuid, rc); } + OBD_FREE(niduuid, len); /* class_import_put will get rid of the additional connections */ - + RETURN(0); } @@ -1273,17 +1323,18 @@ static void server_wait_finished(struct vfsmount *mnt) { wait_queue_head_t waitq; struct l_wait_info lwi; - int retries = 10; + int retries = 120; init_waitqueue_head(&waitq); - while ((atomic_read(&mnt->mnt_count) > 1) && retries--) { + while ((atomic_read(&mnt->mnt_count) > 1) && (retries > 0)) { LCONSOLE_WARN("Mount still busy with %d refs, waiting for " "%d secs...\n", - atomic_read(&mnt->mnt_count), 2 * retries); + atomic_read(&mnt->mnt_count), retries); /* Wait for a bit */ - lwi = LWI_TIMEOUT(2 * HZ, NULL, NULL); + retries -= 5; + lwi = LWI_TIMEOUT(5 * HZ, NULL, NULL); l_wait_event(waitq, 0, &lwi); } if (atomic_read(&mnt->mnt_count) > 1) { diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 892e589..88f1f30 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -626,7 +626,14 @@ finish: " bits?\n"); exp = class_conn2export(&imp->imp_dlm_handle); - LASSERT(exp); + if (!exp) { + /* This could happen if export is cleaned during the + connect attempt */ + spin_unlock_irqrestore(&imp->imp_lock, flags); + CERROR("Missing export for %s\n", + imp->imp_obd->obd_name); + GOTO(out, rc = -ENODEV); + } exp->exp_connect_flags = ocd->ocd_connect_flags; class_export_put(exp); diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 873f23d..0e512cc 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -184,11 +184,11 @@ test_0() { echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 1; } - facet_failover ost1 + facet_failover ost1 || return 4 echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 2; } - facet_failover ost2 + facet_failover ost2 || return 5 echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 3; } return 0 @@ -224,7 +224,7 @@ test_2() { echo "Reintegrating OST" reboot_facet ost1 wait_for ost1 - start_ost 1 + start_ost 1 || return 2 wait_for mds start mds $MDSDEV $MDS_MOUNT_OPTS || return $? -- 1.8.3.1