/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * * lustre/obdclass/obd_mount.c * Client/server mount routines * * Copyright (c) 2006 Cluster File Systems, Inc. * Author: Nathan Rutman * * This file is part of Lustre, http://www.lustre.org/ * * Lustre is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * Lustre is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_CLASS #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */ #define PRINT_CMD CDEBUG #define PRINT_MASK D_SUPER|D_CONFIG #include #include #include #include #include #include #include #include #include static int (*client_fill_super)(struct super_block *sb) = NULL; /*********** mount lookup *********/ DECLARE_MUTEX(lustre_mount_info_lock); struct list_head server_mount_info_list = LIST_HEAD_INIT(server_mount_info_list); static struct lustre_mount_info *server_find_mount(const char *name) { struct list_head *tmp; struct lustre_mount_info *lmi; ENTRY; list_for_each(tmp, &server_mount_info_list) { lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain); if (strcmp(name, lmi->lmi_name) == 0) RETURN(lmi); } RETURN(NULL); } /* we must register an obd for a mount before we call the setup routine. *_setup will call lustre_get_mount to get the mnt struct by obd_name, since we can't pass the pointer to setup. */ static int server_register_mount(const char *name, struct super_block *sb, struct vfsmount *mnt) { struct lustre_mount_info *lmi; char *name_cp; ENTRY; LASSERT(mnt); LASSERT(sb); OBD_ALLOC(lmi, sizeof(*lmi)); if (!lmi) RETURN(-ENOMEM); OBD_ALLOC(name_cp, strlen(name) + 1); if (!name_cp) { OBD_FREE(lmi, sizeof(*lmi)); RETURN(-ENOMEM); } strcpy(name_cp, name); down(&lustre_mount_info_lock); if (server_find_mount(name)) { up(&lustre_mount_info_lock); OBD_FREE(lmi, sizeof(*lmi)); OBD_FREE(name_cp, strlen(name) + 1); CERROR("Already registered %s\n", name); RETURN(-EEXIST); } lmi->lmi_name = name_cp; lmi->lmi_sb = sb; lmi->lmi_mnt = mnt; list_add(&lmi->lmi_list_chain, &server_mount_info_list); up(&lustre_mount_info_lock); CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n", lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count)); RETURN(0); } /* when an obd no longer needs a mount */ static int server_deregister_mount(const char *name) { struct lustre_mount_info *lmi; ENTRY; down(&lustre_mount_info_lock); lmi = server_find_mount(name); if (!lmi) { up(&lustre_mount_info_lock); CERROR("%s not registered\n", name); RETURN(-ENOENT); } CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n", lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count)); OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1); list_del(&lmi->lmi_list_chain); OBD_FREE(lmi, sizeof(*lmi)); up(&lustre_mount_info_lock); RETURN(0); } /* obd's look up a registered mount using their obdname. This is just for initial obd setup to find the mount struct. It should not be called every time you want to mntget. */ struct lustre_mount_info *server_get_mount(const char *name) { struct lustre_mount_info *lmi; struct lustre_sb_info *lsi; ENTRY; down(&lustre_mount_info_lock); lmi = server_find_mount(name); up(&lustre_mount_info_lock); if (!lmi) { CERROR("Can't find mount for %s\n", name); RETURN(NULL); } lsi = s2lsi(lmi->lmi_sb); mntget(lmi->lmi_mnt); atomic_inc(&lsi->lsi_mounts); CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n", lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), atomic_read(&lmi->lmi_mnt->mnt_count)); RETURN(lmi); } /* * Used by mdt to get mount_info from obdname. * There are no blocking when using the mount_info. * Do not use server_get_mount for this purpose. */ struct lustre_mount_info *server_get_mount_2(const char *name) { struct lustre_mount_info *lmi; ENTRY; down(&lustre_mount_info_lock); lmi = server_find_mount(name); up(&lustre_mount_info_lock); if (!lmi) CERROR("Can't find mount for %s\n", name); RETURN(lmi); } static void unlock_mntput(struct vfsmount *mnt) { if (kernel_locked()) { unlock_kernel(); mntput(mnt); lock_kernel(); } else { mntput(mnt); } } static int lustre_put_lsi(struct super_block *sb); /* to be called from obd_cleanup methods */ int server_put_mount(const char *name, struct vfsmount *mnt) { struct lustre_mount_info *lmi; struct lustre_sb_info *lsi; int count = atomic_read(&mnt->mnt_count) - 1; ENTRY; /* This might be the last one, can't deref after this */ unlock_mntput(mnt); down(&lustre_mount_info_lock); lmi = server_find_mount(name); up(&lustre_mount_info_lock); if (!lmi) { CERROR("Can't find mount for %s\n", name); RETURN(-ENOENT); } lsi = s2lsi(lmi->lmi_sb); LASSERT(lmi->lmi_mnt == mnt); CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n", lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), count); if (lustre_put_lsi(lmi->lmi_sb)) { CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n", lmi->lmi_mnt, name, count); /* last mount is the One True Mount */ if (count > 1) CERROR("%s: mount busy, vfscount=%d!\n", name, count); } /* this obd should never need the mount again */ server_deregister_mount(name); RETURN(0); } /* Corresponding to server_get_mount_2 */ int server_put_mount_2(const char *name, struct vfsmount *mnt) { ENTRY; RETURN(0); } /******* mount helper utilities *********/ #if 0 static void ldd_print(struct lustre_disk_data *ldd) { PRINT_CMD(PRINT_MASK, " disk data:\n"); PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname); PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid); PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname); PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex); PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver); PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags); PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd)); PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts); PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params); PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata); } #endif static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt, struct lustre_disk_data *ldd) { struct lvfs_run_ctxt saved; struct file *file; loff_t off = 0; unsigned long len; int rc; ENTRY; push_ctxt(&saved, mount_ctxt, NULL); file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644); if (IS_ERR(file)) { rc = PTR_ERR(file); CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc); GOTO(out, rc); } len = i_size_read(file->f_dentry->d_inode); CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len); if (len != sizeof(*ldd)) { CERROR("disk data size does not match: see %lu expect "LPSZ"\n", len, sizeof(*ldd)); GOTO(out_close, rc = -EINVAL); } rc = lustre_fread(file, ldd, len, &off); if (rc != len) { CERROR("error reading %s: read %d of %lu\n", MOUNT_DATA_FILE, rc, len); GOTO(out_close, rc = -EINVAL); } rc = 0; if (ldd->ldd_magic != LDD_MAGIC) { /* FIXME add swabbing support */ CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE, ldd->ldd_magic, LDD_MAGIC); GOTO(out_close, rc = -EINVAL); } if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) { CERROR("%s: unsupported incompat filesystem feature(s) %x\n", ldd->ldd_svname, ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP); GOTO(out_close, rc = -EINVAL); } if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) { CERROR("%s: unsupported read-only filesystem feature(s) %x\n", ldd->ldd_svname, ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP); /* Do something like remount filesystem read-only */ GOTO(out_close, rc = -EINVAL); } out_close: filp_close(file, 0); out: pop_ctxt(&saved, mount_ctxt, NULL); RETURN(rc); } static int ldd_write(struct lvfs_run_ctxt *mount_ctxt, struct lustre_disk_data *ldd) { struct lvfs_run_ctxt saved; struct file *file; loff_t off = 0; unsigned long len = sizeof(struct lustre_disk_data); int rc = 0; ENTRY; LASSERT(ldd->ldd_magic == LDD_MAGIC); ldd->ldd_config_ver++; push_ctxt(&saved, mount_ctxt, NULL); file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644); if (IS_ERR(file)) { rc = PTR_ERR(file); CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc); GOTO(out, rc); } rc = lustre_fwrite(file, ldd, len, &off); if (rc != len) { CERROR("error writing %s: read %d of %lu\n", MOUNT_DATA_FILE, rc, len); GOTO(out_close, rc = -EINVAL); } rc = 0; out_close: filp_close(file, 0); out: pop_ctxt(&saved, mount_ctxt, NULL); RETURN(rc); } /**************** config llog ********************/ /* Get a config log from the MGS and process it. This func is called for both clients and servers. Continue to process new statements appended to the logs (whenever the config lock is revoked) until lustre_end_log is called. */ int lustre_process_log(struct super_block *sb, char *logname, struct config_llog_instance *cfg) { struct lustre_cfg *lcfg; struct lustre_cfg_bufs bufs; struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *mgc = lsi->lsi_mgc; int rc; ENTRY; LASSERT(mgc); LASSERT(cfg); /* mgc_process_config */ lustre_cfg_bufs_reset(&bufs, mgc->obd_name); lustre_cfg_bufs_set_string(&bufs, 1, logname); lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); lustre_cfg_bufs_set(&bufs, 3, &sb, sizeof(sb)); lcfg = lustre_cfg_new(LCFG_LOG_START, &bufs); rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); lustre_cfg_free(lcfg); if (rc == -EINVAL) LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'" "failed from the MGS (%d). Make sure this " "client and the MGS are running compatible " "versions of Lustre.\n", mgc->obd_name, logname, rc); if (rc) LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' " "failed (%d). This may be the result of " "communication errors between this node and " "the MGS, a bad configuration, or other " "errors. See the syslog for more " "information.\n", mgc->obd_name, logname, rc); /* class_obd_list(); */ RETURN(rc); } /* Stop watching this config log for updates */ int lustre_end_log(struct super_block *sb, char *logname, struct config_llog_instance *cfg) { struct lustre_cfg *lcfg; struct lustre_cfg_bufs bufs; struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *mgc = lsi->lsi_mgc; int rc; ENTRY; if (!mgc) RETURN(-ENOENT); /* mgc_process_config */ lustre_cfg_bufs_reset(&bufs, mgc->obd_name); lustre_cfg_bufs_set_string(&bufs, 1, logname); if (cfg) lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs); rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); lustre_cfg_free(lcfg); RETURN(rc); } /**************** obd start *******************/ int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, char *s1, char *s2, char *s3, char *s4) { struct lustre_cfg_bufs bufs; struct lustre_cfg * lcfg = NULL; int rc; CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, cmd, s1, s2, s3, s4); lustre_cfg_bufs_reset(&bufs, cfgname); if (s1) lustre_cfg_bufs_set_string(&bufs, 1, s1); if (s2) lustre_cfg_bufs_set_string(&bufs, 2, s2); if (s3) lustre_cfg_bufs_set_string(&bufs, 3, s3); if (s4) lustre_cfg_bufs_set_string(&bufs, 4, s4); lcfg = lustre_cfg_new(cmd, &bufs); lcfg->lcfg_nid = nid; rc = class_process_config(lcfg); lustre_cfg_free(lcfg); return(rc); } static int lustre_start_simple(char *obdname, char *type, char *uuid, char *s1, char *s2) { int rc; CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0); if (rc) { CERROR("%s attach error %d\n", obdname, rc); return(rc); } rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0); if (rc) { CERROR("%s setup error %d\n", obdname, rc); do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0); } return rc; } /* Set up a MGS to serve startup logs */ static int server_start_mgs(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct vfsmount *mnt = lsi->lsi_srv_mnt; struct lustre_mount_info *lmi; int rc = 0; ENTRY; LASSERT(mnt); /* It is impossible to have more than 1 MGS per node, since MGC wouldn't know which to connect to */ lmi = server_find_mount(LUSTRE_MGS_OBDNAME); if (lmi) { lsi = s2lsi(lmi->lmi_sb); LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started" " from server %s\n", lsi->lsi_ldd->ldd_svname); RETURN(-EALREADY); } CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME); rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt); if (!rc && ((rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME, LUSTRE_MGS_OBDNAME, 0, 0)))) server_deregister_mount(LUSTRE_MGS_OBDNAME); if (rc) LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). " "Is the 'mgs' module loaded?\n", LUSTRE_MGS_OBDNAME, rc); RETURN(rc); } static int server_stop_mgs(struct super_block *sb) { struct obd_device *obd; int rc; ENTRY; CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME); /* There better be only one MGS */ obd = class_name2obd(LUSTRE_MGS_OBDNAME); if (!obd) { CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME); RETURN(-EALREADY); } /* The MGS should always stop when we say so */ obd->obd_force = 1; rc = class_manual_cleanup(obd); RETURN(rc); } DECLARE_MUTEX(mgc_start_lock); /* Set up a mgcobd to process startup logs */ static int lustre_start_mgc(struct super_block *sb) { struct lustre_handle mgc_conn = {0, }; struct obd_connect_data ocd = { 0 }; struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; struct obd_export *exp; struct obd_uuid *uuid; class_uuid_t uuidc; lnet_nid_t nid; char *mgcname, *niduuid; char *ptr; int recov_bk; int rc = 0, i = 0, j, len; ENTRY; LASSERT(lsi->lsi_lmd); /* Find the first non-lo MGS nid for our MGC name */ if (lsi->lsi_flags & LSI_SERVER) { ptr = lsi->lsi_ldd->ldd_params; /* Use mgsnode= nids */ if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) && (class_parse_nid(ptr, &nid, &ptr) == 0)) { i++; } else if (IS_MGS(lsi->lsi_ldd)) { lnet_process_id_t id; while ((rc = LNetGetId(i++, &id)) != -ENOENT) { if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) continue; nid = id.nid; i++; break; } } } else { /* client */ /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ ptr = lsi->lsi_lmd->lmd_dev; if (class_parse_nid(ptr, &nid, &ptr) == 0) i++; } if (i == 0) { CERROR("No valid MGS nids found.\n"); RETURN(-EINVAL); } len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1; OBD_ALLOC(mgcname, len); OBD_ALLOC(niduuid, len + 2); if (!mgcname || !niduuid) GOTO(out_free, rc = -ENOMEM); sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid)); mutex_down(&mgc_start_lock); obd = class_name2obd(mgcname); if (obd) { /* Re-using an existing MGC */ atomic_inc(&obd->u.cli.cl_mgc_refcount); recov_bk = 0; /* If we are restarting the MGS, don't try to keep the MGC's old connection, or registration will fail. */ if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) { CDEBUG(D_MOUNT, "New MGS with live MGC\n"); recov_bk = 1; } /* Try all connections, but only once (again). We don't want to block another target from starting (using its local copy of the log), but we do want to connect if at all possible. */ recov_bk++; CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk); rc = obd_set_info_async(obd->obd_self_export, strlen(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP, sizeof(recov_bk), &recov_bk, NULL); GOTO(out, rc = 0); } CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); /* Add the primary nids for the MGS */ i = 0; sprintf(niduuid, "%s_%x", mgcname, i); if (lsi->lsi_flags & LSI_SERVER) { ptr = lsi->lsi_ldd->ldd_params; if (IS_MGS(lsi->lsi_ldd)) { /* Use local nids (including LO) */ lnet_process_id_t id; while ((rc = LNetGetId(i++, &id)) != -ENOENT) { rc = do_lcfg(mgcname, id.nid, LCFG_ADD_UUID, niduuid, 0,0,0); } } else { /* Use mgsnode= nids */ if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) { CERROR("No MGS nids given.\n"); GOTO(out_free, rc = -EINVAL); } while (class_parse_nid(ptr, &nid, &ptr) == 0) { rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, niduuid, 0,0,0); i++; } } } else { /* client */ /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ ptr = lsi->lsi_lmd->lmd_dev; while (class_parse_nid(ptr, &nid, &ptr) == 0) { rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, niduuid, 0,0,0); i++; /* Stop at the first failover nid */ if (*ptr == ':') break; } } if (i == 0) { CERROR("No valid MGS nids found.\n"); GOTO(out_free, rc = -EINVAL); } lsi->lsi_lmd->lmd_mgs_failnodes = 1; /* Random uuid for MGC allows easier reconnects */ OBD_ALLOC_PTR(uuid); ll_generate_random_uuid(uuidc); class_uuid_unparse(uuidc, uuid); /* Start the MGC */ rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, niduuid); OBD_FREE_PTR(uuid); if (rc) GOTO(out_free, rc); /* Add any failover MGS nids */ i = 1; while ((*ptr == ':' || class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) { /* New failover node */ sprintf(niduuid, "%s_%x", mgcname, i); j = 0; while (class_parse_nid(ptr, &nid, &ptr) == 0) { j++; rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, niduuid, 0,0,0); if (*ptr == ':') break; } if (j > 0) { rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, niduuid, 0, 0, 0); i++; } else { /* at ":/fsname" */ break; } } lsi->lsi_lmd->lmd_mgs_failnodes = i; obd = class_name2obd(mgcname); if (!obd) { CERROR("Can't find mgcobd %s\n", mgcname); GOTO(out_free, rc = -ENOTCONN); } /* Keep a refcount of servers/clients who started with "mount", so we know when we can get rid of the mgc. */ atomic_set(&obd->u.cli.cl_mgc_refcount, 1); /* Try all connections, but only once. */ recov_bk = 1; rc = obd_set_info_async(obd->obd_self_export, strlen(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP, sizeof(recov_bk), &recov_bk, NULL); if (rc) /* nonfatal */ CERROR("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); /* We connect to the MGS at setup, and don't disconnect until cleanup */ rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), &ocd); if (rc) { CERROR("connect failed %d\n", rc); GOTO(out, rc); } exp = class_conn2export(&mgc_conn); obd->u.cli.cl_mgc_mgsexp = exp; out: /* Keep the mgc info in the sb. Note that many lsi's can point to the same mgc.*/ lsi->lsi_mgc = obd; out_free: mutex_up(&mgc_start_lock); if (mgcname) OBD_FREE(mgcname, len); if (niduuid) OBD_FREE(niduuid, len + 2); RETURN(rc); } static int lustre_stop_mgc(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; char *niduuid, *ptr = 0; int i, rc = 0, len; ENTRY; if (!lsi) RETURN(-ENOENT); obd = lsi->lsi_mgc; if (!obd) RETURN(-ENOENT); lsi->lsi_mgc = NULL; mutex_down(&mgc_start_lock); if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { /* This is not fatal, every client that stops will call in here. */ CDEBUG(D_MOUNT, "mgc still has %d references.\n", atomic_read(&obd->u.cli.cl_mgc_refcount)); GOTO(out, rc = -EBUSY); } /* MGC must always stop */ obd->obd_force = 1; /* client_disconnect_export uses the no_recov flag to decide whether it should disconnect or just invalidate. (The MGC has no recoverable data in any case.) */ obd->obd_no_recov = 1; if (obd->u.cli.cl_mgc_mgsexp) obd_disconnect(obd->u.cli.cl_mgc_mgsexp); /* Save the obdname for cleaning the nid uuids, which are obdname_XX */ len = strlen(obd->obd_name) + 6; OBD_ALLOC(niduuid, len); if (niduuid) { strcpy(niduuid, obd->obd_name); ptr = niduuid + strlen(niduuid); } rc = class_manual_cleanup(obd); if (rc) GOTO(out, rc); /* Clean the nid uuids */ if (!niduuid) RETURN(-ENOMEM); for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { sprintf(ptr, "_%x", i); rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, niduuid, 0, 0, 0); if (rc) CERROR("del MDC UUID %s failed: rc = %d\n", niduuid, rc); } OBD_FREE(niduuid, len); /* class_import_put will get rid of the additional connections */ out: mutex_up(&mgc_start_lock); RETURN(rc); } /* Since there's only one mgc per node, we have to change it's fs to get access to the right disk. */ static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); int rc; ENTRY; CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev); /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */ rc = obd_set_info_async(mgc->obd_self_export, strlen("set_fs"), "set_fs", sizeof(*sb), sb, NULL); if (rc) { CERROR("can't set_fs %d\n", rc); } RETURN(rc); } static int server_mgc_clear_fs(struct obd_device *mgc) { int rc; ENTRY; CDEBUG(D_MOUNT, "Unassign mgc disk\n"); rc = obd_set_info_async(mgc->obd_self_export, strlen("clear_fs"), "clear_fs", 0, NULL, NULL); RETURN(rc); } DECLARE_MUTEX(server_start_lock); /* Stop MDS/OSS if nobody is using them */ static int server_stop_servers(int lddflags, int lsiflags) { struct obd_device *obd = NULL; struct obd_type *type = NULL; int rc = 0; ENTRY; mutex_down(&server_start_lock); /* Either an MDT or an OST or neither */ /* if this was an MDT, and there are no more MDT's, clean up the MDS */ if ((lddflags & LDD_F_SV_TYPE_MDT) && (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) { /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/ type = class_search_type(LUSTRE_MDS_NAME); } /* if this was an OST, and there are no more OST's, clean up the OSS */ if ((lddflags & LDD_F_SV_TYPE_OST) && (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) { type = class_search_type(LUSTRE_OST_NAME); } if (obd && (!type || !type->typ_refcnt)) { int err; obd->obd_force = 1; /* obd_fail doesn't mean much on a server obd */ err = class_manual_cleanup(obd); if (!rc) rc = err; } mutex_up(&server_start_lock); RETURN(rc); } int server_mti_print(char *title, struct mgs_target_info *mti) { PRINT_CMD(PRINT_MASK, "mti %s\n", title); PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname); PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname); PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid); PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n", mti->mti_config_ver, mti->mti_flags); return(0); } static int mti_set_sec_opts(struct mgs_target_info *mti, struct lustre_mount_data *lmd) { char *s1, *s2; if (lmd->lmd_sec_mdt == NULL && lmd->lmd_sec_cli == NULL) { /* just let on-disk params do its work. but we have an * assumption that any changes of on-disk data by tune2fs * should lead to server rewrite log. */ return 0; } /* filter out existing sec options */ s1 = mti->mti_params; while (*s1) { int clear; while (*s1 == ' ') s1++; if (strncmp(s1, PARAM_SEC_RPC_MDT, sizeof(PARAM_SEC_RPC_MDT) - 1) == 0 || strncmp(s1, PARAM_SEC_RPC_CLI, sizeof(PARAM_SEC_RPC_CLI) - 1) == 0) clear = 1; else clear = 0; s2 = strchr(s1, ' '); if (s2 == NULL) { if (clear) *s1 = '\0'; break; } s2++; if (clear) memmove(s1, s2, strlen(s2) + 1); else s1 = s2; } /* append sec options from lmd */ /* FIXME add flag LDD_F_UPDATE after mountconf start supporting * log updating. */ if (lmd->lmd_sec_mdt) { if (strlen(mti->mti_params) + strlen(lmd->lmd_sec_mdt) + sizeof(PARAM_SEC_RPC_MDT) + 1 >= sizeof(mti->mti_params)) { CERROR("security params too big for mti\n"); return -ENOMEM; } strcat(mti->mti_params, " "PARAM_SEC_RPC_MDT); strcat(mti->mti_params, lmd->lmd_sec_mdt); //mti->mti_flags |= LDD_F_UPDATE; } if (lmd->lmd_sec_cli) { if (strlen(mti->mti_params) + strlen(lmd->lmd_sec_cli) + sizeof(PARAM_SEC_RPC_CLI) + 2 > sizeof(mti->mti_params)) { CERROR("security params too big for mti\n"); return -ENOMEM; } strcat(mti->mti_params, " "PARAM_SEC_RPC_CLI); strcat(mti->mti_params, lmd->lmd_sec_cli); //mti->mti_flags |= LDD_F_UPDATE; } return 0; } static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti) { struct lustre_sb_info *lsi = s2lsi(sb); struct lustre_disk_data *ldd = lsi->lsi_ldd; struct lustre_mount_data *lmd = lsi->lsi_lmd; lnet_process_id_t id; int i = 0; ENTRY; if (!(lsi->lsi_flags & LSI_SERVER)) RETURN(-EINVAL); strncpy(mti->mti_fsname, ldd->ldd_fsname, sizeof(mti->mti_fsname)); strncpy(mti->mti_svname, ldd->ldd_svname, sizeof(mti->mti_svname)); mti->mti_nid_count = 0; while (LNetGetId(i++, &id) != -ENOENT) { if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) continue; mti->mti_nids[mti->mti_nid_count] = id.nid; mti->mti_nid_count++; if (mti->mti_nid_count >= MTI_NIDS_MAX) { CWARN("Only using first %d nids for %s\n", mti->mti_nid_count, mti->mti_svname); break; } } mti->mti_lustre_ver = LUSTRE_VERSION_CODE; mti->mti_config_ver = 0; mti->mti_flags = ldd->ldd_flags; mti->mti_stripe_index = ldd->ldd_svindex; memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid)); if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) { CERROR("params too big for mti\n"); RETURN(-ENOMEM); } memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params)); RETURN(mti_set_sec_opts(mti, lmd)); } /* Register an old or new target with the MGS. If needed MGS will construct startup logs and assign index */ int server_register_target(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *mgc = lsi->lsi_mgc; struct lustre_disk_data *ldd = lsi->lsi_ldd; struct mgs_target_info *mti = NULL; int rc; ENTRY; LASSERT(mgc); if (!(lsi->lsi_flags & LSI_SERVER)) RETURN(-EINVAL); OBD_ALLOC_PTR(mti); if (!mti) RETURN(-ENOMEM); rc = server_sb2mti(sb, mti); if (rc) GOTO(out, rc); CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n", mti->mti_svname, mti->mti_fsname, libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index, mti->mti_flags); /* Register the target */ /* FIXME use mgc_process_config instead */ rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp, strlen("register_target"), "register_target", sizeof(*mti), mti, NULL); if (rc) { CERROR("registration with the MGS failed (%d)\n", rc); GOTO(out, rc); } /* Always update our flags */ ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD; /* If this flag is set, it means the MGS wants us to change our on-disk data. (So far this means just the index.) */ if (mti->mti_flags & LDD_F_REWRITE_LDD) { char *label; int err; CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x " "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index, mti->mti_svname); ldd->ldd_svindex = mti->mti_stripe_index; strncpy(ldd->ldd_svname, mti->mti_svname, sizeof(ldd->ldd_svname)); /* or ldd_make_sv_name(ldd); */ ldd_write(&mgc->obd_lvfs_ctxt, ldd); err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb, mti->mti_svname); if (err) CERROR("Label set error %d\n", err); label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb); if (label) CDEBUG(D_MOUNT, "Disk label changed to %s\n", label); /* Flush the new ldd to disk */ fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb); } out: if (mti) OBD_FREE_PTR(mti); RETURN(rc); } /* Start targets */ static int server_start_targets(struct super_block *sb, struct vfsmount *mnt) { struct obd_device *obd; struct lustre_sb_info *lsi = s2lsi(sb); struct config_llog_instance cfg; int rc; ENTRY; CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname); #if 0 /* If we're an MDT, make sure the global MDS is running */ if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) { /* make sure the MDS is started */ mutex_down(&server_start_lock); obd = class_name2obd(LUSTRE_MDS_OBDNAME); if (!obd) { rc = lustre_start_simple(LUSTRE_MDS_OBDNAME, /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */ LUSTRE_MDT_NAME, LUSTRE_MDS_OBDNAME"_uuid", 0, 0); if (rc) { mutex_up(&server_start_lock); CERROR("failed to start MDS: %d\n", rc); RETURN(rc); } } mutex_up(&server_start_lock); } #endif /* If we're an OST, make sure the global OSS is running */ if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) { /* make sure OSS is started */ mutex_down(&server_start_lock); obd = class_name2obd(LUSTRE_OSS_OBDNAME); if (!obd) { rc = lustre_start_simple(LUSTRE_OSS_OBDNAME, LUSTRE_OSS_NAME, LUSTRE_OSS_OBDNAME"_uuid", 0, 0); if (rc) { mutex_up(&server_start_lock); CERROR("failed to start OSS: %d\n", rc); RETURN(rc); } } mutex_up(&server_start_lock); } /* Set the mgc fs to our server disk. This allows the MGC to read and write configs locally. */ rc = server_mgc_set_fs(lsi->lsi_mgc, sb); if (rc) RETURN(rc); /* Register with MGS */ rc = server_register_target(sb); if (rc && (lsi->lsi_ldd->ldd_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){ CERROR("Required registration failed for %s: %d\n", lsi->lsi_ldd->ldd_svname, rc); if (rc == -EIO) { LCONSOLE_ERROR_MSG(0x15f, "Communication error with " "the MGS. Is the MGS running?\n"); } GOTO(out_mgc, rc); } if (rc == -EINVAL) { LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this " "server (%s) to start. Please see messages" " on the MGS node.\n", lsi->lsi_ldd->ldd_svname); GOTO(out_mgc, rc); } /* Let the target look up the mount using the target's name (we can't pass the sb or mnt through class_process_config.) */ rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt); if (rc) GOTO(out_mgc, rc); /* Start targets using the llog named for the target */ memset(&cfg, 0, sizeof(cfg)); rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg); if (rc) { CERROR("failed to start server %s: %d\n", lsi->lsi_ldd->ldd_svname, rc); GOTO(out_mgc, rc); } out_mgc: /* Release the mgc fs for others to use */ server_mgc_clear_fs(lsi->lsi_mgc); if (!rc) { obd = class_name2obd(lsi->lsi_ldd->ldd_svname); if (!obd) { CERROR("no server named %s was started\n", lsi->lsi_ldd->ldd_svname); RETURN(-ENXIO); } if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) && (OBP(obd, iocontrol))) { obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0, NULL, NULL); } /* log has been fully processed */ obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG); } RETURN(rc); } /***************** lustre superblock **************/ struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) { struct lustre_sb_info *lsi = NULL; ENTRY; OBD_ALLOC(lsi, sizeof(*lsi)); if (!lsi) RETURN(NULL); OBD_ALLOC(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); if (!lsi->lsi_lmd) { OBD_FREE(lsi, sizeof(*lsi)); RETURN(NULL); } lsi->lsi_lmd->lmd_exclude_count = 0; s2lsi_nocast(sb) = lsi; /* we take 1 extra ref for our setup */ atomic_set(&lsi->lsi_mounts, 1); /* Default umount style */ lsi->lsi_flags = LSI_UMOUNT_FAILOVER; lsi->lsi_lmd->lmd_nllu = NOBODY_UID; lsi->lsi_lmd->lmd_nllg = NOBODY_GID; RETURN(lsi); } static int lustre_free_lsi(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); ENTRY; if (!lsi) RETURN(0); CDEBUG(D_MOUNT, "Freeing lsi\n"); /* someone didn't call server_put_mount. */ LASSERT(atomic_read(&lsi->lsi_mounts) == 0); if (lsi->lsi_ldd != NULL) OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd)); if (lsi->lsi_lmd != NULL) { if (lsi->lsi_lmd->lmd_dev != NULL) OBD_FREE(lsi->lsi_lmd->lmd_dev, strlen(lsi->lsi_lmd->lmd_dev) + 1); if (lsi->lsi_lmd->lmd_profile != NULL) OBD_FREE(lsi->lsi_lmd->lmd_profile, strlen(lsi->lsi_lmd->lmd_profile) + 1); if (lsi->lsi_lmd->lmd_sec_mdt != NULL) OBD_FREE(lsi->lsi_lmd->lmd_sec_mdt, strlen(lsi->lsi_lmd->lmd_sec_mdt) + 1); if (lsi->lsi_lmd->lmd_sec_cli != NULL) OBD_FREE(lsi->lsi_lmd->lmd_sec_cli, strlen(lsi->lsi_lmd->lmd_sec_cli) + 1); if (lsi->lsi_lmd->lmd_opts != NULL) OBD_FREE(lsi->lsi_lmd->lmd_opts, strlen(lsi->lsi_lmd->lmd_opts) + 1); if (lsi->lsi_lmd->lmd_exclude_count) OBD_FREE(lsi->lsi_lmd->lmd_exclude, sizeof(lsi->lsi_lmd->lmd_exclude[0]) * lsi->lsi_lmd->lmd_exclude_count); OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); } LASSERT(lsi->lsi_llsbi == NULL); OBD_FREE(lsi, sizeof(*lsi)); s2lsi_nocast(sb) = NULL; RETURN(0); } /* The lsi has one reference for every server that is using the disk - e.g. MDT, MGS, and potentially MGC */ static int lustre_put_lsi(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); ENTRY; LASSERT(lsi); CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); if (atomic_dec_and_test(&lsi->lsi_mounts)) { lustre_free_lsi(sb); RETURN(1); } RETURN(0); } /*************** server mount ******************/ /* Kernel mount using mount options in MOUNT_DATA_FILE */ static struct vfsmount *server_kernel_mount(struct super_block *sb) { struct lvfs_run_ctxt mount_ctxt; struct lustre_sb_info *lsi = s2lsi(sb); struct lustre_disk_data *ldd; struct lustre_mount_data *lmd = lsi->lsi_lmd; struct vfsmount *mnt; char *options = NULL; unsigned long page, s_flags; int rc; ENTRY; OBD_ALLOC(ldd, sizeof(*ldd)); if (!ldd) RETURN(ERR_PTR(-ENOMEM)); /* In the past, we have always used flags = 0. Note ext3/ldiskfs can't be mounted ro. */ s_flags = sb->s_flags; /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */ CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev); mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, 0); if (IS_ERR(mnt)) { rc = PTR_ERR(mnt); #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) /* 2.6 kernels: if ldiskfs fails, try ldiskfs2 */ mnt = ll_kern_mount("ldiskfs2", s_flags, lmd->lmd_dev, 0); if (IS_ERR(mnt)) { int rc2 = PTR_ERR(mnt); CERROR("premount %s:%#lx ldiskfs failed: %d, ldiskfs2 " "failed: %d. Is the ldiskfs module available?\n", lmd->lmd_dev, s_flags, rc, rc2); GOTO(out_free, rc); } #else /* 2.4 kernels: if ldiskfs fails, try ext3 */ mnt = ll_kern_mount("ext3", s_flags, lmd->lmd_dev, 0); if (IS_ERR(mnt)) { rc = PTR_ERR(mnt); CERROR("premount ext3 failed: rc = %d\n", rc); GOTO(out_free, rc); } #endif } OBD_SET_CTXT_MAGIC(&mount_ctxt); mount_ctxt.pwdmnt = mnt; mount_ctxt.pwd = mnt->mnt_root; mount_ctxt.fs = get_ds(); rc = ldd_parse(&mount_ctxt, ldd); unlock_mntput(mnt); if (rc) { CERROR("premount parse options failed: rc = %d\n", rc); GOTO(out_free, rc); } /* Done with our pre-mount, now do the real mount. */ /* Glom up mount options */ page = __get_free_page(GFP_KERNEL); if (!page) GOTO(out_free, rc = -ENOMEM); options = (char *)page; memset(options, 0, CFS_PAGE_SIZE); strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2); /* Add in any mount-line options */ if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) { int len = CFS_PAGE_SIZE - strlen(options) - 2; if (*options != 0) strcat(options, ","); strncat(options, lmd->lmd_opts, len); } /* Special permanent mount flags */ if (IS_OST(ldd)) s_flags |= MS_NOATIME | MS_NODIRATIME; CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n", MT_STR(ldd), lmd->lmd_dev, options); mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev, (void *)options); free_page(page); if (IS_ERR(mnt)) { rc = PTR_ERR(mnt); CERROR("ll_kern_mount failed: rc = %d\n", rc); GOTO(out_free, rc); } lsi->lsi_ldd = ldd; /* freed at lsi cleanup */ CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt); RETURN(mnt); out_free: OBD_FREE(ldd, sizeof(*ldd)); lsi->lsi_ldd = NULL; RETURN(ERR_PTR(rc)); } static void server_wait_finished(struct vfsmount *mnt) { wait_queue_head_t waitq; struct l_wait_info lwi; int retries = 120; init_waitqueue_head(&waitq); while ((atomic_read(&mnt->mnt_count) > 1) && (retries > 0)) { LCONSOLE_WARN("Mount still busy with %d refs, waiting for " "%d secs...\n", atomic_read(&mnt->mnt_count), retries); /* Wait for a bit */ retries -= 5; lwi = LWI_TIMEOUT(5 * HZ, NULL, NULL); l_wait_event(waitq, 0, &lwi); } if (atomic_read(&mnt->mnt_count) > 1) { CERROR("Mount %p is still busy (%d refs), giving up.\n", mnt, atomic_read(&mnt->mnt_count)); } } static void server_put_super(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; struct vfsmount *mnt = lsi->lsi_srv_mnt; char *tmpname, *extraname = NULL; int tmpname_sz; int lddflags = lsi->lsi_ldd->ldd_flags; int lsiflags = lsi->lsi_flags; int rc; ENTRY; LASSERT(lsiflags & LSI_SERVER); tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1; OBD_ALLOC(tmpname, tmpname_sz); memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz); CDEBUG(D_MOUNT, "server put_super %s\n", tmpname); /* Stop the target */ if (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd)) { struct lustre_profile *lprof = NULL; /* tell the mgc to drop the config log */ lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL); /* COMPAT_146 - profile may get deleted in mgc_cleanup. If there are any setup/cleanup errors, save the lov name for safety cleanup later. */ lprof = class_get_profile(lsi->lsi_ldd->ldd_svname); if (lprof && lprof->lp_dt) { OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1); strcpy(extraname, lprof->lp_dt); } obd = class_name2obd(lsi->lsi_ldd->ldd_svname); if (obd) { CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name); if (lsi->lsi_flags & LSI_UMOUNT_FORCE) obd->obd_force = 1; if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER) obd->obd_fail = 1; /* We can't seem to give an error return code to .put_super, so we better make sure we clean up! */ obd->obd_force = 1; class_manual_cleanup(obd); } else { CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname); server_deregister_mount(lsi->lsi_ldd->ldd_svname); } } /* If they wanted the mgs to stop separately from the mdt, they should have put it on a different device. */ if (IS_MGS(lsi->lsi_ldd)) { /* stop the mgc before the mgs so the connection gets cleaned up */ lustre_stop_mgc(sb); server_stop_mgs(sb); } /* Clean the mgc and sb */ rc = lustre_common_put_super(sb); /* FIXME how can I report a failure to umount? */ /* Wait for the targets to really clean up - can't exit (and let the sb get destroyed) while the mount is still in use */ server_wait_finished(mnt); /* drop the One True Mount */ unlock_mntput(mnt); /* Stop the servers (MDS, OSS) if no longer needed. We must wait until the target is really gone so that our type refcount check is right. */ server_stop_servers(lddflags, lsiflags); /* In case of startup or cleanup err, stop related obds */ if (extraname) { obd = class_name2obd(extraname); if (obd) { CWARN("Cleaning orphaned obd %s\n", extraname); obd->obd_force = 1; class_manual_cleanup(obd); } OBD_FREE(extraname, strlen(extraname) + 1); } LCONSOLE_WARN("server umount %s complete\n", tmpname); OBD_FREE(tmpname, tmpname_sz); EXIT; } #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT static void server_umount_begin(struct vfsmount *vfsmnt, int flags) { struct super_block *sb = vfsmnt->mnt_sb; #else static void server_umount_begin(struct super_block *sb) { #endif struct lustre_sb_info *lsi = s2lsi(sb); ENTRY; #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT if (!(flags & MNT_FORCE)) { EXIT; return; } #endif CDEBUG(D_MOUNT, "umount -f\n"); /* umount = failover umount -f = force no third way to do non-force, non-failover */ lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER; lsi->lsi_flags |= LSI_UMOUNT_FORCE; EXIT; } #ifndef HAVE_STATFS_DENTRY_PARAM static int server_statfs (struct super_block *sb, struct kstatfs *buf) { #else static int server_statfs (struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; #endif struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt; ENTRY; if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) { #ifdef HAVE_STATFS_DENTRY_PARAM int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf); #else int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf); #endif if (!rc) { buf->f_type = sb->s_magic; RETURN(0); } } /* just return 0 */ buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = 1; buf->f_bfree = 0; buf->f_bavail = 0; buf->f_files = 1; buf->f_ffree = 0; buf->f_namelen = NAME_MAX; RETURN(0); } static struct super_operations server_ops = { .put_super = server_put_super, .umount_begin = server_umount_begin, /* umount -f */ .statfs = server_statfs, }; #define log2(n) ffz(~(n)) #define LUSTRE_SUPER_MAGIC 0x0BD00BD1 static int server_fill_super_common(struct super_block *sb) { struct inode *root = 0; ENTRY; CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev); sb->s_blocksize = 4096; sb->s_blocksize_bits = log2(sb->s_blocksize); sb->s_magic = LUSTRE_SUPER_MAGIC; sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES; sb->s_flags |= MS_RDONLY; sb->s_op = &server_ops; root = new_inode(sb); if (!root) { CERROR("Can't make root inode\n"); RETURN(-EIO); } /* returns -EIO for every operation */ /* make_bad_inode(root); -- badness - can't umount */ /* apparently we need to be a directory for the mount to finish */ root->i_mode = S_IFDIR; sb->s_root = d_alloc_root(root); if (!sb->s_root) { CERROR("Can't make root dentry\n"); iput(root); RETURN(-EIO); } RETURN(0); } static int server_fill_super(struct super_block *sb) { struct lustre_sb_info *lsi = s2lsi(sb); struct vfsmount *mnt; int rc; ENTRY; /* the One True Mount */ mnt = server_kernel_mount(sb); if (IS_ERR(mnt)) { rc = PTR_ERR(mnt); CERROR("Unable to mount device %s: %d\n", lsi->lsi_lmd->lmd_dev, rc); lustre_put_lsi(sb); GOTO(out, rc); } lsi->lsi_srv_mnt = mnt; LASSERT(lsi->lsi_ldd); CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n", lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname, lsi->lsi_lmd->lmd_dev); if (class_name2obd(lsi->lsi_ldd->ldd_svname)) { LCONSOLE_ERROR_MSG(0x161, "The target named %s is already " "running. Double-mount may have compromised" " the disk journal.\n", lsi->lsi_ldd->ldd_svname); unlock_mntput(mnt); lustre_put_lsi(sb); GOTO(out, rc = -EALREADY); } /* start MGS before MGC */ if (IS_MGS(lsi->lsi_ldd)) { rc = server_start_mgs(sb); if (rc) GOTO(out_mnt, rc); } rc = lustre_start_mgc(sb); if (rc) GOTO(out_mnt, rc); /* Set up all obd devices for service */ if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) { rc = server_start_targets(sb, mnt); if (rc < 0) { CERROR("Unable to start targets: %d\n", rc); GOTO(out_mnt, rc); } /* FIXME overmount client here, or can we just start a client log and client_fill_super on this sb? We need to make sure server_put_super gets called too - ll_put_super calls lustre_common_put_super; check there for LSI_SERVER flag, call s_p_s if so. Probably should start client from new thread so we can return. Client will not finish until all servers are connected. Note - MGS-only server does NOT get a client, since there is no lustre fs associated - the MGS is for all lustre fs's */ } rc = server_fill_super_common(sb); if (rc) GOTO(out_mnt, rc); LCONSOLE_WARN("Server %s on device %s has started\n", lsi->lsi_ldd->ldd_svname, lsi->lsi_lmd->lmd_dev); RETURN(0); out_mnt: server_put_super(sb); out: RETURN(rc); } /* Get the index from the obd name. rc = server type, or rc < 0 on error if endptr isn't NULL it is set to end of name */ int server_name2index(char *svname, __u32 *idx, char **endptr) { unsigned long index; int rc; char *dash = strchr(svname, '-'); if (!dash) return(-EINVAL); if (strncmp(dash + 1, "MDT", 3) == 0) rc = LDD_F_SV_TYPE_MDT; else if (strncmp(dash + 1, "OST", 3) == 0) rc = LDD_F_SV_TYPE_OST; else return(-EINVAL); index = simple_strtoul(dash + 4, endptr, 16); *idx = index; return rc; } /*************** mount common betweeen server and client ***************/ /* Common umount */ int lustre_common_put_super(struct super_block *sb) { int rc; ENTRY; CDEBUG(D_MOUNT, "dropping sb %p\n", sb); /* Drop a ref to the MGC */ rc = lustre_stop_mgc(sb); if (rc && (rc != -ENOENT)) { if (rc != -EBUSY) { CERROR("Can't stop MGC: %d\n", rc); RETURN(rc); } /* BUSY just means that there's some other obd that needs the mgc. Let him clean it up. */ CDEBUG(D_MOUNT, "MGC still in use\n"); } /* Drop a ref to the mounted disk */ lustre_put_lsi(sb); RETURN(rc); } #if 0 static void lmd_print(struct lustre_mount_data *lmd) { int i; PRINT_CMD(PRINT_MASK, " mount data:\n"); if (lmd_is_client(lmd)) PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile); PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev); PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags); if (lmd->lmd_sec_mdt) PRINT_CMD(PRINT_MASK, "sec_mdt: %s\n", lmd->lmd_sec_mdt); if (lmd->lmd_sec_cli) PRINT_CMD(PRINT_MASK, "sec_cli: %s\n", lmd->lmd_sec_cli); if (lmd->lmd_opts) PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts); for (i = 0; i < lmd->lmd_exclude_count; i++) { PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i, lmd->lmd_exclude[i]); } } #endif /* Is this server on the exclusion list */ int lustre_check_exclusion(struct super_block *sb, char *svname) { struct lustre_sb_info *lsi = s2lsi(sb); struct lustre_mount_data *lmd = lsi->lsi_lmd; __u32 index; int i, rc; ENTRY; rc = server_name2index(svname, &index, NULL); if (rc != LDD_F_SV_TYPE_OST) /* Only exclude OSTs */ RETURN(0); CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, index, lmd->lmd_exclude_count, lmd->lmd_dev); for(i = 0; i < lmd->lmd_exclude_count; i++) { if (index == lmd->lmd_exclude[i]) { CWARN("Excluding %s (on exclusion list)\n", svname); RETURN(1); } } RETURN(0); } /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr) { char *s1 = ptr, *s2; __u32 index, *exclude_list; int rc = 0, devmax; ENTRY; /* The shortest an ost name can be is 8 chars: -OST0000. We don't actually know the fsname at this time, so in fact a user could specify any fsname. */ devmax = strlen(ptr) / 8 + 1; /* temp storage until we figure out how many we have */ OBD_ALLOC(exclude_list, sizeof(index) * devmax); if (!exclude_list) RETURN(-ENOMEM); /* we enter this fn pointing at the '=' */ while (*s1 && *s1 != ' ' && *s1 != ',') { s1++; rc = server_name2index(s1, &index, &s2); if (rc < 0) { CERROR("Can't parse server name '%s'\n", s1); break; } if (rc == LDD_F_SV_TYPE_OST) exclude_list[lmd->lmd_exclude_count++] = index; else CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1); s1 = s2; /* now we are pointing at ':' (next exclude) or ',' (end of excludes) */ if (lmd->lmd_exclude_count >= devmax) break; } if (rc >= 0) /* non-err */ rc = 0; if (lmd->lmd_exclude_count) { /* permanent, freed in lustre_free_lsi */ OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * lmd->lmd_exclude_count); if (lmd->lmd_exclude) { memcpy(lmd->lmd_exclude, exclude_list, sizeof(index) * lmd->lmd_exclude_count); } else { rc = -ENOMEM; lmd->lmd_exclude_count = 0; } } OBD_FREE(exclude_list, sizeof(index) * devmax); RETURN(rc); } static int lmd_set_sec_opts(char **set, char *opts, int length) { if (*set) OBD_FREE(*set, strlen(*set) + 1); OBD_ALLOC(*set, length + 1); if (*set == NULL) return -ENOMEM; memcpy(*set, opts, length); (*set)[length] = '\0'; return 0; } static int lmd_parse_sec_opts(struct lustre_mount_data *lmd, char *ptr) { char *tail; char **set = NULL; int length; /* check peer name */ if (strncmp(ptr, "sec_mdt=", 8) == 0) { set = &lmd->lmd_sec_mdt; ptr += 8; } else if (strncmp(ptr, "sec_cli=", 8) == 0) { set = &lmd->lmd_sec_cli; ptr += 8; } else if (strncmp(ptr, "sec=", 4) == 0) { /* leave 'set' be null */ ptr += 4; } else { CERROR("invalid security options: %s\n", ptr); return -EINVAL; } tail = strchr(ptr, ','); if (tail == NULL) length = strlen(ptr); else length = tail - ptr; if (set) { if (lmd_set_sec_opts(set, ptr, length)) return -EINVAL; } else { if (lmd->lmd_sec_mdt == NULL && lmd_set_sec_opts(&lmd->lmd_sec_mdt, ptr, length)) return -EINVAL; if (lmd->lmd_sec_cli == NULL && lmd_set_sec_opts(&lmd->lmd_sec_cli, ptr, length)) return -EINVAL; } return 0; } /* mount -v -t lustre uml1:uml2:/lustre-client /mnt/lustre */ static int lmd_parse(char *options, struct lustre_mount_data *lmd) { char *s1, *s2, *devname = NULL; struct lustre_mount_data *raw = (struct lustre_mount_data *)options; int rc = 0; ENTRY; LASSERT(lmd); if (!options) { LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that " "/sbin/mount.lustre is installed.\n"); RETURN(-EINVAL); } /* Options should be a string - try to detect old lmd data */ if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { LCONSOLE_ERROR_MSG(0x163, "You're using an old version of " "/sbin/mount.lustre. Please install " "version %s\n", LUSTRE_VERSION_STRING); RETURN(-EINVAL); } lmd->lmd_magic = LMD_MAGIC; /* Set default flags here */ s1 = options; while (*s1) { int clear = 0; /* Skip whitespace and extra commas */ while (*s1 == ' ' || *s1 == ',') s1++; /* Client options are parsed in ll_options: eg. flock, user_xattr, acl */ /* Parse non-ldiskfs options here. Rather than modifying ldiskfs, we just zero these out here */ if (strncmp(s1, "abort_recov", 11) == 0) { lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; clear++; } else if (strncmp(s1, "nosvc", 5) == 0) { lmd->lmd_flags |= LMD_FLG_NOSVC; clear++; /* ost exclusion list */ } else if (strncmp(s1, "exclude=", 8) == 0) { rc = lmd_make_exclusion(lmd, s1 + 7); if (rc) goto invalid; clear++; } else if (strncmp(s1, "nllu=", 5) == 0) { lmd->lmd_nllu = simple_strtoul(s1 + 5, NULL, 10); clear++; } else if (strncmp(s1, "nllg=", 5) == 0) { lmd->lmd_nllg = simple_strtoul(s1 + 5, NULL, 10); clear++; } else if (strncmp(s1, "sec", 3) == 0) { rc = lmd_parse_sec_opts(lmd, s1); if (rc) goto invalid; clear++; } /* Linux 2.4 doesn't pass the device, so we stuck it at the end of the options. */ else if (strncmp(s1, "device=", 7) == 0) { devname = s1 + 7; /* terminate options right before device. device must be the last one. */ *s1 = '\0'; break; } /* Find next opt */ s2 = strchr(s1, ','); if (s2 == NULL) { if (clear) *s1 = '\0'; break; } s2++; if (clear) memmove(s1, s2, strlen(s2) + 1); else s1 = s2; } if (!devname) { LCONSOLE_ERROR_MSG(0x164, "Can't find the device name " "(need mount option 'device=...')\n"); goto invalid; } s1 = strrchr(devname, ':'); if (s1) { lmd->lmd_flags = LMD_FLG_CLIENT; /* Remove leading /s from fsname */ while (*++s1 == '/') ; /* Freed in lustre_free_lsi */ OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8); if (!lmd->lmd_profile) RETURN(-ENOMEM); sprintf(lmd->lmd_profile, "%s-client", s1); } /* Freed in lustre_free_lsi */ OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); if (!lmd->lmd_dev) RETURN(-ENOMEM); strcpy(lmd->lmd_dev, devname); /* Save mount options */ s1 = options + strlen(options) - 1; while (s1 >= options && (*s1 == ',' || *s1 == ' ')) *s1-- = 0; if (*options != 0) { /* Freed in lustre_free_lsi */ OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); if (!lmd->lmd_opts) RETURN(-ENOMEM); strcpy(lmd->lmd_opts, options); } lmd->lmd_magic = LMD_MAGIC; RETURN(rc); invalid: CERROR("Bad mount options %s\n", options); RETURN(-EINVAL); } /* Common mount */ int lustre_fill_super(struct super_block *sb, void *data, int silent) { struct lustre_mount_data *lmd; struct lustre_sb_info *lsi; int rc; ENTRY; CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); lsi = lustre_init_lsi(sb); if (!lsi) RETURN(-ENOMEM); lmd = lsi->lsi_lmd; /* Figure out the lmd from the mount options */ if (lmd_parse((char *)data, lmd)) { lustre_put_lsi(sb); RETURN(-EINVAL); } if (lmd_is_client(lmd)) { CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); if (!client_fill_super) { LCONSOLE_ERROR_MSG(0x165, "Nothing registered for " "client mount! Is the 'lustre' " "module loaded?\n"); rc = -ENODEV; } else { rc = lustre_start_mgc(sb); if (rc) { lustre_stop_mgc(sb); goto out; } /* Connect and start */ /* (should always be ll_fill_super) */ rc = (*client_fill_super)(sb); /* c_f_s will call lustre_common_put_super on failure */ } } else { CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev); lsi->lsi_flags |= LSI_SERVER; rc = server_fill_super(sb); /* s_f_s calls lustre_start_mgc after the mount because we need the MGS nids which are stored on disk. Plus, we may need to start the MGS first. */ /* s_f_s will call server_put_super on failure */ } out: if (rc){ CERROR("Unable to mount %s (%d)\n", s2lsi(sb) ? lmd->lmd_dev : "", rc); } else { CDEBUG(D_SUPER, "mount %s complete\n", lmd->lmd_dev); } RETURN(rc); } /* We can't call ll_fill_super by name because it lives in a module that must be loaded after this one. */ void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb)) { client_fill_super = cfs; } /***************** FS registration ******************/ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) /* 2.5 and later */ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18)) struct super_block * lustre_get_sb(struct file_system_type *fs_type, int flags, const char *devname, void * data) { /* calls back in fill super */ /* we could append devname= onto options (*data) here, but 2.4 doesn't get devname. So we do it in mount_lustre.c */ return get_sb_nodev(fs_type, flags, data, lustre_fill_super); } #else int lustre_get_sb(struct file_system_type *fs_type, int flags, const char *devname, void * data, struct vfsmount *mnt) { /* calls back in fill super */ /* we could append devname= onto options (*data) here, but 2.4 doesn't get devname. So we do it in mount_lustre.c */ return get_sb_nodev(fs_type, flags, data, lustre_fill_super, mnt); } #endif struct file_system_type lustre_fs_type = { .owner = THIS_MODULE, .name = "lustre", .get_sb = lustre_get_sb, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; #else /* 2.4 */ static struct super_block *lustre_read_super(struct super_block *sb, void *data, int silent) { int rc; ENTRY; rc = lustre_fill_super(sb, data, silent); if (rc) RETURN(NULL); RETURN(sb); } static struct file_system_type lustre_fs_type = { .owner = THIS_MODULE, .name = "lustre", .fs_flags = FS_NFSEXP_FSID, .read_super = lustre_read_super, }; #endif int lustre_register_fs(void) { return register_filesystem(&lustre_fs_type); } int lustre_unregister_fs(void) { return unregister_filesystem(&lustre_fs_type); } EXPORT_SYMBOL(lustre_register_client_fill_super); EXPORT_SYMBOL(lustre_common_put_super); EXPORT_SYMBOL(lustre_process_log); EXPORT_SYMBOL(lustre_end_log); EXPORT_SYMBOL(server_get_mount); EXPORT_SYMBOL(server_get_mount_2); EXPORT_SYMBOL(server_put_mount); EXPORT_SYMBOL(server_put_mount_2); EXPORT_SYMBOL(server_register_target); EXPORT_SYMBOL(server_name2index); EXPORT_SYMBOL(server_mti_print); EXPORT_SYMBOL(do_lcfg);