1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/obd_mount.c
38 * Client/server mount routines
40 * Author: Nathan Rutman <nathan@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_CLASS
45 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
46 #define PRINT_CMD CDEBUG
47 #define PRINT_MASK D_SUPER|D_CONFIG
51 #include <lustre_fsfilt.h>
52 #include <obd_class.h>
53 #include <lustre/lustre_user.h>
54 #include <linux/version.h>
55 #include <lustre_log.h>
56 #include <lustre_disk.h>
57 #include <lustre_param.h>
59 static int (*client_fill_super)(struct super_block *sb) = NULL;
60 static void (*kill_super_cb)(struct super_block *sb) = NULL;
62 /*********** mount lookup *********/
64 CFS_DECLARE_MUTEX(lustre_mount_info_lock);
65 static CFS_LIST_HEAD(server_mount_info_list);
67 static struct lustre_mount_info *server_find_mount(const char *name)
70 struct lustre_mount_info *lmi;
73 cfs_list_for_each(tmp, &server_mount_info_list) {
74 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
76 if (strcmp(name, lmi->lmi_name) == 0)
82 /* we must register an obd for a mount before we call the setup routine.
83 *_setup will call lustre_get_mount to get the mnt struct
84 by obd_name, since we can't pass the pointer to setup. */
85 static int server_register_mount(const char *name, struct super_block *sb,
88 struct lustre_mount_info *lmi;
95 OBD_ALLOC(lmi, sizeof(*lmi));
98 OBD_ALLOC(name_cp, strlen(name) + 1);
100 OBD_FREE(lmi, sizeof(*lmi));
103 strcpy(name_cp, name);
105 cfs_down(&lustre_mount_info_lock);
107 if (server_find_mount(name)) {
108 cfs_up(&lustre_mount_info_lock);
109 OBD_FREE(lmi, sizeof(*lmi));
110 OBD_FREE(name_cp, strlen(name) + 1);
111 CERROR("Already registered %s\n", name);
114 lmi->lmi_name = name_cp;
117 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
119 cfs_up(&lustre_mount_info_lock);
121 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
122 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
127 /* when an obd no longer needs a mount */
128 static int server_deregister_mount(const char *name)
130 struct lustre_mount_info *lmi;
133 cfs_down(&lustre_mount_info_lock);
134 lmi = server_find_mount(name);
136 cfs_up(&lustre_mount_info_lock);
137 CERROR("%s not registered\n", name);
141 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
142 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
144 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
145 cfs_list_del(&lmi->lmi_list_chain);
146 OBD_FREE(lmi, sizeof(*lmi));
147 cfs_up(&lustre_mount_info_lock);
152 /* obd's look up a registered mount using their obdname. This is just
153 for initial obd setup to find the mount struct. It should not be
154 called every time you want to mntget. */
155 struct lustre_mount_info *server_get_mount(const char *name)
157 struct lustre_mount_info *lmi;
158 struct lustre_sb_info *lsi;
161 cfs_down(&lustre_mount_info_lock);
162 lmi = server_find_mount(name);
163 cfs_up(&lustre_mount_info_lock);
165 CERROR("Can't find mount for %s\n", name);
168 lsi = s2lsi(lmi->lmi_sb);
169 mntget(lmi->lmi_mnt);
170 cfs_atomic_inc(&lsi->lsi_mounts);
172 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
173 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
174 cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
180 * Used by mdt to get mount_info from obdname.
181 * There are no blocking when using the mount_info.
182 * Do not use server_get_mount for this purpose.
184 struct lustre_mount_info *server_get_mount_2(const char *name)
186 struct lustre_mount_info *lmi;
189 cfs_down(&lustre_mount_info_lock);
190 lmi = server_find_mount(name);
191 cfs_up(&lustre_mount_info_lock);
193 CERROR("Can't find mount for %s\n", name);
198 static void unlock_mntput(struct vfsmount *mnt)
200 if (kernel_locked()) {
209 static int lustre_put_lsi(struct super_block *sb);
211 /* to be called from obd_cleanup methods */
212 int server_put_mount(const char *name, struct vfsmount *mnt)
214 struct lustre_mount_info *lmi;
215 struct lustre_sb_info *lsi;
216 int count = atomic_read(&mnt->mnt_count) - 1;
219 /* This might be the last one, can't deref after this */
222 cfs_down(&lustre_mount_info_lock);
223 lmi = server_find_mount(name);
224 cfs_up(&lustre_mount_info_lock);
226 CERROR("Can't find mount for %s\n", name);
229 lsi = s2lsi(lmi->lmi_sb);
230 LASSERT(lmi->lmi_mnt == mnt);
232 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
233 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
235 if (lustre_put_lsi(lmi->lmi_sb)) {
236 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
237 lmi->lmi_mnt, name, count);
238 /* last mount is the One True Mount */
240 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
243 /* this obd should never need the mount again */
244 server_deregister_mount(name);
249 /* Corresponding to server_get_mount_2 */
250 int server_put_mount_2(const char *name, struct vfsmount *mnt)
256 /******* mount helper utilities *********/
259 static void ldd_print(struct lustre_disk_data *ldd)
261 PRINT_CMD(PRINT_MASK, " disk data:\n");
262 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
263 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
264 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
265 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
266 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
267 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
268 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
269 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
270 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
271 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
275 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
276 struct lustre_disk_data *ldd)
278 struct lvfs_run_ctxt saved;
285 push_ctxt(&saved, mount_ctxt, NULL);
287 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
290 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
294 len = i_size_read(file->f_dentry->d_inode);
295 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
296 if (len != sizeof(*ldd)) {
297 CERROR("disk data size does not match: see %lu expect %u\n",
298 len, (int)sizeof(*ldd));
299 GOTO(out_close, rc = -EINVAL);
302 rc = lustre_fread(file, ldd, len, &off);
304 CERROR("error reading %s: read %d of %lu\n",
305 MOUNT_DATA_FILE, rc, len);
306 GOTO(out_close, rc = -EINVAL);
310 if (ldd->ldd_magic != LDD_MAGIC) {
311 /* FIXME add swabbing support */
312 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
313 ldd->ldd_magic, LDD_MAGIC);
314 GOTO(out_close, rc = -EINVAL);
317 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
318 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
320 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
321 GOTO(out_close, rc = -EINVAL);
323 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
324 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
326 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
327 /* Do something like remount filesystem read-only */
328 GOTO(out_close, rc = -EINVAL);
334 pop_ctxt(&saved, mount_ctxt, NULL);
338 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
339 struct lustre_disk_data *ldd)
341 struct lvfs_run_ctxt saved;
344 unsigned long len = sizeof(struct lustre_disk_data);
348 LASSERT(ldd->ldd_magic == LDD_MAGIC);
350 ldd->ldd_config_ver++;
352 push_ctxt(&saved, mount_ctxt, NULL);
354 file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
357 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
361 rc = lustre_fwrite(file, ldd, len, &off);
363 CERROR("error writing %s: read %d of %lu\n",
364 MOUNT_DATA_FILE, rc, len);
365 GOTO(out_close, rc = -EINVAL);
373 pop_ctxt(&saved, mount_ctxt, NULL);
378 /**************** config llog ********************/
380 /* Get a config log from the MGS and process it.
381 This func is called for both clients and servers.
382 Continue to process new statements appended to the logs
383 (whenever the config lock is revoked) until lustre_end_log
385 int lustre_process_log(struct super_block *sb, char *logname,
386 struct config_llog_instance *cfg)
388 struct lustre_cfg *lcfg;
389 struct lustre_cfg_bufs bufs;
390 struct lustre_sb_info *lsi = s2lsi(sb);
391 struct obd_device *mgc = lsi->lsi_mgc;
398 /* mgc_process_config */
399 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
400 lustre_cfg_bufs_set_string(&bufs, 1, logname);
401 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
402 lustre_cfg_bufs_set(&bufs, 3, &sb, sizeof(sb));
403 lcfg = lustre_cfg_new(LCFG_LOG_START, &bufs);
404 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
405 lustre_cfg_free(lcfg);
408 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
409 "failed from the MGS (%d). Make sure this "
410 "client and the MGS are running compatible "
411 "versions of Lustre.\n",
412 mgc->obd_name, logname, rc);
415 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
416 "failed (%d). This may be the result of "
417 "communication errors between this node and "
418 "the MGS, a bad configuration, or other "
419 "errors. See the syslog for more "
420 "information.\n", mgc->obd_name, logname,
423 /* class_obd_list(); */
427 /* Stop watching this config log for updates */
428 int lustre_end_log(struct super_block *sb, char *logname,
429 struct config_llog_instance *cfg)
431 struct lustre_cfg *lcfg;
432 struct lustre_cfg_bufs bufs;
433 struct lustre_sb_info *lsi = s2lsi(sb);
434 struct obd_device *mgc = lsi->lsi_mgc;
441 /* mgc_process_config */
442 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
443 lustre_cfg_bufs_set_string(&bufs, 1, logname);
445 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
446 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
447 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
448 lustre_cfg_free(lcfg);
452 /**************** obd start *******************/
454 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
455 char *s1, char *s2, char *s3, char *s4)
457 struct lustre_cfg_bufs bufs;
458 struct lustre_cfg * lcfg = NULL;
461 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
462 cmd, s1, s2, s3, s4);
464 lustre_cfg_bufs_reset(&bufs, cfgname);
466 lustre_cfg_bufs_set_string(&bufs, 1, s1);
468 lustre_cfg_bufs_set_string(&bufs, 2, s2);
470 lustre_cfg_bufs_set_string(&bufs, 3, s3);
472 lustre_cfg_bufs_set_string(&bufs, 4, s4);
474 lcfg = lustre_cfg_new(cmd, &bufs);
475 lcfg->lcfg_nid = nid;
476 rc = class_process_config(lcfg);
477 lustre_cfg_free(lcfg);
481 static int lustre_start_simple(char *obdname, char *type, char *uuid,
485 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
487 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
489 CERROR("%s attach error %d\n", obdname, rc);
492 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
494 CERROR("%s setup error %d\n", obdname, rc);
495 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
500 /* Set up a MGS to serve startup logs */
501 static int server_start_mgs(struct super_block *sb)
503 struct lustre_sb_info *lsi = s2lsi(sb);
504 struct vfsmount *mnt = lsi->lsi_srv_mnt;
505 struct lustre_mount_info *lmi;
510 /* It is impossible to have more than 1 MGS per node, since
511 MGC wouldn't know which to connect to */
512 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
514 lsi = s2lsi(lmi->lmi_sb);
515 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
517 lsi->lsi_ldd->ldd_svname);
521 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
523 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
526 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
527 LUSTRE_MGS_OBDNAME, 0, 0);
528 /* Do NOT call server_deregister_mount() here. This leads to
529 * inability cleanup cleanly and free lsi and other stuff when
530 * mgs calls server_put_mount() in error handling case. -umka */
534 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
535 "Is the 'mgs' module loaded?\n",
536 LUSTRE_MGS_OBDNAME, rc);
540 static int server_stop_mgs(struct super_block *sb)
542 struct obd_device *obd;
546 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
548 /* There better be only one MGS */
549 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
551 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
555 /* The MGS should always stop when we say so */
557 rc = class_manual_cleanup(obd);
561 CFS_DECLARE_MUTEX(mgc_start_lock);
563 /** Set up a mgc obd to process startup logs
565 * \param sb [in] super block of the mgc obd
567 * \retval 0 success, otherwise error code
569 static int lustre_start_mgc(struct super_block *sb)
571 struct obd_connect_data *data = NULL;
572 struct lustre_sb_info *lsi = s2lsi(sb);
573 struct obd_device *obd;
574 struct obd_export *exp;
575 struct obd_uuid *uuid;
578 char *mgcname, *niduuid, *mgssec;
581 int rc = 0, i = 0, j, len;
584 LASSERT(lsi->lsi_lmd);
586 /* Find the first non-lo MGS nid for our MGC name */
587 if (lsi->lsi_flags & LSI_SERVER) {
588 ptr = lsi->lsi_ldd->ldd_params;
589 /* Use mgsnode= nids */
590 if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
591 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
593 } else if (IS_MGS(lsi->lsi_ldd)) {
594 lnet_process_id_t id;
595 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
596 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
603 } else { /* client */
604 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
605 ptr = lsi->lsi_lmd->lmd_dev;
606 if (class_parse_nid(ptr, &nid, &ptr) == 0)
610 CERROR("No valid MGS nids found.\n");
614 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
615 OBD_ALLOC(mgcname, len);
616 OBD_ALLOC(niduuid, len + 2);
617 if (!mgcname || !niduuid)
618 GOTO(out_free, rc = -ENOMEM);
619 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
621 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
623 cfs_mutex_down(&mgc_start_lock);
625 obd = class_name2obd(mgcname);
626 if (obd && !obd->obd_stopping) {
627 rc = obd_set_info_async(obd->obd_self_export,
628 strlen(KEY_MGSSEC), KEY_MGSSEC,
629 strlen(mgssec), mgssec, NULL);
633 /* Re-using an existing MGC */
634 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
637 /* If we are restarting the MGS, don't try to keep the MGC's
638 old connection, or registration will fail. */
639 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
640 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
644 /* Try all connections, but only once (again).
645 We don't want to block another target from starting
646 (using its local copy of the log), but we do want to connect
647 if at all possible. */
649 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
650 rc = obd_set_info_async(obd->obd_self_export,
651 sizeof(KEY_INIT_RECOV_BACKUP),
652 KEY_INIT_RECOV_BACKUP,
653 sizeof(recov_bk), &recov_bk, NULL);
657 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
659 /* Add the primary nids for the MGS */
661 sprintf(niduuid, "%s_%x", mgcname, i);
662 if (lsi->lsi_flags & LSI_SERVER) {
663 ptr = lsi->lsi_ldd->ldd_params;
664 if (IS_MGS(lsi->lsi_ldd)) {
665 /* Use local nids (including LO) */
666 lnet_process_id_t id;
667 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
668 rc = do_lcfg(mgcname, id.nid,
669 LCFG_ADD_UUID, niduuid, 0,0,0);
672 /* Use mgsnode= nids */
673 if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) {
674 CERROR("No MGS nids given.\n");
675 GOTO(out_free, rc = -EINVAL);
677 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
678 rc = do_lcfg(mgcname, nid,
679 LCFG_ADD_UUID, niduuid, 0,0,0);
683 } else { /* client */
684 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
685 ptr = lsi->lsi_lmd->lmd_dev;
686 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
687 rc = do_lcfg(mgcname, nid,
688 LCFG_ADD_UUID, niduuid, 0,0,0);
690 /* Stop at the first failover nid */
696 CERROR("No valid MGS nids found.\n");
697 GOTO(out_free, rc = -EINVAL);
699 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
701 /* Random uuid for MGC allows easier reconnects */
703 ll_generate_random_uuid(uuidc);
704 class_uuid_unparse(uuidc, uuid);
707 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
708 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
714 /* Add any failover MGS nids */
716 while ((*ptr == ':' ||
717 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
718 /* New failover node */
719 sprintf(niduuid, "%s_%x", mgcname, i);
721 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
723 rc = do_lcfg(mgcname, nid,
724 LCFG_ADD_UUID, niduuid, 0,0,0);
729 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
737 lsi->lsi_lmd->lmd_mgs_failnodes = i;
739 obd = class_name2obd(mgcname);
741 CERROR("Can't find mgcobd %s\n", mgcname);
742 GOTO(out_free, rc = -ENOTCONN);
745 rc = obd_set_info_async(obd->obd_self_export,
746 strlen(KEY_MGSSEC), KEY_MGSSEC,
747 strlen(mgssec), mgssec, NULL);
751 /* Keep a refcount of servers/clients who started with "mount",
752 so we know when we can get rid of the mgc. */
753 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
755 /* Try all connections, but only once. */
757 rc = obd_set_info_async(obd->obd_self_export,
758 sizeof(KEY_INIT_RECOV_BACKUP),
759 KEY_INIT_RECOV_BACKUP,
760 sizeof(recov_bk), &recov_bk, NULL);
763 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
764 /* We connect to the MGS at setup, and don't disconnect until cleanup */
767 GOTO(out, rc = -ENOMEM);
768 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
770 data->ocd_version = LUSTRE_VERSION_CODE;
771 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
774 CERROR("connect failed %d\n", rc);
778 obd->u.cli.cl_mgc_mgsexp = exp;
781 /* Keep the mgc info in the sb. Note that many lsi's can point
785 cfs_mutex_up(&mgc_start_lock);
788 OBD_FREE(mgcname, len);
790 OBD_FREE(niduuid, len + 2);
794 static int lustre_stop_mgc(struct super_block *sb)
796 struct lustre_sb_info *lsi = s2lsi(sb);
797 struct obd_device *obd;
798 char *niduuid = 0, *ptr = 0;
799 int i, rc = 0, len = 0;
809 cfs_mutex_down(&mgc_start_lock);
810 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
811 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
812 /* This is not fatal, every client that stops
813 will call in here. */
814 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
815 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
816 GOTO(out, rc = -EBUSY);
819 /* The MGC has no recoverable data in any case.
820 * force shotdown set in umount_begin */
821 obd->obd_no_recov = 1;
823 if (obd->u.cli.cl_mgc_mgsexp) {
824 /* An error is not fatal, if we are unable to send the
825 disconnect mgs ping evictor cleans up the export */
826 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
828 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
831 /* Save the obdname for cleaning the nid uuids, which are
833 len = strlen(obd->obd_name) + 6;
834 OBD_ALLOC(niduuid, len);
836 strcpy(niduuid, obd->obd_name);
837 ptr = niduuid + strlen(niduuid);
840 rc = class_manual_cleanup(obd);
844 /* Clean the nid uuids */
846 GOTO(out, rc = -ENOMEM);
848 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
849 sprintf(ptr, "_%x", i);
850 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
853 CERROR("del MDC UUID %s failed: rc = %d\n",
858 OBD_FREE(niduuid, len);
860 /* class_import_put will get rid of the additional connections */
861 cfs_mutex_up(&mgc_start_lock);
865 /* Since there's only one mgc per node, we have to change it's fs to get
866 access to the right disk. */
867 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
869 struct lustre_sb_info *lsi = s2lsi(sb);
873 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
875 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
876 rc = obd_set_info_async(mgc->obd_self_export,
877 sizeof(KEY_SET_FS), KEY_SET_FS,
878 sizeof(*sb), sb, NULL);
880 CERROR("can't set_fs %d\n", rc);
886 static int server_mgc_clear_fs(struct obd_device *mgc)
891 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
893 rc = obd_set_info_async(mgc->obd_self_export,
894 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
899 CFS_DECLARE_MUTEX(server_start_lock);
901 /* Stop MDS/OSS if nobody is using them */
902 static int server_stop_servers(int lddflags, int lsiflags)
904 struct obd_device *obd = NULL;
905 struct obd_type *type = NULL;
909 cfs_mutex_down(&server_start_lock);
911 /* Either an MDT or an OST or neither */
912 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
913 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
914 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
915 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
916 type = class_search_type(LUSTRE_MDS_NAME);
918 /* if this was an OST, and there are no more OST's, clean up the OSS */
919 if ((lddflags & LDD_F_SV_TYPE_OST) &&
920 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
921 type = class_search_type(LUSTRE_OST_NAME);
924 if (obd && (!type || !type->typ_refcnt)) {
927 /* obd_fail doesn't mean much on a server obd */
928 err = class_manual_cleanup(obd);
933 cfs_mutex_up(&server_start_lock);
938 int server_mti_print(char *title, struct mgs_target_info *mti)
940 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
941 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
942 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
943 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
944 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
945 mti->mti_config_ver, mti->mti_flags);
949 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
951 struct lustre_sb_info *lsi = s2lsi(sb);
952 struct lustre_disk_data *ldd = lsi->lsi_ldd;
953 lnet_process_id_t id;
957 if (!(lsi->lsi_flags & LSI_SERVER))
960 strncpy(mti->mti_fsname, ldd->ldd_fsname,
961 sizeof(mti->mti_fsname));
962 strncpy(mti->mti_svname, ldd->ldd_svname,
963 sizeof(mti->mti_svname));
965 mti->mti_nid_count = 0;
966 while (LNetGetId(i++, &id) != -ENOENT) {
967 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
969 mti->mti_nids[mti->mti_nid_count] = id.nid;
970 mti->mti_nid_count++;
971 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
972 CWARN("Only using first %d nids for %s\n",
973 mti->mti_nid_count, mti->mti_svname);
978 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
979 mti->mti_config_ver = 0;
980 mti->mti_flags = ldd->ldd_flags;
981 mti->mti_stripe_index = ldd->ldd_svindex;
982 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
983 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
984 CERROR("params too big for mti\n");
987 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
991 /* Register an old or new target with the MGS. If needed MGS will construct
992 startup logs and assign index */
993 int server_register_target(struct super_block *sb)
995 struct lustre_sb_info *lsi = s2lsi(sb);
996 struct obd_device *mgc = lsi->lsi_mgc;
997 struct lustre_disk_data *ldd = lsi->lsi_ldd;
998 struct mgs_target_info *mti = NULL;
1004 if (!(lsi->lsi_flags & LSI_SERVER))
1010 rc = server_sb2mti(sb, mti);
1014 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1015 mti->mti_svname, mti->mti_fsname,
1016 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1019 /* Register the target */
1020 /* FIXME use mgc_process_config instead */
1021 rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp,
1022 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1023 sizeof(*mti), mti, NULL);
1027 /* Always update our flags */
1028 ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD;
1030 /* If this flag is set, it means the MGS wants us to change our
1031 on-disk data. (So far this means just the index.) */
1032 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1035 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1036 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1038 ldd->ldd_svindex = mti->mti_stripe_index;
1039 strncpy(ldd->ldd_svname, mti->mti_svname,
1040 sizeof(ldd->ldd_svname));
1041 /* or ldd_make_sv_name(ldd); */
1042 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1043 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1046 CERROR("Label set error %d\n", err);
1047 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1049 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1051 /* Flush the new ldd to disk */
1052 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1062 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1064 struct obd_device *obd;
1065 struct lustre_sb_info *lsi = s2lsi(sb);
1066 struct config_llog_instance cfg;
1070 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1073 /* If we're an MDT, make sure the global MDS is running */
1074 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1075 /* make sure the MDS is started */
1076 cfs_mutex_down(&server_start_lock);
1077 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1079 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1080 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1082 LUSTRE_MDS_OBDNAME"_uuid",
1085 cfs_mutex_up(&server_start_lock);
1086 CERROR("failed to start MDS: %d\n", rc);
1090 cfs_mutex_up(&server_start_lock);
1094 /* If we're an OST, make sure the global OSS is running */
1095 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
1096 /* make sure OSS is started */
1097 cfs_mutex_down(&server_start_lock);
1098 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1100 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1102 LUSTRE_OSS_OBDNAME"_uuid",
1105 cfs_mutex_up(&server_start_lock);
1106 CERROR("failed to start OSS: %d\n", rc);
1110 cfs_mutex_up(&server_start_lock);
1113 /* Set the mgc fs to our server disk. This allows the MGC
1114 to read and write configs locally. */
1115 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1119 /* Register with MGS */
1120 rc = server_register_target(sb);
1121 if (rc && (lsi->lsi_ldd->ldd_flags &
1122 (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){
1123 CERROR("Required registration failed for %s: %d\n",
1124 lsi->lsi_ldd->ldd_svname, rc);
1126 LCONSOLE_ERROR_MSG(0x15f, "Communication error with "
1127 "the MGS. Is the MGS running?\n");
1131 if (rc == -EINVAL) {
1132 LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this "
1133 "server (%s) to start. Please see messages"
1134 " on the MGS node.\n",
1135 lsi->lsi_ldd->ldd_svname);
1138 /* non-fatal error of registeration with MGS */
1140 CDEBUG(D_MOUNT, "Cannot register with MGS: %d\n", rc);
1142 /* Let the target look up the mount using the target's name
1143 (we can't pass the sb or mnt through class_process_config.) */
1144 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1148 /* Start targets using the llog named for the target */
1149 memset(&cfg, 0, sizeof(cfg));
1150 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1152 CERROR("failed to start server %s: %d\n",
1153 lsi->lsi_ldd->ldd_svname, rc);
1154 /* Do NOT call server_deregister_mount() here. This makes it
1155 * impossible to find mount later in cleanup time and leaves
1156 * @lsi and othder stuff leaked. -umka */
1161 /* Release the mgc fs for others to use */
1162 server_mgc_clear_fs(lsi->lsi_mgc);
1165 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1167 CERROR("no server named %s was started\n",
1168 lsi->lsi_ldd->ldd_svname);
1172 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1173 (OBP(obd, iocontrol))) {
1174 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1175 obd->obd_self_export, 0, NULL, NULL);
1178 /* log has been fully processed */
1179 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1185 /***************** lustre superblock **************/
1187 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1189 struct lustre_sb_info *lsi;
1195 OBD_ALLOC_PTR(lsi->lsi_lmd);
1196 if (!lsi->lsi_lmd) {
1201 lsi->lsi_lmd->lmd_exclude_count = 0;
1202 s2lsi_nocast(sb) = lsi;
1203 /* we take 1 extra ref for our setup */
1204 cfs_atomic_set(&lsi->lsi_mounts, 1);
1206 /* Default umount style */
1207 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1212 static int lustre_free_lsi(struct super_block *sb)
1214 struct lustre_sb_info *lsi = s2lsi(sb);
1217 LASSERT(lsi != NULL);
1218 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1220 /* someone didn't call server_put_mount. */
1221 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1223 if (lsi->lsi_ldd != NULL)
1224 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1226 if (lsi->lsi_lmd != NULL) {
1227 if (lsi->lsi_lmd->lmd_dev != NULL)
1228 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1229 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1230 if (lsi->lsi_lmd->lmd_profile != NULL)
1231 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1232 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1233 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1234 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1235 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1236 if (lsi->lsi_lmd->lmd_opts != NULL)
1237 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1238 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1239 if (lsi->lsi_lmd->lmd_exclude_count)
1240 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1241 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1242 lsi->lsi_lmd->lmd_exclude_count);
1243 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1246 LASSERT(lsi->lsi_llsbi == NULL);
1247 OBD_FREE(lsi, sizeof(*lsi));
1248 s2lsi_nocast(sb) = NULL;
1253 /* The lsi has one reference for every server that is using the disk -
1254 e.g. MDT, MGS, and potentially MGC */
1255 static int lustre_put_lsi(struct super_block *sb)
1257 struct lustre_sb_info *lsi = s2lsi(sb);
1260 LASSERT(lsi != NULL);
1262 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1263 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1264 lustre_free_lsi(sb);
1270 /*************** server mount ******************/
1272 /* Kernel mount using mount options in MOUNT_DATA_FILE */
1273 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1275 struct lvfs_run_ctxt mount_ctxt;
1276 struct lustre_sb_info *lsi = s2lsi(sb);
1277 struct lustre_disk_data *ldd;
1278 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1279 struct vfsmount *mnt;
1280 char *options = NULL;
1281 unsigned long page, s_flags;
1282 struct page *__page;
1286 OBD_ALLOC(ldd, sizeof(*ldd));
1288 RETURN(ERR_PTR(-ENOMEM));
1290 /* In the past, we have always used flags = 0.
1291 Note ext3/ldiskfs can't be mounted ro. */
1292 s_flags = sb->s_flags;
1294 /* allocate memory for options */
1295 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1297 GOTO(out_free, rc = -ENOMEM);
1298 page = (unsigned long)cfs_page_address(__page);
1299 options = (char *)page;
1300 memset(options, 0, CFS_PAGE_SIZE);
1302 /* mount-line options must be added for pre-mount because it may
1303 * contain mount options such as journal_dev which are required
1304 * to mount successfuly the underlying filesystem */
1305 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1306 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1308 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1309 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1310 mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, (void *)options);
1313 CERROR("premount %s:%#lx ldiskfs failed: %d "
1314 "Is the ldiskfs module available?\n",
1315 lmd->lmd_dev, s_flags, rc );
1319 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1320 mount_ctxt.pwdmnt = mnt;
1321 mount_ctxt.pwd = mnt->mnt_root;
1322 mount_ctxt.fs = get_ds();
1324 rc = ldd_parse(&mount_ctxt, ldd);
1328 CERROR("premount parse options failed: rc = %d\n", rc);
1332 /* Done with our pre-mount, now do the real mount. */
1334 /* Glom up mount options */
1335 memset(options, 0, CFS_PAGE_SIZE);
1336 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1338 /* Add in any mount-line options */
1339 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1340 int len = CFS_PAGE_SIZE - strlen(options) - 2;
1342 strcat(options, ",");
1343 strncat(options, lmd->lmd_opts, len);
1346 /* Special permanent mount flags */
1348 s_flags |= MS_NOATIME | MS_NODIRATIME;
1350 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1351 MT_STR(ldd), lmd->lmd_dev, options);
1352 mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
1356 CERROR("ll_kern_mount failed: rc = %d\n", rc);
1360 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1361 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1364 OBD_PAGE_FREE(__page);
1365 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1366 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1371 OBD_PAGE_FREE(__page);
1372 OBD_FREE(ldd, sizeof(*ldd));
1373 lsi->lsi_ldd = NULL;
1374 RETURN(ERR_PTR(rc));
1377 /* Wait here forever until the mount refcount is 0 before completing umount,
1378 * else we risk dereferencing a null pointer.
1379 * LNET may take e.g. 165s before killing zombies.
1381 static void server_wait_finished(struct vfsmount *mnt)
1385 cfs_sigset_t blocked;
1387 cfs_waitq_init(&waitq);
1389 while (atomic_read(&mnt->mnt_count) > 1) {
1390 if (waited && (waited % 30 == 0))
1391 LCONSOLE_WARN("Mount still busy with %d refs after "
1393 atomic_read(&mnt->mnt_count),
1395 /* Cannot use l_event_wait() for an interruptible sleep. */
1397 blocked = l_w_e_set_sigs(sigmask(SIGKILL));
1398 cfs_waitq_wait_event_interruptible_timeout(
1400 (atomic_read(&mnt->mnt_count) == 1),
1401 cfs_time_seconds(3),
1403 cfs_block_sigs(blocked);
1405 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1408 atomic_read(&mnt->mnt_count));
1415 static void server_put_super(struct super_block *sb)
1417 struct lustre_sb_info *lsi = s2lsi(sb);
1418 struct obd_device *obd;
1419 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1420 char *tmpname, *extraname = NULL;
1422 int lddflags = lsi->lsi_ldd->ldd_flags;
1423 int lsiflags = lsi->lsi_flags;
1426 LASSERT(lsiflags & LSI_SERVER);
1428 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1429 OBD_ALLOC(tmpname, tmpname_sz);
1430 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1431 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1432 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1433 snprintf(tmpname, tmpname_sz, "MGS");
1435 /* Stop the target */
1436 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1437 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1438 struct lustre_profile *lprof = NULL;
1440 /* tell the mgc to drop the config log */
1441 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1443 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1444 If there are any setup/cleanup errors, save the lov
1445 name for safety cleanup later. */
1446 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1447 if (lprof && lprof->lp_dt) {
1448 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1449 strcpy(extraname, lprof->lp_dt);
1452 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1454 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1455 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1457 /* We can't seem to give an error return code
1458 * to .put_super, so we better make sure we clean up! */
1460 class_manual_cleanup(obd);
1462 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1463 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1467 /* If they wanted the mgs to stop separately from the mdt, they
1468 should have put it on a different device. */
1469 if (IS_MGS(lsi->lsi_ldd)) {
1470 /* if MDS start with --nomgs, don't stop MGS then */
1471 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1472 server_stop_mgs(sb);
1475 /* Clean the mgc and sb */
1476 lustre_common_put_super(sb);
1478 /* Wait for the targets to really clean up - can't exit (and let the
1479 sb get destroyed) while the mount is still in use */
1480 server_wait_finished(mnt);
1482 /* drop the One True Mount */
1485 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1486 until the target is really gone so that our type refcount check
1488 server_stop_servers(lddflags, lsiflags);
1490 /* In case of startup or cleanup err, stop related obds */
1492 obd = class_name2obd(extraname);
1494 CWARN("Cleaning orphaned obd %s\n", extraname);
1496 class_manual_cleanup(obd);
1498 OBD_FREE(extraname, strlen(extraname) + 1);
1501 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1502 OBD_FREE(tmpname, tmpname_sz);
1506 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1507 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1509 struct super_block *sb = vfsmnt->mnt_sb;
1511 static void server_umount_begin(struct super_block *sb)
1514 struct lustre_sb_info *lsi = s2lsi(sb);
1517 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1518 if (!(flags & MNT_FORCE)) {
1524 CDEBUG(D_MOUNT, "umount -f\n");
1525 /* umount = failover
1527 no third way to do non-force, non-failover */
1528 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1529 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1533 #ifndef HAVE_STATFS_DENTRY_PARAM
1534 static int server_statfs (struct super_block *sb, cfs_kstatfs_t *buf)
1537 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1539 struct super_block *sb = dentry->d_sb;
1541 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1544 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1545 #ifdef HAVE_STATFS_DENTRY_PARAM
1546 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1548 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf);
1551 buf->f_type = sb->s_magic;
1557 buf->f_type = sb->s_magic;
1558 buf->f_bsize = sb->s_blocksize;
1564 buf->f_namelen = NAME_MAX;
1568 static struct super_operations server_ops =
1570 .put_super = server_put_super,
1571 .umount_begin = server_umount_begin, /* umount -f */
1572 .statfs = server_statfs,
1575 #define log2(n) cfs_ffz(~(n))
1576 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1578 static int server_fill_super_common(struct super_block *sb)
1580 struct inode *root = 0;
1583 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1585 sb->s_blocksize = 4096;
1586 sb->s_blocksize_bits = log2(sb->s_blocksize);
1587 sb->s_magic = LUSTRE_SUPER_MAGIC;
1588 sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
1589 sb->s_flags |= MS_RDONLY;
1590 sb->s_op = &server_ops;
1592 root = new_inode(sb);
1594 CERROR("Can't make root inode\n");
1598 /* returns -EIO for every operation */
1599 /* make_bad_inode(root); -- badness - can't umount */
1600 /* apparently we need to be a directory for the mount to finish */
1601 root->i_mode = S_IFDIR;
1603 sb->s_root = d_alloc_root(root);
1605 CERROR("Can't make root dentry\n");
1613 static int server_fill_super(struct super_block *sb)
1615 struct lustre_sb_info *lsi = s2lsi(sb);
1616 struct vfsmount *mnt;
1620 /* the One True Mount */
1621 mnt = server_kernel_mount(sb);
1624 CERROR("Unable to mount device %s: %d\n",
1625 lsi->lsi_lmd->lmd_dev, rc);
1629 lsi->lsi_srv_mnt = mnt;
1631 LASSERT(lsi->lsi_ldd);
1632 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1633 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1634 lsi->lsi_lmd->lmd_dev);
1636 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1637 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1638 "running. Double-mount may have compromised"
1639 " the disk journal.\n",
1640 lsi->lsi_ldd->ldd_svname);
1646 /* Start MGS before MGC */
1647 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1648 rc = server_start_mgs(sb);
1653 rc = lustre_start_mgc(sb);
1657 /* Set up all obd devices for service */
1658 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1659 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1660 rc = server_start_targets(sb, mnt);
1662 CERROR("Unable to start targets: %d\n", rc);
1665 /* FIXME overmount client here,
1666 or can we just start a client log and client_fill_super on this sb?
1667 We need to make sure server_put_super gets called too - ll_put_super
1668 calls lustre_common_put_super; check there for LSI_SERVER flag,
1670 Probably should start client from new thread so we can return.
1671 Client will not finish until all servers are connected.
1672 Note - MGS-only server does NOT get a client, since there is no
1673 lustre fs associated - the MGS is for all lustre fs's */
1676 rc = server_fill_super_common(sb);
1680 LCONSOLE_WARN("Server %s on device %s has started\n",
1681 ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1682 (IS_MDT(lsi->lsi_ldd))) ? "MGS" : lsi->lsi_ldd->ldd_svname,
1683 lsi->lsi_lmd->lmd_dev);
1687 /* We jump here in case of failure while starting targets or MGS.
1688 * In this case we can't just put @mnt and have to do real cleanup
1689 * with stoping targets, etc. */
1690 server_put_super(sb);
1694 /* Get the index from the obd name.
1695 rc = server type, or
1697 if endptr isn't NULL it is set to end of name */
1698 int server_name2index(char *svname, __u32 *idx, char **endptr)
1700 unsigned long index;
1702 char *dash = strrchr(svname, '-');
1706 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1707 * in the fsname, then determine the server index */
1708 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
1710 for (; dash > svname && *dash != '-'; dash--);
1715 if (strncmp(dash + 1, "MDT", 3) == 0)
1716 rc = LDD_F_SV_TYPE_MDT;
1717 else if (strncmp(dash + 1, "OST", 3) == 0)
1718 rc = LDD_F_SV_TYPE_OST;
1721 if (strcmp(dash + 4, "all") == 0)
1722 return rc | LDD_F_SV_ALL;
1724 index = simple_strtoul(dash + 4, endptr, 16);
1729 /*************** mount common betweeen server and client ***************/
1732 int lustre_common_put_super(struct super_block *sb)
1737 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
1739 /* Drop a ref to the MGC */
1740 rc = lustre_stop_mgc(sb);
1741 if (rc && (rc != -ENOENT)) {
1743 CERROR("Can't stop MGC: %d\n", rc);
1746 /* BUSY just means that there's some other obd that
1747 needs the mgc. Let him clean it up. */
1748 CDEBUG(D_MOUNT, "MGC still in use\n");
1750 /* Drop a ref to the mounted disk */
1757 static void lmd_print(struct lustre_mount_data *lmd)
1761 PRINT_CMD(PRINT_MASK, " mount data:\n");
1762 if (lmd_is_client(lmd))
1763 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
1764 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
1765 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
1767 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
1768 for (i = 0; i < lmd->lmd_exclude_count; i++) {
1769 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
1770 lmd->lmd_exclude[i]);
1775 /* Is this server on the exclusion list */
1776 int lustre_check_exclusion(struct super_block *sb, char *svname)
1778 struct lustre_sb_info *lsi = s2lsi(sb);
1779 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1784 rc = server_name2index(svname, &index, NULL);
1785 if (rc != LDD_F_SV_TYPE_OST)
1786 /* Only exclude OSTs */
1789 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
1790 index, lmd->lmd_exclude_count, lmd->lmd_dev);
1792 for(i = 0; i < lmd->lmd_exclude_count; i++) {
1793 if (index == lmd->lmd_exclude[i]) {
1794 CWARN("Excluding %s (on exclusion list)\n", svname);
1801 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
1802 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
1804 char *s1 = ptr, *s2;
1805 __u32 index, *exclude_list;
1809 /* The shortest an ost name can be is 8 chars: -OST0000.
1810 We don't actually know the fsname at this time, so in fact
1811 a user could specify any fsname. */
1812 devmax = strlen(ptr) / 8 + 1;
1814 /* temp storage until we figure out how many we have */
1815 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
1819 /* we enter this fn pointing at the '=' */
1820 while (*s1 && *s1 != ' ' && *s1 != ',') {
1822 rc = server_name2index(s1, &index, &s2);
1824 CERROR("Can't parse server name '%s'\n", s1);
1827 if (rc == LDD_F_SV_TYPE_OST)
1828 exclude_list[lmd->lmd_exclude_count++] = index;
1830 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
1832 /* now we are pointing at ':' (next exclude)
1833 or ',' (end of excludes) */
1834 if (lmd->lmd_exclude_count >= devmax)
1837 if (rc >= 0) /* non-err */
1840 if (lmd->lmd_exclude_count) {
1841 /* permanent, freed in lustre_free_lsi */
1842 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
1843 lmd->lmd_exclude_count);
1844 if (lmd->lmd_exclude) {
1845 memcpy(lmd->lmd_exclude, exclude_list,
1846 sizeof(index) * lmd->lmd_exclude_count);
1849 lmd->lmd_exclude_count = 0;
1852 OBD_FREE(exclude_list, sizeof(index) * devmax);
1856 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
1861 if (lmd->lmd_mgssec != NULL) {
1862 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
1863 lmd->lmd_mgssec = NULL;
1866 tail = strchr(ptr, ',');
1868 length = strlen(ptr);
1870 length = tail - ptr;
1872 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
1873 if (lmd->lmd_mgssec == NULL)
1876 memcpy(lmd->lmd_mgssec, ptr, length);
1877 lmd->lmd_mgssec[length] = '\0';
1881 /* mount -v -t lustre uml1:uml2:/lustre-client /mnt/lustre */
1882 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
1884 char *s1, *s2, *devname = NULL;
1885 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
1891 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
1892 "/sbin/mount.lustre is installed.\n");
1896 /* Options should be a string - try to detect old lmd data */
1897 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
1898 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
1899 "/sbin/mount.lustre. Please install "
1900 "version %s\n", LUSTRE_VERSION_STRING);
1903 lmd->lmd_magic = LMD_MAGIC;
1905 /* Set default flags here */
1910 /* Skip whitespace and extra commas */
1911 while (*s1 == ' ' || *s1 == ',')
1914 /* Client options are parsed in ll_options: eg. flock,
1917 /* Parse non-ldiskfs options here. Rather than modifying
1918 ldiskfs, we just zero these out here */
1919 if (strncmp(s1, "abort_recov", 11) == 0) {
1920 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
1922 } else if (strncmp(s1, "nosvc", 5) == 0) {
1923 lmd->lmd_flags |= LMD_FLG_NOSVC;
1925 } else if (strncmp(s1, "nomgs", 5) == 0) {
1926 lmd->lmd_flags |= LMD_FLG_NOMGS;
1928 } else if (strncmp(s1, "mgssec=", 7) == 0) {
1929 rc = lmd_parse_mgssec(lmd, s1 + 7);
1933 /* ost exclusion list */
1934 } else if (strncmp(s1, "exclude=", 8) == 0) {
1935 rc = lmd_make_exclusion(lmd, s1 + 7);
1940 /* Linux 2.4 doesn't pass the device, so we stuck it at the
1941 end of the options. */
1942 else if (strncmp(s1, "device=", 7) == 0) {
1944 /* terminate options right before device. device
1945 must be the last one. */
1951 s2 = strchr(s1, ',');
1959 memmove(s1, s2, strlen(s2) + 1);
1965 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
1966 "(need mount option 'device=...')\n");
1970 s1 = strstr(devname, ":/");
1973 lmd->lmd_flags = LMD_FLG_CLIENT;
1974 /* Remove leading /s from fsname */
1975 while (*++s1 == '/') ;
1976 /* Freed in lustre_free_lsi */
1977 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
1978 if (!lmd->lmd_profile)
1980 sprintf(lmd->lmd_profile, "%s-client", s1);
1983 /* Freed in lustre_free_lsi */
1984 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
1987 strcpy(lmd->lmd_dev, devname);
1989 /* Save mount options */
1990 s1 = options + strlen(options) - 1;
1991 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
1993 if (*options != 0) {
1994 /* Freed in lustre_free_lsi */
1995 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
1998 strcpy(lmd->lmd_opts, options);
2001 lmd->lmd_magic = LMD_MAGIC;
2006 CERROR("Bad mount options %s\n", options);
2012 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2014 struct lustre_mount_data *lmd;
2015 struct lustre_sb_info *lsi;
2019 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2021 lsi = lustre_init_lsi(sb);
2027 * Disable lockdep during mount, because mount locking patterns are
2032 /* Figure out the lmd from the mount options */
2033 if (lmd_parse((char *)data, lmd)) {
2035 GOTO(out, rc = -EINVAL);
2038 if (lmd_is_client(lmd)) {
2039 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2040 if (!client_fill_super) {
2041 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2042 "client mount! Is the 'lustre' "
2043 "module loaded?\n");
2047 rc = lustre_start_mgc(sb);
2052 /* Connect and start */
2053 /* (should always be ll_fill_super) */
2054 rc = (*client_fill_super)(sb);
2055 /* c_f_s will call lustre_common_put_super on failure */
2058 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2059 lsi->lsi_flags |= LSI_SERVER;
2060 rc = server_fill_super(sb);
2061 /* s_f_s calls lustre_start_mgc after the mount because we need
2062 the MGS nids which are stored on disk. Plus, we may
2063 need to start the MGS first. */
2064 /* s_f_s will call server_put_super on failure */
2067 /* If error happens in fill_super() call, @lsi will be killed there.
2068 * This is why we do not put it here. */
2072 CERROR("Unable to mount %s (%d)\n",
2073 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2075 CDEBUG(D_SUPER, "Mount %s complete\n",
2083 /* We can't call ll_fill_super by name because it lives in a module that
2084 must be loaded after this one. */
2085 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb))
2087 client_fill_super = cfs;
2090 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2092 kill_super_cb = cfs;
2095 /***************** FS registration ******************/
2097 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
2098 struct super_block * lustre_get_sb(struct file_system_type *fs_type,
2099 int flags, const char *devname, void * data)
2101 return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
2104 int lustre_get_sb(struct file_system_type *fs_type,
2105 int flags, const char *devname, void * data,
2106 struct vfsmount *mnt)
2108 return get_sb_nodev(fs_type, flags, data, lustre_fill_super, mnt);
2112 void lustre_kill_super(struct super_block *sb)
2114 struct lustre_sb_info *lsi = s2lsi(sb);
2116 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2117 (*kill_super_cb)(sb);
2119 kill_anon_super(sb);
2122 struct file_system_type lustre_fs_type = {
2123 .owner = THIS_MODULE,
2125 .get_sb = lustre_get_sb,
2126 .kill_sb = lustre_kill_super,
2127 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2128 #ifdef FS_HAS_FIEMAP
2131 LL_RENAME_DOES_D_MOVE,
2134 int lustre_register_fs(void)
2136 return register_filesystem(&lustre_fs_type);
2139 int lustre_unregister_fs(void)
2141 return unregister_filesystem(&lustre_fs_type);
2144 EXPORT_SYMBOL(lustre_register_client_fill_super);
2145 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2146 EXPORT_SYMBOL(lustre_common_put_super);
2147 EXPORT_SYMBOL(lustre_process_log);
2148 EXPORT_SYMBOL(lustre_end_log);
2149 EXPORT_SYMBOL(server_get_mount);
2150 EXPORT_SYMBOL(server_get_mount_2);
2151 EXPORT_SYMBOL(server_put_mount);
2152 EXPORT_SYMBOL(server_put_mount_2);
2153 EXPORT_SYMBOL(server_register_target);
2154 EXPORT_SYMBOL(server_name2index);
2155 EXPORT_SYMBOL(server_mti_print);
2156 EXPORT_SYMBOL(do_lcfg);