1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/obd_mount.c
38 * Client/server mount routines
40 * Author: Nathan Rutman <nathan@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_CLASS
45 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
46 #define PRINT_CMD CDEBUG
47 #define PRINT_MASK D_SUPER|D_CONFIG
51 #include <lustre_fsfilt.h>
52 #include <obd_class.h>
53 #include <lustre/lustre_user.h>
54 #include <linux/version.h>
55 #include <lustre_log.h>
56 #include <lustre_disk.h>
57 #include <lustre_param.h>
59 static int (*client_fill_super)(struct super_block *sb,
60 struct vfsmount *mnt) = NULL;
61 static void (*kill_super_cb)(struct super_block *sb) = NULL;
63 /*********** mount lookup *********/
65 CFS_DECLARE_MUTEX(lustre_mount_info_lock);
66 static CFS_LIST_HEAD(server_mount_info_list);
68 static struct lustre_mount_info *server_find_mount(const char *name)
71 struct lustre_mount_info *lmi;
74 cfs_list_for_each(tmp, &server_mount_info_list) {
75 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
77 if (strcmp(name, lmi->lmi_name) == 0)
83 /* we must register an obd for a mount before we call the setup routine.
84 *_setup will call lustre_get_mount to get the mnt struct
85 by obd_name, since we can't pass the pointer to setup. */
86 static int server_register_mount(const char *name, struct super_block *sb,
89 struct lustre_mount_info *lmi;
96 OBD_ALLOC(lmi, sizeof(*lmi));
99 OBD_ALLOC(name_cp, strlen(name) + 1);
101 OBD_FREE(lmi, sizeof(*lmi));
104 strcpy(name_cp, name);
106 cfs_down(&lustre_mount_info_lock);
108 if (server_find_mount(name)) {
109 cfs_up(&lustre_mount_info_lock);
110 OBD_FREE(lmi, sizeof(*lmi));
111 OBD_FREE(name_cp, strlen(name) + 1);
112 CERROR("Already registered %s\n", name);
115 lmi->lmi_name = name_cp;
118 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
120 cfs_up(&lustre_mount_info_lock);
122 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
123 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
128 /* when an obd no longer needs a mount */
129 static int server_deregister_mount(const char *name)
131 struct lustre_mount_info *lmi;
134 cfs_down(&lustre_mount_info_lock);
135 lmi = server_find_mount(name);
137 cfs_up(&lustre_mount_info_lock);
138 CERROR("%s not registered\n", name);
142 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
143 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
145 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
146 cfs_list_del(&lmi->lmi_list_chain);
147 OBD_FREE(lmi, sizeof(*lmi));
148 cfs_up(&lustre_mount_info_lock);
153 /* obd's look up a registered mount using their obdname. This is just
154 for initial obd setup to find the mount struct. It should not be
155 called every time you want to mntget. */
156 struct lustre_mount_info *server_get_mount(const char *name)
158 struct lustre_mount_info *lmi;
159 struct lustre_sb_info *lsi;
162 cfs_down(&lustre_mount_info_lock);
163 lmi = server_find_mount(name);
164 cfs_up(&lustre_mount_info_lock);
166 CERROR("Can't find mount for %s\n", name);
169 lsi = s2lsi(lmi->lmi_sb);
170 mntget(lmi->lmi_mnt);
171 cfs_atomic_inc(&lsi->lsi_mounts);
173 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
174 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
175 cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
181 * Used by mdt to get mount_info from obdname.
182 * There are no blocking when using the mount_info.
183 * Do not use server_get_mount for this purpose.
185 struct lustre_mount_info *server_get_mount_2(const char *name)
187 struct lustre_mount_info *lmi;
190 cfs_down(&lustre_mount_info_lock);
191 lmi = server_find_mount(name);
192 cfs_up(&lustre_mount_info_lock);
194 CERROR("Can't find mount for %s\n", name);
199 static void unlock_mntput(struct vfsmount *mnt)
201 if (kernel_locked()) {
210 static int lustre_put_lsi(struct super_block *sb);
212 /* to be called from obd_cleanup methods */
213 int server_put_mount(const char *name, struct vfsmount *mnt)
215 struct lustre_mount_info *lmi;
216 struct lustre_sb_info *lsi;
217 int count = atomic_read(&mnt->mnt_count) - 1;
220 /* This might be the last one, can't deref after this */
223 cfs_down(&lustre_mount_info_lock);
224 lmi = server_find_mount(name);
225 cfs_up(&lustre_mount_info_lock);
227 CERROR("Can't find mount for %s\n", name);
230 lsi = s2lsi(lmi->lmi_sb);
231 LASSERT(lmi->lmi_mnt == mnt);
233 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
234 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
236 if (lustre_put_lsi(lmi->lmi_sb)) {
237 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
238 lmi->lmi_mnt, name, count);
239 /* last mount is the One True Mount */
241 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
244 /* this obd should never need the mount again */
245 server_deregister_mount(name);
250 /* Corresponding to server_get_mount_2 */
251 int server_put_mount_2(const char *name, struct vfsmount *mnt)
257 /******* mount helper utilities *********/
260 static void ldd_print(struct lustre_disk_data *ldd)
262 PRINT_CMD(PRINT_MASK, " disk data:\n");
263 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
264 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
265 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
266 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
267 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
268 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
269 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
270 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
271 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
272 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
276 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
277 struct lustre_disk_data *ldd)
279 struct lvfs_run_ctxt saved;
286 push_ctxt(&saved, mount_ctxt, NULL);
288 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
291 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
295 len = i_size_read(file->f_dentry->d_inode);
296 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
297 if (len != sizeof(*ldd)) {
298 CERROR("disk data size does not match: see %lu expect %u\n",
299 len, (int)sizeof(*ldd));
300 GOTO(out_close, rc = -EINVAL);
303 rc = lustre_fread(file, ldd, len, &off);
305 CERROR("error reading %s: read %d of %lu\n",
306 MOUNT_DATA_FILE, rc, len);
307 GOTO(out_close, rc = -EINVAL);
311 if (ldd->ldd_magic != LDD_MAGIC) {
312 /* FIXME add swabbing support */
313 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
314 ldd->ldd_magic, LDD_MAGIC);
315 GOTO(out_close, rc = -EINVAL);
318 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
319 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
321 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
322 GOTO(out_close, rc = -EINVAL);
324 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
325 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
327 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
328 /* Do something like remount filesystem read-only */
329 GOTO(out_close, rc = -EINVAL);
335 pop_ctxt(&saved, mount_ctxt, NULL);
339 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
340 struct lustre_disk_data *ldd)
342 struct lvfs_run_ctxt saved;
345 unsigned long len = sizeof(struct lustre_disk_data);
349 LASSERT(ldd->ldd_magic == LDD_MAGIC);
351 ldd->ldd_config_ver++;
353 push_ctxt(&saved, mount_ctxt, NULL);
355 file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
358 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
362 rc = lustre_fwrite(file, ldd, len, &off);
364 CERROR("error writing %s: read %d of %lu\n",
365 MOUNT_DATA_FILE, rc, len);
366 GOTO(out_close, rc = -EINVAL);
374 pop_ctxt(&saved, mount_ctxt, NULL);
379 /**************** config llog ********************/
381 /** Get a config log from the MGS and process it.
382 * This func is called for both clients and servers.
383 * Continue to process new statements appended to the logs
384 * (whenever the config lock is revoked) until lustre_end_log
386 * @param sb The superblock is used by the MGC to write to the local copy of
388 * @param logname The name of the llog to replicate from the MGS
389 * @param cfg Since the same mgc may be used to follow multiple config logs
390 * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
391 * this log, and is added to the mgc's list of logs to follow.
393 int lustre_process_log(struct super_block *sb, char *logname,
394 struct config_llog_instance *cfg)
396 struct lustre_cfg *lcfg;
397 struct lustre_cfg_bufs bufs;
398 struct lustre_sb_info *lsi = s2lsi(sb);
399 struct obd_device *mgc = lsi->lsi_mgc;
406 /* mgc_process_config */
407 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
408 lustre_cfg_bufs_set_string(&bufs, 1, logname);
409 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
410 lustre_cfg_bufs_set(&bufs, 3, &sb, sizeof(sb));
411 lcfg = lustre_cfg_new(LCFG_LOG_START, &bufs);
412 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
413 lustre_cfg_free(lcfg);
416 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
417 "failed from the MGS (%d). Make sure this "
418 "client and the MGS are running compatible "
419 "versions of Lustre.\n",
420 mgc->obd_name, logname, rc);
423 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
424 "failed (%d). This may be the result of "
425 "communication errors between this node and "
426 "the MGS, a bad configuration, or other "
427 "errors. See the syslog for more "
428 "information.\n", mgc->obd_name, logname,
431 /* class_obd_list(); */
435 /* Stop watching this config log for updates */
436 int lustre_end_log(struct super_block *sb, char *logname,
437 struct config_llog_instance *cfg)
439 struct lustre_cfg *lcfg;
440 struct lustre_cfg_bufs bufs;
441 struct lustre_sb_info *lsi = s2lsi(sb);
442 struct obd_device *mgc = lsi->lsi_mgc;
449 /* mgc_process_config */
450 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
451 lustre_cfg_bufs_set_string(&bufs, 1, logname);
453 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
454 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
455 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
456 lustre_cfg_free(lcfg);
460 /**************** obd start *******************/
462 /** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
463 * lctl (and do for echo cli/srv.
465 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
466 char *s1, char *s2, char *s3, char *s4)
468 struct lustre_cfg_bufs bufs;
469 struct lustre_cfg * lcfg = NULL;
472 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
473 cmd, s1, s2, s3, s4);
475 lustre_cfg_bufs_reset(&bufs, cfgname);
477 lustre_cfg_bufs_set_string(&bufs, 1, s1);
479 lustre_cfg_bufs_set_string(&bufs, 2, s2);
481 lustre_cfg_bufs_set_string(&bufs, 3, s3);
483 lustre_cfg_bufs_set_string(&bufs, 4, s4);
485 lcfg = lustre_cfg_new(cmd, &bufs);
486 lcfg->lcfg_nid = nid;
487 rc = class_process_config(lcfg);
488 lustre_cfg_free(lcfg);
492 /** Call class_attach and class_setup. These methods in turn call
493 * obd type-specific methods.
495 static int lustre_start_simple(char *obdname, char *type, char *uuid,
499 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
501 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
503 CERROR("%s attach error %d\n", obdname, rc);
506 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
508 CERROR("%s setup error %d\n", obdname, rc);
509 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
514 /* Set up a MGS to serve startup logs */
515 static int server_start_mgs(struct super_block *sb)
517 struct lustre_sb_info *lsi = s2lsi(sb);
518 struct vfsmount *mnt = lsi->lsi_srv_mnt;
519 struct lustre_mount_info *lmi;
524 /* It is impossible to have more than 1 MGS per node, since
525 MGC wouldn't know which to connect to */
526 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
528 lsi = s2lsi(lmi->lmi_sb);
529 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
531 lsi->lsi_ldd->ldd_svname);
535 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
537 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
540 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
541 LUSTRE_MGS_OBDNAME, 0, 0);
542 /* Do NOT call server_deregister_mount() here. This leads to
543 * inability cleanup cleanly and free lsi and other stuff when
544 * mgs calls server_put_mount() in error handling case. -umka */
548 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
549 "Is the 'mgs' module loaded?\n",
550 LUSTRE_MGS_OBDNAME, rc);
554 static int server_stop_mgs(struct super_block *sb)
556 struct obd_device *obd;
560 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
562 /* There better be only one MGS */
563 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
565 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
569 /* The MGS should always stop when we say so */
571 rc = class_manual_cleanup(obd);
575 CFS_DECLARE_MUTEX(mgc_start_lock);
577 /** Set up a mgc obd to process startup logs
579 * \param sb [in] super block of the mgc obd
581 * \retval 0 success, otherwise error code
583 static int lustre_start_mgc(struct super_block *sb)
585 struct obd_connect_data *data = NULL;
586 struct lustre_sb_info *lsi = s2lsi(sb);
587 struct obd_device *obd;
588 struct obd_export *exp;
589 struct obd_uuid *uuid;
592 char *mgcname, *niduuid, *mgssec;
595 int rc = 0, i = 0, j, len;
598 LASSERT(lsi->lsi_lmd);
600 /* Find the first non-lo MGS nid for our MGC name */
601 if (lsi->lsi_flags & LSI_SERVER) {
602 ptr = lsi->lsi_ldd->ldd_params;
603 /* Use mgsnode= nids */
604 if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
605 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
607 } else if (IS_MGS(lsi->lsi_ldd)) {
608 lnet_process_id_t id;
609 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
610 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
617 } else { /* client */
618 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
619 ptr = lsi->lsi_lmd->lmd_dev;
620 if (class_parse_nid(ptr, &nid, &ptr) == 0)
624 CERROR("No valid MGS nids found.\n");
628 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
629 OBD_ALLOC(mgcname, len);
630 OBD_ALLOC(niduuid, len + 2);
631 if (!mgcname || !niduuid)
632 GOTO(out_free, rc = -ENOMEM);
633 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
635 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
637 cfs_mutex_down(&mgc_start_lock);
639 obd = class_name2obd(mgcname);
640 if (obd && !obd->obd_stopping) {
641 rc = obd_set_info_async(obd->obd_self_export,
642 strlen(KEY_MGSSEC), KEY_MGSSEC,
643 strlen(mgssec), mgssec, NULL);
647 /* Re-using an existing MGC */
648 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
651 /* If we are restarting the MGS, don't try to keep the MGC's
652 old connection, or registration will fail. */
653 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
654 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
658 /* Try all connections, but only once (again).
659 We don't want to block another target from starting
660 (using its local copy of the log), but we do want to connect
661 if at all possible. */
663 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
664 rc = obd_set_info_async(obd->obd_self_export,
665 sizeof(KEY_INIT_RECOV_BACKUP),
666 KEY_INIT_RECOV_BACKUP,
667 sizeof(recov_bk), &recov_bk, NULL);
671 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
673 /* Add the primary nids for the MGS */
675 sprintf(niduuid, "%s_%x", mgcname, i);
676 if (lsi->lsi_flags & LSI_SERVER) {
677 ptr = lsi->lsi_ldd->ldd_params;
678 if (IS_MGS(lsi->lsi_ldd)) {
679 /* Use local nids (including LO) */
680 lnet_process_id_t id;
681 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
682 rc = do_lcfg(mgcname, id.nid,
683 LCFG_ADD_UUID, niduuid, 0,0,0);
686 /* Use mgsnode= nids */
687 if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) {
688 CERROR("No MGS nids given.\n");
689 GOTO(out_free, rc = -EINVAL);
691 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
692 rc = do_lcfg(mgcname, nid,
693 LCFG_ADD_UUID, niduuid, 0,0,0);
697 } else { /* client */
698 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
699 ptr = lsi->lsi_lmd->lmd_dev;
700 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
701 rc = do_lcfg(mgcname, nid,
702 LCFG_ADD_UUID, niduuid, 0,0,0);
704 /* Stop at the first failover nid */
710 CERROR("No valid MGS nids found.\n");
711 GOTO(out_free, rc = -EINVAL);
713 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
715 /* Random uuid for MGC allows easier reconnects */
717 ll_generate_random_uuid(uuidc);
718 class_uuid_unparse(uuidc, uuid);
721 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
722 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
728 /* Add any failover MGS nids */
730 while ((*ptr == ':' ||
731 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
732 /* New failover node */
733 sprintf(niduuid, "%s_%x", mgcname, i);
735 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
737 rc = do_lcfg(mgcname, nid,
738 LCFG_ADD_UUID, niduuid, 0,0,0);
743 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
751 lsi->lsi_lmd->lmd_mgs_failnodes = i;
753 obd = class_name2obd(mgcname);
755 CERROR("Can't find mgcobd %s\n", mgcname);
756 GOTO(out_free, rc = -ENOTCONN);
759 rc = obd_set_info_async(obd->obd_self_export,
760 strlen(KEY_MGSSEC), KEY_MGSSEC,
761 strlen(mgssec), mgssec, NULL);
765 /* Keep a refcount of servers/clients who started with "mount",
766 so we know when we can get rid of the mgc. */
767 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
769 /* Try all connections, but only once. */
771 rc = obd_set_info_async(obd->obd_self_export,
772 sizeof(KEY_INIT_RECOV_BACKUP),
773 KEY_INIT_RECOV_BACKUP,
774 sizeof(recov_bk), &recov_bk, NULL);
777 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
778 /* We connect to the MGS at setup, and don't disconnect until cleanup */
781 GOTO(out, rc = -ENOMEM);
782 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
783 OBD_CONNECT_AT | OBD_CONNECT_FULL20;
784 data->ocd_version = LUSTRE_VERSION_CODE;
785 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
788 CERROR("connect failed %d\n", rc);
792 obd->u.cli.cl_mgc_mgsexp = exp;
795 /* Keep the mgc info in the sb. Note that many lsi's can point
799 cfs_mutex_up(&mgc_start_lock);
802 OBD_FREE(mgcname, len);
804 OBD_FREE(niduuid, len + 2);
808 static int lustre_stop_mgc(struct super_block *sb)
810 struct lustre_sb_info *lsi = s2lsi(sb);
811 struct obd_device *obd;
812 char *niduuid = 0, *ptr = 0;
813 int i, rc = 0, len = 0;
823 cfs_mutex_down(&mgc_start_lock);
824 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
825 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
826 /* This is not fatal, every client that stops
827 will call in here. */
828 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
829 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
830 GOTO(out, rc = -EBUSY);
833 /* The MGC has no recoverable data in any case.
834 * force shotdown set in umount_begin */
835 obd->obd_no_recov = 1;
837 if (obd->u.cli.cl_mgc_mgsexp) {
838 /* An error is not fatal, if we are unable to send the
839 disconnect mgs ping evictor cleans up the export */
840 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
842 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
845 /* Save the obdname for cleaning the nid uuids, which are
847 len = strlen(obd->obd_name) + 6;
848 OBD_ALLOC(niduuid, len);
850 strcpy(niduuid, obd->obd_name);
851 ptr = niduuid + strlen(niduuid);
854 rc = class_manual_cleanup(obd);
858 /* Clean the nid uuids */
860 GOTO(out, rc = -ENOMEM);
862 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
863 sprintf(ptr, "_%x", i);
864 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
867 CERROR("del MDC UUID %s failed: rc = %d\n",
872 OBD_FREE(niduuid, len);
874 /* class_import_put will get rid of the additional connections */
875 cfs_mutex_up(&mgc_start_lock);
879 /* Since there's only one mgc per node, we have to change it's fs to get
880 access to the right disk. */
881 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
883 struct lustre_sb_info *lsi = s2lsi(sb);
887 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
889 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
890 rc = obd_set_info_async(mgc->obd_self_export,
891 sizeof(KEY_SET_FS), KEY_SET_FS,
892 sizeof(*sb), sb, NULL);
894 CERROR("can't set_fs %d\n", rc);
900 static int server_mgc_clear_fs(struct obd_device *mgc)
905 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
907 rc = obd_set_info_async(mgc->obd_self_export,
908 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
913 CFS_DECLARE_MUTEX(server_start_lock);
915 /* Stop MDS/OSS if nobody is using them */
916 static int server_stop_servers(int lddflags, int lsiflags)
918 struct obd_device *obd = NULL;
919 struct obd_type *type = NULL;
923 cfs_mutex_down(&server_start_lock);
925 /* Either an MDT or an OST or neither */
926 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
927 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
928 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
929 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
930 type = class_search_type(LUSTRE_MDS_NAME);
932 /* if this was an OST, and there are no more OST's, clean up the OSS */
933 if ((lddflags & LDD_F_SV_TYPE_OST) &&
934 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
935 type = class_search_type(LUSTRE_OST_NAME);
938 if (obd && (!type || !type->typ_refcnt)) {
941 /* obd_fail doesn't mean much on a server obd */
942 err = class_manual_cleanup(obd);
947 cfs_mutex_up(&server_start_lock);
952 int server_mti_print(char *title, struct mgs_target_info *mti)
954 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
955 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
956 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
957 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
958 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
959 mti->mti_config_ver, mti->mti_flags);
963 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
965 struct lustre_sb_info *lsi = s2lsi(sb);
966 struct lustre_disk_data *ldd = lsi->lsi_ldd;
967 lnet_process_id_t id;
971 if (!(lsi->lsi_flags & LSI_SERVER))
974 strncpy(mti->mti_fsname, ldd->ldd_fsname,
975 sizeof(mti->mti_fsname));
976 strncpy(mti->mti_svname, ldd->ldd_svname,
977 sizeof(mti->mti_svname));
979 mti->mti_nid_count = 0;
980 while (LNetGetId(i++, &id) != -ENOENT) {
981 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
984 /* server use --servicenode param, only allow specified
985 * nids be registered */
986 if ((ldd->ldd_flags & LDD_F_NO_PRIMNODE) != 0 &&
987 class_match_nid(ldd->ldd_params,
988 PARAM_FAILNODE, id.nid) < 1)
991 /* match specified network */
992 if (!class_match_net(ldd->ldd_params,
993 PARAM_NETWORK, LNET_NIDNET(id.nid)))
996 mti->mti_nids[mti->mti_nid_count] = id.nid;
997 mti->mti_nid_count++;
998 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
999 CWARN("Only using first %d nids for %s\n",
1000 mti->mti_nid_count, mti->mti_svname);
1005 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
1006 mti->mti_config_ver = 0;
1007 if (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF)
1008 ldd->ldd_flags |= LDD_F_WRITECONF;
1009 mti->mti_flags = ldd->ldd_flags;
1010 mti->mti_stripe_index = ldd->ldd_svindex;
1011 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
1012 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
1013 CERROR("params too big for mti\n");
1016 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
1020 /* Register an old or new target with the MGS. If needed MGS will construct
1021 startup logs and assign index */
1022 int server_register_target(struct super_block *sb)
1024 struct lustre_sb_info *lsi = s2lsi(sb);
1025 struct obd_device *mgc = lsi->lsi_mgc;
1026 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1027 struct mgs_target_info *mti = NULL;
1033 if (!(lsi->lsi_flags & LSI_SERVER))
1039 rc = server_sb2mti(sb, mti);
1043 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1044 mti->mti_svname, mti->mti_fsname,
1045 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1048 /* Register the target */
1049 /* FIXME use mgc_process_config instead */
1050 rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp,
1051 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1052 sizeof(*mti), mti, NULL);
1056 /* Always update our flags */
1057 ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD;
1059 /* If this flag is set, it means the MGS wants us to change our
1060 on-disk data. (So far this means just the index.) */
1061 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1064 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1065 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1067 ldd->ldd_svindex = mti->mti_stripe_index;
1068 strncpy(ldd->ldd_svname, mti->mti_svname,
1069 sizeof(ldd->ldd_svname));
1070 /* or ldd_make_sv_name(ldd); */
1071 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1072 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1075 CERROR("Label set error %d\n", err);
1076 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1078 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1080 /* Flush the new ldd to disk */
1081 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1090 /** Start server targets: MDTs and OSTs
1092 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1094 struct obd_device *obd;
1095 struct lustre_sb_info *lsi = s2lsi(sb);
1096 struct config_llog_instance cfg;
1100 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1103 /* If we're an MDT, make sure the global MDS is running */
1104 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1105 /* make sure the MDS is started */
1106 cfs_mutex_down(&server_start_lock);
1107 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1109 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1110 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1112 LUSTRE_MDS_OBDNAME"_uuid",
1115 cfs_mutex_up(&server_start_lock);
1116 CERROR("failed to start MDS: %d\n", rc);
1120 cfs_mutex_up(&server_start_lock);
1124 /* If we're an OST, make sure the global OSS is running */
1125 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
1126 /* make sure OSS is started */
1127 cfs_mutex_down(&server_start_lock);
1128 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1130 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1132 LUSTRE_OSS_OBDNAME"_uuid",
1135 cfs_mutex_up(&server_start_lock);
1136 CERROR("failed to start OSS: %d\n", rc);
1140 cfs_mutex_up(&server_start_lock);
1143 /* Set the mgc fs to our server disk. This allows the MGC to
1144 * read and write configs locally, in case it can't talk to the MGS. */
1145 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1149 /* Register with MGS */
1150 rc = server_register_target(sb);
1151 if (rc && (lsi->lsi_ldd->ldd_flags &
1152 (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){
1153 CERROR("Required registration failed for %s: %d\n",
1154 lsi->lsi_ldd->ldd_svname, rc);
1156 LCONSOLE_ERROR_MSG(0x15f, "Communication error with "
1157 "the MGS. Is the MGS running?\n");
1161 if (rc == -EINVAL) {
1162 LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this "
1163 "server (%s) to start. Please see messages"
1164 " on the MGS node.\n",
1165 lsi->lsi_ldd->ldd_svname);
1168 /* non-fatal error of registeration with MGS */
1170 CDEBUG(D_MOUNT, "Cannot register with MGS: %d\n", rc);
1172 /* Let the target look up the mount using the target's name
1173 (we can't pass the sb or mnt through class_process_config.) */
1174 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1178 /* Start targets using the llog named for the target */
1179 memset(&cfg, 0, sizeof(cfg));
1180 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1182 CERROR("failed to start server %s: %d\n",
1183 lsi->lsi_ldd->ldd_svname, rc);
1184 /* Do NOT call server_deregister_mount() here. This makes it
1185 * impossible to find mount later in cleanup time and leaves
1186 * @lsi and othder stuff leaked. -umka */
1191 /* Release the mgc fs for others to use */
1192 server_mgc_clear_fs(lsi->lsi_mgc);
1195 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1197 CERROR("no server named %s was started\n",
1198 lsi->lsi_ldd->ldd_svname);
1202 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1203 (OBP(obd, iocontrol))) {
1204 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1205 obd->obd_self_export, 0, NULL, NULL);
1208 /* log has been fully processed */
1209 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1215 /***************** lustre superblock **************/
1217 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1219 struct lustre_sb_info *lsi;
1225 OBD_ALLOC_PTR(lsi->lsi_lmd);
1226 if (!lsi->lsi_lmd) {
1231 lsi->lsi_lmd->lmd_exclude_count = 0;
1232 lsi->lsi_lmd->lmd_recovery_time_soft = 0;
1233 lsi->lsi_lmd->lmd_recovery_time_hard = 0;
1234 s2lsi_nocast(sb) = lsi;
1235 /* we take 1 extra ref for our setup */
1236 cfs_atomic_set(&lsi->lsi_mounts, 1);
1238 /* Default umount style */
1239 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1244 static int lustre_free_lsi(struct super_block *sb)
1246 struct lustre_sb_info *lsi = s2lsi(sb);
1249 LASSERT(lsi != NULL);
1250 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1252 /* someone didn't call server_put_mount. */
1253 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1255 if (lsi->lsi_ldd != NULL)
1256 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1258 if (lsi->lsi_lmd != NULL) {
1259 if (lsi->lsi_lmd->lmd_dev != NULL)
1260 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1261 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1262 if (lsi->lsi_lmd->lmd_profile != NULL)
1263 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1264 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1265 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1266 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1267 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1268 if (lsi->lsi_lmd->lmd_opts != NULL)
1269 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1270 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1271 if (lsi->lsi_lmd->lmd_exclude_count)
1272 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1273 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1274 lsi->lsi_lmd->lmd_exclude_count);
1275 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1278 LASSERT(lsi->lsi_llsbi == NULL);
1279 OBD_FREE(lsi, sizeof(*lsi));
1280 s2lsi_nocast(sb) = NULL;
1285 /* The lsi has one reference for every server that is using the disk -
1286 e.g. MDT, MGS, and potentially MGC */
1287 static int lustre_put_lsi(struct super_block *sb)
1289 struct lustre_sb_info *lsi = s2lsi(sb);
1292 LASSERT(lsi != NULL);
1294 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1295 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1296 lustre_free_lsi(sb);
1302 /*************** server mount ******************/
1304 /** Kernel mount using mount options in MOUNT_DATA_FILE.
1305 * Since this file lives on the disk, we pre-mount using a common
1306 * type, read the file, then re-mount using the type specified in the
1309 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1311 struct lvfs_run_ctxt mount_ctxt;
1312 struct lustre_sb_info *lsi = s2lsi(sb);
1313 struct lustre_disk_data *ldd;
1314 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1315 struct vfsmount *mnt;
1316 char *options = NULL;
1317 unsigned long page, s_flags;
1318 struct page *__page;
1322 OBD_ALLOC(ldd, sizeof(*ldd));
1324 RETURN(ERR_PTR(-ENOMEM));
1326 /* In the past, we have always used flags = 0.
1327 Note ext3/ldiskfs can't be mounted ro. */
1328 s_flags = sb->s_flags;
1330 /* allocate memory for options */
1331 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1333 GOTO(out_free, rc = -ENOMEM);
1334 page = (unsigned long)cfs_page_address(__page);
1335 options = (char *)page;
1336 memset(options, 0, CFS_PAGE_SIZE);
1338 /* mount-line options must be added for pre-mount because it may
1339 * contain mount options such as journal_dev which are required
1340 * to mount successfuly the underlying filesystem */
1341 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1342 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1344 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1345 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1346 mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, (void *)options);
1349 CERROR("premount %s:%#lx ldiskfs failed: %d "
1350 "Is the ldiskfs module available?\n",
1351 lmd->lmd_dev, s_flags, rc );
1355 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1356 mount_ctxt.pwdmnt = mnt;
1357 mount_ctxt.pwd = mnt->mnt_root;
1358 mount_ctxt.fs = get_ds();
1360 rc = ldd_parse(&mount_ctxt, ldd);
1364 CERROR("premount parse options failed: rc = %d\n", rc);
1368 /* Done with our pre-mount, now do the real mount. */
1370 /* Glom up mount options */
1371 memset(options, 0, CFS_PAGE_SIZE);
1372 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1374 /* Add in any mount-line options */
1375 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1376 int len = CFS_PAGE_SIZE - strlen(options) - 2;
1378 strcat(options, ",");
1379 strncat(options, lmd->lmd_opts, len);
1382 /* Special permanent mount flags */
1384 s_flags |= MS_NOATIME | MS_NODIRATIME;
1386 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1387 MT_STR(ldd), lmd->lmd_dev, options);
1388 mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
1392 CERROR("ll_kern_mount failed: rc = %d\n", rc);
1396 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1397 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1400 OBD_PAGE_FREE(__page);
1401 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1402 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1407 OBD_PAGE_FREE(__page);
1408 OBD_FREE(ldd, sizeof(*ldd));
1409 lsi->lsi_ldd = NULL;
1410 RETURN(ERR_PTR(rc));
1413 /** Wait here forever until the mount refcount is 0 before completing umount,
1414 * else we risk dereferencing a null pointer.
1415 * LNET may take e.g. 165s before killing zombies.
1417 static void server_wait_finished(struct vfsmount *mnt)
1421 cfs_sigset_t blocked;
1423 cfs_waitq_init(&waitq);
1425 while (atomic_read(&mnt->mnt_count) > 1) {
1426 if (waited && (waited % 30 == 0))
1427 LCONSOLE_WARN("Mount still busy with %d refs after "
1429 atomic_read(&mnt->mnt_count),
1431 /* Cannot use l_event_wait() for an interruptible sleep. */
1433 blocked = cfs_block_sigsinv(sigmask(SIGKILL));
1434 cfs_waitq_wait_event_interruptible_timeout(
1436 (atomic_read(&mnt->mnt_count) == 1),
1437 cfs_time_seconds(3),
1439 cfs_block_sigs(blocked);
1441 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1442 "%d refs!\n", mnt->mnt_devname,
1443 atomic_read(&mnt->mnt_count));
1450 /** Start the shutdown of servers at umount.
1452 static void server_put_super(struct super_block *sb)
1454 struct lustre_sb_info *lsi = s2lsi(sb);
1455 struct obd_device *obd;
1456 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1457 char *tmpname, *extraname = NULL;
1459 int lddflags = lsi->lsi_ldd->ldd_flags;
1460 int lsiflags = lsi->lsi_flags;
1463 LASSERT(lsiflags & LSI_SERVER);
1465 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1466 OBD_ALLOC(tmpname, tmpname_sz);
1467 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1468 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1469 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1470 snprintf(tmpname, tmpname_sz, "MGS");
1472 /* Stop the target */
1473 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1474 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1475 struct lustre_profile *lprof = NULL;
1477 /* tell the mgc to drop the config log */
1478 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1480 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1481 If there are any setup/cleanup errors, save the lov
1482 name for safety cleanup later. */
1483 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1484 if (lprof && lprof->lp_dt) {
1485 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1486 strcpy(extraname, lprof->lp_dt);
1489 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1491 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1492 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1494 /* We can't seem to give an error return code
1495 * to .put_super, so we better make sure we clean up! */
1497 class_manual_cleanup(obd);
1499 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1500 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1504 /* If they wanted the mgs to stop separately from the mdt, they
1505 should have put it on a different device. */
1506 if (IS_MGS(lsi->lsi_ldd)) {
1507 /* if MDS start with --nomgs, don't stop MGS then */
1508 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1509 server_stop_mgs(sb);
1512 /* Clean the mgc and sb */
1513 lustre_common_put_super(sb);
1515 /* Wait for the targets to really clean up - can't exit (and let the
1516 sb get destroyed) while the mount is still in use */
1517 server_wait_finished(mnt);
1519 /* drop the One True Mount */
1522 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1523 until the target is really gone so that our type refcount check
1525 server_stop_servers(lddflags, lsiflags);
1527 /* In case of startup or cleanup err, stop related obds */
1529 obd = class_name2obd(extraname);
1531 CWARN("Cleaning orphaned obd %s\n", extraname);
1533 class_manual_cleanup(obd);
1535 OBD_FREE(extraname, strlen(extraname) + 1);
1538 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1539 OBD_FREE(tmpname, tmpname_sz);
1543 /** Called only for 'umount -f'
1545 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1546 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1548 struct super_block *sb = vfsmnt->mnt_sb;
1550 static void server_umount_begin(struct super_block *sb)
1553 struct lustre_sb_info *lsi = s2lsi(sb);
1556 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1557 if (!(flags & MNT_FORCE)) {
1563 CDEBUG(D_MOUNT, "umount -f\n");
1564 /* umount = failover
1566 no third way to do non-force, non-failover */
1567 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1568 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1572 #ifndef HAVE_STATFS_DENTRY_PARAM
1573 static int server_statfs (struct super_block *sb, cfs_kstatfs_t *buf)
1576 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1578 struct super_block *sb = dentry->d_sb;
1580 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1583 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1584 #ifdef HAVE_STATFS_DENTRY_PARAM
1585 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1587 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf);
1590 buf->f_type = sb->s_magic;
1596 buf->f_type = sb->s_magic;
1597 buf->f_bsize = sb->s_blocksize;
1603 buf->f_namelen = NAME_MAX;
1607 /** The operations we support directly on the superblock:
1608 * mount, umount, and df.
1610 static struct super_operations server_ops =
1612 .put_super = server_put_super,
1613 .umount_begin = server_umount_begin, /* umount -f */
1614 .statfs = server_statfs,
1617 #define log2(n) cfs_ffz(~(n))
1618 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1620 static int server_fill_super_common(struct super_block *sb)
1622 struct inode *root = 0;
1625 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1627 sb->s_blocksize = 4096;
1628 sb->s_blocksize_bits = log2(sb->s_blocksize);
1629 sb->s_magic = LUSTRE_SUPER_MAGIC;
1630 sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
1631 sb->s_flags |= MS_RDONLY;
1632 sb->s_op = &server_ops;
1634 root = new_inode(sb);
1636 CERROR("Can't make root inode\n");
1640 /* returns -EIO for every operation */
1641 /* make_bad_inode(root); -- badness - can't umount */
1642 /* apparently we need to be a directory for the mount to finish */
1643 root->i_mode = S_IFDIR;
1645 sb->s_root = d_alloc_root(root);
1647 CERROR("Can't make root dentry\n");
1655 /** Fill in the superblock info for a Lustre server.
1656 * Mount the device with the correct options.
1657 * Read the on-disk config file.
1658 * Start the services.
1660 static int server_fill_super(struct super_block *sb)
1662 struct lustre_sb_info *lsi = s2lsi(sb);
1663 struct vfsmount *mnt;
1667 /* the One True Mount */
1668 mnt = server_kernel_mount(sb);
1671 CERROR("Unable to mount device %s: %d\n",
1672 lsi->lsi_lmd->lmd_dev, rc);
1676 lsi->lsi_srv_mnt = mnt;
1678 LASSERT(lsi->lsi_ldd);
1679 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1680 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1681 lsi->lsi_lmd->lmd_dev);
1683 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1684 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1685 "running. Double-mount may have compromised"
1686 " the disk journal.\n",
1687 lsi->lsi_ldd->ldd_svname);
1693 /* Start MGS before MGC */
1694 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1695 rc = server_start_mgs(sb);
1700 /* Start MGC before servers */
1701 rc = lustre_start_mgc(sb);
1705 /* Set up all obd devices for service */
1706 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1707 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1708 rc = server_start_targets(sb, mnt);
1710 CERROR("Unable to start targets: %d\n", rc);
1713 /* FIXME overmount client here,
1714 or can we just start a client log and client_fill_super on this sb?
1715 We need to make sure server_put_super gets called too - ll_put_super
1716 calls lustre_common_put_super; check there for LSI_SERVER flag,
1718 Probably should start client from new thread so we can return.
1719 Client will not finish until all servers are connected.
1720 Note - MGS-only server does NOT get a client, since there is no
1721 lustre fs associated - the MGS is for all lustre fs's */
1724 rc = server_fill_super_common(sb);
1730 /* We jump here in case of failure while starting targets or MGS.
1731 * In this case we can't just put @mnt and have to do real cleanup
1732 * with stoping targets, etc. */
1733 server_put_super(sb);
1737 /* Get the index from the obd name.
1738 rc = server type, or
1740 if endptr isn't NULL it is set to end of name */
1741 int server_name2index(char *svname, __u32 *idx, char **endptr)
1743 unsigned long index;
1745 char *dash = strrchr(svname, '-');
1749 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1750 * in the fsname, then determine the server index */
1751 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
1753 for (; dash > svname && *dash != '-'; dash--);
1758 if (strncmp(dash + 1, "MDT", 3) == 0)
1759 rc = LDD_F_SV_TYPE_MDT;
1760 else if (strncmp(dash + 1, "OST", 3) == 0)
1761 rc = LDD_F_SV_TYPE_OST;
1764 if (strcmp(dash + 4, "all") == 0)
1765 return rc | LDD_F_SV_ALL;
1767 index = simple_strtoul(dash + 4, endptr, 16);
1772 /*************** mount common betweeen server and client ***************/
1775 int lustre_common_put_super(struct super_block *sb)
1780 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
1782 /* Drop a ref to the MGC */
1783 rc = lustre_stop_mgc(sb);
1784 if (rc && (rc != -ENOENT)) {
1786 CERROR("Can't stop MGC: %d\n", rc);
1789 /* BUSY just means that there's some other obd that
1790 needs the mgc. Let him clean it up. */
1791 CDEBUG(D_MOUNT, "MGC still in use\n");
1793 /* Drop a ref to the mounted disk */
1799 static void lmd_print(struct lustre_mount_data *lmd)
1803 PRINT_CMD(PRINT_MASK, " mount data:\n");
1804 if (lmd_is_client(lmd))
1805 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
1806 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
1807 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
1810 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
1812 if (lmd->lmd_recovery_time_soft)
1813 PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
1814 lmd->lmd_recovery_time_soft);
1816 if (lmd->lmd_recovery_time_hard)
1817 PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
1818 lmd->lmd_recovery_time_hard);
1820 for (i = 0; i < lmd->lmd_exclude_count; i++) {
1821 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
1822 lmd->lmd_exclude[i]);
1826 /* Is this server on the exclusion list */
1827 int lustre_check_exclusion(struct super_block *sb, char *svname)
1829 struct lustre_sb_info *lsi = s2lsi(sb);
1830 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1835 rc = server_name2index(svname, &index, NULL);
1836 if (rc != LDD_F_SV_TYPE_OST)
1837 /* Only exclude OSTs */
1840 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
1841 index, lmd->lmd_exclude_count, lmd->lmd_dev);
1843 for(i = 0; i < lmd->lmd_exclude_count; i++) {
1844 if (index == lmd->lmd_exclude[i]) {
1845 CWARN("Excluding %s (on exclusion list)\n", svname);
1852 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
1853 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
1855 char *s1 = ptr, *s2;
1856 __u32 index, *exclude_list;
1860 /* The shortest an ost name can be is 8 chars: -OST0000.
1861 We don't actually know the fsname at this time, so in fact
1862 a user could specify any fsname. */
1863 devmax = strlen(ptr) / 8 + 1;
1865 /* temp storage until we figure out how many we have */
1866 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
1870 /* we enter this fn pointing at the '=' */
1871 while (*s1 && *s1 != ' ' && *s1 != ',') {
1873 rc = server_name2index(s1, &index, &s2);
1875 CERROR("Can't parse server name '%s'\n", s1);
1878 if (rc == LDD_F_SV_TYPE_OST)
1879 exclude_list[lmd->lmd_exclude_count++] = index;
1881 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
1883 /* now we are pointing at ':' (next exclude)
1884 or ',' (end of excludes) */
1885 if (lmd->lmd_exclude_count >= devmax)
1888 if (rc >= 0) /* non-err */
1891 if (lmd->lmd_exclude_count) {
1892 /* permanent, freed in lustre_free_lsi */
1893 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
1894 lmd->lmd_exclude_count);
1895 if (lmd->lmd_exclude) {
1896 memcpy(lmd->lmd_exclude, exclude_list,
1897 sizeof(index) * lmd->lmd_exclude_count);
1900 lmd->lmd_exclude_count = 0;
1903 OBD_FREE(exclude_list, sizeof(index) * devmax);
1907 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
1912 if (lmd->lmd_mgssec != NULL) {
1913 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
1914 lmd->lmd_mgssec = NULL;
1917 tail = strchr(ptr, ',');
1919 length = strlen(ptr);
1921 length = tail - ptr;
1923 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
1924 if (lmd->lmd_mgssec == NULL)
1927 memcpy(lmd->lmd_mgssec, ptr, length);
1928 lmd->lmd_mgssec[length] = '\0';
1932 /** Parse mount line options
1933 * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
1934 * dev is passed as device=uml1:/lustre by mount.lustre
1936 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
1938 char *s1, *s2, *devname = NULL;
1939 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
1945 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
1946 "/sbin/mount.lustre is installed.\n");
1950 /* Options should be a string - try to detect old lmd data */
1951 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
1952 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
1953 "/sbin/mount.lustre. Please install "
1954 "version %s\n", LUSTRE_VERSION_STRING);
1957 lmd->lmd_magic = LMD_MAGIC;
1959 /* Set default flags here */
1964 int time_min = 2 * (CONNECTION_SWITCH_MAX +
1965 2 * INITIAL_CONNECT_TIMEOUT);
1967 /* Skip whitespace and extra commas */
1968 while (*s1 == ' ' || *s1 == ',')
1971 /* Client options are parsed in ll_options: eg. flock,
1974 /* Parse non-ldiskfs options here. Rather than modifying
1975 ldiskfs, we just zero these out here */
1976 if (strncmp(s1, "abort_recov", 11) == 0) {
1977 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
1979 } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
1980 lmd->lmd_recovery_time_soft = max_t(int,
1981 simple_strtoul(s1 + 19, NULL, 10), time_min);
1983 } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
1984 lmd->lmd_recovery_time_hard = max_t(int,
1985 simple_strtoul(s1 + 19, NULL, 10), time_min);
1987 } else if (strncmp(s1, "nosvc", 5) == 0) {
1988 lmd->lmd_flags |= LMD_FLG_NOSVC;
1990 } else if (strncmp(s1, "nomgs", 5) == 0) {
1991 lmd->lmd_flags |= LMD_FLG_NOMGS;
1993 } else if (strncmp(s1, "writeconf", 9) == 0) {
1994 lmd->lmd_flags |= LMD_FLG_WRITECONF;
1996 } else if (strncmp(s1, "mgssec=", 7) == 0) {
1997 rc = lmd_parse_mgssec(lmd, s1 + 7);
2001 /* ost exclusion list */
2002 } else if (strncmp(s1, "exclude=", 8) == 0) {
2003 rc = lmd_make_exclusion(lmd, s1 + 7);
2008 /* Linux 2.4 doesn't pass the device, so we stuck it at the
2009 end of the options. */
2010 else if (strncmp(s1, "device=", 7) == 0) {
2012 /* terminate options right before device. device
2013 must be the last one. */
2019 s2 = strchr(s1, ',');
2027 memmove(s1, s2, strlen(s2) + 1);
2033 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
2034 "(need mount option 'device=...')\n");
2038 s1 = strstr(devname, ":/");
2041 lmd->lmd_flags = LMD_FLG_CLIENT;
2042 /* Remove leading /s from fsname */
2043 while (*++s1 == '/') ;
2044 /* Freed in lustre_free_lsi */
2045 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
2046 if (!lmd->lmd_profile)
2048 sprintf(lmd->lmd_profile, "%s-client", s1);
2051 /* Freed in lustre_free_lsi */
2052 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
2055 strcpy(lmd->lmd_dev, devname);
2057 /* Save mount options */
2058 s1 = options + strlen(options) - 1;
2059 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
2061 if (*options != 0) {
2062 /* Freed in lustre_free_lsi */
2063 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
2066 strcpy(lmd->lmd_opts, options);
2070 lmd->lmd_magic = LMD_MAGIC;
2075 CERROR("Bad mount options %s\n", options);
2079 struct lustre_mount_data2 {
2081 struct vfsmount *lmd2_mnt;
2084 /** This is the entry point for the mount call into Lustre.
2085 * This is called when a server or client is mounted,
2086 * and this is where we start setting things up.
2087 * @param data Mount options (e.g. -o flock,abort_recov)
2089 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2091 struct lustre_mount_data *lmd;
2092 struct lustre_mount_data2 *lmd2 = data;
2093 struct lustre_sb_info *lsi;
2097 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2099 lsi = lustre_init_lsi(sb);
2105 * Disable lockdep during mount, because mount locking patterns are
2110 /* Figure out the lmd from the mount options */
2111 if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
2113 GOTO(out, rc = -EINVAL);
2116 if (lmd_is_client(lmd)) {
2117 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2118 if (!client_fill_super) {
2119 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2120 "client mount! Is the 'lustre' "
2121 "module loaded?\n");
2125 rc = lustre_start_mgc(sb);
2130 /* Connect and start */
2131 /* (should always be ll_fill_super) */
2132 rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
2133 /* c_f_s will call lustre_common_put_super on failure */
2136 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2137 lsi->lsi_flags |= LSI_SERVER;
2138 rc = server_fill_super(sb);
2139 /* s_f_s calls lustre_start_mgc after the mount because we need
2140 the MGS nids which are stored on disk. Plus, we may
2141 need to start the MGS first. */
2142 /* s_f_s will call server_put_super on failure */
2145 /* If error happens in fill_super() call, @lsi will be killed there.
2146 * This is why we do not put it here. */
2150 CERROR("Unable to mount %s (%d)\n",
2151 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2153 CDEBUG(D_SUPER, "Mount %s complete\n",
2161 /* We can't call ll_fill_super by name because it lives in a module that
2162 must be loaded after this one. */
2163 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
2164 struct vfsmount *mnt))
2166 client_fill_super = cfs;
2169 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2171 kill_super_cb = cfs;
2174 /***************** FS registration ******************/
2176 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
2177 struct super_block * lustre_get_sb(struct file_system_type *fs_type, int flags,
2178 const char *devname, void * data)
2180 return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
2183 int lustre_get_sb(struct file_system_type *fs_type, int flags,
2184 const char *devname, void * data, struct vfsmount *mnt)
2186 struct lustre_mount_data2 lmd2 = {data, mnt};
2188 return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt);
2192 void lustre_kill_super(struct super_block *sb)
2194 struct lustre_sb_info *lsi = s2lsi(sb);
2196 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2197 (*kill_super_cb)(sb);
2199 kill_anon_super(sb);
2202 /** Register the "lustre" fs type
2204 struct file_system_type lustre_fs_type = {
2205 .owner = THIS_MODULE,
2207 .get_sb = lustre_get_sb,
2208 .kill_sb = lustre_kill_super,
2209 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2210 #ifdef FS_HAS_FIEMAP
2213 LL_RENAME_DOES_D_MOVE,
2216 int lustre_register_fs(void)
2218 return register_filesystem(&lustre_fs_type);
2221 int lustre_unregister_fs(void)
2223 return unregister_filesystem(&lustre_fs_type);
2226 EXPORT_SYMBOL(lustre_register_client_fill_super);
2227 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2228 EXPORT_SYMBOL(lustre_common_put_super);
2229 EXPORT_SYMBOL(lustre_process_log);
2230 EXPORT_SYMBOL(lustre_end_log);
2231 EXPORT_SYMBOL(server_get_mount);
2232 EXPORT_SYMBOL(server_get_mount_2);
2233 EXPORT_SYMBOL(server_put_mount);
2234 EXPORT_SYMBOL(server_put_mount_2);
2235 EXPORT_SYMBOL(server_register_target);
2236 EXPORT_SYMBOL(server_name2index);
2237 EXPORT_SYMBOL(server_mti_print);
2238 EXPORT_SYMBOL(do_lcfg);