1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011 Whamcloud, Inc.
36 * This file is part of Lustre, http://www.lustre.org/
37 * Lustre is a trademark of Sun Microsystems, Inc.
39 * lustre/obdclass/obd_mount.c
41 * Client/server mount routines
43 * Author: Nathan Rutman <nathan@clusterfs.com>
47 #define DEBUG_SUBSYSTEM S_CLASS
48 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
49 #define PRINT_CMD CDEBUG
50 #define PRINT_MASK D_SUPER|D_CONFIG
54 #include <lustre_fsfilt.h>
55 #include <obd_class.h>
56 #include <lustre/lustre_user.h>
57 #include <linux/version.h>
58 #include <lustre_log.h>
59 #include <lustre_disk.h>
60 #include <lustre_param.h>
62 static int (*client_fill_super)(struct super_block *sb,
63 struct vfsmount *mnt) = NULL;
64 static void (*kill_super_cb)(struct super_block *sb) = NULL;
66 /*********** mount lookup *********/
68 CFS_DECLARE_MUTEX(lustre_mount_info_lock);
69 static CFS_LIST_HEAD(server_mount_info_list);
71 static struct lustre_mount_info *server_find_mount(const char *name)
74 struct lustre_mount_info *lmi;
77 cfs_list_for_each(tmp, &server_mount_info_list) {
78 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
80 if (strcmp(name, lmi->lmi_name) == 0)
86 /* we must register an obd for a mount before we call the setup routine.
87 *_setup will call lustre_get_mount to get the mnt struct
88 by obd_name, since we can't pass the pointer to setup. */
89 static int server_register_mount(const char *name, struct super_block *sb,
92 struct lustre_mount_info *lmi;
99 OBD_ALLOC(lmi, sizeof(*lmi));
102 OBD_ALLOC(name_cp, strlen(name) + 1);
104 OBD_FREE(lmi, sizeof(*lmi));
107 strcpy(name_cp, name);
109 cfs_down(&lustre_mount_info_lock);
111 if (server_find_mount(name)) {
112 cfs_up(&lustre_mount_info_lock);
113 OBD_FREE(lmi, sizeof(*lmi));
114 OBD_FREE(name_cp, strlen(name) + 1);
115 CERROR("Already registered %s\n", name);
118 lmi->lmi_name = name_cp;
121 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
123 cfs_up(&lustre_mount_info_lock);
125 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
126 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
131 /* when an obd no longer needs a mount */
132 static int server_deregister_mount(const char *name)
134 struct lustre_mount_info *lmi;
137 cfs_down(&lustre_mount_info_lock);
138 lmi = server_find_mount(name);
140 cfs_up(&lustre_mount_info_lock);
141 CERROR("%s not registered\n", name);
145 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
146 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
148 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
149 cfs_list_del(&lmi->lmi_list_chain);
150 OBD_FREE(lmi, sizeof(*lmi));
151 cfs_up(&lustre_mount_info_lock);
156 /* obd's look up a registered mount using their obdname. This is just
157 for initial obd setup to find the mount struct. It should not be
158 called every time you want to mntget. */
159 struct lustre_mount_info *server_get_mount(const char *name)
161 struct lustre_mount_info *lmi;
162 struct lustre_sb_info *lsi;
165 cfs_down(&lustre_mount_info_lock);
166 lmi = server_find_mount(name);
167 cfs_up(&lustre_mount_info_lock);
169 CERROR("Can't find mount for %s\n", name);
172 lsi = s2lsi(lmi->lmi_sb);
173 mntget(lmi->lmi_mnt);
174 cfs_atomic_inc(&lsi->lsi_mounts);
176 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
177 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
178 cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
184 * Used by mdt to get mount_info from obdname.
185 * There are no blocking when using the mount_info.
186 * Do not use server_get_mount for this purpose.
188 struct lustre_mount_info *server_get_mount_2(const char *name)
190 struct lustre_mount_info *lmi;
193 cfs_down(&lustre_mount_info_lock);
194 lmi = server_find_mount(name);
195 cfs_up(&lustre_mount_info_lock);
197 CERROR("Can't find mount for %s\n", name);
202 static void unlock_mntput(struct vfsmount *mnt)
204 if (kernel_locked()) {
213 static int lustre_put_lsi(struct super_block *sb);
215 /* to be called from obd_cleanup methods */
216 int server_put_mount(const char *name, struct vfsmount *mnt)
218 struct lustre_mount_info *lmi;
219 struct lustre_sb_info *lsi;
220 int count = atomic_read(&mnt->mnt_count) - 1;
223 /* This might be the last one, can't deref after this */
226 cfs_down(&lustre_mount_info_lock);
227 lmi = server_find_mount(name);
228 cfs_up(&lustre_mount_info_lock);
230 CERROR("Can't find mount for %s\n", name);
233 lsi = s2lsi(lmi->lmi_sb);
234 LASSERT(lmi->lmi_mnt == mnt);
236 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
237 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
239 if (lustre_put_lsi(lmi->lmi_sb)) {
240 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
241 lmi->lmi_mnt, name, count);
242 /* last mount is the One True Mount */
244 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
247 /* this obd should never need the mount again */
248 server_deregister_mount(name);
253 /* Corresponding to server_get_mount_2 */
254 int server_put_mount_2(const char *name, struct vfsmount *mnt)
260 /******* mount helper utilities *********/
263 static void ldd_print(struct lustre_disk_data *ldd)
265 PRINT_CMD(PRINT_MASK, " disk data:\n");
266 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
267 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
268 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
269 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
270 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
271 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
272 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
273 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
274 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
275 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
279 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
280 struct lustre_disk_data *ldd)
282 struct lvfs_run_ctxt saved;
289 push_ctxt(&saved, mount_ctxt, NULL);
291 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
294 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
298 len = i_size_read(file->f_dentry->d_inode);
299 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
300 if (len != sizeof(*ldd)) {
301 CERROR("disk data size does not match: see %lu expect %u\n",
302 len, (int)sizeof(*ldd));
303 GOTO(out_close, rc = -EINVAL);
306 rc = lustre_fread(file, ldd, len, &off);
308 CERROR("error reading %s: read %d of %lu\n",
309 MOUNT_DATA_FILE, rc, len);
310 GOTO(out_close, rc = -EINVAL);
314 if (ldd->ldd_magic != LDD_MAGIC) {
315 /* FIXME add swabbing support */
316 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
317 ldd->ldd_magic, LDD_MAGIC);
318 GOTO(out_close, rc = -EINVAL);
321 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
322 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
324 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
325 GOTO(out_close, rc = -EINVAL);
327 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
328 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
330 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
331 /* Do something like remount filesystem read-only */
332 GOTO(out_close, rc = -EINVAL);
338 pop_ctxt(&saved, mount_ctxt, NULL);
342 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
343 struct lustre_disk_data *ldd)
345 struct lvfs_run_ctxt saved;
348 unsigned long len = sizeof(struct lustre_disk_data);
352 LASSERT(ldd->ldd_magic == LDD_MAGIC);
354 ldd->ldd_config_ver++;
356 push_ctxt(&saved, mount_ctxt, NULL);
358 file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
361 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
365 rc = lustre_fwrite(file, ldd, len, &off);
367 CERROR("error writing %s: read %d of %lu\n",
368 MOUNT_DATA_FILE, rc, len);
369 GOTO(out_close, rc = -EINVAL);
377 pop_ctxt(&saved, mount_ctxt, NULL);
382 /**************** config llog ********************/
384 /** Get a config log from the MGS and process it.
385 * This func is called for both clients and servers.
386 * Continue to process new statements appended to the logs
387 * (whenever the config lock is revoked) until lustre_end_log
389 * @param sb The superblock is used by the MGC to write to the local copy of
391 * @param logname The name of the llog to replicate from the MGS
392 * @param cfg Since the same mgc may be used to follow multiple config logs
393 * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
394 * this log, and is added to the mgc's list of logs to follow.
396 int lustre_process_log(struct super_block *sb, char *logname,
397 struct config_llog_instance *cfg)
399 struct lustre_cfg *lcfg;
400 struct lustre_cfg_bufs *bufs;
401 struct lustre_sb_info *lsi = s2lsi(sb);
402 struct obd_device *mgc = lsi->lsi_mgc;
413 /* mgc_process_config */
414 lustre_cfg_bufs_reset(bufs, mgc->obd_name);
415 lustre_cfg_bufs_set_string(bufs, 1, logname);
416 lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
417 lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
418 lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
419 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
420 lustre_cfg_free(lcfg);
425 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
426 "failed from the MGS (%d). Make sure this "
427 "client and the MGS are running compatible "
428 "versions of Lustre.\n",
429 mgc->obd_name, logname, rc);
432 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
433 "failed (%d). This may be the result of "
434 "communication errors between this node and "
435 "the MGS, a bad configuration, or other "
436 "errors. See the syslog for more "
437 "information.\n", mgc->obd_name, logname,
440 /* class_obd_list(); */
444 /* Stop watching this config log for updates */
445 int lustre_end_log(struct super_block *sb, char *logname,
446 struct config_llog_instance *cfg)
448 struct lustre_cfg *lcfg;
449 struct lustre_cfg_bufs bufs;
450 struct lustre_sb_info *lsi = s2lsi(sb);
451 struct obd_device *mgc = lsi->lsi_mgc;
458 /* mgc_process_config */
459 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
460 lustre_cfg_bufs_set_string(&bufs, 1, logname);
462 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
463 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
464 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
465 lustre_cfg_free(lcfg);
469 /**************** obd start *******************/
471 /** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
472 * lctl (and do for echo cli/srv.
474 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
475 char *s1, char *s2, char *s3, char *s4)
477 struct lustre_cfg_bufs bufs;
478 struct lustre_cfg * lcfg = NULL;
481 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
482 cmd, s1, s2, s3, s4);
484 lustre_cfg_bufs_reset(&bufs, cfgname);
486 lustre_cfg_bufs_set_string(&bufs, 1, s1);
488 lustre_cfg_bufs_set_string(&bufs, 2, s2);
490 lustre_cfg_bufs_set_string(&bufs, 3, s3);
492 lustre_cfg_bufs_set_string(&bufs, 4, s4);
494 lcfg = lustre_cfg_new(cmd, &bufs);
495 lcfg->lcfg_nid = nid;
496 rc = class_process_config(lcfg);
497 lustre_cfg_free(lcfg);
501 /** Call class_attach and class_setup. These methods in turn call
502 * obd type-specific methods.
504 static int lustre_start_simple(char *obdname, char *type, char *uuid,
508 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
510 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
512 CERROR("%s attach error %d\n", obdname, rc);
515 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
517 CERROR("%s setup error %d\n", obdname, rc);
518 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
523 /* Set up a MGS to serve startup logs */
524 static int server_start_mgs(struct super_block *sb)
526 struct lustre_sb_info *lsi = s2lsi(sb);
527 struct vfsmount *mnt = lsi->lsi_srv_mnt;
528 struct lustre_mount_info *lmi;
533 /* It is impossible to have more than 1 MGS per node, since
534 MGC wouldn't know which to connect to */
535 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
537 lsi = s2lsi(lmi->lmi_sb);
538 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
540 lsi->lsi_ldd->ldd_svname);
544 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
546 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
549 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
550 LUSTRE_MGS_OBDNAME, 0, 0);
551 /* Do NOT call server_deregister_mount() here. This leads to
552 * inability cleanup cleanly and free lsi and other stuff when
553 * mgs calls server_put_mount() in error handling case. -umka */
557 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
558 "Is the 'mgs' module loaded?\n",
559 LUSTRE_MGS_OBDNAME, rc);
563 static int server_stop_mgs(struct super_block *sb)
565 struct obd_device *obd;
569 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
571 /* There better be only one MGS */
572 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
574 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
578 /* The MGS should always stop when we say so */
580 rc = class_manual_cleanup(obd);
584 CFS_DECLARE_MUTEX(mgc_start_lock);
586 /** Set up a mgc obd to process startup logs
588 * \param sb [in] super block of the mgc obd
590 * \retval 0 success, otherwise error code
592 static int lustre_start_mgc(struct super_block *sb)
594 struct obd_connect_data *data = NULL;
595 struct lustre_sb_info *lsi = s2lsi(sb);
596 struct obd_device *obd;
597 struct obd_export *exp;
598 struct obd_uuid *uuid;
601 char *mgcname, *niduuid, *mgssec;
604 int rc = 0, i = 0, j, len;
607 LASSERT(lsi->lsi_lmd);
609 /* Find the first non-lo MGS nid for our MGC name */
610 if (lsi->lsi_flags & LSI_SERVER) {
611 ptr = lsi->lsi_ldd->ldd_params;
612 /* Use mgsnode= nids */
613 if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
614 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
616 } else if (IS_MGS(lsi->lsi_ldd)) {
617 lnet_process_id_t id;
618 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
619 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
626 } else { /* client */
627 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
628 ptr = lsi->lsi_lmd->lmd_dev;
629 if (class_parse_nid(ptr, &nid, &ptr) == 0)
633 CERROR("No valid MGS nids found.\n");
637 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
638 OBD_ALLOC(mgcname, len);
639 OBD_ALLOC(niduuid, len + 2);
640 if (!mgcname || !niduuid)
641 GOTO(out_free, rc = -ENOMEM);
642 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
644 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
646 cfs_mutex_down(&mgc_start_lock);
648 obd = class_name2obd(mgcname);
649 if (obd && !obd->obd_stopping) {
650 rc = obd_set_info_async(obd->obd_self_export,
651 strlen(KEY_MGSSEC), KEY_MGSSEC,
652 strlen(mgssec), mgssec, NULL);
656 /* Re-using an existing MGC */
657 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
660 /* If we are restarting the MGS, don't try to keep the MGC's
661 old connection, or registration will fail. */
662 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
663 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
667 /* Try all connections, but only once (again).
668 We don't want to block another target from starting
669 (using its local copy of the log), but we do want to connect
670 if at all possible. */
672 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
673 rc = obd_set_info_async(obd->obd_self_export,
674 sizeof(KEY_INIT_RECOV_BACKUP),
675 KEY_INIT_RECOV_BACKUP,
676 sizeof(recov_bk), &recov_bk, NULL);
680 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
682 /* Add the primary nids for the MGS */
684 sprintf(niduuid, "%s_%x", mgcname, i);
685 if (lsi->lsi_flags & LSI_SERVER) {
686 ptr = lsi->lsi_ldd->ldd_params;
687 if (IS_MGS(lsi->lsi_ldd)) {
688 /* Use local nids (including LO) */
689 lnet_process_id_t id;
690 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
691 rc = do_lcfg(mgcname, id.nid,
692 LCFG_ADD_UUID, niduuid, 0,0,0);
695 /* Use mgsnode= nids */
696 if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) {
697 CERROR("No MGS nids given.\n");
698 GOTO(out_free, rc = -EINVAL);
700 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
701 rc = do_lcfg(mgcname, nid,
702 LCFG_ADD_UUID, niduuid, 0,0,0);
706 } else { /* client */
707 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
708 ptr = lsi->lsi_lmd->lmd_dev;
709 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
710 rc = do_lcfg(mgcname, nid,
711 LCFG_ADD_UUID, niduuid, 0,0,0);
713 /* Stop at the first failover nid */
719 CERROR("No valid MGS nids found.\n");
720 GOTO(out_free, rc = -EINVAL);
722 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
724 /* Random uuid for MGC allows easier reconnects */
726 ll_generate_random_uuid(uuidc);
727 class_uuid_unparse(uuidc, uuid);
730 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
731 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
737 /* Add any failover MGS nids */
739 while ((*ptr == ':' ||
740 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
741 /* New failover node */
742 sprintf(niduuid, "%s_%x", mgcname, i);
744 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
746 rc = do_lcfg(mgcname, nid,
747 LCFG_ADD_UUID, niduuid, 0,0,0);
752 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
760 lsi->lsi_lmd->lmd_mgs_failnodes = i;
762 obd = class_name2obd(mgcname);
764 CERROR("Can't find mgcobd %s\n", mgcname);
765 GOTO(out_free, rc = -ENOTCONN);
768 rc = obd_set_info_async(obd->obd_self_export,
769 strlen(KEY_MGSSEC), KEY_MGSSEC,
770 strlen(mgssec), mgssec, NULL);
774 /* Keep a refcount of servers/clients who started with "mount",
775 so we know when we can get rid of the mgc. */
776 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
778 /* Try all connections, but only once. */
780 rc = obd_set_info_async(obd->obd_self_export,
781 sizeof(KEY_INIT_RECOV_BACKUP),
782 KEY_INIT_RECOV_BACKUP,
783 sizeof(recov_bk), &recov_bk, NULL);
786 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
787 /* We connect to the MGS at setup, and don't disconnect until cleanup */
790 GOTO(out, rc = -ENOMEM);
791 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
792 OBD_CONNECT_AT | OBD_CONNECT_FULL20;
793 data->ocd_version = LUSTRE_VERSION_CODE;
794 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
797 CERROR("connect failed %d\n", rc);
801 obd->u.cli.cl_mgc_mgsexp = exp;
804 /* Keep the mgc info in the sb. Note that many lsi's can point
808 cfs_mutex_up(&mgc_start_lock);
811 OBD_FREE(mgcname, len);
813 OBD_FREE(niduuid, len + 2);
817 static int lustre_stop_mgc(struct super_block *sb)
819 struct lustre_sb_info *lsi = s2lsi(sb);
820 struct obd_device *obd;
821 char *niduuid = 0, *ptr = 0;
822 int i, rc = 0, len = 0;
832 cfs_mutex_down(&mgc_start_lock);
833 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
834 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
835 /* This is not fatal, every client that stops
836 will call in here. */
837 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
838 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
839 GOTO(out, rc = -EBUSY);
842 /* The MGC has no recoverable data in any case.
843 * force shotdown set in umount_begin */
844 obd->obd_no_recov = 1;
846 if (obd->u.cli.cl_mgc_mgsexp) {
847 /* An error is not fatal, if we are unable to send the
848 disconnect mgs ping evictor cleans up the export */
849 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
851 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
854 /* Save the obdname for cleaning the nid uuids, which are
856 len = strlen(obd->obd_name) + 6;
857 OBD_ALLOC(niduuid, len);
859 strcpy(niduuid, obd->obd_name);
860 ptr = niduuid + strlen(niduuid);
863 rc = class_manual_cleanup(obd);
867 /* Clean the nid uuids */
869 GOTO(out, rc = -ENOMEM);
871 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
872 sprintf(ptr, "_%x", i);
873 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
876 CERROR("del MDC UUID %s failed: rc = %d\n",
881 OBD_FREE(niduuid, len);
883 /* class_import_put will get rid of the additional connections */
884 cfs_mutex_up(&mgc_start_lock);
888 /* Since there's only one mgc per node, we have to change it's fs to get
889 access to the right disk. */
890 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
892 struct lustre_sb_info *lsi = s2lsi(sb);
896 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
898 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
899 rc = obd_set_info_async(mgc->obd_self_export,
900 sizeof(KEY_SET_FS), KEY_SET_FS,
901 sizeof(*sb), sb, NULL);
903 CERROR("can't set_fs %d\n", rc);
909 static int server_mgc_clear_fs(struct obd_device *mgc)
914 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
916 rc = obd_set_info_async(mgc->obd_self_export,
917 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
922 CFS_DECLARE_MUTEX(server_start_lock);
924 /* Stop MDS/OSS if nobody is using them */
925 static int server_stop_servers(int lddflags, int lsiflags)
927 struct obd_device *obd = NULL;
928 struct obd_type *type = NULL;
932 cfs_mutex_down(&server_start_lock);
934 /* Either an MDT or an OST or neither */
935 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
936 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
937 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
938 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
939 type = class_search_type(LUSTRE_MDS_NAME);
941 /* if this was an OST, and there are no more OST's, clean up the OSS */
942 if ((lddflags & LDD_F_SV_TYPE_OST) &&
943 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
944 type = class_search_type(LUSTRE_OST_NAME);
947 if (obd && (!type || !type->typ_refcnt)) {
950 /* obd_fail doesn't mean much on a server obd */
951 err = class_manual_cleanup(obd);
956 cfs_mutex_up(&server_start_lock);
961 int server_mti_print(char *title, struct mgs_target_info *mti)
963 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
964 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
965 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
966 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
967 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
968 mti->mti_config_ver, mti->mti_flags);
972 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
974 struct lustre_sb_info *lsi = s2lsi(sb);
975 struct lustre_disk_data *ldd = lsi->lsi_ldd;
976 lnet_process_id_t id;
980 if (!(lsi->lsi_flags & LSI_SERVER))
983 strncpy(mti->mti_fsname, ldd->ldd_fsname,
984 sizeof(mti->mti_fsname));
985 strncpy(mti->mti_svname, ldd->ldd_svname,
986 sizeof(mti->mti_svname));
988 mti->mti_nid_count = 0;
989 while (LNetGetId(i++, &id) != -ENOENT) {
990 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
993 /* server use --servicenode param, only allow specified
994 * nids be registered */
995 if ((ldd->ldd_flags & LDD_F_NO_PRIMNODE) != 0 &&
996 class_match_nid(ldd->ldd_params,
997 PARAM_FAILNODE, id.nid) < 1)
1000 /* match specified network */
1001 if (!class_match_net(ldd->ldd_params,
1002 PARAM_NETWORK, LNET_NIDNET(id.nid)))
1005 mti->mti_nids[mti->mti_nid_count] = id.nid;
1006 mti->mti_nid_count++;
1007 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
1008 CWARN("Only using first %d nids for %s\n",
1009 mti->mti_nid_count, mti->mti_svname);
1014 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
1015 mti->mti_config_ver = 0;
1016 if (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF)
1017 ldd->ldd_flags |= LDD_F_WRITECONF;
1018 mti->mti_flags = ldd->ldd_flags;
1019 mti->mti_stripe_index = ldd->ldd_svindex;
1020 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
1021 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
1022 CERROR("params too big for mti\n");
1025 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
1029 /* Register an old or new target with the MGS. If needed MGS will construct
1030 startup logs and assign index */
1031 int server_register_target(struct super_block *sb)
1033 struct lustre_sb_info *lsi = s2lsi(sb);
1034 struct obd_device *mgc = lsi->lsi_mgc;
1035 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1036 struct mgs_target_info *mti = NULL;
1042 if (!(lsi->lsi_flags & LSI_SERVER))
1048 rc = server_sb2mti(sb, mti);
1052 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1053 mti->mti_svname, mti->mti_fsname,
1054 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1057 /* Register the target */
1058 /* FIXME use mgc_process_config instead */
1059 rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp,
1060 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1061 sizeof(*mti), mti, NULL);
1065 /* Always update our flags */
1066 ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD;
1068 /* If this flag is set, it means the MGS wants us to change our
1069 on-disk data. (So far this means just the index.) */
1070 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1073 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1074 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1076 ldd->ldd_svindex = mti->mti_stripe_index;
1077 strncpy(ldd->ldd_svname, mti->mti_svname,
1078 sizeof(ldd->ldd_svname));
1079 /* or ldd_make_sv_name(ldd); */
1080 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1081 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1084 CERROR("Label set error %d\n", err);
1085 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1087 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1089 /* Flush the new ldd to disk */
1090 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1099 /** Start server targets: MDTs and OSTs
1101 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1103 struct obd_device *obd;
1104 struct lustre_sb_info *lsi = s2lsi(sb);
1105 struct config_llog_instance cfg;
1109 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1112 /* If we're an MDT, make sure the global MDS is running */
1113 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1114 /* make sure the MDS is started */
1115 cfs_mutex_down(&server_start_lock);
1116 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1118 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1119 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1121 LUSTRE_MDS_OBDNAME"_uuid",
1124 cfs_mutex_up(&server_start_lock);
1125 CERROR("failed to start MDS: %d\n", rc);
1129 cfs_mutex_up(&server_start_lock);
1133 /* If we're an OST, make sure the global OSS is running */
1134 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
1135 /* make sure OSS is started */
1136 cfs_mutex_down(&server_start_lock);
1137 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1139 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1141 LUSTRE_OSS_OBDNAME"_uuid",
1144 cfs_mutex_up(&server_start_lock);
1145 CERROR("failed to start OSS: %d\n", rc);
1149 cfs_mutex_up(&server_start_lock);
1152 /* Set the mgc fs to our server disk. This allows the MGC to
1153 * read and write configs locally, in case it can't talk to the MGS. */
1154 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1158 /* Register with MGS */
1159 rc = server_register_target(sb);
1160 if (rc && (lsi->lsi_ldd->ldd_flags &
1161 (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){
1162 CERROR("Required registration failed for %s: %d\n",
1163 lsi->lsi_ldd->ldd_svname, rc);
1165 LCONSOLE_ERROR_MSG(0x15f, "Communication error with "
1166 "the MGS. Is the MGS running?\n");
1170 if (rc == -EINVAL) {
1171 LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this "
1172 "server (%s) to start. Please see messages"
1173 " on the MGS node.\n",
1174 lsi->lsi_ldd->ldd_svname);
1177 /* non-fatal error of registeration with MGS */
1179 CDEBUG(D_MOUNT, "Cannot register with MGS: %d\n", rc);
1181 /* Let the target look up the mount using the target's name
1182 (we can't pass the sb or mnt through class_process_config.) */
1183 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1187 /* Start targets using the llog named for the target */
1188 memset(&cfg, 0, sizeof(cfg));
1189 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1191 CERROR("failed to start server %s: %d\n",
1192 lsi->lsi_ldd->ldd_svname, rc);
1193 /* Do NOT call server_deregister_mount() here. This makes it
1194 * impossible to find mount later in cleanup time and leaves
1195 * @lsi and othder stuff leaked. -umka */
1200 /* Release the mgc fs for others to use */
1201 server_mgc_clear_fs(lsi->lsi_mgc);
1204 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1206 CERROR("no server named %s was started\n",
1207 lsi->lsi_ldd->ldd_svname);
1211 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1212 (OBP(obd, iocontrol))) {
1213 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1214 obd->obd_self_export, 0, NULL, NULL);
1217 /* log has been fully processed */
1218 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1224 /***************** lustre superblock **************/
1226 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1228 struct lustre_sb_info *lsi;
1234 OBD_ALLOC_PTR(lsi->lsi_lmd);
1235 if (!lsi->lsi_lmd) {
1240 lsi->lsi_lmd->lmd_exclude_count = 0;
1241 lsi->lsi_lmd->lmd_recovery_time_soft = 0;
1242 lsi->lsi_lmd->lmd_recovery_time_hard = 0;
1243 s2lsi_nocast(sb) = lsi;
1244 /* we take 1 extra ref for our setup */
1245 cfs_atomic_set(&lsi->lsi_mounts, 1);
1247 /* Default umount style */
1248 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1253 static int lustre_free_lsi(struct super_block *sb)
1255 struct lustre_sb_info *lsi = s2lsi(sb);
1258 LASSERT(lsi != NULL);
1259 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1261 /* someone didn't call server_put_mount. */
1262 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1264 if (lsi->lsi_ldd != NULL)
1265 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1267 if (lsi->lsi_lmd != NULL) {
1268 if (lsi->lsi_lmd->lmd_dev != NULL)
1269 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1270 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1271 if (lsi->lsi_lmd->lmd_profile != NULL)
1272 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1273 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1274 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1275 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1276 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1277 if (lsi->lsi_lmd->lmd_opts != NULL)
1278 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1279 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1280 if (lsi->lsi_lmd->lmd_exclude_count)
1281 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1282 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1283 lsi->lsi_lmd->lmd_exclude_count);
1284 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1287 LASSERT(lsi->lsi_llsbi == NULL);
1288 OBD_FREE(lsi, sizeof(*lsi));
1289 s2lsi_nocast(sb) = NULL;
1294 /* The lsi has one reference for every server that is using the disk -
1295 e.g. MDT, MGS, and potentially MGC */
1296 static int lustre_put_lsi(struct super_block *sb)
1298 struct lustre_sb_info *lsi = s2lsi(sb);
1301 LASSERT(lsi != NULL);
1303 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1304 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1305 lustre_free_lsi(sb);
1311 /*************** server mount ******************/
1313 /** Kernel mount using mount options in MOUNT_DATA_FILE.
1314 * Since this file lives on the disk, we pre-mount using a common
1315 * type, read the file, then re-mount using the type specified in the
1318 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1320 struct lvfs_run_ctxt mount_ctxt;
1321 struct lustre_sb_info *lsi = s2lsi(sb);
1322 struct lustre_disk_data *ldd;
1323 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1324 struct vfsmount *mnt;
1325 char *options = NULL;
1326 unsigned long page, s_flags;
1327 struct page *__page;
1331 OBD_ALLOC(ldd, sizeof(*ldd));
1333 RETURN(ERR_PTR(-ENOMEM));
1335 /* In the past, we have always used flags = 0.
1336 Note ext3/ldiskfs can't be mounted ro. */
1337 s_flags = sb->s_flags;
1339 /* allocate memory for options */
1340 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1342 GOTO(out_free, rc = -ENOMEM);
1343 page = (unsigned long)cfs_page_address(__page);
1344 options = (char *)page;
1345 memset(options, 0, CFS_PAGE_SIZE);
1347 /* mount-line options must be added for pre-mount because it may
1348 * contain mount options such as journal_dev which are required
1349 * to mount successfuly the underlying filesystem */
1350 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1351 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1353 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1354 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1355 mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, (void *)options);
1358 CERROR("premount %s:%#lx ldiskfs failed: %d "
1359 "Is the ldiskfs module available?\n",
1360 lmd->lmd_dev, s_flags, rc );
1364 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1365 mount_ctxt.pwdmnt = mnt;
1366 mount_ctxt.pwd = mnt->mnt_root;
1367 mount_ctxt.fs = get_ds();
1369 rc = ldd_parse(&mount_ctxt, ldd);
1373 CERROR("premount parse options failed: rc = %d\n", rc);
1377 /* Done with our pre-mount, now do the real mount. */
1379 /* Glom up mount options */
1380 memset(options, 0, CFS_PAGE_SIZE);
1381 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1383 /* Add in any mount-line options */
1384 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1385 int len = CFS_PAGE_SIZE - strlen(options) - 2;
1387 strcat(options, ",");
1388 strncat(options, lmd->lmd_opts, len);
1391 /* Special permanent mount flags */
1393 s_flags |= MS_NOATIME | MS_NODIRATIME;
1395 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1396 MT_STR(ldd), lmd->lmd_dev, options);
1397 mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
1401 CERROR("ll_kern_mount failed: rc = %d\n", rc);
1405 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1406 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1409 OBD_PAGE_FREE(__page);
1410 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1411 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1416 OBD_PAGE_FREE(__page);
1417 OBD_FREE(ldd, sizeof(*ldd));
1418 lsi->lsi_ldd = NULL;
1419 RETURN(ERR_PTR(rc));
1422 /** Wait here forever until the mount refcount is 0 before completing umount,
1423 * else we risk dereferencing a null pointer.
1424 * LNET may take e.g. 165s before killing zombies.
1426 static void server_wait_finished(struct vfsmount *mnt)
1430 cfs_sigset_t blocked;
1432 cfs_waitq_init(&waitq);
1434 while (atomic_read(&mnt->mnt_count) > 1) {
1435 if (waited && (waited % 30 == 0))
1436 LCONSOLE_WARN("Mount still busy with %d refs after "
1438 atomic_read(&mnt->mnt_count),
1440 /* Cannot use l_event_wait() for an interruptible sleep. */
1442 blocked = cfs_block_sigsinv(sigmask(SIGKILL));
1443 cfs_waitq_wait_event_interruptible_timeout(
1445 (atomic_read(&mnt->mnt_count) == 1),
1446 cfs_time_seconds(3),
1448 cfs_block_sigs(blocked);
1450 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1451 "%d refs!\n", mnt->mnt_devname,
1452 atomic_read(&mnt->mnt_count));
1459 /** Start the shutdown of servers at umount.
1461 static void server_put_super(struct super_block *sb)
1463 struct lustre_sb_info *lsi = s2lsi(sb);
1464 struct obd_device *obd;
1465 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1466 char *tmpname, *extraname = NULL;
1468 int lddflags = lsi->lsi_ldd->ldd_flags;
1469 int lsiflags = lsi->lsi_flags;
1472 LASSERT(lsiflags & LSI_SERVER);
1474 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1475 OBD_ALLOC(tmpname, tmpname_sz);
1476 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1477 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1478 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1479 snprintf(tmpname, tmpname_sz, "MGS");
1481 /* Stop the target */
1482 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1483 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1484 struct lustre_profile *lprof = NULL;
1486 /* tell the mgc to drop the config log */
1487 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1489 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1490 If there are any setup/cleanup errors, save the lov
1491 name for safety cleanup later. */
1492 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1493 if (lprof && lprof->lp_dt) {
1494 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1495 strcpy(extraname, lprof->lp_dt);
1498 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1500 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1501 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1503 /* We can't seem to give an error return code
1504 * to .put_super, so we better make sure we clean up! */
1506 class_manual_cleanup(obd);
1508 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1509 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1513 /* If they wanted the mgs to stop separately from the mdt, they
1514 should have put it on a different device. */
1515 if (IS_MGS(lsi->lsi_ldd)) {
1516 /* if MDS start with --nomgs, don't stop MGS then */
1517 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1518 server_stop_mgs(sb);
1521 /* Clean the mgc and sb */
1522 lustre_common_put_super(sb);
1524 /* Wait for the targets to really clean up - can't exit (and let the
1525 sb get destroyed) while the mount is still in use */
1526 server_wait_finished(mnt);
1528 /* drop the One True Mount */
1531 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1532 until the target is really gone so that our type refcount check
1534 server_stop_servers(lddflags, lsiflags);
1536 /* In case of startup or cleanup err, stop related obds */
1538 obd = class_name2obd(extraname);
1540 CWARN("Cleaning orphaned obd %s\n", extraname);
1542 class_manual_cleanup(obd);
1544 OBD_FREE(extraname, strlen(extraname) + 1);
1547 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1548 OBD_FREE(tmpname, tmpname_sz);
1552 /** Called only for 'umount -f'
1554 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1555 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1557 struct super_block *sb = vfsmnt->mnt_sb;
1559 static void server_umount_begin(struct super_block *sb)
1562 struct lustre_sb_info *lsi = s2lsi(sb);
1565 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1566 if (!(flags & MNT_FORCE)) {
1572 CDEBUG(D_MOUNT, "umount -f\n");
1573 /* umount = failover
1575 no third way to do non-force, non-failover */
1576 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1577 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1581 #ifndef HAVE_STATFS_DENTRY_PARAM
1582 static int server_statfs (struct super_block *sb, cfs_kstatfs_t *buf)
1585 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1587 struct super_block *sb = dentry->d_sb;
1589 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1592 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1593 #ifdef HAVE_STATFS_DENTRY_PARAM
1594 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1596 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf);
1599 buf->f_type = sb->s_magic;
1605 buf->f_type = sb->s_magic;
1606 buf->f_bsize = sb->s_blocksize;
1612 buf->f_namelen = NAME_MAX;
1616 /** The operations we support directly on the superblock:
1617 * mount, umount, and df.
1619 static struct super_operations server_ops =
1621 .put_super = server_put_super,
1622 .umount_begin = server_umount_begin, /* umount -f */
1623 .statfs = server_statfs,
1626 #define log2(n) cfs_ffz(~(n))
1627 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1629 static int server_fill_super_common(struct super_block *sb)
1631 struct inode *root = 0;
1634 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1636 sb->s_blocksize = 4096;
1637 sb->s_blocksize_bits = log2(sb->s_blocksize);
1638 sb->s_magic = LUSTRE_SUPER_MAGIC;
1639 sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
1640 sb->s_flags |= MS_RDONLY;
1641 sb->s_op = &server_ops;
1643 root = new_inode(sb);
1645 CERROR("Can't make root inode\n");
1649 /* returns -EIO for every operation */
1650 /* make_bad_inode(root); -- badness - can't umount */
1651 /* apparently we need to be a directory for the mount to finish */
1652 root->i_mode = S_IFDIR;
1654 sb->s_root = d_alloc_root(root);
1656 CERROR("Can't make root dentry\n");
1664 /** Fill in the superblock info for a Lustre server.
1665 * Mount the device with the correct options.
1666 * Read the on-disk config file.
1667 * Start the services.
1669 static int server_fill_super(struct super_block *sb)
1671 struct lustre_sb_info *lsi = s2lsi(sb);
1672 struct vfsmount *mnt;
1676 /* the One True Mount */
1677 mnt = server_kernel_mount(sb);
1680 CERROR("Unable to mount device %s: %d\n",
1681 lsi->lsi_lmd->lmd_dev, rc);
1685 lsi->lsi_srv_mnt = mnt;
1687 LASSERT(lsi->lsi_ldd);
1688 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1689 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1690 lsi->lsi_lmd->lmd_dev);
1692 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1693 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1694 "running. Double-mount may have compromised"
1695 " the disk journal.\n",
1696 lsi->lsi_ldd->ldd_svname);
1702 /* Start MGS before MGC */
1703 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1704 rc = server_start_mgs(sb);
1709 /* Start MGC before servers */
1710 rc = lustre_start_mgc(sb);
1714 /* Set up all obd devices for service */
1715 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1716 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1717 rc = server_start_targets(sb, mnt);
1719 CERROR("Unable to start targets: %d\n", rc);
1722 /* FIXME overmount client here,
1723 or can we just start a client log and client_fill_super on this sb?
1724 We need to make sure server_put_super gets called too - ll_put_super
1725 calls lustre_common_put_super; check there for LSI_SERVER flag,
1727 Probably should start client from new thread so we can return.
1728 Client will not finish until all servers are connected.
1729 Note - MGS-only server does NOT get a client, since there is no
1730 lustre fs associated - the MGS is for all lustre fs's */
1733 rc = server_fill_super_common(sb);
1739 /* We jump here in case of failure while starting targets or MGS.
1740 * In this case we can't just put @mnt and have to do real cleanup
1741 * with stoping targets, etc. */
1742 server_put_super(sb);
1746 /* Get the index from the obd name.
1747 rc = server type, or
1749 if endptr isn't NULL it is set to end of name */
1750 int server_name2index(char *svname, __u32 *idx, char **endptr)
1752 unsigned long index;
1754 char *dash = strrchr(svname, '-');
1758 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1759 * in the fsname, then determine the server index */
1760 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
1762 for (; dash > svname && *dash != '-'; dash--);
1767 if (strncmp(dash + 1, "MDT", 3) == 0)
1768 rc = LDD_F_SV_TYPE_MDT;
1769 else if (strncmp(dash + 1, "OST", 3) == 0)
1770 rc = LDD_F_SV_TYPE_OST;
1773 if (strcmp(dash + 4, "all") == 0)
1774 return rc | LDD_F_SV_ALL;
1776 index = simple_strtoul(dash + 4, endptr, 16);
1781 /*************** mount common betweeen server and client ***************/
1784 int lustre_common_put_super(struct super_block *sb)
1789 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
1791 /* Drop a ref to the MGC */
1792 rc = lustre_stop_mgc(sb);
1793 if (rc && (rc != -ENOENT)) {
1795 CERROR("Can't stop MGC: %d\n", rc);
1798 /* BUSY just means that there's some other obd that
1799 needs the mgc. Let him clean it up. */
1800 CDEBUG(D_MOUNT, "MGC still in use\n");
1802 /* Drop a ref to the mounted disk */
1808 static void lmd_print(struct lustre_mount_data *lmd)
1812 PRINT_CMD(PRINT_MASK, " mount data:\n");
1813 if (lmd_is_client(lmd))
1814 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
1815 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
1816 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
1819 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
1821 if (lmd->lmd_recovery_time_soft)
1822 PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
1823 lmd->lmd_recovery_time_soft);
1825 if (lmd->lmd_recovery_time_hard)
1826 PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
1827 lmd->lmd_recovery_time_hard);
1829 for (i = 0; i < lmd->lmd_exclude_count; i++) {
1830 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
1831 lmd->lmd_exclude[i]);
1835 /* Is this server on the exclusion list */
1836 int lustre_check_exclusion(struct super_block *sb, char *svname)
1838 struct lustre_sb_info *lsi = s2lsi(sb);
1839 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1844 rc = server_name2index(svname, &index, NULL);
1845 if (rc != LDD_F_SV_TYPE_OST)
1846 /* Only exclude OSTs */
1849 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
1850 index, lmd->lmd_exclude_count, lmd->lmd_dev);
1852 for(i = 0; i < lmd->lmd_exclude_count; i++) {
1853 if (index == lmd->lmd_exclude[i]) {
1854 CWARN("Excluding %s (on exclusion list)\n", svname);
1861 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
1862 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
1864 char *s1 = ptr, *s2;
1865 __u32 index, *exclude_list;
1869 /* The shortest an ost name can be is 8 chars: -OST0000.
1870 We don't actually know the fsname at this time, so in fact
1871 a user could specify any fsname. */
1872 devmax = strlen(ptr) / 8 + 1;
1874 /* temp storage until we figure out how many we have */
1875 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
1879 /* we enter this fn pointing at the '=' */
1880 while (*s1 && *s1 != ' ' && *s1 != ',') {
1882 rc = server_name2index(s1, &index, &s2);
1884 CERROR("Can't parse server name '%s'\n", s1);
1887 if (rc == LDD_F_SV_TYPE_OST)
1888 exclude_list[lmd->lmd_exclude_count++] = index;
1890 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
1892 /* now we are pointing at ':' (next exclude)
1893 or ',' (end of excludes) */
1894 if (lmd->lmd_exclude_count >= devmax)
1897 if (rc >= 0) /* non-err */
1900 if (lmd->lmd_exclude_count) {
1901 /* permanent, freed in lustre_free_lsi */
1902 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
1903 lmd->lmd_exclude_count);
1904 if (lmd->lmd_exclude) {
1905 memcpy(lmd->lmd_exclude, exclude_list,
1906 sizeof(index) * lmd->lmd_exclude_count);
1909 lmd->lmd_exclude_count = 0;
1912 OBD_FREE(exclude_list, sizeof(index) * devmax);
1916 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
1921 if (lmd->lmd_mgssec != NULL) {
1922 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
1923 lmd->lmd_mgssec = NULL;
1926 tail = strchr(ptr, ',');
1928 length = strlen(ptr);
1930 length = tail - ptr;
1932 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
1933 if (lmd->lmd_mgssec == NULL)
1936 memcpy(lmd->lmd_mgssec, ptr, length);
1937 lmd->lmd_mgssec[length] = '\0';
1941 /** Parse mount line options
1942 * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
1943 * dev is passed as device=uml1:/lustre by mount.lustre
1945 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
1947 char *s1, *s2, *devname = NULL;
1948 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
1954 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
1955 "/sbin/mount.lustre is installed.\n");
1959 /* Options should be a string - try to detect old lmd data */
1960 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
1961 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
1962 "/sbin/mount.lustre. Please install "
1963 "version %s\n", LUSTRE_VERSION_STRING);
1966 lmd->lmd_magic = LMD_MAGIC;
1968 /* Set default flags here */
1973 int time_min = 2 * (CONNECTION_SWITCH_MAX +
1974 2 * INITIAL_CONNECT_TIMEOUT);
1976 /* Skip whitespace and extra commas */
1977 while (*s1 == ' ' || *s1 == ',')
1980 /* Client options are parsed in ll_options: eg. flock,
1983 /* Parse non-ldiskfs options here. Rather than modifying
1984 ldiskfs, we just zero these out here */
1985 if (strncmp(s1, "abort_recov", 11) == 0) {
1986 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
1988 } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
1989 lmd->lmd_recovery_time_soft = max_t(int,
1990 simple_strtoul(s1 + 19, NULL, 10), time_min);
1992 } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
1993 lmd->lmd_recovery_time_hard = max_t(int,
1994 simple_strtoul(s1 + 19, NULL, 10), time_min);
1996 } else if (strncmp(s1, "nosvc", 5) == 0) {
1997 lmd->lmd_flags |= LMD_FLG_NOSVC;
1999 } else if (strncmp(s1, "nomgs", 5) == 0) {
2000 lmd->lmd_flags |= LMD_FLG_NOMGS;
2002 } else if (strncmp(s1, "writeconf", 9) == 0) {
2003 lmd->lmd_flags |= LMD_FLG_WRITECONF;
2005 } else if (strncmp(s1, "mgssec=", 7) == 0) {
2006 rc = lmd_parse_mgssec(lmd, s1 + 7);
2010 /* ost exclusion list */
2011 } else if (strncmp(s1, "exclude=", 8) == 0) {
2012 rc = lmd_make_exclusion(lmd, s1 + 7);
2017 /* Linux 2.4 doesn't pass the device, so we stuck it at the
2018 end of the options. */
2019 else if (strncmp(s1, "device=", 7) == 0) {
2021 /* terminate options right before device. device
2022 must be the last one. */
2028 s2 = strchr(s1, ',');
2036 memmove(s1, s2, strlen(s2) + 1);
2042 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
2043 "(need mount option 'device=...')\n");
2047 s1 = strstr(devname, ":/");
2050 lmd->lmd_flags = LMD_FLG_CLIENT;
2051 /* Remove leading /s from fsname */
2052 while (*++s1 == '/') ;
2053 /* Freed in lustre_free_lsi */
2054 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
2055 if (!lmd->lmd_profile)
2057 sprintf(lmd->lmd_profile, "%s-client", s1);
2060 /* Freed in lustre_free_lsi */
2061 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
2064 strcpy(lmd->lmd_dev, devname);
2066 /* Save mount options */
2067 s1 = options + strlen(options) - 1;
2068 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
2070 if (*options != 0) {
2071 /* Freed in lustre_free_lsi */
2072 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
2075 strcpy(lmd->lmd_opts, options);
2079 lmd->lmd_magic = LMD_MAGIC;
2084 CERROR("Bad mount options %s\n", options);
2088 struct lustre_mount_data2 {
2090 struct vfsmount *lmd2_mnt;
2093 /** This is the entry point for the mount call into Lustre.
2094 * This is called when a server or client is mounted,
2095 * and this is where we start setting things up.
2096 * @param data Mount options (e.g. -o flock,abort_recov)
2098 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2100 struct lustre_mount_data *lmd;
2101 struct lustre_mount_data2 *lmd2 = data;
2102 struct lustre_sb_info *lsi;
2106 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2108 lsi = lustre_init_lsi(sb);
2114 * Disable lockdep during mount, because mount locking patterns are
2119 /* Figure out the lmd from the mount options */
2120 if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
2122 GOTO(out, rc = -EINVAL);
2125 if (lmd_is_client(lmd)) {
2126 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2127 if (!client_fill_super) {
2128 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2129 "client mount! Is the 'lustre' "
2130 "module loaded?\n");
2134 rc = lustre_start_mgc(sb);
2139 /* Connect and start */
2140 /* (should always be ll_fill_super) */
2141 rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
2142 /* c_f_s will call lustre_common_put_super on failure */
2145 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2146 lsi->lsi_flags |= LSI_SERVER;
2147 rc = server_fill_super(sb);
2148 /* s_f_s calls lustre_start_mgc after the mount because we need
2149 the MGS nids which are stored on disk. Plus, we may
2150 need to start the MGS first. */
2151 /* s_f_s will call server_put_super on failure */
2154 /* If error happens in fill_super() call, @lsi will be killed there.
2155 * This is why we do not put it here. */
2159 CERROR("Unable to mount %s (%d)\n",
2160 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2162 CDEBUG(D_SUPER, "Mount %s complete\n",
2170 /* We can't call ll_fill_super by name because it lives in a module that
2171 must be loaded after this one. */
2172 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
2173 struct vfsmount *mnt))
2175 client_fill_super = cfs;
2178 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2180 kill_super_cb = cfs;
2183 /***************** FS registration ******************/
2185 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
2186 struct super_block * lustre_get_sb(struct file_system_type *fs_type, int flags,
2187 const char *devname, void * data)
2189 return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
2192 int lustre_get_sb(struct file_system_type *fs_type, int flags,
2193 const char *devname, void * data, struct vfsmount *mnt)
2195 struct lustre_mount_data2 lmd2 = {data, mnt};
2197 return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt);
2201 void lustre_kill_super(struct super_block *sb)
2203 struct lustre_sb_info *lsi = s2lsi(sb);
2205 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2206 (*kill_super_cb)(sb);
2208 kill_anon_super(sb);
2211 /** Register the "lustre" fs type
2213 struct file_system_type lustre_fs_type = {
2214 .owner = THIS_MODULE,
2216 .get_sb = lustre_get_sb,
2217 .kill_sb = lustre_kill_super,
2218 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2219 #ifdef FS_HAS_FIEMAP
2222 LL_RENAME_DOES_D_MOVE,
2225 int lustre_register_fs(void)
2227 return register_filesystem(&lustre_fs_type);
2230 int lustre_unregister_fs(void)
2232 return unregister_filesystem(&lustre_fs_type);
2235 EXPORT_SYMBOL(lustre_register_client_fill_super);
2236 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2237 EXPORT_SYMBOL(lustre_common_put_super);
2238 EXPORT_SYMBOL(lustre_process_log);
2239 EXPORT_SYMBOL(lustre_end_log);
2240 EXPORT_SYMBOL(server_get_mount);
2241 EXPORT_SYMBOL(server_get_mount_2);
2242 EXPORT_SYMBOL(server_put_mount);
2243 EXPORT_SYMBOL(server_put_mount_2);
2244 EXPORT_SYMBOL(server_register_target);
2245 EXPORT_SYMBOL(server_name2index);
2246 EXPORT_SYMBOL(server_mti_print);
2247 EXPORT_SYMBOL(do_lcfg);