1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/obd_mount.c
38 * Client/server mount routines
40 * Author: Nathan Rutman <nathan@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_CLASS
45 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
46 #define PRINT_CMD CDEBUG
47 #define PRINT_MASK D_SUPER|D_CONFIG
51 #include <lustre_fsfilt.h>
52 #include <obd_class.h>
53 #include <lustre/lustre_user.h>
54 #include <linux/version.h>
55 #include <lustre_log.h>
56 #include <lustre_disk.h>
57 #include <lustre_param.h>
59 static int (*client_fill_super)(struct super_block *sb,
60 struct vfsmount *mnt) = NULL;
61 static void (*kill_super_cb)(struct super_block *sb) = NULL;
63 /*********** mount lookup *********/
65 CFS_DECLARE_MUTEX(lustre_mount_info_lock);
66 static CFS_LIST_HEAD(server_mount_info_list);
68 static struct lustre_mount_info *server_find_mount(const char *name)
71 struct lustre_mount_info *lmi;
74 cfs_list_for_each(tmp, &server_mount_info_list) {
75 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
77 if (strcmp(name, lmi->lmi_name) == 0)
83 /* we must register an obd for a mount before we call the setup routine.
84 *_setup will call lustre_get_mount to get the mnt struct
85 by obd_name, since we can't pass the pointer to setup. */
86 static int server_register_mount(const char *name, struct super_block *sb,
89 struct lustre_mount_info *lmi;
96 OBD_ALLOC(lmi, sizeof(*lmi));
99 OBD_ALLOC(name_cp, strlen(name) + 1);
101 OBD_FREE(lmi, sizeof(*lmi));
104 strcpy(name_cp, name);
106 cfs_down(&lustre_mount_info_lock);
108 if (server_find_mount(name)) {
109 cfs_up(&lustre_mount_info_lock);
110 OBD_FREE(lmi, sizeof(*lmi));
111 OBD_FREE(name_cp, strlen(name) + 1);
112 CERROR("Already registered %s\n", name);
115 lmi->lmi_name = name_cp;
118 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
120 cfs_up(&lustre_mount_info_lock);
122 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
123 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
128 /* when an obd no longer needs a mount */
129 static int server_deregister_mount(const char *name)
131 struct lustre_mount_info *lmi;
134 cfs_down(&lustre_mount_info_lock);
135 lmi = server_find_mount(name);
137 cfs_up(&lustre_mount_info_lock);
138 CERROR("%s not registered\n", name);
142 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
143 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
145 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
146 cfs_list_del(&lmi->lmi_list_chain);
147 OBD_FREE(lmi, sizeof(*lmi));
148 cfs_up(&lustre_mount_info_lock);
153 /* obd's look up a registered mount using their obdname. This is just
154 for initial obd setup to find the mount struct. It should not be
155 called every time you want to mntget. */
156 struct lustre_mount_info *server_get_mount(const char *name)
158 struct lustre_mount_info *lmi;
159 struct lustre_sb_info *lsi;
162 cfs_down(&lustre_mount_info_lock);
163 lmi = server_find_mount(name);
164 cfs_up(&lustre_mount_info_lock);
166 CERROR("Can't find mount for %s\n", name);
169 lsi = s2lsi(lmi->lmi_sb);
170 mntget(lmi->lmi_mnt);
171 cfs_atomic_inc(&lsi->lsi_mounts);
173 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
174 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
175 cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
181 * Used by mdt to get mount_info from obdname.
182 * There are no blocking when using the mount_info.
183 * Do not use server_get_mount for this purpose.
185 struct lustre_mount_info *server_get_mount_2(const char *name)
187 struct lustre_mount_info *lmi;
190 cfs_down(&lustre_mount_info_lock);
191 lmi = server_find_mount(name);
192 cfs_up(&lustre_mount_info_lock);
194 CERROR("Can't find mount for %s\n", name);
199 static void unlock_mntput(struct vfsmount *mnt)
201 if (kernel_locked()) {
210 static int lustre_put_lsi(struct super_block *sb);
212 /* to be called from obd_cleanup methods */
213 int server_put_mount(const char *name, struct vfsmount *mnt)
215 struct lustre_mount_info *lmi;
216 struct lustre_sb_info *lsi;
217 int count = atomic_read(&mnt->mnt_count) - 1;
220 /* This might be the last one, can't deref after this */
223 cfs_down(&lustre_mount_info_lock);
224 lmi = server_find_mount(name);
225 cfs_up(&lustre_mount_info_lock);
227 CERROR("Can't find mount for %s\n", name);
230 lsi = s2lsi(lmi->lmi_sb);
231 LASSERT(lmi->lmi_mnt == mnt);
233 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
234 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
236 if (lustre_put_lsi(lmi->lmi_sb)) {
237 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
238 lmi->lmi_mnt, name, count);
239 /* last mount is the One True Mount */
241 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
244 /* this obd should never need the mount again */
245 server_deregister_mount(name);
250 /* Corresponding to server_get_mount_2 */
251 int server_put_mount_2(const char *name, struct vfsmount *mnt)
257 /******* mount helper utilities *********/
260 static void ldd_print(struct lustre_disk_data *ldd)
262 PRINT_CMD(PRINT_MASK, " disk data:\n");
263 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
264 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
265 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
266 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
267 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
268 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
269 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
270 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
271 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
272 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
276 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
277 struct lustre_disk_data *ldd)
279 struct lvfs_run_ctxt saved;
286 push_ctxt(&saved, mount_ctxt, NULL);
288 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
291 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
295 len = i_size_read(file->f_dentry->d_inode);
296 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
297 if (len != sizeof(*ldd)) {
298 CERROR("disk data size does not match: see %lu expect %u\n",
299 len, (int)sizeof(*ldd));
300 GOTO(out_close, rc = -EINVAL);
303 rc = lustre_fread(file, ldd, len, &off);
305 CERROR("error reading %s: read %d of %lu\n",
306 MOUNT_DATA_FILE, rc, len);
307 GOTO(out_close, rc = -EINVAL);
311 if (ldd->ldd_magic != LDD_MAGIC) {
312 /* FIXME add swabbing support */
313 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
314 ldd->ldd_magic, LDD_MAGIC);
315 GOTO(out_close, rc = -EINVAL);
318 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
319 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
321 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
322 GOTO(out_close, rc = -EINVAL);
324 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
325 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
327 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
328 /* Do something like remount filesystem read-only */
329 GOTO(out_close, rc = -EINVAL);
335 pop_ctxt(&saved, mount_ctxt, NULL);
339 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
340 struct lustre_disk_data *ldd)
342 struct lvfs_run_ctxt saved;
345 unsigned long len = sizeof(struct lustre_disk_data);
349 LASSERT(ldd->ldd_magic == LDD_MAGIC);
351 ldd->ldd_config_ver++;
353 push_ctxt(&saved, mount_ctxt, NULL);
355 file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
358 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
362 rc = lustre_fwrite(file, ldd, len, &off);
364 CERROR("error writing %s: read %d of %lu\n",
365 MOUNT_DATA_FILE, rc, len);
366 GOTO(out_close, rc = -EINVAL);
374 pop_ctxt(&saved, mount_ctxt, NULL);
379 /**************** config llog ********************/
381 /** Get a config log from the MGS and process it.
382 * This func is called for both clients and servers.
383 * Continue to process new statements appended to the logs
384 * (whenever the config lock is revoked) until lustre_end_log
386 * @param sb The superblock is used by the MGC to write to the local copy of
388 * @param logname The name of the llog to replicate from the MGS
389 * @param cfg Since the same mgc may be used to follow multiple config logs
390 * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
391 * this log, and is added to the mgc's list of logs to follow.
393 int lustre_process_log(struct super_block *sb, char *logname,
394 struct config_llog_instance *cfg)
396 struct lustre_cfg *lcfg;
397 struct lustre_cfg_bufs *bufs;
398 struct lustre_sb_info *lsi = s2lsi(sb);
399 struct obd_device *mgc = lsi->lsi_mgc;
410 /* mgc_process_config */
411 lustre_cfg_bufs_reset(bufs, mgc->obd_name);
412 lustre_cfg_bufs_set_string(bufs, 1, logname);
413 lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
414 lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
415 lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
416 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
417 lustre_cfg_free(lcfg);
422 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
423 "failed from the MGS (%d). Make sure this "
424 "client and the MGS are running compatible "
425 "versions of Lustre.\n",
426 mgc->obd_name, logname, rc);
429 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
430 "failed (%d). This may be the result of "
431 "communication errors between this node and "
432 "the MGS, a bad configuration, or other "
433 "errors. See the syslog for more "
434 "information.\n", mgc->obd_name, logname,
437 /* class_obd_list(); */
441 /* Stop watching this config log for updates */
442 int lustre_end_log(struct super_block *sb, char *logname,
443 struct config_llog_instance *cfg)
445 struct lustre_cfg *lcfg;
446 struct lustre_cfg_bufs bufs;
447 struct lustre_sb_info *lsi = s2lsi(sb);
448 struct obd_device *mgc = lsi->lsi_mgc;
455 /* mgc_process_config */
456 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
457 lustre_cfg_bufs_set_string(&bufs, 1, logname);
459 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
460 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
461 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
462 lustre_cfg_free(lcfg);
466 /**************** obd start *******************/
468 /** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
469 * lctl (and do for echo cli/srv.
471 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
472 char *s1, char *s2, char *s3, char *s4)
474 struct lustre_cfg_bufs bufs;
475 struct lustre_cfg * lcfg = NULL;
478 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
479 cmd, s1, s2, s3, s4);
481 lustre_cfg_bufs_reset(&bufs, cfgname);
483 lustre_cfg_bufs_set_string(&bufs, 1, s1);
485 lustre_cfg_bufs_set_string(&bufs, 2, s2);
487 lustre_cfg_bufs_set_string(&bufs, 3, s3);
489 lustre_cfg_bufs_set_string(&bufs, 4, s4);
491 lcfg = lustre_cfg_new(cmd, &bufs);
492 lcfg->lcfg_nid = nid;
493 rc = class_process_config(lcfg);
494 lustre_cfg_free(lcfg);
498 /** Call class_attach and class_setup. These methods in turn call
499 * obd type-specific methods.
501 static int lustre_start_simple(char *obdname, char *type, char *uuid,
505 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
507 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
509 CERROR("%s attach error %d\n", obdname, rc);
512 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
514 CERROR("%s setup error %d\n", obdname, rc);
515 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
520 /* Set up a MGS to serve startup logs */
521 static int server_start_mgs(struct super_block *sb)
523 struct lustre_sb_info *lsi = s2lsi(sb);
524 struct vfsmount *mnt = lsi->lsi_srv_mnt;
525 struct lustre_mount_info *lmi;
530 /* It is impossible to have more than 1 MGS per node, since
531 MGC wouldn't know which to connect to */
532 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
534 lsi = s2lsi(lmi->lmi_sb);
535 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
537 lsi->lsi_ldd->ldd_svname);
541 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
543 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
546 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
547 LUSTRE_MGS_OBDNAME, 0, 0);
548 /* Do NOT call server_deregister_mount() here. This leads to
549 * inability cleanup cleanly and free lsi and other stuff when
550 * mgs calls server_put_mount() in error handling case. -umka */
554 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
555 "Is the 'mgs' module loaded?\n",
556 LUSTRE_MGS_OBDNAME, rc);
560 static int server_stop_mgs(struct super_block *sb)
562 struct obd_device *obd;
566 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
568 /* There better be only one MGS */
569 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
571 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
575 /* The MGS should always stop when we say so */
577 rc = class_manual_cleanup(obd);
581 CFS_DECLARE_MUTEX(mgc_start_lock);
583 /** Set up a mgc obd to process startup logs
585 * \param sb [in] super block of the mgc obd
587 * \retval 0 success, otherwise error code
589 static int lustre_start_mgc(struct super_block *sb)
591 struct obd_connect_data *data = NULL;
592 struct lustre_sb_info *lsi = s2lsi(sb);
593 struct obd_device *obd;
594 struct obd_export *exp;
595 struct obd_uuid *uuid;
598 char *mgcname, *niduuid, *mgssec;
601 int rc = 0, i = 0, j, len;
604 LASSERT(lsi->lsi_lmd);
606 /* Find the first non-lo MGS nid for our MGC name */
607 if (lsi->lsi_flags & LSI_SERVER) {
608 ptr = lsi->lsi_ldd->ldd_params;
609 /* Use mgsnode= nids */
610 if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
611 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
613 } else if (IS_MGS(lsi->lsi_ldd)) {
614 lnet_process_id_t id;
615 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
616 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
623 } else { /* client */
624 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
625 ptr = lsi->lsi_lmd->lmd_dev;
626 if (class_parse_nid(ptr, &nid, &ptr) == 0)
630 CERROR("No valid MGS nids found.\n");
634 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
635 OBD_ALLOC(mgcname, len);
636 OBD_ALLOC(niduuid, len + 2);
637 if (!mgcname || !niduuid)
638 GOTO(out_free, rc = -ENOMEM);
639 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
641 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
643 cfs_mutex_down(&mgc_start_lock);
645 obd = class_name2obd(mgcname);
646 if (obd && !obd->obd_stopping) {
647 rc = obd_set_info_async(obd->obd_self_export,
648 strlen(KEY_MGSSEC), KEY_MGSSEC,
649 strlen(mgssec), mgssec, NULL);
653 /* Re-using an existing MGC */
654 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
657 /* If we are restarting the MGS, don't try to keep the MGC's
658 old connection, or registration will fail. */
659 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
660 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
664 /* Try all connections, but only once (again).
665 We don't want to block another target from starting
666 (using its local copy of the log), but we do want to connect
667 if at all possible. */
669 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
670 rc = obd_set_info_async(obd->obd_self_export,
671 sizeof(KEY_INIT_RECOV_BACKUP),
672 KEY_INIT_RECOV_BACKUP,
673 sizeof(recov_bk), &recov_bk, NULL);
677 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
679 /* Add the primary nids for the MGS */
681 sprintf(niduuid, "%s_%x", mgcname, i);
682 if (lsi->lsi_flags & LSI_SERVER) {
683 ptr = lsi->lsi_ldd->ldd_params;
684 if (IS_MGS(lsi->lsi_ldd)) {
685 /* Use local nids (including LO) */
686 lnet_process_id_t id;
687 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
688 rc = do_lcfg(mgcname, id.nid,
689 LCFG_ADD_UUID, niduuid, 0,0,0);
692 /* Use mgsnode= nids */
693 if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) {
694 CERROR("No MGS nids given.\n");
695 GOTO(out_free, rc = -EINVAL);
697 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
698 rc = do_lcfg(mgcname, nid,
699 LCFG_ADD_UUID, niduuid, 0,0,0);
703 } else { /* client */
704 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
705 ptr = lsi->lsi_lmd->lmd_dev;
706 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
707 rc = do_lcfg(mgcname, nid,
708 LCFG_ADD_UUID, niduuid, 0,0,0);
710 /* Stop at the first failover nid */
716 CERROR("No valid MGS nids found.\n");
717 GOTO(out_free, rc = -EINVAL);
719 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
721 /* Random uuid for MGC allows easier reconnects */
723 ll_generate_random_uuid(uuidc);
724 class_uuid_unparse(uuidc, uuid);
727 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
728 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
734 /* Add any failover MGS nids */
736 while ((*ptr == ':' ||
737 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
738 /* New failover node */
739 sprintf(niduuid, "%s_%x", mgcname, i);
741 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
743 rc = do_lcfg(mgcname, nid,
744 LCFG_ADD_UUID, niduuid, 0,0,0);
749 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
757 lsi->lsi_lmd->lmd_mgs_failnodes = i;
759 obd = class_name2obd(mgcname);
761 CERROR("Can't find mgcobd %s\n", mgcname);
762 GOTO(out_free, rc = -ENOTCONN);
765 rc = obd_set_info_async(obd->obd_self_export,
766 strlen(KEY_MGSSEC), KEY_MGSSEC,
767 strlen(mgssec), mgssec, NULL);
771 /* Keep a refcount of servers/clients who started with "mount",
772 so we know when we can get rid of the mgc. */
773 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
775 /* Try all connections, but only once. */
777 rc = obd_set_info_async(obd->obd_self_export,
778 sizeof(KEY_INIT_RECOV_BACKUP),
779 KEY_INIT_RECOV_BACKUP,
780 sizeof(recov_bk), &recov_bk, NULL);
783 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
784 /* We connect to the MGS at setup, and don't disconnect until cleanup */
787 GOTO(out, rc = -ENOMEM);
788 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
789 OBD_CONNECT_AT | OBD_CONNECT_FULL20;
790 data->ocd_version = LUSTRE_VERSION_CODE;
791 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
794 CERROR("connect failed %d\n", rc);
798 obd->u.cli.cl_mgc_mgsexp = exp;
801 /* Keep the mgc info in the sb. Note that many lsi's can point
805 cfs_mutex_up(&mgc_start_lock);
808 OBD_FREE(mgcname, len);
810 OBD_FREE(niduuid, len + 2);
814 static int lustre_stop_mgc(struct super_block *sb)
816 struct lustre_sb_info *lsi = s2lsi(sb);
817 struct obd_device *obd;
818 char *niduuid = 0, *ptr = 0;
819 int i, rc = 0, len = 0;
829 cfs_mutex_down(&mgc_start_lock);
830 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
831 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
832 /* This is not fatal, every client that stops
833 will call in here. */
834 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
835 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
836 GOTO(out, rc = -EBUSY);
839 /* The MGC has no recoverable data in any case.
840 * force shotdown set in umount_begin */
841 obd->obd_no_recov = 1;
843 if (obd->u.cli.cl_mgc_mgsexp) {
844 /* An error is not fatal, if we are unable to send the
845 disconnect mgs ping evictor cleans up the export */
846 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
848 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
851 /* Save the obdname for cleaning the nid uuids, which are
853 len = strlen(obd->obd_name) + 6;
854 OBD_ALLOC(niduuid, len);
856 strcpy(niduuid, obd->obd_name);
857 ptr = niduuid + strlen(niduuid);
860 rc = class_manual_cleanup(obd);
864 /* Clean the nid uuids */
866 GOTO(out, rc = -ENOMEM);
868 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
869 sprintf(ptr, "_%x", i);
870 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
873 CERROR("del MDC UUID %s failed: rc = %d\n",
878 OBD_FREE(niduuid, len);
880 /* class_import_put will get rid of the additional connections */
881 cfs_mutex_up(&mgc_start_lock);
885 /* Since there's only one mgc per node, we have to change it's fs to get
886 access to the right disk. */
887 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
889 struct lustre_sb_info *lsi = s2lsi(sb);
893 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
895 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
896 rc = obd_set_info_async(mgc->obd_self_export,
897 sizeof(KEY_SET_FS), KEY_SET_FS,
898 sizeof(*sb), sb, NULL);
900 CERROR("can't set_fs %d\n", rc);
906 static int server_mgc_clear_fs(struct obd_device *mgc)
911 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
913 rc = obd_set_info_async(mgc->obd_self_export,
914 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
919 CFS_DECLARE_MUTEX(server_start_lock);
921 /* Stop MDS/OSS if nobody is using them */
922 static int server_stop_servers(int lddflags, int lsiflags)
924 struct obd_device *obd = NULL;
925 struct obd_type *type = NULL;
929 cfs_mutex_down(&server_start_lock);
931 /* Either an MDT or an OST or neither */
932 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
933 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
934 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
935 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
936 type = class_search_type(LUSTRE_MDS_NAME);
938 /* if this was an OST, and there are no more OST's, clean up the OSS */
939 if ((lddflags & LDD_F_SV_TYPE_OST) &&
940 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
941 type = class_search_type(LUSTRE_OST_NAME);
944 if (obd && (!type || !type->typ_refcnt)) {
947 /* obd_fail doesn't mean much on a server obd */
948 err = class_manual_cleanup(obd);
953 cfs_mutex_up(&server_start_lock);
958 int server_mti_print(char *title, struct mgs_target_info *mti)
960 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
961 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
962 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
963 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
964 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
965 mti->mti_config_ver, mti->mti_flags);
969 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
971 struct lustre_sb_info *lsi = s2lsi(sb);
972 struct lustre_disk_data *ldd = lsi->lsi_ldd;
973 lnet_process_id_t id;
977 if (!(lsi->lsi_flags & LSI_SERVER))
980 strncpy(mti->mti_fsname, ldd->ldd_fsname,
981 sizeof(mti->mti_fsname));
982 strncpy(mti->mti_svname, ldd->ldd_svname,
983 sizeof(mti->mti_svname));
985 mti->mti_nid_count = 0;
986 while (LNetGetId(i++, &id) != -ENOENT) {
987 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
990 /* server use --servicenode param, only allow specified
991 * nids be registered */
992 if ((ldd->ldd_flags & LDD_F_NO_PRIMNODE) != 0 &&
993 class_match_nid(ldd->ldd_params,
994 PARAM_FAILNODE, id.nid) < 1)
997 /* match specified network */
998 if (!class_match_net(ldd->ldd_params,
999 PARAM_NETWORK, LNET_NIDNET(id.nid)))
1002 mti->mti_nids[mti->mti_nid_count] = id.nid;
1003 mti->mti_nid_count++;
1004 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
1005 CWARN("Only using first %d nids for %s\n",
1006 mti->mti_nid_count, mti->mti_svname);
1011 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
1012 mti->mti_config_ver = 0;
1013 if (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF)
1014 ldd->ldd_flags |= LDD_F_WRITECONF;
1015 mti->mti_flags = ldd->ldd_flags;
1016 mti->mti_stripe_index = ldd->ldd_svindex;
1017 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
1018 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
1019 CERROR("params too big for mti\n");
1022 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
1026 /* Register an old or new target with the MGS. If needed MGS will construct
1027 startup logs and assign index */
1028 int server_register_target(struct super_block *sb)
1030 struct lustre_sb_info *lsi = s2lsi(sb);
1031 struct obd_device *mgc = lsi->lsi_mgc;
1032 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1033 struct mgs_target_info *mti = NULL;
1039 if (!(lsi->lsi_flags & LSI_SERVER))
1045 rc = server_sb2mti(sb, mti);
1049 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1050 mti->mti_svname, mti->mti_fsname,
1051 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1054 /* Register the target */
1055 /* FIXME use mgc_process_config instead */
1056 rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp,
1057 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1058 sizeof(*mti), mti, NULL);
1062 /* Always update our flags */
1063 ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD;
1065 /* If this flag is set, it means the MGS wants us to change our
1066 on-disk data. (So far this means just the index.) */
1067 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1070 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1071 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1073 ldd->ldd_svindex = mti->mti_stripe_index;
1074 strncpy(ldd->ldd_svname, mti->mti_svname,
1075 sizeof(ldd->ldd_svname));
1076 /* or ldd_make_sv_name(ldd); */
1077 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1078 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1081 CERROR("Label set error %d\n", err);
1082 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1084 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1086 /* Flush the new ldd to disk */
1087 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1096 /** Start server targets: MDTs and OSTs
1098 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1100 struct obd_device *obd;
1101 struct lustre_sb_info *lsi = s2lsi(sb);
1102 struct config_llog_instance cfg;
1106 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1109 /* If we're an MDT, make sure the global MDS is running */
1110 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1111 /* make sure the MDS is started */
1112 cfs_mutex_down(&server_start_lock);
1113 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1115 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1116 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1118 LUSTRE_MDS_OBDNAME"_uuid",
1121 cfs_mutex_up(&server_start_lock);
1122 CERROR("failed to start MDS: %d\n", rc);
1126 cfs_mutex_up(&server_start_lock);
1130 /* If we're an OST, make sure the global OSS is running */
1131 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
1132 /* make sure OSS is started */
1133 cfs_mutex_down(&server_start_lock);
1134 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1136 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1138 LUSTRE_OSS_OBDNAME"_uuid",
1141 cfs_mutex_up(&server_start_lock);
1142 CERROR("failed to start OSS: %d\n", rc);
1146 cfs_mutex_up(&server_start_lock);
1149 /* Set the mgc fs to our server disk. This allows the MGC to
1150 * read and write configs locally, in case it can't talk to the MGS. */
1151 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1155 /* Register with MGS */
1156 rc = server_register_target(sb);
1157 if (rc && (lsi->lsi_ldd->ldd_flags &
1158 (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){
1159 CERROR("Required registration failed for %s: %d\n",
1160 lsi->lsi_ldd->ldd_svname, rc);
1162 LCONSOLE_ERROR_MSG(0x15f, "Communication error with "
1163 "the MGS. Is the MGS running?\n");
1167 if (rc == -EINVAL) {
1168 LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this "
1169 "server (%s) to start. Please see messages"
1170 " on the MGS node.\n",
1171 lsi->lsi_ldd->ldd_svname);
1174 /* non-fatal error of registeration with MGS */
1176 CDEBUG(D_MOUNT, "Cannot register with MGS: %d\n", rc);
1178 /* Let the target look up the mount using the target's name
1179 (we can't pass the sb or mnt through class_process_config.) */
1180 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1184 /* Start targets using the llog named for the target */
1185 memset(&cfg, 0, sizeof(cfg));
1186 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1188 CERROR("failed to start server %s: %d\n",
1189 lsi->lsi_ldd->ldd_svname, rc);
1190 /* Do NOT call server_deregister_mount() here. This makes it
1191 * impossible to find mount later in cleanup time and leaves
1192 * @lsi and othder stuff leaked. -umka */
1197 /* Release the mgc fs for others to use */
1198 server_mgc_clear_fs(lsi->lsi_mgc);
1201 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1203 CERROR("no server named %s was started\n",
1204 lsi->lsi_ldd->ldd_svname);
1208 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1209 (OBP(obd, iocontrol))) {
1210 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1211 obd->obd_self_export, 0, NULL, NULL);
1214 /* log has been fully processed */
1215 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1221 /***************** lustre superblock **************/
1223 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1225 struct lustre_sb_info *lsi;
1231 OBD_ALLOC_PTR(lsi->lsi_lmd);
1232 if (!lsi->lsi_lmd) {
1237 lsi->lsi_lmd->lmd_exclude_count = 0;
1238 lsi->lsi_lmd->lmd_recovery_time_soft = 0;
1239 lsi->lsi_lmd->lmd_recovery_time_hard = 0;
1240 s2lsi_nocast(sb) = lsi;
1241 /* we take 1 extra ref for our setup */
1242 cfs_atomic_set(&lsi->lsi_mounts, 1);
1244 /* Default umount style */
1245 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1250 static int lustre_free_lsi(struct super_block *sb)
1252 struct lustre_sb_info *lsi = s2lsi(sb);
1255 LASSERT(lsi != NULL);
1256 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1258 /* someone didn't call server_put_mount. */
1259 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1261 if (lsi->lsi_ldd != NULL)
1262 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1264 if (lsi->lsi_lmd != NULL) {
1265 if (lsi->lsi_lmd->lmd_dev != NULL)
1266 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1267 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1268 if (lsi->lsi_lmd->lmd_profile != NULL)
1269 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1270 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1271 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1272 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1273 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1274 if (lsi->lsi_lmd->lmd_opts != NULL)
1275 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1276 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1277 if (lsi->lsi_lmd->lmd_exclude_count)
1278 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1279 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1280 lsi->lsi_lmd->lmd_exclude_count);
1281 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1284 LASSERT(lsi->lsi_llsbi == NULL);
1285 OBD_FREE(lsi, sizeof(*lsi));
1286 s2lsi_nocast(sb) = NULL;
1291 /* The lsi has one reference for every server that is using the disk -
1292 e.g. MDT, MGS, and potentially MGC */
1293 static int lustre_put_lsi(struct super_block *sb)
1295 struct lustre_sb_info *lsi = s2lsi(sb);
1298 LASSERT(lsi != NULL);
1300 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1301 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1302 lustre_free_lsi(sb);
1308 /*************** server mount ******************/
1310 /** Kernel mount using mount options in MOUNT_DATA_FILE.
1311 * Since this file lives on the disk, we pre-mount using a common
1312 * type, read the file, then re-mount using the type specified in the
1315 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1317 struct lvfs_run_ctxt mount_ctxt;
1318 struct lustre_sb_info *lsi = s2lsi(sb);
1319 struct lustre_disk_data *ldd;
1320 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1321 struct vfsmount *mnt;
1322 char *options = NULL;
1323 unsigned long page, s_flags;
1324 struct page *__page;
1328 OBD_ALLOC(ldd, sizeof(*ldd));
1330 RETURN(ERR_PTR(-ENOMEM));
1332 /* In the past, we have always used flags = 0.
1333 Note ext3/ldiskfs can't be mounted ro. */
1334 s_flags = sb->s_flags;
1336 /* allocate memory for options */
1337 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1339 GOTO(out_free, rc = -ENOMEM);
1340 page = (unsigned long)cfs_page_address(__page);
1341 options = (char *)page;
1342 memset(options, 0, CFS_PAGE_SIZE);
1344 /* mount-line options must be added for pre-mount because it may
1345 * contain mount options such as journal_dev which are required
1346 * to mount successfuly the underlying filesystem */
1347 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1348 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1350 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1351 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1352 mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, (void *)options);
1355 CERROR("premount %s:%#lx ldiskfs failed: %d "
1356 "Is the ldiskfs module available?\n",
1357 lmd->lmd_dev, s_flags, rc );
1361 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1362 mount_ctxt.pwdmnt = mnt;
1363 mount_ctxt.pwd = mnt->mnt_root;
1364 mount_ctxt.fs = get_ds();
1366 rc = ldd_parse(&mount_ctxt, ldd);
1370 CERROR("premount parse options failed: rc = %d\n", rc);
1374 /* Done with our pre-mount, now do the real mount. */
1376 /* Glom up mount options */
1377 memset(options, 0, CFS_PAGE_SIZE);
1378 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1380 /* Add in any mount-line options */
1381 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1382 int len = CFS_PAGE_SIZE - strlen(options) - 2;
1384 strcat(options, ",");
1385 strncat(options, lmd->lmd_opts, len);
1388 /* Special permanent mount flags */
1390 s_flags |= MS_NOATIME | MS_NODIRATIME;
1392 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1393 MT_STR(ldd), lmd->lmd_dev, options);
1394 mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
1398 CERROR("ll_kern_mount failed: rc = %d\n", rc);
1402 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1403 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1406 OBD_PAGE_FREE(__page);
1407 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1408 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1413 OBD_PAGE_FREE(__page);
1414 OBD_FREE(ldd, sizeof(*ldd));
1415 lsi->lsi_ldd = NULL;
1416 RETURN(ERR_PTR(rc));
1419 /** Wait here forever until the mount refcount is 0 before completing umount,
1420 * else we risk dereferencing a null pointer.
1421 * LNET may take e.g. 165s before killing zombies.
1423 static void server_wait_finished(struct vfsmount *mnt)
1427 cfs_sigset_t blocked;
1429 cfs_waitq_init(&waitq);
1431 while (atomic_read(&mnt->mnt_count) > 1) {
1432 if (waited && (waited % 30 == 0))
1433 LCONSOLE_WARN("Mount still busy with %d refs after "
1435 atomic_read(&mnt->mnt_count),
1437 /* Cannot use l_event_wait() for an interruptible sleep. */
1439 blocked = cfs_block_sigsinv(sigmask(SIGKILL));
1440 cfs_waitq_wait_event_interruptible_timeout(
1442 (atomic_read(&mnt->mnt_count) == 1),
1443 cfs_time_seconds(3),
1445 cfs_block_sigs(blocked);
1447 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1448 "%d refs!\n", mnt->mnt_devname,
1449 atomic_read(&mnt->mnt_count));
1456 /** Start the shutdown of servers at umount.
1458 static void server_put_super(struct super_block *sb)
1460 struct lustre_sb_info *lsi = s2lsi(sb);
1461 struct obd_device *obd;
1462 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1463 char *tmpname, *extraname = NULL;
1465 int lddflags = lsi->lsi_ldd->ldd_flags;
1466 int lsiflags = lsi->lsi_flags;
1469 LASSERT(lsiflags & LSI_SERVER);
1471 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1472 OBD_ALLOC(tmpname, tmpname_sz);
1473 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1474 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1475 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1476 snprintf(tmpname, tmpname_sz, "MGS");
1478 /* Stop the target */
1479 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1480 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1481 struct lustre_profile *lprof = NULL;
1483 /* tell the mgc to drop the config log */
1484 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1486 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1487 If there are any setup/cleanup errors, save the lov
1488 name for safety cleanup later. */
1489 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1490 if (lprof && lprof->lp_dt) {
1491 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1492 strcpy(extraname, lprof->lp_dt);
1495 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1497 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1498 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1500 /* We can't seem to give an error return code
1501 * to .put_super, so we better make sure we clean up! */
1503 class_manual_cleanup(obd);
1505 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1506 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1510 /* If they wanted the mgs to stop separately from the mdt, they
1511 should have put it on a different device. */
1512 if (IS_MGS(lsi->lsi_ldd)) {
1513 /* if MDS start with --nomgs, don't stop MGS then */
1514 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1515 server_stop_mgs(sb);
1518 /* Clean the mgc and sb */
1519 lustre_common_put_super(sb);
1521 /* Wait for the targets to really clean up - can't exit (and let the
1522 sb get destroyed) while the mount is still in use */
1523 server_wait_finished(mnt);
1525 /* drop the One True Mount */
1528 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1529 until the target is really gone so that our type refcount check
1531 server_stop_servers(lddflags, lsiflags);
1533 /* In case of startup or cleanup err, stop related obds */
1535 obd = class_name2obd(extraname);
1537 CWARN("Cleaning orphaned obd %s\n", extraname);
1539 class_manual_cleanup(obd);
1541 OBD_FREE(extraname, strlen(extraname) + 1);
1544 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1545 OBD_FREE(tmpname, tmpname_sz);
1549 /** Called only for 'umount -f'
1551 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1552 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1554 struct super_block *sb = vfsmnt->mnt_sb;
1556 static void server_umount_begin(struct super_block *sb)
1559 struct lustre_sb_info *lsi = s2lsi(sb);
1562 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1563 if (!(flags & MNT_FORCE)) {
1569 CDEBUG(D_MOUNT, "umount -f\n");
1570 /* umount = failover
1572 no third way to do non-force, non-failover */
1573 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1574 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1578 #ifndef HAVE_STATFS_DENTRY_PARAM
1579 static int server_statfs (struct super_block *sb, cfs_kstatfs_t *buf)
1582 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1584 struct super_block *sb = dentry->d_sb;
1586 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1589 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1590 #ifdef HAVE_STATFS_DENTRY_PARAM
1591 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1593 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf);
1596 buf->f_type = sb->s_magic;
1602 buf->f_type = sb->s_magic;
1603 buf->f_bsize = sb->s_blocksize;
1609 buf->f_namelen = NAME_MAX;
1613 /** The operations we support directly on the superblock:
1614 * mount, umount, and df.
1616 static struct super_operations server_ops =
1618 .put_super = server_put_super,
1619 .umount_begin = server_umount_begin, /* umount -f */
1620 .statfs = server_statfs,
1623 #define log2(n) cfs_ffz(~(n))
1624 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1626 static int server_fill_super_common(struct super_block *sb)
1628 struct inode *root = 0;
1631 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1633 sb->s_blocksize = 4096;
1634 sb->s_blocksize_bits = log2(sb->s_blocksize);
1635 sb->s_magic = LUSTRE_SUPER_MAGIC;
1636 sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
1637 sb->s_flags |= MS_RDONLY;
1638 sb->s_op = &server_ops;
1640 root = new_inode(sb);
1642 CERROR("Can't make root inode\n");
1646 /* returns -EIO for every operation */
1647 /* make_bad_inode(root); -- badness - can't umount */
1648 /* apparently we need to be a directory for the mount to finish */
1649 root->i_mode = S_IFDIR;
1651 sb->s_root = d_alloc_root(root);
1653 CERROR("Can't make root dentry\n");
1661 /** Fill in the superblock info for a Lustre server.
1662 * Mount the device with the correct options.
1663 * Read the on-disk config file.
1664 * Start the services.
1666 static int server_fill_super(struct super_block *sb)
1668 struct lustre_sb_info *lsi = s2lsi(sb);
1669 struct vfsmount *mnt;
1673 /* the One True Mount */
1674 mnt = server_kernel_mount(sb);
1677 CERROR("Unable to mount device %s: %d\n",
1678 lsi->lsi_lmd->lmd_dev, rc);
1682 lsi->lsi_srv_mnt = mnt;
1684 LASSERT(lsi->lsi_ldd);
1685 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1686 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1687 lsi->lsi_lmd->lmd_dev);
1689 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1690 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1691 "running. Double-mount may have compromised"
1692 " the disk journal.\n",
1693 lsi->lsi_ldd->ldd_svname);
1699 /* Start MGS before MGC */
1700 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1701 rc = server_start_mgs(sb);
1706 /* Start MGC before servers */
1707 rc = lustre_start_mgc(sb);
1711 /* Set up all obd devices for service */
1712 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1713 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1714 rc = server_start_targets(sb, mnt);
1716 CERROR("Unable to start targets: %d\n", rc);
1719 /* FIXME overmount client here,
1720 or can we just start a client log and client_fill_super on this sb?
1721 We need to make sure server_put_super gets called too - ll_put_super
1722 calls lustre_common_put_super; check there for LSI_SERVER flag,
1724 Probably should start client from new thread so we can return.
1725 Client will not finish until all servers are connected.
1726 Note - MGS-only server does NOT get a client, since there is no
1727 lustre fs associated - the MGS is for all lustre fs's */
1730 rc = server_fill_super_common(sb);
1736 /* We jump here in case of failure while starting targets or MGS.
1737 * In this case we can't just put @mnt and have to do real cleanup
1738 * with stoping targets, etc. */
1739 server_put_super(sb);
1743 /* Get the index from the obd name.
1744 rc = server type, or
1746 if endptr isn't NULL it is set to end of name */
1747 int server_name2index(char *svname, __u32 *idx, char **endptr)
1749 unsigned long index;
1751 char *dash = strrchr(svname, '-');
1755 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1756 * in the fsname, then determine the server index */
1757 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
1759 for (; dash > svname && *dash != '-'; dash--);
1764 if (strncmp(dash + 1, "MDT", 3) == 0)
1765 rc = LDD_F_SV_TYPE_MDT;
1766 else if (strncmp(dash + 1, "OST", 3) == 0)
1767 rc = LDD_F_SV_TYPE_OST;
1770 if (strcmp(dash + 4, "all") == 0)
1771 return rc | LDD_F_SV_ALL;
1773 index = simple_strtoul(dash + 4, endptr, 16);
1778 /*************** mount common betweeen server and client ***************/
1781 int lustre_common_put_super(struct super_block *sb)
1786 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
1788 /* Drop a ref to the MGC */
1789 rc = lustre_stop_mgc(sb);
1790 if (rc && (rc != -ENOENT)) {
1792 CERROR("Can't stop MGC: %d\n", rc);
1795 /* BUSY just means that there's some other obd that
1796 needs the mgc. Let him clean it up. */
1797 CDEBUG(D_MOUNT, "MGC still in use\n");
1799 /* Drop a ref to the mounted disk */
1805 static void lmd_print(struct lustre_mount_data *lmd)
1809 PRINT_CMD(PRINT_MASK, " mount data:\n");
1810 if (lmd_is_client(lmd))
1811 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
1812 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
1813 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
1816 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
1818 if (lmd->lmd_recovery_time_soft)
1819 PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
1820 lmd->lmd_recovery_time_soft);
1822 if (lmd->lmd_recovery_time_hard)
1823 PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
1824 lmd->lmd_recovery_time_hard);
1826 for (i = 0; i < lmd->lmd_exclude_count; i++) {
1827 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
1828 lmd->lmd_exclude[i]);
1832 /* Is this server on the exclusion list */
1833 int lustre_check_exclusion(struct super_block *sb, char *svname)
1835 struct lustre_sb_info *lsi = s2lsi(sb);
1836 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1841 rc = server_name2index(svname, &index, NULL);
1842 if (rc != LDD_F_SV_TYPE_OST)
1843 /* Only exclude OSTs */
1846 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
1847 index, lmd->lmd_exclude_count, lmd->lmd_dev);
1849 for(i = 0; i < lmd->lmd_exclude_count; i++) {
1850 if (index == lmd->lmd_exclude[i]) {
1851 CWARN("Excluding %s (on exclusion list)\n", svname);
1858 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
1859 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
1861 char *s1 = ptr, *s2;
1862 __u32 index, *exclude_list;
1866 /* The shortest an ost name can be is 8 chars: -OST0000.
1867 We don't actually know the fsname at this time, so in fact
1868 a user could specify any fsname. */
1869 devmax = strlen(ptr) / 8 + 1;
1871 /* temp storage until we figure out how many we have */
1872 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
1876 /* we enter this fn pointing at the '=' */
1877 while (*s1 && *s1 != ' ' && *s1 != ',') {
1879 rc = server_name2index(s1, &index, &s2);
1881 CERROR("Can't parse server name '%s'\n", s1);
1884 if (rc == LDD_F_SV_TYPE_OST)
1885 exclude_list[lmd->lmd_exclude_count++] = index;
1887 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
1889 /* now we are pointing at ':' (next exclude)
1890 or ',' (end of excludes) */
1891 if (lmd->lmd_exclude_count >= devmax)
1894 if (rc >= 0) /* non-err */
1897 if (lmd->lmd_exclude_count) {
1898 /* permanent, freed in lustre_free_lsi */
1899 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
1900 lmd->lmd_exclude_count);
1901 if (lmd->lmd_exclude) {
1902 memcpy(lmd->lmd_exclude, exclude_list,
1903 sizeof(index) * lmd->lmd_exclude_count);
1906 lmd->lmd_exclude_count = 0;
1909 OBD_FREE(exclude_list, sizeof(index) * devmax);
1913 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
1918 if (lmd->lmd_mgssec != NULL) {
1919 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
1920 lmd->lmd_mgssec = NULL;
1923 tail = strchr(ptr, ',');
1925 length = strlen(ptr);
1927 length = tail - ptr;
1929 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
1930 if (lmd->lmd_mgssec == NULL)
1933 memcpy(lmd->lmd_mgssec, ptr, length);
1934 lmd->lmd_mgssec[length] = '\0';
1938 /** Parse mount line options
1939 * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
1940 * dev is passed as device=uml1:/lustre by mount.lustre
1942 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
1944 char *s1, *s2, *devname = NULL;
1945 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
1951 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
1952 "/sbin/mount.lustre is installed.\n");
1956 /* Options should be a string - try to detect old lmd data */
1957 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
1958 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
1959 "/sbin/mount.lustre. Please install "
1960 "version %s\n", LUSTRE_VERSION_STRING);
1963 lmd->lmd_magic = LMD_MAGIC;
1965 /* Set default flags here */
1970 int time_min = 2 * (CONNECTION_SWITCH_MAX +
1971 2 * INITIAL_CONNECT_TIMEOUT);
1973 /* Skip whitespace and extra commas */
1974 while (*s1 == ' ' || *s1 == ',')
1977 /* Client options are parsed in ll_options: eg. flock,
1980 /* Parse non-ldiskfs options here. Rather than modifying
1981 ldiskfs, we just zero these out here */
1982 if (strncmp(s1, "abort_recov", 11) == 0) {
1983 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
1985 } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
1986 lmd->lmd_recovery_time_soft = max_t(int,
1987 simple_strtoul(s1 + 19, NULL, 10), time_min);
1989 } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
1990 lmd->lmd_recovery_time_hard = max_t(int,
1991 simple_strtoul(s1 + 19, NULL, 10), time_min);
1993 } else if (strncmp(s1, "nosvc", 5) == 0) {
1994 lmd->lmd_flags |= LMD_FLG_NOSVC;
1996 } else if (strncmp(s1, "nomgs", 5) == 0) {
1997 lmd->lmd_flags |= LMD_FLG_NOMGS;
1999 } else if (strncmp(s1, "writeconf", 9) == 0) {
2000 lmd->lmd_flags |= LMD_FLG_WRITECONF;
2002 } else if (strncmp(s1, "mgssec=", 7) == 0) {
2003 rc = lmd_parse_mgssec(lmd, s1 + 7);
2007 /* ost exclusion list */
2008 } else if (strncmp(s1, "exclude=", 8) == 0) {
2009 rc = lmd_make_exclusion(lmd, s1 + 7);
2014 /* Linux 2.4 doesn't pass the device, so we stuck it at the
2015 end of the options. */
2016 else if (strncmp(s1, "device=", 7) == 0) {
2018 /* terminate options right before device. device
2019 must be the last one. */
2025 s2 = strchr(s1, ',');
2033 memmove(s1, s2, strlen(s2) + 1);
2039 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
2040 "(need mount option 'device=...')\n");
2044 s1 = strstr(devname, ":/");
2047 lmd->lmd_flags = LMD_FLG_CLIENT;
2048 /* Remove leading /s from fsname */
2049 while (*++s1 == '/') ;
2050 /* Freed in lustre_free_lsi */
2051 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
2052 if (!lmd->lmd_profile)
2054 sprintf(lmd->lmd_profile, "%s-client", s1);
2057 /* Freed in lustre_free_lsi */
2058 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
2061 strcpy(lmd->lmd_dev, devname);
2063 /* Save mount options */
2064 s1 = options + strlen(options) - 1;
2065 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
2067 if (*options != 0) {
2068 /* Freed in lustre_free_lsi */
2069 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
2072 strcpy(lmd->lmd_opts, options);
2076 lmd->lmd_magic = LMD_MAGIC;
2081 CERROR("Bad mount options %s\n", options);
2085 struct lustre_mount_data2 {
2087 struct vfsmount *lmd2_mnt;
2090 /** This is the entry point for the mount call into Lustre.
2091 * This is called when a server or client is mounted,
2092 * and this is where we start setting things up.
2093 * @param data Mount options (e.g. -o flock,abort_recov)
2095 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2097 struct lustre_mount_data *lmd;
2098 struct lustre_mount_data2 *lmd2 = data;
2099 struct lustre_sb_info *lsi;
2103 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2105 lsi = lustre_init_lsi(sb);
2111 * Disable lockdep during mount, because mount locking patterns are
2116 /* Figure out the lmd from the mount options */
2117 if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
2119 GOTO(out, rc = -EINVAL);
2122 if (lmd_is_client(lmd)) {
2123 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2124 if (!client_fill_super) {
2125 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2126 "client mount! Is the 'lustre' "
2127 "module loaded?\n");
2131 rc = lustre_start_mgc(sb);
2136 /* Connect and start */
2137 /* (should always be ll_fill_super) */
2138 rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
2139 /* c_f_s will call lustre_common_put_super on failure */
2142 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2143 lsi->lsi_flags |= LSI_SERVER;
2144 rc = server_fill_super(sb);
2145 /* s_f_s calls lustre_start_mgc after the mount because we need
2146 the MGS nids which are stored on disk. Plus, we may
2147 need to start the MGS first. */
2148 /* s_f_s will call server_put_super on failure */
2151 /* If error happens in fill_super() call, @lsi will be killed there.
2152 * This is why we do not put it here. */
2156 CERROR("Unable to mount %s (%d)\n",
2157 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2159 CDEBUG(D_SUPER, "Mount %s complete\n",
2167 /* We can't call ll_fill_super by name because it lives in a module that
2168 must be loaded after this one. */
2169 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
2170 struct vfsmount *mnt))
2172 client_fill_super = cfs;
2175 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2177 kill_super_cb = cfs;
2180 /***************** FS registration ******************/
2182 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
2183 struct super_block * lustre_get_sb(struct file_system_type *fs_type, int flags,
2184 const char *devname, void * data)
2186 return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
2189 int lustre_get_sb(struct file_system_type *fs_type, int flags,
2190 const char *devname, void * data, struct vfsmount *mnt)
2192 struct lustre_mount_data2 lmd2 = {data, mnt};
2194 return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt);
2198 void lustre_kill_super(struct super_block *sb)
2200 struct lustre_sb_info *lsi = s2lsi(sb);
2202 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2203 (*kill_super_cb)(sb);
2205 kill_anon_super(sb);
2208 /** Register the "lustre" fs type
2210 struct file_system_type lustre_fs_type = {
2211 .owner = THIS_MODULE,
2213 .get_sb = lustre_get_sb,
2214 .kill_sb = lustre_kill_super,
2215 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2216 #ifdef FS_HAS_FIEMAP
2219 LL_RENAME_DOES_D_MOVE,
2222 int lustre_register_fs(void)
2224 return register_filesystem(&lustre_fs_type);
2227 int lustre_unregister_fs(void)
2229 return unregister_filesystem(&lustre_fs_type);
2232 EXPORT_SYMBOL(lustre_register_client_fill_super);
2233 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2234 EXPORT_SYMBOL(lustre_common_put_super);
2235 EXPORT_SYMBOL(lustre_process_log);
2236 EXPORT_SYMBOL(lustre_end_log);
2237 EXPORT_SYMBOL(server_get_mount);
2238 EXPORT_SYMBOL(server_get_mount_2);
2239 EXPORT_SYMBOL(server_put_mount);
2240 EXPORT_SYMBOL(server_put_mount_2);
2241 EXPORT_SYMBOL(server_register_target);
2242 EXPORT_SYMBOL(server_name2index);
2243 EXPORT_SYMBOL(server_mti_print);
2244 EXPORT_SYMBOL(do_lcfg);