1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/obd_mount.c
38 * Client/server mount routines
40 * Author: Nathan Rutman <nathan@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_CLASS
45 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
46 #define PRINT_CMD CDEBUG
47 #define PRINT_MASK D_SUPER|D_CONFIG
51 #include <lustre_fsfilt.h>
52 #include <obd_class.h>
53 #include <lustre/lustre_user.h>
54 #include <linux/version.h>
55 #include <lustre_log.h>
56 #include <lustre_disk.h>
57 #include <lustre_param.h>
59 static int (*client_fill_super)(struct super_block *sb) = NULL;
60 static void (*kill_super_cb)(struct super_block *sb) = NULL;
62 /*********** mount lookup *********/
64 CFS_DECLARE_MUTEX(lustre_mount_info_lock);
65 static CFS_LIST_HEAD(server_mount_info_list);
67 static struct lustre_mount_info *server_find_mount(const char *name)
70 struct lustre_mount_info *lmi;
73 cfs_list_for_each(tmp, &server_mount_info_list) {
74 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
76 if (strcmp(name, lmi->lmi_name) == 0)
82 /* we must register an obd for a mount before we call the setup routine.
83 *_setup will call lustre_get_mount to get the mnt struct
84 by obd_name, since we can't pass the pointer to setup. */
85 static int server_register_mount(const char *name, struct super_block *sb,
88 struct lustre_mount_info *lmi;
95 OBD_ALLOC(lmi, sizeof(*lmi));
98 OBD_ALLOC(name_cp, strlen(name) + 1);
100 OBD_FREE(lmi, sizeof(*lmi));
103 strcpy(name_cp, name);
105 cfs_down(&lustre_mount_info_lock);
107 if (server_find_mount(name)) {
108 cfs_up(&lustre_mount_info_lock);
109 OBD_FREE(lmi, sizeof(*lmi));
110 OBD_FREE(name_cp, strlen(name) + 1);
111 CERROR("Already registered %s\n", name);
114 lmi->lmi_name = name_cp;
117 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
119 cfs_up(&lustre_mount_info_lock);
121 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
122 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
127 /* when an obd no longer needs a mount */
128 static int server_deregister_mount(const char *name)
130 struct lustre_mount_info *lmi;
133 cfs_down(&lustre_mount_info_lock);
134 lmi = server_find_mount(name);
136 cfs_up(&lustre_mount_info_lock);
137 CERROR("%s not registered\n", name);
141 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
142 lmi->lmi_mnt, name, cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
144 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
145 cfs_list_del(&lmi->lmi_list_chain);
146 OBD_FREE(lmi, sizeof(*lmi));
147 cfs_up(&lustre_mount_info_lock);
152 /* obd's look up a registered mount using their obdname. This is just
153 for initial obd setup to find the mount struct. It should not be
154 called every time you want to mntget. */
155 struct lustre_mount_info *server_get_mount(const char *name)
157 struct lustre_mount_info *lmi;
158 struct lustre_sb_info *lsi;
161 cfs_down(&lustre_mount_info_lock);
162 lmi = server_find_mount(name);
163 cfs_up(&lustre_mount_info_lock);
165 CERROR("Can't find mount for %s\n", name);
168 lsi = s2lsi(lmi->lmi_sb);
169 mntget(lmi->lmi_mnt);
170 cfs_atomic_inc(&lsi->lsi_mounts);
172 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
173 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
174 cfs_atomic_read(&lmi->lmi_mnt->mnt_count));
180 * Used by mdt to get mount_info from obdname.
181 * There are no blocking when using the mount_info.
182 * Do not use server_get_mount for this purpose.
184 struct lustre_mount_info *server_get_mount_2(const char *name)
186 struct lustre_mount_info *lmi;
189 cfs_down(&lustre_mount_info_lock);
190 lmi = server_find_mount(name);
191 cfs_up(&lustre_mount_info_lock);
193 CERROR("Can't find mount for %s\n", name);
198 static void unlock_mntput(struct vfsmount *mnt)
200 if (kernel_locked()) {
209 static int lustre_put_lsi(struct super_block *sb);
211 /* to be called from obd_cleanup methods */
212 int server_put_mount(const char *name, struct vfsmount *mnt)
214 struct lustre_mount_info *lmi;
215 struct lustre_sb_info *lsi;
216 int count = atomic_read(&mnt->mnt_count) - 1;
219 /* This might be the last one, can't deref after this */
222 cfs_down(&lustre_mount_info_lock);
223 lmi = server_find_mount(name);
224 cfs_up(&lustre_mount_info_lock);
226 CERROR("Can't find mount for %s\n", name);
229 lsi = s2lsi(lmi->lmi_sb);
230 LASSERT(lmi->lmi_mnt == mnt);
232 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
233 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
235 if (lustre_put_lsi(lmi->lmi_sb)) {
236 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
237 lmi->lmi_mnt, name, count);
238 /* last mount is the One True Mount */
240 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
243 /* this obd should never need the mount again */
244 server_deregister_mount(name);
249 /* Corresponding to server_get_mount_2 */
250 int server_put_mount_2(const char *name, struct vfsmount *mnt)
256 /******* mount helper utilities *********/
259 static void ldd_print(struct lustre_disk_data *ldd)
261 PRINT_CMD(PRINT_MASK, " disk data:\n");
262 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
263 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
264 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
265 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
266 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
267 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
268 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
269 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
270 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
271 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
275 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
276 struct lustre_disk_data *ldd)
278 struct lvfs_run_ctxt saved;
285 push_ctxt(&saved, mount_ctxt, NULL);
287 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
290 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
294 len = i_size_read(file->f_dentry->d_inode);
295 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
296 if (len != sizeof(*ldd)) {
297 CERROR("disk data size does not match: see %lu expect %u\n",
298 len, (int)sizeof(*ldd));
299 GOTO(out_close, rc = -EINVAL);
302 rc = lustre_fread(file, ldd, len, &off);
304 CERROR("error reading %s: read %d of %lu\n",
305 MOUNT_DATA_FILE, rc, len);
306 GOTO(out_close, rc = -EINVAL);
310 if (ldd->ldd_magic != LDD_MAGIC) {
311 /* FIXME add swabbing support */
312 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
313 ldd->ldd_magic, LDD_MAGIC);
314 GOTO(out_close, rc = -EINVAL);
317 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
318 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
320 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
321 GOTO(out_close, rc = -EINVAL);
323 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
324 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
326 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
327 /* Do something like remount filesystem read-only */
328 GOTO(out_close, rc = -EINVAL);
334 pop_ctxt(&saved, mount_ctxt, NULL);
338 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
339 struct lustre_disk_data *ldd)
341 struct lvfs_run_ctxt saved;
344 unsigned long len = sizeof(struct lustre_disk_data);
348 LASSERT(ldd->ldd_magic == LDD_MAGIC);
350 ldd->ldd_config_ver++;
352 push_ctxt(&saved, mount_ctxt, NULL);
354 file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
357 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
361 rc = lustre_fwrite(file, ldd, len, &off);
363 CERROR("error writing %s: read %d of %lu\n",
364 MOUNT_DATA_FILE, rc, len);
365 GOTO(out_close, rc = -EINVAL);
373 pop_ctxt(&saved, mount_ctxt, NULL);
378 /**************** config llog ********************/
380 /** Get a config log from the MGS and process it.
381 * This func is called for both clients and servers.
382 * Continue to process new statements appended to the logs
383 * (whenever the config lock is revoked) until lustre_end_log
385 * @param sb The superblock is used by the MGC to write to the local copy of
387 * @param logname The name of the llog to replicate from the MGS
388 * @param cfg Since the same mgc may be used to follow multiple config logs
389 * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
390 * this log, and is added to the mgc's list of logs to follow.
392 int lustre_process_log(struct super_block *sb, char *logname,
393 struct config_llog_instance *cfg)
395 struct lustre_cfg *lcfg;
396 struct lustre_cfg_bufs bufs;
397 struct lustre_sb_info *lsi = s2lsi(sb);
398 struct obd_device *mgc = lsi->lsi_mgc;
405 /* mgc_process_config */
406 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
407 lustre_cfg_bufs_set_string(&bufs, 1, logname);
408 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
409 lustre_cfg_bufs_set(&bufs, 3, &sb, sizeof(sb));
410 lcfg = lustre_cfg_new(LCFG_LOG_START, &bufs);
411 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
412 lustre_cfg_free(lcfg);
415 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
416 "failed from the MGS (%d). Make sure this "
417 "client and the MGS are running compatible "
418 "versions of Lustre.\n",
419 mgc->obd_name, logname, rc);
422 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
423 "failed (%d). This may be the result of "
424 "communication errors between this node and "
425 "the MGS, a bad configuration, or other "
426 "errors. See the syslog for more "
427 "information.\n", mgc->obd_name, logname,
430 /* class_obd_list(); */
434 /* Stop watching this config log for updates */
435 int lustre_end_log(struct super_block *sb, char *logname,
436 struct config_llog_instance *cfg)
438 struct lustre_cfg *lcfg;
439 struct lustre_cfg_bufs bufs;
440 struct lustre_sb_info *lsi = s2lsi(sb);
441 struct obd_device *mgc = lsi->lsi_mgc;
448 /* mgc_process_config */
449 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
450 lustre_cfg_bufs_set_string(&bufs, 1, logname);
452 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
453 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
454 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
455 lustre_cfg_free(lcfg);
459 /**************** obd start *******************/
461 /** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
462 * lctl (and do for echo cli/srv.
464 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
465 char *s1, char *s2, char *s3, char *s4)
467 struct lustre_cfg_bufs bufs;
468 struct lustre_cfg * lcfg = NULL;
471 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
472 cmd, s1, s2, s3, s4);
474 lustre_cfg_bufs_reset(&bufs, cfgname);
476 lustre_cfg_bufs_set_string(&bufs, 1, s1);
478 lustre_cfg_bufs_set_string(&bufs, 2, s2);
480 lustre_cfg_bufs_set_string(&bufs, 3, s3);
482 lustre_cfg_bufs_set_string(&bufs, 4, s4);
484 lcfg = lustre_cfg_new(cmd, &bufs);
485 lcfg->lcfg_nid = nid;
486 rc = class_process_config(lcfg);
487 lustre_cfg_free(lcfg);
491 /** Call class_attach and class_setup. These methods in turn call
492 * obd type-specific methods.
494 static int lustre_start_simple(char *obdname, char *type, char *uuid,
498 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
500 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
502 CERROR("%s attach error %d\n", obdname, rc);
505 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
507 CERROR("%s setup error %d\n", obdname, rc);
508 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
513 /* Set up a MGS to serve startup logs */
514 static int server_start_mgs(struct super_block *sb)
516 struct lustre_sb_info *lsi = s2lsi(sb);
517 struct vfsmount *mnt = lsi->lsi_srv_mnt;
518 struct lustre_mount_info *lmi;
523 /* It is impossible to have more than 1 MGS per node, since
524 MGC wouldn't know which to connect to */
525 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
527 lsi = s2lsi(lmi->lmi_sb);
528 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
530 lsi->lsi_ldd->ldd_svname);
534 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
536 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
539 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
540 LUSTRE_MGS_OBDNAME, 0, 0);
541 /* Do NOT call server_deregister_mount() here. This leads to
542 * inability cleanup cleanly and free lsi and other stuff when
543 * mgs calls server_put_mount() in error handling case. -umka */
547 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
548 "Is the 'mgs' module loaded?\n",
549 LUSTRE_MGS_OBDNAME, rc);
553 static int server_stop_mgs(struct super_block *sb)
555 struct obd_device *obd;
559 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
561 /* There better be only one MGS */
562 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
564 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
568 /* The MGS should always stop when we say so */
570 rc = class_manual_cleanup(obd);
574 CFS_DECLARE_MUTEX(mgc_start_lock);
576 /** Set up a mgc obd to process startup logs
578 * \param sb [in] super block of the mgc obd
580 * \retval 0 success, otherwise error code
582 static int lustre_start_mgc(struct super_block *sb)
584 struct obd_connect_data *data = NULL;
585 struct lustre_sb_info *lsi = s2lsi(sb);
586 struct obd_device *obd;
587 struct obd_export *exp;
588 struct obd_uuid *uuid;
591 char *mgcname, *niduuid, *mgssec;
594 int rc = 0, i = 0, j, len;
597 LASSERT(lsi->lsi_lmd);
599 /* Find the first non-lo MGS nid for our MGC name */
600 if (lsi->lsi_flags & LSI_SERVER) {
601 ptr = lsi->lsi_ldd->ldd_params;
602 /* Use mgsnode= nids */
603 if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
604 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
606 } else if (IS_MGS(lsi->lsi_ldd)) {
607 lnet_process_id_t id;
608 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
609 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
616 } else { /* client */
617 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
618 ptr = lsi->lsi_lmd->lmd_dev;
619 if (class_parse_nid(ptr, &nid, &ptr) == 0)
623 CERROR("No valid MGS nids found.\n");
627 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
628 OBD_ALLOC(mgcname, len);
629 OBD_ALLOC(niduuid, len + 2);
630 if (!mgcname || !niduuid)
631 GOTO(out_free, rc = -ENOMEM);
632 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
634 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
636 cfs_mutex_down(&mgc_start_lock);
638 obd = class_name2obd(mgcname);
639 if (obd && !obd->obd_stopping) {
640 rc = obd_set_info_async(obd->obd_self_export,
641 strlen(KEY_MGSSEC), KEY_MGSSEC,
642 strlen(mgssec), mgssec, NULL);
646 /* Re-using an existing MGC */
647 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
650 /* If we are restarting the MGS, don't try to keep the MGC's
651 old connection, or registration will fail. */
652 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
653 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
657 /* Try all connections, but only once (again).
658 We don't want to block another target from starting
659 (using its local copy of the log), but we do want to connect
660 if at all possible. */
662 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
663 rc = obd_set_info_async(obd->obd_self_export,
664 sizeof(KEY_INIT_RECOV_BACKUP),
665 KEY_INIT_RECOV_BACKUP,
666 sizeof(recov_bk), &recov_bk, NULL);
670 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
672 /* Add the primary nids for the MGS */
674 sprintf(niduuid, "%s_%x", mgcname, i);
675 if (lsi->lsi_flags & LSI_SERVER) {
676 ptr = lsi->lsi_ldd->ldd_params;
677 if (IS_MGS(lsi->lsi_ldd)) {
678 /* Use local nids (including LO) */
679 lnet_process_id_t id;
680 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
681 rc = do_lcfg(mgcname, id.nid,
682 LCFG_ADD_UUID, niduuid, 0,0,0);
685 /* Use mgsnode= nids */
686 if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) {
687 CERROR("No MGS nids given.\n");
688 GOTO(out_free, rc = -EINVAL);
690 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
691 rc = do_lcfg(mgcname, nid,
692 LCFG_ADD_UUID, niduuid, 0,0,0);
696 } else { /* client */
697 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
698 ptr = lsi->lsi_lmd->lmd_dev;
699 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
700 rc = do_lcfg(mgcname, nid,
701 LCFG_ADD_UUID, niduuid, 0,0,0);
703 /* Stop at the first failover nid */
709 CERROR("No valid MGS nids found.\n");
710 GOTO(out_free, rc = -EINVAL);
712 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
714 /* Random uuid for MGC allows easier reconnects */
716 ll_generate_random_uuid(uuidc);
717 class_uuid_unparse(uuidc, uuid);
720 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
721 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
727 /* Add any failover MGS nids */
729 while ((*ptr == ':' ||
730 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
731 /* New failover node */
732 sprintf(niduuid, "%s_%x", mgcname, i);
734 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
736 rc = do_lcfg(mgcname, nid,
737 LCFG_ADD_UUID, niduuid, 0,0,0);
742 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
750 lsi->lsi_lmd->lmd_mgs_failnodes = i;
752 obd = class_name2obd(mgcname);
754 CERROR("Can't find mgcobd %s\n", mgcname);
755 GOTO(out_free, rc = -ENOTCONN);
758 rc = obd_set_info_async(obd->obd_self_export,
759 strlen(KEY_MGSSEC), KEY_MGSSEC,
760 strlen(mgssec), mgssec, NULL);
764 /* Keep a refcount of servers/clients who started with "mount",
765 so we know when we can get rid of the mgc. */
766 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
768 /* Try all connections, but only once. */
770 rc = obd_set_info_async(obd->obd_self_export,
771 sizeof(KEY_INIT_RECOV_BACKUP),
772 KEY_INIT_RECOV_BACKUP,
773 sizeof(recov_bk), &recov_bk, NULL);
776 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
777 /* We connect to the MGS at setup, and don't disconnect until cleanup */
780 GOTO(out, rc = -ENOMEM);
781 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
782 OBD_CONNECT_AT | OBD_CONNECT_FULL20;
783 data->ocd_version = LUSTRE_VERSION_CODE;
784 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
787 CERROR("connect failed %d\n", rc);
791 obd->u.cli.cl_mgc_mgsexp = exp;
794 /* Keep the mgc info in the sb. Note that many lsi's can point
798 cfs_mutex_up(&mgc_start_lock);
801 OBD_FREE(mgcname, len);
803 OBD_FREE(niduuid, len + 2);
807 static int lustre_stop_mgc(struct super_block *sb)
809 struct lustre_sb_info *lsi = s2lsi(sb);
810 struct obd_device *obd;
811 char *niduuid = 0, *ptr = 0;
812 int i, rc = 0, len = 0;
822 cfs_mutex_down(&mgc_start_lock);
823 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
824 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
825 /* This is not fatal, every client that stops
826 will call in here. */
827 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
828 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
829 GOTO(out, rc = -EBUSY);
832 /* The MGC has no recoverable data in any case.
833 * force shotdown set in umount_begin */
834 obd->obd_no_recov = 1;
836 if (obd->u.cli.cl_mgc_mgsexp) {
837 /* An error is not fatal, if we are unable to send the
838 disconnect mgs ping evictor cleans up the export */
839 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
841 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
844 /* Save the obdname for cleaning the nid uuids, which are
846 len = strlen(obd->obd_name) + 6;
847 OBD_ALLOC(niduuid, len);
849 strcpy(niduuid, obd->obd_name);
850 ptr = niduuid + strlen(niduuid);
853 rc = class_manual_cleanup(obd);
857 /* Clean the nid uuids */
859 GOTO(out, rc = -ENOMEM);
861 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
862 sprintf(ptr, "_%x", i);
863 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
866 CERROR("del MDC UUID %s failed: rc = %d\n",
871 OBD_FREE(niduuid, len);
873 /* class_import_put will get rid of the additional connections */
874 cfs_mutex_up(&mgc_start_lock);
878 /* Since there's only one mgc per node, we have to change it's fs to get
879 access to the right disk. */
880 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
882 struct lustre_sb_info *lsi = s2lsi(sb);
886 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
888 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
889 rc = obd_set_info_async(mgc->obd_self_export,
890 sizeof(KEY_SET_FS), KEY_SET_FS,
891 sizeof(*sb), sb, NULL);
893 CERROR("can't set_fs %d\n", rc);
899 static int server_mgc_clear_fs(struct obd_device *mgc)
904 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
906 rc = obd_set_info_async(mgc->obd_self_export,
907 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
912 CFS_DECLARE_MUTEX(server_start_lock);
914 /* Stop MDS/OSS if nobody is using them */
915 static int server_stop_servers(int lddflags, int lsiflags)
917 struct obd_device *obd = NULL;
918 struct obd_type *type = NULL;
922 cfs_mutex_down(&server_start_lock);
924 /* Either an MDT or an OST or neither */
925 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
926 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
927 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
928 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
929 type = class_search_type(LUSTRE_MDS_NAME);
931 /* if this was an OST, and there are no more OST's, clean up the OSS */
932 if ((lddflags & LDD_F_SV_TYPE_OST) &&
933 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
934 type = class_search_type(LUSTRE_OST_NAME);
937 if (obd && (!type || !type->typ_refcnt)) {
940 /* obd_fail doesn't mean much on a server obd */
941 err = class_manual_cleanup(obd);
946 cfs_mutex_up(&server_start_lock);
951 int server_mti_print(char *title, struct mgs_target_info *mti)
953 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
954 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
955 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
956 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
957 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
958 mti->mti_config_ver, mti->mti_flags);
962 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
964 struct lustre_sb_info *lsi = s2lsi(sb);
965 struct lustre_disk_data *ldd = lsi->lsi_ldd;
966 lnet_process_id_t id;
970 if (!(lsi->lsi_flags & LSI_SERVER))
973 strncpy(mti->mti_fsname, ldd->ldd_fsname,
974 sizeof(mti->mti_fsname));
975 strncpy(mti->mti_svname, ldd->ldd_svname,
976 sizeof(mti->mti_svname));
978 mti->mti_nid_count = 0;
979 while (LNetGetId(i++, &id) != -ENOENT) {
980 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
983 if (class_find_param(ldd->ldd_params,
984 PARAM_NETWORK, NULL) == 0 &&
985 !class_match_net(ldd->ldd_params, id.nid)) {
986 /* can't match specified network */
990 mti->mti_nids[mti->mti_nid_count] = id.nid;
991 mti->mti_nid_count++;
992 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
993 CWARN("Only using first %d nids for %s\n",
994 mti->mti_nid_count, mti->mti_svname);
999 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
1000 mti->mti_config_ver = 0;
1001 if (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF)
1002 ldd->ldd_flags |= LDD_F_WRITECONF;
1003 mti->mti_flags = ldd->ldd_flags;
1004 mti->mti_stripe_index = ldd->ldd_svindex;
1005 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
1006 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
1007 CERROR("params too big for mti\n");
1010 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
1014 /* Register an old or new target with the MGS. If needed MGS will construct
1015 startup logs and assign index */
1016 int server_register_target(struct super_block *sb)
1018 struct lustre_sb_info *lsi = s2lsi(sb);
1019 struct obd_device *mgc = lsi->lsi_mgc;
1020 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1021 struct mgs_target_info *mti = NULL;
1027 if (!(lsi->lsi_flags & LSI_SERVER))
1033 rc = server_sb2mti(sb, mti);
1037 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1038 mti->mti_svname, mti->mti_fsname,
1039 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1042 /* Register the target */
1043 /* FIXME use mgc_process_config instead */
1044 rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp,
1045 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1046 sizeof(*mti), mti, NULL);
1050 /* Always update our flags */
1051 ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD;
1053 /* If this flag is set, it means the MGS wants us to change our
1054 on-disk data. (So far this means just the index.) */
1055 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1058 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1059 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1061 ldd->ldd_svindex = mti->mti_stripe_index;
1062 strncpy(ldd->ldd_svname, mti->mti_svname,
1063 sizeof(ldd->ldd_svname));
1064 /* or ldd_make_sv_name(ldd); */
1065 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1066 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1069 CERROR("Label set error %d\n", err);
1070 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1072 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1074 /* Flush the new ldd to disk */
1075 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1084 /** Start server targets: MDTs and OSTs
1086 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1088 struct obd_device *obd;
1089 struct lustre_sb_info *lsi = s2lsi(sb);
1090 struct config_llog_instance cfg;
1094 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1097 /* If we're an MDT, make sure the global MDS is running */
1098 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1099 /* make sure the MDS is started */
1100 cfs_mutex_down(&server_start_lock);
1101 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1103 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1104 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1106 LUSTRE_MDS_OBDNAME"_uuid",
1109 cfs_mutex_up(&server_start_lock);
1110 CERROR("failed to start MDS: %d\n", rc);
1114 cfs_mutex_up(&server_start_lock);
1118 /* If we're an OST, make sure the global OSS is running */
1119 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
1120 /* make sure OSS is started */
1121 cfs_mutex_down(&server_start_lock);
1122 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1124 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1126 LUSTRE_OSS_OBDNAME"_uuid",
1129 cfs_mutex_up(&server_start_lock);
1130 CERROR("failed to start OSS: %d\n", rc);
1134 cfs_mutex_up(&server_start_lock);
1137 /* Set the mgc fs to our server disk. This allows the MGC to
1138 * read and write configs locally, in case it can't talk to the MGS. */
1139 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1143 /* Register with MGS */
1144 rc = server_register_target(sb);
1145 if (rc && (lsi->lsi_ldd->ldd_flags &
1146 (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){
1147 CERROR("Required registration failed for %s: %d\n",
1148 lsi->lsi_ldd->ldd_svname, rc);
1150 LCONSOLE_ERROR_MSG(0x15f, "Communication error with "
1151 "the MGS. Is the MGS running?\n");
1155 if (rc == -EINVAL) {
1156 LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this "
1157 "server (%s) to start. Please see messages"
1158 " on the MGS node.\n",
1159 lsi->lsi_ldd->ldd_svname);
1162 /* non-fatal error of registeration with MGS */
1164 CDEBUG(D_MOUNT, "Cannot register with MGS: %d\n", rc);
1166 /* Let the target look up the mount using the target's name
1167 (we can't pass the sb or mnt through class_process_config.) */
1168 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1172 /* Start targets using the llog named for the target */
1173 memset(&cfg, 0, sizeof(cfg));
1174 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1176 CERROR("failed to start server %s: %d\n",
1177 lsi->lsi_ldd->ldd_svname, rc);
1178 /* Do NOT call server_deregister_mount() here. This makes it
1179 * impossible to find mount later in cleanup time and leaves
1180 * @lsi and othder stuff leaked. -umka */
1185 /* Release the mgc fs for others to use */
1186 server_mgc_clear_fs(lsi->lsi_mgc);
1189 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1191 CERROR("no server named %s was started\n",
1192 lsi->lsi_ldd->ldd_svname);
1196 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1197 (OBP(obd, iocontrol))) {
1198 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1199 obd->obd_self_export, 0, NULL, NULL);
1202 /* log has been fully processed */
1203 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1209 /***************** lustre superblock **************/
1211 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1213 struct lustre_sb_info *lsi;
1219 OBD_ALLOC_PTR(lsi->lsi_lmd);
1220 if (!lsi->lsi_lmd) {
1225 lsi->lsi_lmd->lmd_exclude_count = 0;
1226 lsi->lsi_lmd->lmd_recovery_time_soft = 0;
1227 lsi->lsi_lmd->lmd_recovery_time_hard = 0;
1228 s2lsi_nocast(sb) = lsi;
1229 /* we take 1 extra ref for our setup */
1230 cfs_atomic_set(&lsi->lsi_mounts, 1);
1232 /* Default umount style */
1233 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1238 static int lustre_free_lsi(struct super_block *sb)
1240 struct lustre_sb_info *lsi = s2lsi(sb);
1243 LASSERT(lsi != NULL);
1244 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1246 /* someone didn't call server_put_mount. */
1247 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1249 if (lsi->lsi_ldd != NULL)
1250 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1252 if (lsi->lsi_lmd != NULL) {
1253 if (lsi->lsi_lmd->lmd_dev != NULL)
1254 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1255 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1256 if (lsi->lsi_lmd->lmd_profile != NULL)
1257 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1258 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1259 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1260 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1261 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1262 if (lsi->lsi_lmd->lmd_opts != NULL)
1263 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1264 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1265 if (lsi->lsi_lmd->lmd_exclude_count)
1266 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1267 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1268 lsi->lsi_lmd->lmd_exclude_count);
1269 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1272 LASSERT(lsi->lsi_llsbi == NULL);
1273 OBD_FREE(lsi, sizeof(*lsi));
1274 s2lsi_nocast(sb) = NULL;
1279 /* The lsi has one reference for every server that is using the disk -
1280 e.g. MDT, MGS, and potentially MGC */
1281 static int lustre_put_lsi(struct super_block *sb)
1283 struct lustre_sb_info *lsi = s2lsi(sb);
1286 LASSERT(lsi != NULL);
1288 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1289 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1290 lustre_free_lsi(sb);
1296 /*************** server mount ******************/
1298 /** Kernel mount using mount options in MOUNT_DATA_FILE.
1299 * Since this file lives on the disk, we pre-mount using a common
1300 * type, read the file, then re-mount using the type specified in the
1303 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1305 struct lvfs_run_ctxt mount_ctxt;
1306 struct lustre_sb_info *lsi = s2lsi(sb);
1307 struct lustre_disk_data *ldd;
1308 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1309 struct vfsmount *mnt;
1310 char *options = NULL;
1311 unsigned long page, s_flags;
1312 struct page *__page;
1316 OBD_ALLOC(ldd, sizeof(*ldd));
1318 RETURN(ERR_PTR(-ENOMEM));
1320 /* In the past, we have always used flags = 0.
1321 Note ext3/ldiskfs can't be mounted ro. */
1322 s_flags = sb->s_flags;
1324 /* allocate memory for options */
1325 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1327 GOTO(out_free, rc = -ENOMEM);
1328 page = (unsigned long)cfs_page_address(__page);
1329 options = (char *)page;
1330 memset(options, 0, CFS_PAGE_SIZE);
1332 /* mount-line options must be added for pre-mount because it may
1333 * contain mount options such as journal_dev which are required
1334 * to mount successfuly the underlying filesystem */
1335 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1336 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1338 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1339 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1340 mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, (void *)options);
1343 CERROR("premount %s:%#lx ldiskfs failed: %d "
1344 "Is the ldiskfs module available?\n",
1345 lmd->lmd_dev, s_flags, rc );
1349 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1350 mount_ctxt.pwdmnt = mnt;
1351 mount_ctxt.pwd = mnt->mnt_root;
1352 mount_ctxt.fs = get_ds();
1354 rc = ldd_parse(&mount_ctxt, ldd);
1358 CERROR("premount parse options failed: rc = %d\n", rc);
1362 /* Done with our pre-mount, now do the real mount. */
1364 /* Glom up mount options */
1365 memset(options, 0, CFS_PAGE_SIZE);
1366 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1368 /* Add in any mount-line options */
1369 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1370 int len = CFS_PAGE_SIZE - strlen(options) - 2;
1372 strcat(options, ",");
1373 strncat(options, lmd->lmd_opts, len);
1376 /* Special permanent mount flags */
1378 s_flags |= MS_NOATIME | MS_NODIRATIME;
1380 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1381 MT_STR(ldd), lmd->lmd_dev, options);
1382 mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
1386 CERROR("ll_kern_mount failed: rc = %d\n", rc);
1390 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1391 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1394 OBD_PAGE_FREE(__page);
1395 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1396 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1401 OBD_PAGE_FREE(__page);
1402 OBD_FREE(ldd, sizeof(*ldd));
1403 lsi->lsi_ldd = NULL;
1404 RETURN(ERR_PTR(rc));
1407 /** Wait here forever until the mount refcount is 0 before completing umount,
1408 * else we risk dereferencing a null pointer.
1409 * LNET may take e.g. 165s before killing zombies.
1411 static void server_wait_finished(struct vfsmount *mnt)
1415 cfs_sigset_t blocked;
1417 cfs_waitq_init(&waitq);
1419 while (atomic_read(&mnt->mnt_count) > 1) {
1420 if (waited && (waited % 30 == 0))
1421 LCONSOLE_WARN("Mount still busy with %d refs after "
1423 atomic_read(&mnt->mnt_count),
1425 /* Cannot use l_event_wait() for an interruptible sleep. */
1427 blocked = l_w_e_set_sigs(sigmask(SIGKILL));
1428 cfs_waitq_wait_event_interruptible_timeout(
1430 (atomic_read(&mnt->mnt_count) == 1),
1431 cfs_time_seconds(3),
1433 cfs_block_sigs(blocked);
1435 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1438 atomic_read(&mnt->mnt_count));
1445 /** Start the shutdown of servers at umount.
1447 static void server_put_super(struct super_block *sb)
1449 struct lustre_sb_info *lsi = s2lsi(sb);
1450 struct obd_device *obd;
1451 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1452 char *tmpname, *extraname = NULL;
1454 int lddflags = lsi->lsi_ldd->ldd_flags;
1455 int lsiflags = lsi->lsi_flags;
1458 LASSERT(lsiflags & LSI_SERVER);
1460 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1461 OBD_ALLOC(tmpname, tmpname_sz);
1462 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1463 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1464 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1465 snprintf(tmpname, tmpname_sz, "MGS");
1467 /* Stop the target */
1468 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1469 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1470 struct lustre_profile *lprof = NULL;
1472 /* tell the mgc to drop the config log */
1473 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1475 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1476 If there are any setup/cleanup errors, save the lov
1477 name for safety cleanup later. */
1478 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1479 if (lprof && lprof->lp_dt) {
1480 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1481 strcpy(extraname, lprof->lp_dt);
1484 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1486 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1487 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1489 /* We can't seem to give an error return code
1490 * to .put_super, so we better make sure we clean up! */
1492 class_manual_cleanup(obd);
1494 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1495 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1499 /* If they wanted the mgs to stop separately from the mdt, they
1500 should have put it on a different device. */
1501 if (IS_MGS(lsi->lsi_ldd)) {
1502 /* if MDS start with --nomgs, don't stop MGS then */
1503 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1504 server_stop_mgs(sb);
1507 /* Clean the mgc and sb */
1508 lustre_common_put_super(sb);
1510 /* Wait for the targets to really clean up - can't exit (and let the
1511 sb get destroyed) while the mount is still in use */
1512 server_wait_finished(mnt);
1514 /* drop the One True Mount */
1517 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1518 until the target is really gone so that our type refcount check
1520 server_stop_servers(lddflags, lsiflags);
1522 /* In case of startup or cleanup err, stop related obds */
1524 obd = class_name2obd(extraname);
1526 CWARN("Cleaning orphaned obd %s\n", extraname);
1528 class_manual_cleanup(obd);
1530 OBD_FREE(extraname, strlen(extraname) + 1);
1533 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1534 OBD_FREE(tmpname, tmpname_sz);
1538 /** Called only for 'umount -f'
1540 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1541 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1543 struct super_block *sb = vfsmnt->mnt_sb;
1545 static void server_umount_begin(struct super_block *sb)
1548 struct lustre_sb_info *lsi = s2lsi(sb);
1551 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1552 if (!(flags & MNT_FORCE)) {
1558 CDEBUG(D_MOUNT, "umount -f\n");
1559 /* umount = failover
1561 no third way to do non-force, non-failover */
1562 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1563 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1567 #ifndef HAVE_STATFS_DENTRY_PARAM
1568 static int server_statfs (struct super_block *sb, cfs_kstatfs_t *buf)
1571 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1573 struct super_block *sb = dentry->d_sb;
1575 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1578 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1579 #ifdef HAVE_STATFS_DENTRY_PARAM
1580 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1582 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf);
1585 buf->f_type = sb->s_magic;
1591 buf->f_type = sb->s_magic;
1592 buf->f_bsize = sb->s_blocksize;
1598 buf->f_namelen = NAME_MAX;
1602 /** The operations we support directly on the superblock:
1603 * mount, umount, and df.
1605 static struct super_operations server_ops =
1607 .put_super = server_put_super,
1608 .umount_begin = server_umount_begin, /* umount -f */
1609 .statfs = server_statfs,
1612 #define log2(n) cfs_ffz(~(n))
1613 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1615 static int server_fill_super_common(struct super_block *sb)
1617 struct inode *root = 0;
1620 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1622 sb->s_blocksize = 4096;
1623 sb->s_blocksize_bits = log2(sb->s_blocksize);
1624 sb->s_magic = LUSTRE_SUPER_MAGIC;
1625 sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
1626 sb->s_flags |= MS_RDONLY;
1627 sb->s_op = &server_ops;
1629 root = new_inode(sb);
1631 CERROR("Can't make root inode\n");
1635 /* returns -EIO for every operation */
1636 /* make_bad_inode(root); -- badness - can't umount */
1637 /* apparently we need to be a directory for the mount to finish */
1638 root->i_mode = S_IFDIR;
1640 sb->s_root = d_alloc_root(root);
1642 CERROR("Can't make root dentry\n");
1650 /** Fill in the superblock info for a Lustre server.
1651 * Mount the device with the correct options.
1652 * Read the on-disk config file.
1653 * Start the services.
1655 static int server_fill_super(struct super_block *sb)
1657 struct lustre_sb_info *lsi = s2lsi(sb);
1658 struct vfsmount *mnt;
1662 /* the One True Mount */
1663 mnt = server_kernel_mount(sb);
1666 CERROR("Unable to mount device %s: %d\n",
1667 lsi->lsi_lmd->lmd_dev, rc);
1671 lsi->lsi_srv_mnt = mnt;
1673 LASSERT(lsi->lsi_ldd);
1674 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1675 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1676 lsi->lsi_lmd->lmd_dev);
1678 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1679 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1680 "running. Double-mount may have compromised"
1681 " the disk journal.\n",
1682 lsi->lsi_ldd->ldd_svname);
1688 /* Start MGS before MGC */
1689 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1690 rc = server_start_mgs(sb);
1695 /* Start MGC before servers */
1696 rc = lustre_start_mgc(sb);
1700 /* Set up all obd devices for service */
1701 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1702 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1703 rc = server_start_targets(sb, mnt);
1705 CERROR("Unable to start targets: %d\n", rc);
1708 /* FIXME overmount client here,
1709 or can we just start a client log and client_fill_super on this sb?
1710 We need to make sure server_put_super gets called too - ll_put_super
1711 calls lustre_common_put_super; check there for LSI_SERVER flag,
1713 Probably should start client from new thread so we can return.
1714 Client will not finish until all servers are connected.
1715 Note - MGS-only server does NOT get a client, since there is no
1716 lustre fs associated - the MGS is for all lustre fs's */
1719 rc = server_fill_super_common(sb);
1725 /* We jump here in case of failure while starting targets or MGS.
1726 * In this case we can't just put @mnt and have to do real cleanup
1727 * with stoping targets, etc. */
1728 server_put_super(sb);
1732 /* Get the index from the obd name.
1733 rc = server type, or
1735 if endptr isn't NULL it is set to end of name */
1736 int server_name2index(char *svname, __u32 *idx, char **endptr)
1738 unsigned long index;
1740 char *dash = strrchr(svname, '-');
1744 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1745 * in the fsname, then determine the server index */
1746 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
1748 for (; dash > svname && *dash != '-'; dash--);
1753 if (strncmp(dash + 1, "MDT", 3) == 0)
1754 rc = LDD_F_SV_TYPE_MDT;
1755 else if (strncmp(dash + 1, "OST", 3) == 0)
1756 rc = LDD_F_SV_TYPE_OST;
1759 if (strcmp(dash + 4, "all") == 0)
1760 return rc | LDD_F_SV_ALL;
1762 index = simple_strtoul(dash + 4, endptr, 16);
1767 /*************** mount common betweeen server and client ***************/
1770 int lustre_common_put_super(struct super_block *sb)
1775 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
1777 /* Drop a ref to the MGC */
1778 rc = lustre_stop_mgc(sb);
1779 if (rc && (rc != -ENOENT)) {
1781 CERROR("Can't stop MGC: %d\n", rc);
1784 /* BUSY just means that there's some other obd that
1785 needs the mgc. Let him clean it up. */
1786 CDEBUG(D_MOUNT, "MGC still in use\n");
1788 /* Drop a ref to the mounted disk */
1794 static void lmd_print(struct lustre_mount_data *lmd)
1798 PRINT_CMD(PRINT_MASK, " mount data:\n");
1799 if (lmd_is_client(lmd))
1800 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
1801 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
1802 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
1805 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
1807 if (lmd->lmd_recovery_time_soft)
1808 PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
1809 lmd->lmd_recovery_time_soft);
1811 if (lmd->lmd_recovery_time_hard)
1812 PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
1813 lmd->lmd_recovery_time_hard);
1815 for (i = 0; i < lmd->lmd_exclude_count; i++) {
1816 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
1817 lmd->lmd_exclude[i]);
1821 /* Is this server on the exclusion list */
1822 int lustre_check_exclusion(struct super_block *sb, char *svname)
1824 struct lustre_sb_info *lsi = s2lsi(sb);
1825 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1830 rc = server_name2index(svname, &index, NULL);
1831 if (rc != LDD_F_SV_TYPE_OST)
1832 /* Only exclude OSTs */
1835 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
1836 index, lmd->lmd_exclude_count, lmd->lmd_dev);
1838 for(i = 0; i < lmd->lmd_exclude_count; i++) {
1839 if (index == lmd->lmd_exclude[i]) {
1840 CWARN("Excluding %s (on exclusion list)\n", svname);
1847 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
1848 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
1850 char *s1 = ptr, *s2;
1851 __u32 index, *exclude_list;
1855 /* The shortest an ost name can be is 8 chars: -OST0000.
1856 We don't actually know the fsname at this time, so in fact
1857 a user could specify any fsname. */
1858 devmax = strlen(ptr) / 8 + 1;
1860 /* temp storage until we figure out how many we have */
1861 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
1865 /* we enter this fn pointing at the '=' */
1866 while (*s1 && *s1 != ' ' && *s1 != ',') {
1868 rc = server_name2index(s1, &index, &s2);
1870 CERROR("Can't parse server name '%s'\n", s1);
1873 if (rc == LDD_F_SV_TYPE_OST)
1874 exclude_list[lmd->lmd_exclude_count++] = index;
1876 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
1878 /* now we are pointing at ':' (next exclude)
1879 or ',' (end of excludes) */
1880 if (lmd->lmd_exclude_count >= devmax)
1883 if (rc >= 0) /* non-err */
1886 if (lmd->lmd_exclude_count) {
1887 /* permanent, freed in lustre_free_lsi */
1888 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
1889 lmd->lmd_exclude_count);
1890 if (lmd->lmd_exclude) {
1891 memcpy(lmd->lmd_exclude, exclude_list,
1892 sizeof(index) * lmd->lmd_exclude_count);
1895 lmd->lmd_exclude_count = 0;
1898 OBD_FREE(exclude_list, sizeof(index) * devmax);
1902 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
1907 if (lmd->lmd_mgssec != NULL) {
1908 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
1909 lmd->lmd_mgssec = NULL;
1912 tail = strchr(ptr, ',');
1914 length = strlen(ptr);
1916 length = tail - ptr;
1918 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
1919 if (lmd->lmd_mgssec == NULL)
1922 memcpy(lmd->lmd_mgssec, ptr, length);
1923 lmd->lmd_mgssec[length] = '\0';
1927 /** Parse mount line options
1928 * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
1929 * dev is passed as device=uml1:/lustre by mount.lustre
1931 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
1933 char *s1, *s2, *devname = NULL;
1934 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
1940 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
1941 "/sbin/mount.lustre is installed.\n");
1945 /* Options should be a string - try to detect old lmd data */
1946 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
1947 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
1948 "/sbin/mount.lustre. Please install "
1949 "version %s\n", LUSTRE_VERSION_STRING);
1952 lmd->lmd_magic = LMD_MAGIC;
1954 /* Set default flags here */
1959 int time_min = 2 * (CONNECTION_SWITCH_MAX +
1960 2 * INITIAL_CONNECT_TIMEOUT);
1962 /* Skip whitespace and extra commas */
1963 while (*s1 == ' ' || *s1 == ',')
1966 /* Client options are parsed in ll_options: eg. flock,
1969 /* Parse non-ldiskfs options here. Rather than modifying
1970 ldiskfs, we just zero these out here */
1971 if (strncmp(s1, "abort_recov", 11) == 0) {
1972 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
1974 } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
1975 lmd->lmd_recovery_time_soft = max_t(int,
1976 simple_strtoul(s1 + 19, NULL, 10), time_min);
1978 } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
1979 lmd->lmd_recovery_time_hard = max_t(int,
1980 simple_strtoul(s1 + 19, NULL, 10), time_min);
1982 } else if (strncmp(s1, "nosvc", 5) == 0) {
1983 lmd->lmd_flags |= LMD_FLG_NOSVC;
1985 } else if (strncmp(s1, "nomgs", 5) == 0) {
1986 lmd->lmd_flags |= LMD_FLG_NOMGS;
1988 } else if (strncmp(s1, "writeconf", 9) == 0) {
1989 lmd->lmd_flags |= LMD_FLG_WRITECONF;
1991 } else if (strncmp(s1, "mgssec=", 7) == 0) {
1992 rc = lmd_parse_mgssec(lmd, s1 + 7);
1996 /* ost exclusion list */
1997 } else if (strncmp(s1, "exclude=", 8) == 0) {
1998 rc = lmd_make_exclusion(lmd, s1 + 7);
2003 /* Linux 2.4 doesn't pass the device, so we stuck it at the
2004 end of the options. */
2005 else if (strncmp(s1, "device=", 7) == 0) {
2007 /* terminate options right before device. device
2008 must be the last one. */
2014 s2 = strchr(s1, ',');
2022 memmove(s1, s2, strlen(s2) + 1);
2028 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
2029 "(need mount option 'device=...')\n");
2033 s1 = strstr(devname, ":/");
2036 lmd->lmd_flags = LMD_FLG_CLIENT;
2037 /* Remove leading /s from fsname */
2038 while (*++s1 == '/') ;
2039 /* Freed in lustre_free_lsi */
2040 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
2041 if (!lmd->lmd_profile)
2043 sprintf(lmd->lmd_profile, "%s-client", s1);
2046 /* Freed in lustre_free_lsi */
2047 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
2050 strcpy(lmd->lmd_dev, devname);
2052 /* Save mount options */
2053 s1 = options + strlen(options) - 1;
2054 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
2056 if (*options != 0) {
2057 /* Freed in lustre_free_lsi */
2058 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
2061 strcpy(lmd->lmd_opts, options);
2065 lmd->lmd_magic = LMD_MAGIC;
2070 CERROR("Bad mount options %s\n", options);
2075 /** This is the entry point for the mount call into Lustre.
2076 * This is called when a server or client is mounted,
2077 * and this is where we start setting things up.
2078 * @param data Mount options (e.g. -o flock,abort_recov)
2080 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2082 struct lustre_mount_data *lmd;
2083 struct lustre_sb_info *lsi;
2087 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2089 lsi = lustre_init_lsi(sb);
2095 * Disable lockdep during mount, because mount locking patterns are
2100 /* Figure out the lmd from the mount options */
2101 if (lmd_parse((char *)data, lmd)) {
2103 GOTO(out, rc = -EINVAL);
2106 if (lmd_is_client(lmd)) {
2107 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2108 if (!client_fill_super) {
2109 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2110 "client mount! Is the 'lustre' "
2111 "module loaded?\n");
2115 rc = lustre_start_mgc(sb);
2120 /* Connect and start */
2121 /* (should always be ll_fill_super) */
2122 rc = (*client_fill_super)(sb);
2123 /* c_f_s will call lustre_common_put_super on failure */
2126 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2127 lsi->lsi_flags |= LSI_SERVER;
2128 rc = server_fill_super(sb);
2129 /* s_f_s calls lustre_start_mgc after the mount because we need
2130 the MGS nids which are stored on disk. Plus, we may
2131 need to start the MGS first. */
2132 /* s_f_s will call server_put_super on failure */
2135 /* If error happens in fill_super() call, @lsi will be killed there.
2136 * This is why we do not put it here. */
2140 CERROR("Unable to mount %s (%d)\n",
2141 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2143 CDEBUG(D_SUPER, "Mount %s complete\n",
2151 /* We can't call ll_fill_super by name because it lives in a module that
2152 must be loaded after this one. */
2153 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb))
2155 client_fill_super = cfs;
2158 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2160 kill_super_cb = cfs;
2163 /***************** FS registration ******************/
2165 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
2166 struct super_block * lustre_get_sb(struct file_system_type *fs_type,
2167 int flags, const char *devname, void * data)
2169 return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
2172 int lustre_get_sb(struct file_system_type *fs_type,
2173 int flags, const char *devname, void * data,
2174 struct vfsmount *mnt)
2176 return get_sb_nodev(fs_type, flags, data, lustre_fill_super, mnt);
2180 void lustre_kill_super(struct super_block *sb)
2182 struct lustre_sb_info *lsi = s2lsi(sb);
2184 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2185 (*kill_super_cb)(sb);
2187 kill_anon_super(sb);
2190 /** Register the "lustre" fs type
2192 struct file_system_type lustre_fs_type = {
2193 .owner = THIS_MODULE,
2195 .get_sb = lustre_get_sb,
2196 .kill_sb = lustre_kill_super,
2197 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2198 #ifdef FS_HAS_FIEMAP
2201 LL_RENAME_DOES_D_MOVE,
2204 int lustre_register_fs(void)
2206 return register_filesystem(&lustre_fs_type);
2209 int lustre_unregister_fs(void)
2211 return unregister_filesystem(&lustre_fs_type);
2214 EXPORT_SYMBOL(lustre_register_client_fill_super);
2215 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2216 EXPORT_SYMBOL(lustre_common_put_super);
2217 EXPORT_SYMBOL(lustre_process_log);
2218 EXPORT_SYMBOL(lustre_end_log);
2219 EXPORT_SYMBOL(server_get_mount);
2220 EXPORT_SYMBOL(server_get_mount_2);
2221 EXPORT_SYMBOL(server_put_mount);
2222 EXPORT_SYMBOL(server_put_mount_2);
2223 EXPORT_SYMBOL(server_register_target);
2224 EXPORT_SYMBOL(server_name2index);
2225 EXPORT_SYMBOL(server_mti_print);
2226 EXPORT_SYMBOL(do_lcfg);