4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/obd_mount.c
38 * Client/server mount routines
40 * Author: Nathan Rutman <nathan@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_CLASS
45 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
46 #define PRINT_CMD CDEBUG
47 #define PRINT_MASK D_SUPER|D_CONFIG
51 #include <lustre_fsfilt.h>
52 #include <obd_class.h>
53 #include <lustre/lustre_user.h>
54 #include <linux/version.h>
55 #include <lustre_log.h>
56 #include <lustre_disk.h>
57 #include <lustre_param.h>
58 #ifdef HAVE_KERNEL_LOCKED
59 #include <linux/smp_lock.h>
62 static int (*client_fill_super)(struct super_block *sb,
63 struct vfsmount *mnt) = NULL;
64 static void (*kill_super_cb)(struct super_block *sb) = NULL;
66 /*********** mount lookup *********/
68 CFS_DEFINE_MUTEX(lustre_mount_info_lock);
69 static CFS_LIST_HEAD(server_mount_info_list);
71 static struct lustre_mount_info *server_find_mount(const char *name)
74 struct lustre_mount_info *lmi;
77 cfs_list_for_each(tmp, &server_mount_info_list) {
78 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
80 if (strcmp(name, lmi->lmi_name) == 0)
86 /* we must register an obd for a mount before we call the setup routine.
87 *_setup will call lustre_get_mount to get the mnt struct
88 by obd_name, since we can't pass the pointer to setup. */
89 static int server_register_mount(const char *name, struct super_block *sb,
92 struct lustre_mount_info *lmi;
98 OBD_ALLOC(lmi, sizeof(*lmi));
101 OBD_ALLOC(name_cp, strlen(name) + 1);
103 OBD_FREE(lmi, sizeof(*lmi));
106 strcpy(name_cp, name);
108 cfs_mutex_lock(&lustre_mount_info_lock);
110 if (server_find_mount(name)) {
111 cfs_mutex_unlock(&lustre_mount_info_lock);
112 OBD_FREE(lmi, sizeof(*lmi));
113 OBD_FREE(name_cp, strlen(name) + 1);
114 CERROR("Already registered %s\n", name);
117 lmi->lmi_name = name_cp;
120 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
122 cfs_mutex_unlock(&lustre_mount_info_lock);
124 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
126 lmi->lmi_mnt ? mnt_get_count(lmi->lmi_mnt) : -1);
131 /* when an obd no longer needs a mount */
132 static int server_deregister_mount(const char *name)
134 struct lustre_mount_info *lmi;
137 cfs_mutex_lock(&lustre_mount_info_lock);
138 lmi = server_find_mount(name);
140 cfs_mutex_unlock(&lustre_mount_info_lock);
141 CERROR("%s not registered\n", name);
145 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
147 lmi->lmi_mnt ? mnt_get_count(lmi->lmi_mnt) : -1);
149 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
150 cfs_list_del(&lmi->lmi_list_chain);
151 OBD_FREE(lmi, sizeof(*lmi));
152 cfs_mutex_unlock(&lustre_mount_info_lock);
157 /* obd's look up a registered mount using their obdname. This is just
158 for initial obd setup to find the mount struct. It should not be
159 called every time you want to mntget. */
160 struct lustre_mount_info *server_get_mount(const char *name)
162 struct lustre_mount_info *lmi;
163 struct lustre_sb_info *lsi;
166 cfs_mutex_lock(&lustre_mount_info_lock);
167 lmi = server_find_mount(name);
168 cfs_mutex_unlock(&lustre_mount_info_lock);
170 CERROR("Can't find mount for %s\n", name);
173 lsi = s2lsi(lmi->lmi_sb);
176 mntget(lmi->lmi_mnt);
177 cfs_atomic_inc(&lsi->lsi_mounts);
179 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
180 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
181 lmi->lmi_mnt ? mnt_get_count(lmi->lmi_mnt) - 1 : -1);
185 EXPORT_SYMBOL(server_get_mount);
188 * Used by mdt to get mount_info from obdname.
189 * There are no blocking when using the mount_info.
190 * Do not use server_get_mount for this purpose.
192 struct lustre_mount_info *server_get_mount_2(const char *name)
194 struct lustre_mount_info *lmi;
197 cfs_mutex_lock(&lustre_mount_info_lock);
198 lmi = server_find_mount(name);
199 cfs_mutex_unlock(&lustre_mount_info_lock);
201 CERROR("Can't find mount for %s\n", name);
205 EXPORT_SYMBOL(server_get_mount_2);
207 static void unlock_mntput(struct vfsmount *mnt)
209 #ifdef HAVE_KERNEL_LOCKED
210 /* for kernel < 2.6.37 */
211 if (kernel_locked()) {
223 static int lustre_put_lsi(struct super_block *sb);
225 /* to be called from obd_cleanup methods */
226 int server_put_mount(const char *name, struct vfsmount *mnt)
228 struct lustre_mount_info *lmi;
229 struct lustre_sb_info *lsi;
233 /* This might be the last one, can't deref after this */
235 count = mnt_get_count(mnt) - 1;
239 cfs_mutex_lock(&lustre_mount_info_lock);
240 lmi = server_find_mount(name);
241 cfs_mutex_unlock(&lustre_mount_info_lock);
243 CERROR("Can't find mount for %s\n", name);
246 lsi = s2lsi(lmi->lmi_sb);
247 LASSERT(lmi->lmi_mnt == mnt);
249 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
250 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
252 if (lustre_put_lsi(lmi->lmi_sb)) {
253 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
254 lmi->lmi_mnt, name, count);
255 /* last mount is the One True Mount */
257 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
260 /* this obd should never need the mount again */
261 server_deregister_mount(name);
265 EXPORT_SYMBOL(server_put_mount);
267 /* Corresponding to server_get_mount_2 */
268 int server_put_mount_2(const char *name, struct vfsmount *mnt)
273 EXPORT_SYMBOL(server_put_mount_2);
275 /******* mount helper utilities *********/
278 static void ldd_print(struct lustre_disk_data *ldd)
280 PRINT_CMD(PRINT_MASK, " disk data:\n");
281 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
282 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
283 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
284 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
285 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
286 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
287 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
288 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
289 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
290 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
294 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
295 struct lustre_disk_data *ldd)
297 struct lvfs_run_ctxt saved;
304 push_ctxt(&saved, mount_ctxt, NULL);
306 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
309 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
313 len = i_size_read(file->f_dentry->d_inode);
314 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
315 if (len != sizeof(*ldd)) {
316 CERROR("disk data size does not match: see %lu expect %u\n",
317 len, (int)sizeof(*ldd));
318 GOTO(out_close, rc = -EINVAL);
321 rc = lustre_fread(file, ldd, len, &off);
323 CERROR("error reading %s: read %d of %lu\n",
324 MOUNT_DATA_FILE, rc, len);
325 GOTO(out_close, rc = -EINVAL);
329 if (ldd->ldd_magic != LDD_MAGIC) {
330 /* FIXME add swabbing support */
331 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
332 ldd->ldd_magic, LDD_MAGIC);
333 GOTO(out_close, rc = -EINVAL);
336 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
337 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
339 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
340 GOTO(out_close, rc = -EINVAL);
342 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
343 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
345 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
346 /* Do something like remount filesystem read-only */
347 GOTO(out_close, rc = -EINVAL);
350 /* svname of the form lustre:OST1234 means never registered */
351 rc = strlen(ldd->ldd_svname);
352 if (ldd->ldd_svname[rc - 8] == ':') {
353 ldd->ldd_svname[rc - 8] = '-';
354 ldd->ldd_flags |= LDD_F_VIRGIN;
361 pop_ctxt(&saved, mount_ctxt, NULL);
365 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
366 struct lustre_disk_data *ldd)
368 struct lvfs_run_ctxt saved;
371 unsigned long len = sizeof(struct lustre_disk_data);
375 if (ldd->ldd_magic == 0)
378 LASSERT(ldd->ldd_magic == LDD_MAGIC);
380 ldd->ldd_config_ver++;
382 push_ctxt(&saved, mount_ctxt, NULL);
384 file = filp_open(MOUNT_DATA_FILE, O_RDWR|O_SYNC, 0644);
387 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
391 rc = lustre_fwrite(file, ldd, len, &off);
393 CERROR("error writing %s: read %d of %lu\n",
394 MOUNT_DATA_FILE, rc, len);
395 GOTO(out_close, rc = -EINVAL);
403 pop_ctxt(&saved, mount_ctxt, NULL);
408 /**************** config llog ********************/
410 /** Get a config log from the MGS and process it.
411 * This func is called for both clients and servers.
412 * Continue to process new statements appended to the logs
413 * (whenever the config lock is revoked) until lustre_end_log
415 * @param sb The superblock is used by the MGC to write to the local copy of
417 * @param logname The name of the llog to replicate from the MGS
418 * @param cfg Since the same mgc may be used to follow multiple config logs
419 * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
420 * this log, and is added to the mgc's list of logs to follow.
422 int lustre_process_log(struct super_block *sb, char *logname,
423 struct config_llog_instance *cfg)
425 struct lustre_cfg *lcfg;
426 struct lustre_cfg_bufs *bufs;
427 struct lustre_sb_info *lsi = s2lsi(sb);
428 struct obd_device *mgc = lsi->lsi_mgc;
439 /* mgc_process_config */
440 lustre_cfg_bufs_reset(bufs, mgc->obd_name);
441 lustre_cfg_bufs_set_string(bufs, 1, logname);
442 lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
443 lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
444 lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
445 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
446 lustre_cfg_free(lcfg);
451 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
452 "failed from the MGS (%d). Make sure this "
453 "client and the MGS are running compatible "
454 "versions of Lustre.\n",
455 mgc->obd_name, logname, rc);
458 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
459 "failed (%d). This may be the result of "
460 "communication errors between this node and "
461 "the MGS, a bad configuration, or other "
462 "errors. See the syslog for more "
463 "information.\n", mgc->obd_name, logname,
466 /* class_obd_list(); */
469 EXPORT_SYMBOL(lustre_process_log);
471 /* Stop watching this config log for updates */
472 int lustre_end_log(struct super_block *sb, char *logname,
473 struct config_llog_instance *cfg)
475 struct lustre_cfg *lcfg;
476 struct lustre_cfg_bufs bufs;
477 struct lustre_sb_info *lsi = s2lsi(sb);
478 struct obd_device *mgc = lsi->lsi_mgc;
485 /* mgc_process_config */
486 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
487 lustre_cfg_bufs_set_string(&bufs, 1, logname);
489 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
490 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
491 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
492 lustre_cfg_free(lcfg);
495 EXPORT_SYMBOL(lustre_end_log);
497 /**************** obd start *******************/
499 /** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
500 * lctl (and do for echo cli/srv.
502 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
503 char *s1, char *s2, char *s3, char *s4)
505 struct lustre_cfg_bufs bufs;
506 struct lustre_cfg * lcfg = NULL;
509 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
510 cmd, s1, s2, s3, s4);
512 lustre_cfg_bufs_reset(&bufs, cfgname);
514 lustre_cfg_bufs_set_string(&bufs, 1, s1);
516 lustre_cfg_bufs_set_string(&bufs, 2, s2);
518 lustre_cfg_bufs_set_string(&bufs, 3, s3);
520 lustre_cfg_bufs_set_string(&bufs, 4, s4);
522 lcfg = lustre_cfg_new(cmd, &bufs);
523 lcfg->lcfg_nid = nid;
524 rc = class_process_config(lcfg);
525 lustre_cfg_free(lcfg);
528 EXPORT_SYMBOL(do_lcfg);
530 /** Call class_attach and class_setup. These methods in turn call
531 * obd type-specific methods.
533 static int lustre_start_simple(char *obdname, char *type, char *uuid,
537 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
539 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
541 CERROR("%s attach error %d\n", obdname, rc);
544 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
546 CERROR("%s setup error %d\n", obdname, rc);
547 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
552 /* Set up a MGS to serve startup logs */
553 static int server_start_mgs(struct super_block *sb)
555 struct lustre_sb_info *lsi = s2lsi(sb);
556 struct vfsmount *mnt = lsi->lsi_srv_mnt;
557 struct lustre_mount_info *lmi;
562 /* It is impossible to have more than 1 MGS per node, since
563 MGC wouldn't know which to connect to */
564 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
566 lsi = s2lsi(lmi->lmi_sb);
567 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
569 lsi->lsi_ldd->ldd_svname);
573 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
575 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
578 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
579 LUSTRE_MGS_OBDNAME, 0, 0);
580 /* Do NOT call server_deregister_mount() here. This leads to
581 * inability cleanup cleanly and free lsi and other stuff when
582 * mgs calls server_put_mount() in error handling case. -umka */
586 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
587 "Is the 'mgs' module loaded?\n",
588 LUSTRE_MGS_OBDNAME, rc);
592 static int server_stop_mgs(struct super_block *sb)
594 struct obd_device *obd;
598 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
600 /* There better be only one MGS */
601 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
603 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
607 /* The MGS should always stop when we say so */
609 rc = class_manual_cleanup(obd);
613 CFS_DEFINE_MUTEX(mgc_start_lock);
615 /** Set up a mgc obd to process startup logs
617 * \param sb [in] super block of the mgc obd
619 * \retval 0 success, otherwise error code
621 static int lustre_start_mgc(struct super_block *sb)
623 struct obd_connect_data *data = NULL;
624 struct lustre_sb_info *lsi = s2lsi(sb);
625 struct obd_device *obd;
626 struct obd_export *exp;
627 struct obd_uuid *uuid;
630 char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
633 int rc = 0, i = 0, j, len;
636 LASSERT(lsi->lsi_lmd);
638 /* Find the first non-lo MGS nid for our MGC name */
639 if (lsi->lsi_flags & LSI_SERVER) {
640 ptr = lsi->lsi_ldd->ldd_params;
641 /* mount -o mgsnode=nid */
642 if (lsi->lsi_lmd->lmd_mgs &&
643 (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
645 /* Use mgsnode= nids */
646 } else if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
647 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
649 } else if (IS_MGS(lsi->lsi_ldd)) {
650 lnet_process_id_t id;
651 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
652 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
659 } else { /* client */
660 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
661 ptr = lsi->lsi_lmd->lmd_dev;
662 if (class_parse_nid(ptr, &nid, &ptr) == 0)
666 CERROR("No valid MGS nids found.\n");
670 cfs_mutex_lock(&mgc_start_lock);
672 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
673 OBD_ALLOC(mgcname, len);
674 OBD_ALLOC(niduuid, len + 2);
675 if (!mgcname || !niduuid)
676 GOTO(out_free, rc = -ENOMEM);
677 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
679 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
683 GOTO(out_free, rc = -ENOMEM);
685 obd = class_name2obd(mgcname);
686 if (obd && !obd->obd_stopping) {
687 rc = obd_set_info_async(NULL, obd->obd_self_export,
688 strlen(KEY_MGSSEC), KEY_MGSSEC,
689 strlen(mgssec), mgssec, NULL);
693 /* Re-using an existing MGC */
694 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
696 /* IR compatibility check, only for clients */
697 if (lmd_is_client(lsi->lsi_lmd)) {
699 int vallen = sizeof(*data);
700 __u32 *flags = &lsi->lsi_lmd->lmd_flags;
702 rc = obd_get_info(NULL, obd->obd_self_export,
703 strlen(KEY_CONN_DATA), KEY_CONN_DATA,
704 &vallen, data, NULL);
706 has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
707 if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
708 /* LMD_FLG_NOIR is for test purpose only */
710 "Trying to mount a client with IR setting "
711 "not compatible with current mgc. "
712 "Force to use current mgc setting that is "
714 has_ir ? "enabled" : "disabled");
716 *flags &= ~LMD_FLG_NOIR;
718 *flags |= LMD_FLG_NOIR;
723 /* If we are restarting the MGS, don't try to keep the MGC's
724 old connection, or registration will fail. */
725 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
726 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
730 /* Try all connections, but only once (again).
731 We don't want to block another target from starting
732 (using its local copy of the log), but we do want to connect
733 if at all possible. */
735 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
736 rc = obd_set_info_async(NULL, obd->obd_self_export,
737 sizeof(KEY_INIT_RECOV_BACKUP),
738 KEY_INIT_RECOV_BACKUP,
739 sizeof(recov_bk), &recov_bk, NULL);
743 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
745 /* Add the primary nids for the MGS */
747 sprintf(niduuid, "%s_%x", mgcname, i);
748 if (lsi->lsi_flags & LSI_SERVER) {
749 ptr = lsi->lsi_ldd->ldd_params;
750 if (IS_MGS(lsi->lsi_ldd)) {
751 /* Use local nids (including LO) */
752 lnet_process_id_t id;
753 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
754 rc = do_lcfg(mgcname, id.nid,
755 LCFG_ADD_UUID, niduuid, 0,0,0);
758 /* Use mgsnode= nids */
759 /* mount -o mgsnode=nid */
760 if (lsi->lsi_lmd->lmd_mgs) {
761 ptr = lsi->lsi_lmd->lmd_mgs;
762 } else if (class_find_param(ptr, PARAM_MGSNODE,
764 CERROR("No MGS nids given.\n");
765 GOTO(out_free, rc = -EINVAL);
767 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
768 rc = do_lcfg(mgcname, nid,
769 LCFG_ADD_UUID, niduuid, 0,0,0);
773 } else { /* client */
774 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
775 ptr = lsi->lsi_lmd->lmd_dev;
776 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
777 rc = do_lcfg(mgcname, nid,
778 LCFG_ADD_UUID, niduuid, 0,0,0);
780 /* Stop at the first failover nid */
786 CERROR("No valid MGS nids found.\n");
787 GOTO(out_free, rc = -EINVAL);
789 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
791 /* Random uuid for MGC allows easier reconnects */
793 ll_generate_random_uuid(uuidc);
794 class_uuid_unparse(uuidc, uuid);
797 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
798 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
804 /* Add any failover MGS nids */
806 while ((*ptr == ':' ||
807 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
808 /* New failover node */
809 sprintf(niduuid, "%s_%x", mgcname, i);
811 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
813 rc = do_lcfg(mgcname, nid,
814 LCFG_ADD_UUID, niduuid, 0,0,0);
819 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
827 lsi->lsi_lmd->lmd_mgs_failnodes = i;
829 obd = class_name2obd(mgcname);
831 CERROR("Can't find mgcobd %s\n", mgcname);
832 GOTO(out_free, rc = -ENOTCONN);
835 rc = obd_set_info_async(NULL, obd->obd_self_export,
836 strlen(KEY_MGSSEC), KEY_MGSSEC,
837 strlen(mgssec), mgssec, NULL);
841 /* Keep a refcount of servers/clients who started with "mount",
842 so we know when we can get rid of the mgc. */
843 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
845 /* Try all connections, but only once. */
847 rc = obd_set_info_async(NULL, obd->obd_self_export,
848 sizeof(KEY_INIT_RECOV_BACKUP),
849 KEY_INIT_RECOV_BACKUP,
850 sizeof(recov_bk), &recov_bk, NULL);
853 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
855 /* We connect to the MGS at setup, and don't disconnect until cleanup */
856 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
857 OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
858 OBD_CONNECT_MNE_SWAB;
859 if (lmd_is_client(lsi->lsi_lmd) &&
860 lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
861 data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
862 data->ocd_version = LUSTRE_VERSION_CODE;
863 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
865 CERROR("connect failed %d\n", rc);
869 obd->u.cli.cl_mgc_mgsexp = exp;
872 /* Keep the mgc info in the sb. Note that many lsi's can point
876 cfs_mutex_unlock(&mgc_start_lock);
881 OBD_FREE(mgcname, len);
883 OBD_FREE(niduuid, len + 2);
887 static int lustre_stop_mgc(struct super_block *sb)
889 struct lustre_sb_info *lsi = s2lsi(sb);
890 struct obd_device *obd;
891 char *niduuid = 0, *ptr = 0;
892 int i, rc = 0, len = 0;
902 cfs_mutex_lock(&mgc_start_lock);
903 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
904 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
905 /* This is not fatal, every client that stops
906 will call in here. */
907 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
908 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
909 GOTO(out, rc = -EBUSY);
912 /* The MGC has no recoverable data in any case.
913 * force shotdown set in umount_begin */
914 obd->obd_no_recov = 1;
916 if (obd->u.cli.cl_mgc_mgsexp) {
917 /* An error is not fatal, if we are unable to send the
918 disconnect mgs ping evictor cleans up the export */
919 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
921 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
924 /* Save the obdname for cleaning the nid uuids, which are
926 len = strlen(obd->obd_name) + 6;
927 OBD_ALLOC(niduuid, len);
929 strcpy(niduuid, obd->obd_name);
930 ptr = niduuid + strlen(niduuid);
933 rc = class_manual_cleanup(obd);
937 /* Clean the nid uuids */
939 GOTO(out, rc = -ENOMEM);
941 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
942 sprintf(ptr, "_%x", i);
943 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
946 CERROR("del MDC UUID %s failed: rc = %d\n",
951 OBD_FREE(niduuid, len);
953 /* class_import_put will get rid of the additional connections */
954 cfs_mutex_unlock(&mgc_start_lock);
958 /* Since there's only one mgc per node, we have to change it's fs to get
959 access to the right disk. */
960 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
962 struct lustre_sb_info *lsi = s2lsi(sb);
966 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
968 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
969 rc = obd_set_info_async(NULL, mgc->obd_self_export,
970 sizeof(KEY_SET_FS), KEY_SET_FS,
971 sizeof(*sb), sb, NULL);
973 CERROR("can't set_fs %d\n", rc);
979 static int server_mgc_clear_fs(struct obd_device *mgc)
984 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
986 rc = obd_set_info_async(NULL, mgc->obd_self_export,
987 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
992 CFS_DEFINE_MUTEX(server_start_lock);
994 /* Stop MDS/OSS if nobody is using them */
995 static int server_stop_servers(int lddflags, int lsiflags)
997 struct obd_device *obd = NULL;
998 struct obd_type *type = NULL;
1002 cfs_mutex_lock(&server_start_lock);
1004 /* Either an MDT or an OST or neither */
1005 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
1006 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
1007 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
1008 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
1009 type = class_search_type(LUSTRE_MDS_NAME);
1011 /* if this was an OST, and there are no more OST's, clean up the OSS */
1012 if ((lddflags & LDD_F_SV_TYPE_OST) &&
1013 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
1014 type = class_search_type(LUSTRE_OST_NAME);
1017 if (obd && (!type || !type->typ_refcnt)) {
1020 /* obd_fail doesn't mean much on a server obd */
1021 err = class_manual_cleanup(obd);
1026 cfs_mutex_unlock(&server_start_lock);
1031 int server_mti_print(char *title, struct mgs_target_info *mti)
1033 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
1034 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
1035 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
1036 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
1037 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
1038 mti->mti_config_ver, mti->mti_flags);
1041 EXPORT_SYMBOL(server_mti_print);
1043 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
1045 struct lustre_sb_info *lsi = s2lsi(sb);
1046 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1047 lnet_process_id_t id;
1051 if (!(lsi->lsi_flags & LSI_SERVER))
1054 strncpy(mti->mti_fsname, ldd->ldd_fsname,
1055 sizeof(mti->mti_fsname));
1056 strncpy(mti->mti_svname, ldd->ldd_svname,
1057 sizeof(mti->mti_svname));
1059 mti->mti_nid_count = 0;
1060 while (LNetGetId(i++, &id) != -ENOENT) {
1061 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
1064 /* server use --servicenode param, only allow specified
1065 * nids be registered */
1066 if ((ldd->ldd_flags & LDD_F_NO_PRIMNODE) != 0 &&
1067 class_match_nid(ldd->ldd_params,
1068 PARAM_FAILNODE, id.nid) < 1)
1071 /* match specified network */
1072 if (!class_match_net(ldd->ldd_params,
1073 PARAM_NETWORK, LNET_NIDNET(id.nid)))
1076 mti->mti_nids[mti->mti_nid_count] = id.nid;
1077 mti->mti_nid_count++;
1078 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
1079 CWARN("Only using first %d nids for %s\n",
1080 mti->mti_nid_count, mti->mti_svname);
1085 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
1086 mti->mti_config_ver = 0;
1087 if (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF)
1088 ldd->ldd_flags |= LDD_F_WRITECONF;
1089 mti->mti_flags = ldd->ldd_flags;
1090 mti->mti_stripe_index = ldd->ldd_svindex;
1091 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
1092 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
1093 CERROR("params too big for mti\n");
1096 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
1100 /* Register an old or new target with the MGS. If needed MGS will construct
1101 startup logs and assign index */
1102 int server_register_target(struct super_block *sb)
1104 struct lustre_sb_info *lsi = s2lsi(sb);
1105 struct obd_device *mgc = lsi->lsi_mgc;
1106 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1107 struct mgs_target_info *mti = NULL;
1114 if (!(lsi->lsi_flags & LSI_SERVER))
1120 rc = server_sb2mti(sb, mti);
1124 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1125 mti->mti_svname, mti->mti_fsname,
1126 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1129 /* if write_conf is true, the registration must succeed */
1130 writeconf = !!(ldd->ldd_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE));
1131 mti->mti_flags |= LDD_F_OPC_REG;
1133 /* Register the target */
1134 /* FIXME use mgc_process_config instead */
1135 rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
1136 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1137 sizeof(*mti), mti, NULL);
1139 if (mti->mti_flags & LDD_F_ERROR) {
1140 LCONSOLE_ERROR_MSG(0x160,
1141 "The MGS is refusing to allow this "
1142 "server (%s) to start. Please see messages"
1143 " on the MGS node.\n", ldd->ldd_svname);
1144 } else if (writeconf) {
1145 LCONSOLE_ERROR_MSG(0x15f,
1146 "Communication to the MGS return error %d. "
1147 "Is the MGS running?\n", rc);
1149 CERROR("Cannot talk to the MGS: %d, not fatal\n", rc);
1150 /* reset the error code for non-fatal error. */
1156 /* Always update our flags */
1157 ldd->ldd_flags = mti->mti_flags & LDD_F_ONDISK_MASK;
1159 /* If this flag is set, it means the MGS wants us to change our
1160 on-disk data. (So far this means just the index.) */
1161 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1164 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1165 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1167 ldd->ldd_svindex = mti->mti_stripe_index;
1168 strncpy(ldd->ldd_svname, mti->mti_svname,
1169 sizeof(ldd->ldd_svname));
1170 /* or ldd_make_sv_name(ldd); */
1171 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1172 if (lsi->lsi_lmd->lmd_osd_type)
1174 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1177 CERROR("Label set error %d\n", err);
1178 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1180 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1182 /* Flush the new ldd to disk */
1183 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1191 EXPORT_SYMBOL(server_register_target);
1194 * Notify the MGS that this target is ready.
1195 * Used by IR - if the MGS receives this message, it will notify clients.
1197 static int server_notify_target(struct super_block *sb, struct obd_device *obd)
1199 struct lustre_sb_info *lsi = s2lsi(sb);
1200 struct obd_device *mgc = lsi->lsi_mgc;
1201 struct mgs_target_info *mti = NULL;
1207 if (!(lsi->lsi_flags & LSI_SERVER))
1213 rc = server_sb2mti(sb, mti);
1217 mti->mti_instance = obd->u.obt.obt_instance;
1218 mti->mti_flags |= LDD_F_OPC_READY;
1220 /* FIXME use mgc_process_config instead */
1221 rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
1222 sizeof(KEY_REGISTER_TARGET),
1223 KEY_REGISTER_TARGET,
1224 sizeof(*mti), mti, NULL);
1226 /* Imperative recovery: if the mgs informs us to use IR? */
1227 if (!rc && !(mti->mti_flags & LDD_F_ERROR) &&
1228 (mti->mti_flags & LDD_F_IR_CAPABLE))
1229 lsi->lsi_flags |= LSI_IR_CAPABLE;
1238 /** Start server targets: MDTs and OSTs
1240 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1242 struct obd_device *obd;
1243 struct lustre_sb_info *lsi = s2lsi(sb);
1244 struct config_llog_instance cfg;
1248 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1251 /* If we're an MDT, make sure the global MDS is running */
1252 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1253 /* make sure the MDS is started */
1254 cfs_mutex_lock(&server_start_lock);
1255 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1257 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1258 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1260 LUSTRE_MDS_OBDNAME"_uuid",
1263 cfs_mutex_unlock(&server_start_lock);
1264 CERROR("failed to start MDS: %d\n", rc);
1268 cfs_mutex_unlock(&server_start_lock);
1272 /* If we're an OST, make sure the global OSS is running */
1273 if (IS_OST(lsi->lsi_ldd)) {
1274 /* make sure OSS is started */
1275 cfs_mutex_lock(&server_start_lock);
1276 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1278 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1280 LUSTRE_OSS_OBDNAME"_uuid",
1283 cfs_mutex_unlock(&server_start_lock);
1284 CERROR("failed to start OSS: %d\n", rc);
1288 cfs_mutex_unlock(&server_start_lock);
1291 /* Set the mgc fs to our server disk. This allows the MGC to
1292 * read and write configs locally, in case it can't talk to the MGS. */
1293 if (lsi->lsi_lmd->lmd_osd_type == NULL) {
1294 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1299 /* Register with MGS */
1300 rc = server_register_target(sb);
1304 /* Let the target look up the mount using the target's name
1305 (we can't pass the sb or mnt through class_process_config.) */
1306 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1310 /* Start targets using the llog named for the target */
1311 memset(&cfg, 0, sizeof(cfg));
1312 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1314 CERROR("failed to start server %s: %d\n",
1315 lsi->lsi_ldd->ldd_svname, rc);
1316 /* Do NOT call server_deregister_mount() here. This makes it
1317 * impossible to find mount later in cleanup time and leaves
1318 * @lsi and othder stuff leaked. -umka */
1323 /* Release the mgc fs for others to use */
1324 if (lsi->lsi_lmd->lmd_osd_type == NULL)
1325 server_mgc_clear_fs(lsi->lsi_mgc);
1328 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1330 CERROR("no server named %s was started\n",
1331 lsi->lsi_ldd->ldd_svname);
1335 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1336 (OBP(obd, iocontrol))) {
1337 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1338 obd->obd_self_export, 0, NULL, NULL);
1341 server_notify_target(sb, obd);
1343 /* calculate recovery timeout, do it after lustre_process_log */
1344 server_calc_timeout(lsi, obd);
1346 /* log has been fully processed */
1347 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1353 /***************** lustre superblock **************/
1355 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1357 struct lustre_sb_info *lsi;
1363 OBD_ALLOC_PTR(lsi->lsi_lmd);
1364 if (!lsi->lsi_lmd) {
1369 lsi->lsi_lmd->lmd_exclude_count = 0;
1370 lsi->lsi_lmd->lmd_recovery_time_soft = 0;
1371 lsi->lsi_lmd->lmd_recovery_time_hard = 0;
1372 s2lsi_nocast(sb) = lsi;
1373 /* we take 1 extra ref for our setup */
1374 cfs_atomic_set(&lsi->lsi_mounts, 1);
1376 /* Default umount style */
1377 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1382 static int lustre_free_lsi(struct super_block *sb)
1384 struct lustre_sb_info *lsi = s2lsi(sb);
1387 LASSERT(lsi != NULL);
1388 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1390 /* someone didn't call server_put_mount. */
1391 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1393 if (lsi->lsi_ldd != NULL)
1394 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1396 if (lsi->lsi_lmd != NULL) {
1397 if (lsi->lsi_lmd->lmd_dev != NULL)
1398 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1399 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1400 if (lsi->lsi_lmd->lmd_profile != NULL)
1401 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1402 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1403 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1404 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1405 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1406 if (lsi->lsi_lmd->lmd_opts != NULL)
1407 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1408 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1409 if (lsi->lsi_lmd->lmd_exclude_count)
1410 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1411 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1412 lsi->lsi_lmd->lmd_exclude_count);
1413 if (lsi->lsi_lmd->lmd_mgs != NULL)
1414 OBD_FREE(lsi->lsi_lmd->lmd_mgs,
1415 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
1416 if (lsi->lsi_lmd->lmd_osd_type != NULL)
1417 OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
1418 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
1420 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1423 LASSERT(lsi->lsi_llsbi == NULL);
1424 OBD_FREE(lsi, sizeof(*lsi));
1425 s2lsi_nocast(sb) = NULL;
1430 /* The lsi has one reference for every server that is using the disk -
1431 e.g. MDT, MGS, and potentially MGC */
1432 static int lustre_put_lsi(struct super_block *sb)
1434 struct lustre_sb_info *lsi = s2lsi(sb);
1437 LASSERT(lsi != NULL);
1439 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1440 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1441 lustre_free_lsi(sb);
1447 static int lsi_prepare(struct lustre_sb_info *lsi)
1449 struct lustre_disk_data *ldd;
1455 LASSERT(lsi->lsi_lmd);
1457 OBD_ALLOC(ldd, sizeof(*ldd));
1461 strcpy(lsi->lsi_osd_type, LUSTRE_OSD_NAME);
1463 /* The server name is given as a mount line option */
1464 if (lsi->lsi_lmd->lmd_profile == NULL) {
1465 LCONSOLE_ERROR("Can't determine server name\n");
1466 GOTO(err, rc = -EINVAL);
1469 if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(ldd->ldd_svname))
1470 GOTO(err, rc = -ENAMETOOLONG);
1472 strcpy(ldd->ldd_svname, lsi->lsi_lmd->lmd_profile);
1474 /* Determine osd type */
1475 if (lsi->lsi_lmd->lmd_osd_type != NULL) {
1476 if (strlen(lsi->lsi_lmd->lmd_osd_type) >=
1477 sizeof(lsi->lsi_osd_type))
1478 GOTO(err, rc = -ENAMETOOLONG);
1480 strcpy(lsi->lsi_osd_type, lsi->lsi_lmd->lmd_osd_type);
1483 if ((p = strstr(ldd->ldd_svname, "-OST"))) {
1484 ldd->ldd_flags = LDD_F_SV_TYPE_OST;
1485 } else if ((p = strstr(ldd->ldd_svname, "-MDT"))) {
1486 ldd->ldd_flags = LDD_F_SV_TYPE_MDT;
1488 LCONSOLE_ERROR("Can't determine server type of '%s'\n",
1490 GOTO(err, rc = -EINVAL);
1493 len = p - ldd->ldd_svname;
1494 if (len >= MTI_NAME_MAXLEN)
1495 GOTO(err, rc = -ENAMETOOLONG);
1496 memcpy(ldd->ldd_fsname, ldd->ldd_svname, len);
1497 ldd->ldd_fsname[len] = '\0';
1499 ldd->ldd_svindex = simple_strtoul(p + 4, NULL, 16);
1500 ldd->ldd_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
1501 LDD_F_WRITECONF : 0;
1505 /* Add mount line flags that used to be in ldd:
1506 * writeconf, mgs, iam, anything else?
1509 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
1510 LDD_F_WRITECONF : 0;
1511 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ?
1512 LDD_F_SV_TYPE_MGS : 0;
1513 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_IAM) ?
1515 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ?
1516 LDD_F_NO_PRIMNODE : 0;
1522 OBD_FREE(ldd, sizeof(*ldd));
1526 /*************** server mount ******************/
1528 /** Kernel mount using mount options in MOUNT_DATA_FILE.
1529 * Since this file lives on the disk, we pre-mount using a common
1530 * type, read the file, then re-mount using the type specified in the
1533 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1535 struct lvfs_run_ctxt mount_ctxt;
1536 struct lustre_sb_info *lsi = s2lsi(sb);
1537 struct lustre_disk_data *ldd;
1538 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1539 struct vfsmount *mnt;
1540 struct file_system_type *type;
1541 char *options = NULL;
1542 unsigned long page, s_flags;
1543 struct page *__page;
1548 if (lsi->lsi_lmd->lmd_osd_type) {
1549 rc = lsi_prepare(lsi);
1550 RETURN(ERR_PTR(rc));
1553 OBD_ALLOC(ldd, sizeof(*ldd));
1555 RETURN(ERR_PTR(-ENOMEM));
1556 strcpy(lsi->lsi_osd_type, LUSTRE_OSD_NAME);
1558 /* In the past, we have always used flags = 0.
1559 Note ext3/ldiskfs can't be mounted ro. */
1560 s_flags = sb->s_flags;
1562 /* allocate memory for options */
1563 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1565 GOTO(out_free, rc = -ENOMEM);
1566 page = (unsigned long)cfs_page_address(__page);
1567 options = (char *)page;
1568 memset(options, 0, CFS_PAGE_SIZE);
1570 /* mount-line options must be added for pre-mount because it may
1571 * contain mount options such as journal_dev which are required
1572 * to mount successfuly the underlying filesystem */
1573 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1574 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1576 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1577 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1578 type = get_fs_type("ldiskfs");
1580 CERROR("premount failed: cannot find ldiskfs module\n");
1581 GOTO(out_free, rc = -ENODEV);
1583 mnt = vfs_kern_mount(type, s_flags, lmd->lmd_dev, (void *)options);
1584 cfs_module_put(type->owner);
1587 CERROR("premount %s:%#lx ldiskfs failed: %d "
1588 "Is the ldiskfs module available?\n",
1589 lmd->lmd_dev, s_flags, rc );
1593 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1594 mount_ctxt.pwdmnt = mnt;
1595 mount_ctxt.pwd = mnt->mnt_root;
1596 mount_ctxt.fs = get_ds();
1598 rc = ldd_parse(&mount_ctxt, ldd);
1602 CERROR("premount parse options failed: rc = %d\n", rc);
1606 /* Done with our pre-mount, now do the real mount. */
1608 /* Glom up mount options */
1609 memset(options, 0, CFS_PAGE_SIZE);
1610 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1612 len = CFS_PAGE_SIZE - strlen(options) - 2;
1614 strcat(options, ",");
1615 strncat(options, "no_mbcache", len);
1617 /* Add in any mount-line options */
1618 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1619 len = CFS_PAGE_SIZE - strlen(options) - 2;
1620 strcat(options, ",");
1621 strncat(options, lmd->lmd_opts, len);
1624 /* Special permanent mount flags */
1626 s_flags |= MS_NOATIME | MS_NODIRATIME;
1628 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1629 MT_STR(ldd), lmd->lmd_dev, options);
1630 type = get_fs_type(MT_STR(ldd));
1632 CERROR("get_fs_type failed\n");
1633 GOTO(out_free, rc = -ENODEV);
1635 mnt = vfs_kern_mount(type, s_flags, lmd->lmd_dev, (void *)options);
1636 cfs_module_put(type->owner);
1639 CERROR("vfs_kern_mount failed: rc = %d\n", rc);
1643 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1644 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1647 OBD_PAGE_FREE(__page);
1648 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1649 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1654 OBD_PAGE_FREE(__page);
1655 OBD_FREE(ldd, sizeof(*ldd));
1656 lsi->lsi_ldd = NULL;
1657 RETURN(ERR_PTR(rc));
1660 /** Wait here forever until the mount refcount is 0 before completing umount,
1661 * else we risk dereferencing a null pointer.
1662 * LNET may take e.g. 165s before killing zombies.
1664 static void server_wait_finished(struct vfsmount *mnt)
1668 cfs_sigset_t blocked;
1671 cfs_waitq_init(&waitq);
1672 cfs_waitq_wait_event_interruptible_timeout(waitq, 0,
1673 cfs_time_seconds(3), rc);
1678 cfs_waitq_init(&waitq);
1680 while (mnt_get_count(mnt) > 1) {
1681 if (waited && (waited % 30 == 0))
1682 LCONSOLE_WARN("Mount still busy with %d refs after "
1686 /* Cannot use l_event_wait() for an interruptible sleep. */
1688 blocked = cfs_block_sigsinv(sigmask(SIGKILL));
1689 cfs_waitq_wait_event_interruptible_timeout(
1691 (mnt_get_count(mnt) == 1),
1692 cfs_time_seconds(3),
1694 cfs_restore_sigs(blocked);
1696 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1697 "%d refs!\n", mnt_get_devname(mnt),
1698 mnt_get_count(mnt));
1705 /** Start the shutdown of servers at umount.
1707 static void server_put_super(struct super_block *sb)
1709 struct lustre_sb_info *lsi = s2lsi(sb);
1710 struct obd_device *obd;
1711 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1712 char *tmpname, *extraname = NULL;
1714 int lddflags = lsi->lsi_ldd->ldd_flags;
1715 int lsiflags = lsi->lsi_flags;
1718 LASSERT(lsiflags & LSI_SERVER);
1720 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1721 OBD_ALLOC(tmpname, tmpname_sz);
1722 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1723 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1724 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1725 snprintf(tmpname, tmpname_sz, "MGS");
1727 /* Stop the target */
1728 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1729 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1730 struct lustre_profile *lprof = NULL;
1732 /* tell the mgc to drop the config log */
1733 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1735 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1736 If there are any setup/cleanup errors, save the lov
1737 name for safety cleanup later. */
1738 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1739 if (lprof && lprof->lp_dt) {
1740 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1741 strcpy(extraname, lprof->lp_dt);
1744 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1746 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1747 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1749 /* We can't seem to give an error return code
1750 * to .put_super, so we better make sure we clean up! */
1752 class_manual_cleanup(obd);
1754 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1755 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1759 /* If they wanted the mgs to stop separately from the mdt, they
1760 should have put it on a different device. */
1761 if (IS_MGS(lsi->lsi_ldd)) {
1762 /* if MDS start with --nomgs, don't stop MGS then */
1763 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1764 server_stop_mgs(sb);
1767 /* Clean the mgc and sb */
1768 lustre_common_put_super(sb);
1770 /* Wait for the targets to really clean up - can't exit (and let the
1771 sb get destroyed) while the mount is still in use */
1772 server_wait_finished(mnt);
1774 /* drop the One True Mount */
1778 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1779 until the target is really gone so that our type refcount check
1781 server_stop_servers(lddflags, lsiflags);
1783 /* In case of startup or cleanup err, stop related obds */
1785 obd = class_name2obd(extraname);
1787 CWARN("Cleaning orphaned obd %s\n", extraname);
1789 class_manual_cleanup(obd);
1791 OBD_FREE(extraname, strlen(extraname) + 1);
1794 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1795 OBD_FREE(tmpname, tmpname_sz);
1799 /** Called only for 'umount -f'
1801 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1802 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1804 struct super_block *sb = vfsmnt->mnt_sb;
1806 static void server_umount_begin(struct super_block *sb)
1809 struct lustre_sb_info *lsi = s2lsi(sb);
1812 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1813 if (!(flags & MNT_FORCE)) {
1819 CDEBUG(D_MOUNT, "umount -f\n");
1820 /* umount = failover
1822 no third way to do non-force, non-failover */
1823 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1824 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1828 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1830 struct super_block *sb = dentry->d_sb;
1831 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1834 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1835 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1837 buf->f_type = sb->s_magic;
1843 buf->f_type = sb->s_magic;
1844 buf->f_bsize = sb->s_blocksize;
1850 buf->f_namelen = NAME_MAX;
1854 /** The operations we support directly on the superblock:
1855 * mount, umount, and df.
1857 static struct super_operations server_ops =
1859 .put_super = server_put_super,
1860 .umount_begin = server_umount_begin, /* umount -f */
1861 .statfs = server_statfs,
1864 #define log2(n) cfs_ffz(~(n))
1865 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1867 static int server_fill_super_common(struct super_block *sb)
1869 struct inode *root = 0;
1872 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1874 sb->s_blocksize = 4096;
1875 sb->s_blocksize_bits = log2(sb->s_blocksize);
1876 sb->s_magic = LUSTRE_SUPER_MAGIC;
1877 sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
1878 sb->s_flags |= MS_RDONLY;
1879 sb->s_op = &server_ops;
1881 root = new_inode(sb);
1883 CERROR("Can't make root inode\n");
1887 /* returns -EIO for every operation */
1888 /* make_bad_inode(root); -- badness - can't umount */
1889 /* apparently we need to be a directory for the mount to finish */
1890 root->i_mode = S_IFDIR;
1892 sb->s_root = d_alloc_root(root);
1894 CERROR("Can't make root dentry\n");
1902 /** Fill in the superblock info for a Lustre server.
1903 * Mount the device with the correct options.
1904 * Read the on-disk config file.
1905 * Start the services.
1907 static int server_fill_super(struct super_block *sb)
1909 struct lustre_sb_info *lsi = s2lsi(sb);
1910 struct vfsmount *mnt;
1914 /* the One True Mount */
1915 mnt = server_kernel_mount(sb);
1918 CERROR("Unable to mount device %s: %d\n",
1919 lsi->lsi_lmd->lmd_dev, rc);
1923 lsi->lsi_srv_mnt = mnt;
1925 LASSERT(lsi->lsi_ldd);
1926 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1927 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1928 lsi->lsi_lmd->lmd_dev);
1930 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1931 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1932 "running. Double-mount may have compromised"
1933 " the disk journal.\n",
1934 lsi->lsi_ldd->ldd_svname);
1940 /* Start MGS before MGC */
1941 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1942 rc = server_start_mgs(sb);
1947 /* Start MGC before servers */
1948 rc = lustre_start_mgc(sb);
1952 /* Set up all obd devices for service */
1953 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1954 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1955 rc = server_start_targets(sb, mnt);
1957 CERROR("Unable to start targets: %d\n", rc);
1960 /* FIXME overmount client here,
1961 or can we just start a client log and client_fill_super on this sb?
1962 We need to make sure server_put_super gets called too - ll_put_super
1963 calls lustre_common_put_super; check there for LSI_SERVER flag,
1965 Probably should start client from new thread so we can return.
1966 Client will not finish until all servers are connected.
1967 Note - MGS-only server does NOT get a client, since there is no
1968 lustre fs associated - the MGS is for all lustre fs's */
1971 rc = server_fill_super_common(sb);
1977 /* We jump here in case of failure while starting targets or MGS.
1978 * In this case we can't just put @mnt and have to do real cleanup
1979 * with stoping targets, etc. */
1980 server_put_super(sb);
1984 /* Get the index from the obd name.
1985 rc = server type, or
1987 if endptr isn't NULL it is set to end of name */
1988 int server_name2index(char *svname, __u32 *idx, char **endptr)
1990 unsigned long index;
1992 char *dash = strrchr(svname, '-');
1996 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1997 * in the fsname, then determine the server index */
1998 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
2000 for (; dash > svname && *dash != '-'; dash--);
2005 if (strncmp(dash + 1, "MDT", 3) == 0)
2006 rc = LDD_F_SV_TYPE_MDT;
2007 else if (strncmp(dash + 1, "OST", 3) == 0)
2008 rc = LDD_F_SV_TYPE_OST;
2011 if (strcmp(dash + 4, "all") == 0)
2012 return rc | LDD_F_SV_ALL;
2014 index = simple_strtoul(dash + 4, endptr, 16);
2018 EXPORT_SYMBOL(server_name2index);
2021 * Calculate timeout value for a target.
2023 void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
2025 struct lustre_mount_data *lmd;
2029 bool has_ir = !!(lsi->lsi_flags & LSI_IR_CAPABLE);
2030 int min = OBD_RECOVERY_TIME_MIN;
2032 LASSERT(lsi->lsi_flags & LSI_SERVER);
2036 soft = lmd->lmd_recovery_time_soft;
2037 hard = lmd->lmd_recovery_time_hard;
2038 has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR);
2039 obd->obd_no_ir = !has_ir;
2043 soft = OBD_RECOVERY_TIME_SOFT;
2045 hard = OBD_RECOVERY_TIME_HARD;
2047 /* target may have ir_factor configured. */
2048 factor = OBD_IR_FACTOR_DEFAULT;
2049 if (obd->obd_recovery_ir_factor)
2050 factor = obd->obd_recovery_ir_factor;
2053 int new_soft = soft;
2054 int new_hard = hard;
2056 /* adjust timeout value by imperative recovery */
2058 new_soft = (soft * factor) / OBD_IR_FACTOR_MAX;
2059 new_hard = (hard * factor) / OBD_IR_FACTOR_MAX;
2061 /* make sure the timeout is not too short */
2062 new_soft = max(min, new_soft);
2063 new_hard = max(new_soft, new_hard);
2065 LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery "
2066 "window shrunk from %d-%d down to %d-%d\n",
2067 obd->obd_name, soft, hard, new_soft, new_hard);
2074 obd->obd_recovery_timeout = max(obd->obd_recovery_timeout, soft);
2075 obd->obd_recovery_time_hard = hard;
2076 obd->obd_recovery_ir_factor = factor;
2078 EXPORT_SYMBOL(server_calc_timeout);
2080 /*************** mount common betweeen server and client ***************/
2083 int lustre_common_put_super(struct super_block *sb)
2088 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
2090 /* Drop a ref to the MGC */
2091 rc = lustre_stop_mgc(sb);
2092 if (rc && (rc != -ENOENT)) {
2094 CERROR("Can't stop MGC: %d\n", rc);
2097 /* BUSY just means that there's some other obd that
2098 needs the mgc. Let him clean it up. */
2099 CDEBUG(D_MOUNT, "MGC still in use\n");
2101 /* Drop a ref to the mounted disk */
2106 EXPORT_SYMBOL(lustre_common_put_super);
2108 static void lmd_print(struct lustre_mount_data *lmd)
2112 PRINT_CMD(PRINT_MASK, " mount data:\n");
2113 if (lmd_is_client(lmd))
2114 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
2115 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
2116 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
2119 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
2121 if (lmd->lmd_recovery_time_soft)
2122 PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
2123 lmd->lmd_recovery_time_soft);
2125 if (lmd->lmd_recovery_time_hard)
2126 PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
2127 lmd->lmd_recovery_time_hard);
2129 for (i = 0; i < lmd->lmd_exclude_count; i++) {
2130 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
2131 lmd->lmd_exclude[i]);
2135 /* Is this server on the exclusion list */
2136 int lustre_check_exclusion(struct super_block *sb, char *svname)
2138 struct lustre_sb_info *lsi = s2lsi(sb);
2139 struct lustre_mount_data *lmd = lsi->lsi_lmd;
2144 rc = server_name2index(svname, &index, NULL);
2145 if (rc != LDD_F_SV_TYPE_OST)
2146 /* Only exclude OSTs */
2149 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
2150 index, lmd->lmd_exclude_count, lmd->lmd_dev);
2152 for(i = 0; i < lmd->lmd_exclude_count; i++) {
2153 if (index == lmd->lmd_exclude[i]) {
2154 CWARN("Excluding %s (on exclusion list)\n", svname);
2161 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
2162 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
2164 char *s1 = ptr, *s2;
2165 __u32 index, *exclude_list;
2169 /* The shortest an ost name can be is 8 chars: -OST0000.
2170 We don't actually know the fsname at this time, so in fact
2171 a user could specify any fsname. */
2172 devmax = strlen(ptr) / 8 + 1;
2174 /* temp storage until we figure out how many we have */
2175 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
2179 /* we enter this fn pointing at the '=' */
2180 while (*s1 && *s1 != ' ' && *s1 != ',') {
2182 rc = server_name2index(s1, &index, &s2);
2184 CERROR("Can't parse server name '%s'\n", s1);
2187 if (rc == LDD_F_SV_TYPE_OST)
2188 exclude_list[lmd->lmd_exclude_count++] = index;
2190 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
2192 /* now we are pointing at ':' (next exclude)
2193 or ',' (end of excludes) */
2194 if (lmd->lmd_exclude_count >= devmax)
2197 if (rc >= 0) /* non-err */
2200 if (lmd->lmd_exclude_count) {
2201 /* permanent, freed in lustre_free_lsi */
2202 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
2203 lmd->lmd_exclude_count);
2204 if (lmd->lmd_exclude) {
2205 memcpy(lmd->lmd_exclude, exclude_list,
2206 sizeof(index) * lmd->lmd_exclude_count);
2209 lmd->lmd_exclude_count = 0;
2212 OBD_FREE(exclude_list, sizeof(index) * devmax);
2216 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
2221 if (lmd->lmd_mgssec != NULL) {
2222 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
2223 lmd->lmd_mgssec = NULL;
2226 tail = strchr(ptr, ',');
2228 length = strlen(ptr);
2230 length = tail - ptr;
2232 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
2233 if (lmd->lmd_mgssec == NULL)
2236 memcpy(lmd->lmd_mgssec, ptr, length);
2237 lmd->lmd_mgssec[length] = '\0';
2241 static int lmd_parse_string(char **handle, char *ptr)
2246 if ((handle == NULL) || (ptr == NULL))
2249 if (*handle != NULL) {
2250 OBD_FREE(*handle, strlen(*handle) + 1);
2254 tail = strchr(ptr, ',');
2256 length = strlen(ptr);
2258 length = tail - ptr;
2260 OBD_ALLOC(*handle, length + 1);
2261 if (*handle == NULL)
2264 memcpy(*handle, ptr, length);
2265 (*handle)[length] = '\0';
2270 /* Collect multiple values for mgsnid specifiers */
2271 static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
2279 /* Find end of nidlist */
2280 while (class_parse_nid(tail, &nid, &tail) == 0) {}
2281 length = tail - *ptr;
2283 LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
2287 if (lmd->lmd_mgs != NULL)
2288 oldlen = strlen(lmd->lmd_mgs) + 1;
2290 OBD_ALLOC(mgsnid, oldlen + length + 1);
2294 if (lmd->lmd_mgs != NULL) {
2295 /* Multiple mgsnid= are taken to mean failover locations */
2296 memcpy(mgsnid, lmd->lmd_mgs, oldlen);
2297 mgsnid[oldlen - 1] = ':';
2298 OBD_FREE(lmd->lmd_mgs, oldlen);
2300 memcpy(mgsnid + oldlen, *ptr, length);
2301 mgsnid[oldlen + length] = '\0';
2302 lmd->lmd_mgs = mgsnid;
2308 /** Parse mount line options
2309 * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
2310 * dev is passed as device=uml1:/lustre by mount.lustre
2312 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
2314 char *s1, *s2, *devname = NULL;
2315 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
2321 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
2322 "/sbin/mount.lustre is installed.\n");
2326 /* Options should be a string - try to detect old lmd data */
2327 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
2328 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
2329 "/sbin/mount.lustre. Please install "
2330 "version %s\n", LUSTRE_VERSION_STRING);
2333 lmd->lmd_magic = LMD_MAGIC;
2335 /* Set default flags here */
2340 int time_min = OBD_RECOVERY_TIME_MIN;
2342 /* Skip whitespace and extra commas */
2343 while (*s1 == ' ' || *s1 == ',')
2346 /* Client options are parsed in ll_options: eg. flock,
2349 /* Parse non-ldiskfs options here. Rather than modifying
2350 ldiskfs, we just zero these out here */
2351 if (strncmp(s1, "abort_recov", 11) == 0) {
2352 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
2354 } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
2355 lmd->lmd_recovery_time_soft = max_t(int,
2356 simple_strtoul(s1 + 19, NULL, 10), time_min);
2358 } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
2359 lmd->lmd_recovery_time_hard = max_t(int,
2360 simple_strtoul(s1 + 19, NULL, 10), time_min);
2362 } else if (strncmp(s1, "noir", 4) == 0) {
2363 lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
2365 } else if (strncmp(s1, "nosvc", 5) == 0) {
2366 lmd->lmd_flags |= LMD_FLG_NOSVC;
2368 } else if (strncmp(s1, "nomgs", 5) == 0) {
2369 lmd->lmd_flags |= LMD_FLG_NOMGS;
2371 } else if (strncmp(s1, "noscrub", 7) == 0) {
2372 lmd->lmd_flags |= LMD_FLG_NOSCRUB;
2374 } else if (strncmp(s1, PARAM_MGSNODE,
2375 sizeof(PARAM_MGSNODE) - 1) == 0) {
2376 s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
2377 /* Assume the next mount opt is the first
2378 invalid nid we get to. */
2379 rc = lmd_parse_mgs(lmd, &s2);
2383 } else if (strncmp(s1, "writeconf", 9) == 0) {
2384 lmd->lmd_flags |= LMD_FLG_WRITECONF;
2386 } else if (strncmp(s1, "mgssec=", 7) == 0) {
2387 rc = lmd_parse_mgssec(lmd, s1 + 7);
2391 /* ost exclusion list */
2392 } else if (strncmp(s1, "exclude=", 8) == 0) {
2393 rc = lmd_make_exclusion(lmd, s1 + 7);
2397 } else if (strncmp(s1, "svname=", 7) == 0) {
2398 rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
2402 } else if (strncmp(s1, "osd=", 4) == 0) {
2403 rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
2406 /* with ldiskfs we're still doing ldd parsing
2407 * in the kernel space */
2408 if (!strcmp(lmd->lmd_osd_type, "osd-ldiskfs")) {
2409 OBD_FREE(lmd->lmd_osd_type,
2410 strlen(lmd->lmd_osd_type) + 1);
2411 lmd->lmd_osd_type = NULL;
2415 /* Linux 2.4 doesn't pass the device, so we stuck it at the
2416 end of the options. */
2417 else if (strncmp(s1, "device=", 7) == 0) {
2419 /* terminate options right before device. device
2420 must be the last one. */
2426 s2 = strchr(s1, ',');
2434 memmove(s1, s2, strlen(s2) + 1);
2440 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
2441 "(need mount option 'device=...')\n");
2445 s1 = strstr(devname, ":/");
2448 lmd->lmd_flags |= LMD_FLG_CLIENT;
2449 /* Remove leading /s from fsname */
2450 while (*++s1 == '/') ;
2451 /* Freed in lustre_free_lsi */
2452 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
2453 if (!lmd->lmd_profile)
2455 sprintf(lmd->lmd_profile, "%s-client", s1);
2458 /* Freed in lustre_free_lsi */
2459 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
2462 strcpy(lmd->lmd_dev, devname);
2464 /* Save mount options */
2465 s1 = options + strlen(options) - 1;
2466 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
2468 if (*options != 0) {
2469 /* Freed in lustre_free_lsi */
2470 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
2473 strcpy(lmd->lmd_opts, options);
2477 lmd->lmd_magic = LMD_MAGIC;
2482 CERROR("Bad mount options %s\n", options);
2486 struct lustre_mount_data2 {
2488 struct vfsmount *lmd2_mnt;
2491 /** This is the entry point for the mount call into Lustre.
2492 * This is called when a server or client is mounted,
2493 * and this is where we start setting things up.
2494 * @param data Mount options (e.g. -o flock,abort_recov)
2496 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2498 struct lustre_mount_data *lmd;
2499 struct lustre_mount_data2 *lmd2 = data;
2500 struct lustre_sb_info *lsi;
2504 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2506 lsi = lustre_init_lsi(sb);
2512 * Disable lockdep during mount, because mount locking patterns are
2518 * LU-639: the obd cleanup of last mount may not finish yet, wait here.
2520 obd_zombie_barrier();
2522 /* Figure out the lmd from the mount options */
2523 if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
2525 GOTO(out, rc = -EINVAL);
2528 if (lmd_is_client(lmd)) {
2529 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2530 if (!client_fill_super) {
2531 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2532 "client mount! Is the 'lustre' "
2533 "module loaded?\n");
2537 rc = lustre_start_mgc(sb);
2542 /* Connect and start */
2543 /* (should always be ll_fill_super) */
2544 rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
2545 /* c_f_s will call lustre_common_put_super on failure */
2548 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2549 lsi->lsi_flags |= LSI_SERVER;
2550 rc = server_fill_super(sb);
2551 /* s_f_s calls lustre_start_mgc after the mount because we need
2552 the MGS nids which are stored on disk. Plus, we may
2553 need to start the MGS first. */
2554 /* s_f_s will call server_put_super on failure */
2557 /* If error happens in fill_super() call, @lsi will be killed there.
2558 * This is why we do not put it here. */
2562 CERROR("Unable to mount %s (%d)\n",
2563 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2565 CDEBUG(D_SUPER, "Mount %s complete\n",
2573 /* We can't call ll_fill_super by name because it lives in a module that
2574 must be loaded after this one. */
2575 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
2576 struct vfsmount *mnt))
2578 client_fill_super = cfs;
2580 EXPORT_SYMBOL(lustre_register_client_fill_super);
2582 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2584 kill_super_cb = cfs;
2586 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2588 /***************** FS registration ******************/
2589 #ifdef HAVE_FSTYPE_MOUNT
2590 struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
2591 const char *devname, void *data)
2593 struct lustre_mount_data2 lmd2 = { data, NULL };
2595 return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
2598 int lustre_get_sb(struct file_system_type *fs_type, int flags,
2599 const char *devname, void * data, struct vfsmount *mnt)
2601 struct lustre_mount_data2 lmd2 = { data, mnt };
2603 return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt);
2607 void lustre_kill_super(struct super_block *sb)
2609 struct lustre_sb_info *lsi = s2lsi(sb);
2611 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2612 (*kill_super_cb)(sb);
2614 kill_anon_super(sb);
2617 /** Register the "lustre" fs type
2619 struct file_system_type lustre_fs_type = {
2620 .owner = THIS_MODULE,
2622 #ifdef HAVE_FSTYPE_MOUNT
2623 .mount = lustre_mount,
2625 .get_sb = lustre_get_sb,
2627 .kill_sb = lustre_kill_super,
2628 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2629 #ifdef FS_HAS_FIEMAP
2632 LL_RENAME_DOES_D_MOVE,
2635 int lustre_register_fs(void)
2637 return register_filesystem(&lustre_fs_type);
2640 int lustre_unregister_fs(void)
2642 return unregister_filesystem(&lustre_fs_type);