4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/obd_mount.c
38 * Client/server mount routines
40 * Author: Nathan Rutman <nathan@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_CLASS
45 #define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
46 #define PRINT_CMD CDEBUG
47 #define PRINT_MASK D_SUPER|D_CONFIG
51 #include <lustre_fsfilt.h>
52 #include <obd_class.h>
53 #include <lustre/lustre_user.h>
54 #include <linux/version.h>
55 #include <lustre_log.h>
56 #include <lustre_disk.h>
57 #include <lustre_param.h>
58 #ifdef HAVE_KERNEL_LOCKED
59 #include <linux/smp_lock.h>
62 static int (*client_fill_super)(struct super_block *sb,
63 struct vfsmount *mnt) = NULL;
64 static void (*kill_super_cb)(struct super_block *sb) = NULL;
66 /*********** mount lookup *********/
68 CFS_DEFINE_MUTEX(lustre_mount_info_lock);
69 static CFS_LIST_HEAD(server_mount_info_list);
71 static struct lustre_mount_info *server_find_mount(const char *name)
74 struct lustre_mount_info *lmi;
77 cfs_list_for_each(tmp, &server_mount_info_list) {
78 lmi = cfs_list_entry(tmp, struct lustre_mount_info,
80 if (strcmp(name, lmi->lmi_name) == 0)
86 /* we must register an obd for a mount before we call the setup routine.
87 *_setup will call lustre_get_mount to get the mnt struct
88 by obd_name, since we can't pass the pointer to setup. */
89 static int server_register_mount(const char *name, struct super_block *sb,
92 struct lustre_mount_info *lmi;
98 OBD_ALLOC(lmi, sizeof(*lmi));
101 OBD_ALLOC(name_cp, strlen(name) + 1);
103 OBD_FREE(lmi, sizeof(*lmi));
106 strcpy(name_cp, name);
108 cfs_mutex_lock(&lustre_mount_info_lock);
110 if (server_find_mount(name)) {
111 cfs_mutex_unlock(&lustre_mount_info_lock);
112 OBD_FREE(lmi, sizeof(*lmi));
113 OBD_FREE(name_cp, strlen(name) + 1);
114 CERROR("Already registered %s\n", name);
117 lmi->lmi_name = name_cp;
120 cfs_list_add(&lmi->lmi_list_chain, &server_mount_info_list);
122 cfs_mutex_unlock(&lustre_mount_info_lock);
124 CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n",
126 lmi->lmi_mnt ? mnt_get_count(lmi->lmi_mnt) : -1);
131 /* when an obd no longer needs a mount */
132 static int server_deregister_mount(const char *name)
134 struct lustre_mount_info *lmi;
137 cfs_mutex_lock(&lustre_mount_info_lock);
138 lmi = server_find_mount(name);
140 cfs_mutex_unlock(&lustre_mount_info_lock);
141 CERROR("%s not registered\n", name);
145 CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n",
147 lmi->lmi_mnt ? mnt_get_count(lmi->lmi_mnt) : -1);
149 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
150 cfs_list_del(&lmi->lmi_list_chain);
151 OBD_FREE(lmi, sizeof(*lmi));
152 cfs_mutex_unlock(&lustre_mount_info_lock);
157 /* obd's look up a registered mount using their obdname. This is just
158 for initial obd setup to find the mount struct. It should not be
159 called every time you want to mntget. */
160 struct lustre_mount_info *server_get_mount(const char *name)
162 struct lustre_mount_info *lmi;
163 struct lustre_sb_info *lsi;
166 cfs_mutex_lock(&lustre_mount_info_lock);
167 lmi = server_find_mount(name);
168 cfs_mutex_unlock(&lustre_mount_info_lock);
170 CERROR("Can't find mount for %s\n", name);
173 lsi = s2lsi(lmi->lmi_sb);
176 mntget(lmi->lmi_mnt);
177 cfs_atomic_inc(&lsi->lsi_mounts);
179 CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
180 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts),
181 lmi->lmi_mnt ? mnt_get_count(lmi->lmi_mnt) - 1 : -1);
185 EXPORT_SYMBOL(server_get_mount);
188 * Used by mdt to get mount_info from obdname.
189 * There are no blocking when using the mount_info.
190 * Do not use server_get_mount for this purpose.
192 struct lustre_mount_info *server_get_mount_2(const char *name)
194 struct lustre_mount_info *lmi;
197 cfs_mutex_lock(&lustre_mount_info_lock);
198 lmi = server_find_mount(name);
199 cfs_mutex_unlock(&lustre_mount_info_lock);
201 CERROR("Can't find mount for %s\n", name);
205 EXPORT_SYMBOL(server_get_mount_2);
207 static void unlock_mntput(struct vfsmount *mnt)
209 #ifdef HAVE_KERNEL_LOCKED
210 /* for kernel < 2.6.37 */
211 if (kernel_locked()) {
223 static int lustre_put_lsi(struct super_block *sb);
225 /* to be called from obd_cleanup methods */
226 int server_put_mount(const char *name, struct vfsmount *mnt)
228 struct lustre_mount_info *lmi;
229 struct lustre_sb_info *lsi;
233 /* This might be the last one, can't deref after this */
235 count = mnt_get_count(mnt) - 1;
239 cfs_mutex_lock(&lustre_mount_info_lock);
240 lmi = server_find_mount(name);
241 cfs_mutex_unlock(&lustre_mount_info_lock);
243 CERROR("Can't find mount for %s\n", name);
246 lsi = s2lsi(lmi->lmi_sb);
247 LASSERT(lmi->lmi_mnt == mnt);
249 CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
250 lmi->lmi_mnt, name, cfs_atomic_read(&lsi->lsi_mounts), count);
252 if (lustre_put_lsi(lmi->lmi_sb)) {
253 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
254 lmi->lmi_mnt, name, count);
255 /* last mount is the One True Mount */
257 CERROR("%s: mount busy, vfscount=%d!\n", name, count);
260 /* this obd should never need the mount again */
261 server_deregister_mount(name);
265 EXPORT_SYMBOL(server_put_mount);
267 /* Corresponding to server_get_mount_2 */
268 int server_put_mount_2(const char *name, struct vfsmount *mnt)
273 EXPORT_SYMBOL(server_put_mount_2);
275 /******* mount helper utilities *********/
278 static void ldd_print(struct lustre_disk_data *ldd)
280 PRINT_CMD(PRINT_MASK, " disk data:\n");
281 PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname);
282 PRINT_CMD(PRINT_MASK, "uuid: %s\n", (char *)ldd->ldd_uuid);
283 PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname);
284 PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex);
285 PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver);
286 PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags);
287 PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd));
288 PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
289 PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
290 PRINT_CMD(PRINT_MASK, "comment: %s\n", ldd->ldd_userdata);
294 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
295 struct lustre_disk_data *ldd)
297 struct lvfs_run_ctxt saved;
304 push_ctxt(&saved, mount_ctxt, NULL);
306 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
309 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
313 len = i_size_read(file->f_dentry->d_inode);
314 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
315 if (len != sizeof(*ldd)) {
316 CERROR("disk data size does not match: see %lu expect %u\n",
317 len, (int)sizeof(*ldd));
318 GOTO(out_close, rc = -EINVAL);
321 rc = lustre_fread(file, ldd, len, &off);
323 CERROR("error reading %s: read %d of %lu\n",
324 MOUNT_DATA_FILE, rc, len);
325 GOTO(out_close, rc = -EINVAL);
329 if (ldd->ldd_magic != LDD_MAGIC) {
330 /* FIXME add swabbing support */
331 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
332 ldd->ldd_magic, LDD_MAGIC);
333 GOTO(out_close, rc = -EINVAL);
336 if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
337 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
339 ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
340 GOTO(out_close, rc = -EINVAL);
342 if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
343 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
345 ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
346 /* Do something like remount filesystem read-only */
347 GOTO(out_close, rc = -EINVAL);
353 pop_ctxt(&saved, mount_ctxt, NULL);
357 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
358 struct lustre_disk_data *ldd)
360 struct lvfs_run_ctxt saved;
363 unsigned long len = sizeof(struct lustre_disk_data);
367 if (ldd->ldd_magic == 0)
370 LASSERT(ldd->ldd_magic == LDD_MAGIC);
372 ldd->ldd_config_ver++;
374 push_ctxt(&saved, mount_ctxt, NULL);
376 file = filp_open(MOUNT_DATA_FILE, O_RDWR|O_SYNC, 0644);
379 CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
383 rc = lustre_fwrite(file, ldd, len, &off);
385 CERROR("error writing %s: read %d of %lu\n",
386 MOUNT_DATA_FILE, rc, len);
387 GOTO(out_close, rc = -EINVAL);
395 pop_ctxt(&saved, mount_ctxt, NULL);
400 /**************** config llog ********************/
402 /** Get a config log from the MGS and process it.
403 * This func is called for both clients and servers.
404 * Continue to process new statements appended to the logs
405 * (whenever the config lock is revoked) until lustre_end_log
407 * @param sb The superblock is used by the MGC to write to the local copy of
409 * @param logname The name of the llog to replicate from the MGS
410 * @param cfg Since the same mgc may be used to follow multiple config logs
411 * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
412 * this log, and is added to the mgc's list of logs to follow.
414 int lustre_process_log(struct super_block *sb, char *logname,
415 struct config_llog_instance *cfg)
417 struct lustre_cfg *lcfg;
418 struct lustre_cfg_bufs *bufs;
419 struct lustre_sb_info *lsi = s2lsi(sb);
420 struct obd_device *mgc = lsi->lsi_mgc;
431 /* mgc_process_config */
432 lustre_cfg_bufs_reset(bufs, mgc->obd_name);
433 lustre_cfg_bufs_set_string(bufs, 1, logname);
434 lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
435 lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
436 lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
437 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
438 lustre_cfg_free(lcfg);
443 LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
444 "failed from the MGS (%d). Make sure this "
445 "client and the MGS are running compatible "
446 "versions of Lustre.\n",
447 mgc->obd_name, logname, rc);
450 LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
451 "failed (%d). This may be the result of "
452 "communication errors between this node and "
453 "the MGS, a bad configuration, or other "
454 "errors. See the syslog for more "
455 "information.\n", mgc->obd_name, logname,
458 /* class_obd_list(); */
461 EXPORT_SYMBOL(lustre_process_log);
463 /* Stop watching this config log for updates */
464 int lustre_end_log(struct super_block *sb, char *logname,
465 struct config_llog_instance *cfg)
467 struct lustre_cfg *lcfg;
468 struct lustre_cfg_bufs bufs;
469 struct lustre_sb_info *lsi = s2lsi(sb);
470 struct obd_device *mgc = lsi->lsi_mgc;
477 /* mgc_process_config */
478 lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
479 lustre_cfg_bufs_set_string(&bufs, 1, logname);
481 lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
482 lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
483 rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
484 lustre_cfg_free(lcfg);
487 EXPORT_SYMBOL(lustre_end_log);
489 /**************** obd start *******************/
491 /** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
492 * lctl (and do for echo cli/srv.
494 int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
495 char *s1, char *s2, char *s3, char *s4)
497 struct lustre_cfg_bufs bufs;
498 struct lustre_cfg * lcfg = NULL;
501 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
502 cmd, s1, s2, s3, s4);
504 lustre_cfg_bufs_reset(&bufs, cfgname);
506 lustre_cfg_bufs_set_string(&bufs, 1, s1);
508 lustre_cfg_bufs_set_string(&bufs, 2, s2);
510 lustre_cfg_bufs_set_string(&bufs, 3, s3);
512 lustre_cfg_bufs_set_string(&bufs, 4, s4);
514 lcfg = lustre_cfg_new(cmd, &bufs);
515 lcfg->lcfg_nid = nid;
516 rc = class_process_config(lcfg);
517 lustre_cfg_free(lcfg);
520 EXPORT_SYMBOL(do_lcfg);
522 /** Call class_attach and class_setup. These methods in turn call
523 * obd type-specific methods.
525 static int lustre_start_simple(char *obdname, char *type, char *uuid,
529 CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
531 rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
533 CERROR("%s attach error %d\n", obdname, rc);
536 rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
538 CERROR("%s setup error %d\n", obdname, rc);
539 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
544 /* Set up a MGS to serve startup logs */
545 static int server_start_mgs(struct super_block *sb)
547 struct lustre_sb_info *lsi = s2lsi(sb);
548 struct vfsmount *mnt = lsi->lsi_srv_mnt;
549 struct lustre_mount_info *lmi;
554 /* It is impossible to have more than 1 MGS per node, since
555 MGC wouldn't know which to connect to */
556 lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
558 lsi = s2lsi(lmi->lmi_sb);
559 LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
561 lsi->lsi_ldd->ldd_svname);
565 CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
567 rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
570 rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
571 LUSTRE_MGS_OBDNAME, 0, 0);
572 /* Do NOT call server_deregister_mount() here. This leads to
573 * inability cleanup cleanly and free lsi and other stuff when
574 * mgs calls server_put_mount() in error handling case. -umka */
578 LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
579 "Is the 'mgs' module loaded?\n",
580 LUSTRE_MGS_OBDNAME, rc);
584 static int server_stop_mgs(struct super_block *sb)
586 struct obd_device *obd;
590 CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
592 /* There better be only one MGS */
593 obd = class_name2obd(LUSTRE_MGS_OBDNAME);
595 CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
599 /* The MGS should always stop when we say so */
601 rc = class_manual_cleanup(obd);
605 CFS_DEFINE_MUTEX(mgc_start_lock);
607 /** Set up a mgc obd to process startup logs
609 * \param sb [in] super block of the mgc obd
611 * \retval 0 success, otherwise error code
613 static int lustre_start_mgc(struct super_block *sb)
615 struct obd_connect_data *data = NULL;
616 struct lustre_sb_info *lsi = s2lsi(sb);
617 struct obd_device *obd;
618 struct obd_export *exp;
619 struct obd_uuid *uuid;
622 char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
625 int rc = 0, i = 0, j, len;
628 LASSERT(lsi->lsi_lmd);
630 /* Find the first non-lo MGS nid for our MGC name */
631 if (lsi->lsi_flags & LSI_SERVER) {
632 ptr = lsi->lsi_ldd->ldd_params;
633 /* mount -o mgsnode=nid */
634 if (lsi->lsi_lmd->lmd_mgs &&
635 (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
637 /* Use mgsnode= nids */
638 } else if ((class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0) &&
639 (class_parse_nid(ptr, &nid, &ptr) == 0)) {
641 } else if (IS_MGS(lsi->lsi_ldd)) {
642 lnet_process_id_t id;
643 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
644 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
651 } else { /* client */
652 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
653 ptr = lsi->lsi_lmd->lmd_dev;
654 if (class_parse_nid(ptr, &nid, &ptr) == 0)
658 CERROR("No valid MGS nids found.\n");
662 cfs_mutex_lock(&mgc_start_lock);
664 len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
665 OBD_ALLOC(mgcname, len);
666 OBD_ALLOC(niduuid, len + 2);
667 if (!mgcname || !niduuid)
668 GOTO(out_free, rc = -ENOMEM);
669 sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
671 mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
675 GOTO(out_free, rc = -ENOMEM);
677 obd = class_name2obd(mgcname);
678 if (obd && !obd->obd_stopping) {
679 rc = obd_set_info_async(NULL, obd->obd_self_export,
680 strlen(KEY_MGSSEC), KEY_MGSSEC,
681 strlen(mgssec), mgssec, NULL);
685 /* Re-using an existing MGC */
686 cfs_atomic_inc(&obd->u.cli.cl_mgc_refcount);
688 /* IR compatibility check, only for clients */
689 if (lmd_is_client(lsi->lsi_lmd)) {
691 int vallen = sizeof(*data);
692 __u32 *flags = &lsi->lsi_lmd->lmd_flags;
694 rc = obd_get_info(NULL, obd->obd_self_export,
695 strlen(KEY_CONN_DATA), KEY_CONN_DATA,
696 &vallen, data, NULL);
698 has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
699 if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
700 /* LMD_FLG_NOIR is for test purpose only */
702 "Trying to mount a client with IR setting "
703 "not compatible with current mgc. "
704 "Force to use current mgc setting that is "
706 has_ir ? "enabled" : "disabled");
708 *flags &= ~LMD_FLG_NOIR;
710 *flags |= LMD_FLG_NOIR;
715 /* If we are restarting the MGS, don't try to keep the MGC's
716 old connection, or registration will fail. */
717 if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
718 CDEBUG(D_MOUNT, "New MGS with live MGC\n");
722 /* Try all connections, but only once (again).
723 We don't want to block another target from starting
724 (using its local copy of the log), but we do want to connect
725 if at all possible. */
727 CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
728 rc = obd_set_info_async(NULL, obd->obd_self_export,
729 sizeof(KEY_INIT_RECOV_BACKUP),
730 KEY_INIT_RECOV_BACKUP,
731 sizeof(recov_bk), &recov_bk, NULL);
735 CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
737 /* Add the primary nids for the MGS */
739 sprintf(niduuid, "%s_%x", mgcname, i);
740 if (lsi->lsi_flags & LSI_SERVER) {
741 ptr = lsi->lsi_ldd->ldd_params;
742 if (IS_MGS(lsi->lsi_ldd)) {
743 /* Use local nids (including LO) */
744 lnet_process_id_t id;
745 while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
746 rc = do_lcfg(mgcname, id.nid,
747 LCFG_ADD_UUID, niduuid, 0,0,0);
750 /* Use mgsnode= nids */
751 /* mount -o mgsnode=nid */
752 if (lsi->lsi_lmd->lmd_mgs) {
753 ptr = lsi->lsi_lmd->lmd_mgs;
754 } else if (class_find_param(ptr, PARAM_MGSNODE,
756 CERROR("No MGS nids given.\n");
757 GOTO(out_free, rc = -EINVAL);
759 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
760 rc = do_lcfg(mgcname, nid,
761 LCFG_ADD_UUID, niduuid, 0,0,0);
765 } else { /* client */
766 /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
767 ptr = lsi->lsi_lmd->lmd_dev;
768 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
769 rc = do_lcfg(mgcname, nid,
770 LCFG_ADD_UUID, niduuid, 0,0,0);
772 /* Stop at the first failover nid */
778 CERROR("No valid MGS nids found.\n");
779 GOTO(out_free, rc = -EINVAL);
781 lsi->lsi_lmd->lmd_mgs_failnodes = 1;
783 /* Random uuid for MGC allows easier reconnects */
785 ll_generate_random_uuid(uuidc);
786 class_uuid_unparse(uuidc, uuid);
789 rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
790 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
796 /* Add any failover MGS nids */
798 while ((*ptr == ':' ||
799 class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
800 /* New failover node */
801 sprintf(niduuid, "%s_%x", mgcname, i);
803 while (class_parse_nid(ptr, &nid, &ptr) == 0) {
805 rc = do_lcfg(mgcname, nid,
806 LCFG_ADD_UUID, niduuid, 0,0,0);
811 rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
819 lsi->lsi_lmd->lmd_mgs_failnodes = i;
821 obd = class_name2obd(mgcname);
823 CERROR("Can't find mgcobd %s\n", mgcname);
824 GOTO(out_free, rc = -ENOTCONN);
827 rc = obd_set_info_async(NULL, obd->obd_self_export,
828 strlen(KEY_MGSSEC), KEY_MGSSEC,
829 strlen(mgssec), mgssec, NULL);
833 /* Keep a refcount of servers/clients who started with "mount",
834 so we know when we can get rid of the mgc. */
835 cfs_atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
837 /* Try all connections, but only once. */
839 rc = obd_set_info_async(NULL, obd->obd_self_export,
840 sizeof(KEY_INIT_RECOV_BACKUP),
841 KEY_INIT_RECOV_BACKUP,
842 sizeof(recov_bk), &recov_bk, NULL);
845 CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
846 /* We connect to the MGS at setup, and don't disconnect until cleanup */
847 data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
848 OBD_CONNECT_AT | OBD_CONNECT_FULL20 |
849 OBD_CONNECT_IMP_RECOV;
850 if (lmd_is_client(lsi->lsi_lmd) &&
851 lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
852 data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
853 data->ocd_version = LUSTRE_VERSION_CODE;
854 rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
856 CERROR("connect failed %d\n", rc);
860 obd->u.cli.cl_mgc_mgsexp = exp;
863 /* Keep the mgc info in the sb. Note that many lsi's can point
867 cfs_mutex_unlock(&mgc_start_lock);
872 OBD_FREE(mgcname, len);
874 OBD_FREE(niduuid, len + 2);
878 static int lustre_stop_mgc(struct super_block *sb)
880 struct lustre_sb_info *lsi = s2lsi(sb);
881 struct obd_device *obd;
882 char *niduuid = 0, *ptr = 0;
883 int i, rc = 0, len = 0;
893 cfs_mutex_lock(&mgc_start_lock);
894 LASSERT(cfs_atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
895 if (!cfs_atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
896 /* This is not fatal, every client that stops
897 will call in here. */
898 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
899 cfs_atomic_read(&obd->u.cli.cl_mgc_refcount));
900 GOTO(out, rc = -EBUSY);
903 /* The MGC has no recoverable data in any case.
904 * force shotdown set in umount_begin */
905 obd->obd_no_recov = 1;
907 if (obd->u.cli.cl_mgc_mgsexp) {
908 /* An error is not fatal, if we are unable to send the
909 disconnect mgs ping evictor cleans up the export */
910 rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
912 CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
915 /* Save the obdname for cleaning the nid uuids, which are
917 len = strlen(obd->obd_name) + 6;
918 OBD_ALLOC(niduuid, len);
920 strcpy(niduuid, obd->obd_name);
921 ptr = niduuid + strlen(niduuid);
924 rc = class_manual_cleanup(obd);
928 /* Clean the nid uuids */
930 GOTO(out, rc = -ENOMEM);
932 for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
933 sprintf(ptr, "_%x", i);
934 rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
937 CERROR("del MDC UUID %s failed: rc = %d\n",
942 OBD_FREE(niduuid, len);
944 /* class_import_put will get rid of the additional connections */
945 cfs_mutex_unlock(&mgc_start_lock);
949 /* Since there's only one mgc per node, we have to change it's fs to get
950 access to the right disk. */
951 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
953 struct lustre_sb_info *lsi = s2lsi(sb);
957 CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
959 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
960 rc = obd_set_info_async(NULL, mgc->obd_self_export,
961 sizeof(KEY_SET_FS), KEY_SET_FS,
962 sizeof(*sb), sb, NULL);
964 CERROR("can't set_fs %d\n", rc);
970 static int server_mgc_clear_fs(struct obd_device *mgc)
975 CDEBUG(D_MOUNT, "Unassign mgc disk\n");
977 rc = obd_set_info_async(NULL, mgc->obd_self_export,
978 sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
983 CFS_DEFINE_MUTEX(server_start_lock);
985 /* Stop MDS/OSS if nobody is using them */
986 static int server_stop_servers(int lddflags, int lsiflags)
988 struct obd_device *obd = NULL;
989 struct obd_type *type = NULL;
993 cfs_mutex_lock(&server_start_lock);
995 /* Either an MDT or an OST or neither */
996 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
997 if ((lddflags & LDD_F_SV_TYPE_MDT) &&
998 (obd = class_name2obd(LUSTRE_MDS_OBDNAME))) {
999 /*FIXME pre-rename, should eventually be LUSTRE_MDT_NAME*/
1000 type = class_search_type(LUSTRE_MDS_NAME);
1002 /* if this was an OST, and there are no more OST's, clean up the OSS */
1003 if ((lddflags & LDD_F_SV_TYPE_OST) &&
1004 (obd = class_name2obd(LUSTRE_OSS_OBDNAME))) {
1005 type = class_search_type(LUSTRE_OST_NAME);
1008 if (obd && (!type || !type->typ_refcnt)) {
1011 /* obd_fail doesn't mean much on a server obd */
1012 err = class_manual_cleanup(obd);
1017 cfs_mutex_unlock(&server_start_lock);
1022 int server_mti_print(char *title, struct mgs_target_info *mti)
1024 PRINT_CMD(PRINT_MASK, "mti %s\n", title);
1025 PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
1026 PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname);
1027 PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid);
1028 PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n",
1029 mti->mti_config_ver, mti->mti_flags);
1032 EXPORT_SYMBOL(server_mti_print);
1034 static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
1036 struct lustre_sb_info *lsi = s2lsi(sb);
1037 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1038 lnet_process_id_t id;
1042 if (!(lsi->lsi_flags & LSI_SERVER))
1045 strncpy(mti->mti_fsname, ldd->ldd_fsname,
1046 sizeof(mti->mti_fsname));
1047 strncpy(mti->mti_svname, ldd->ldd_svname,
1048 sizeof(mti->mti_svname));
1050 mti->mti_nid_count = 0;
1051 while (LNetGetId(i++, &id) != -ENOENT) {
1052 if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
1055 /* server use --servicenode param, only allow specified
1056 * nids be registered */
1057 if ((ldd->ldd_flags & LDD_F_NO_PRIMNODE) != 0 &&
1058 class_match_nid(ldd->ldd_params,
1059 PARAM_FAILNODE, id.nid) < 1)
1062 /* match specified network */
1063 if (!class_match_net(ldd->ldd_params,
1064 PARAM_NETWORK, LNET_NIDNET(id.nid)))
1067 mti->mti_nids[mti->mti_nid_count] = id.nid;
1068 mti->mti_nid_count++;
1069 if (mti->mti_nid_count >= MTI_NIDS_MAX) {
1070 CWARN("Only using first %d nids for %s\n",
1071 mti->mti_nid_count, mti->mti_svname);
1076 mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
1077 mti->mti_config_ver = 0;
1078 if (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF)
1079 ldd->ldd_flags |= LDD_F_WRITECONF;
1080 mti->mti_flags = ldd->ldd_flags;
1081 mti->mti_stripe_index = ldd->ldd_svindex;
1082 memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
1083 if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
1084 CERROR("params too big for mti\n");
1087 memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
1091 /* Register an old or new target with the MGS. If needed MGS will construct
1092 startup logs and assign index */
1093 int server_register_target(struct super_block *sb)
1095 struct lustre_sb_info *lsi = s2lsi(sb);
1096 struct obd_device *mgc = lsi->lsi_mgc;
1097 struct lustre_disk_data *ldd = lsi->lsi_ldd;
1098 struct mgs_target_info *mti = NULL;
1105 if (!(lsi->lsi_flags & LSI_SERVER))
1111 rc = server_sb2mti(sb, mti);
1115 CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
1116 mti->mti_svname, mti->mti_fsname,
1117 libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
1120 /* if write_conf is true, the registration must succeed */
1121 writeconf = !!(ldd->ldd_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE));
1122 mti->mti_flags |= LDD_F_OPC_REG;
1124 /* Register the target */
1125 /* FIXME use mgc_process_config instead */
1126 rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
1127 sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET,
1128 sizeof(*mti), mti, NULL);
1130 if (mti->mti_flags & LDD_F_ERROR) {
1131 LCONSOLE_ERROR_MSG(0x160,
1132 "The MGS is refusing to allow this "
1133 "server (%s) to start. Please see messages"
1134 " on the MGS node.\n", ldd->ldd_svname);
1135 } else if (writeconf) {
1136 LCONSOLE_ERROR_MSG(0x15f,
1137 "Communication to the MGS return error %d. "
1138 "Is the MGS running?\n", rc);
1140 CERROR("Cannot talk to the MGS: %d, not fatal\n", rc);
1141 /* reset the error code for non-fatal error. */
1147 /* Always update our flags */
1148 ldd->ldd_flags = mti->mti_flags & LDD_F_ONDISK_MASK;
1150 /* If this flag is set, it means the MGS wants us to change our
1151 on-disk data. (So far this means just the index.) */
1152 if (mti->mti_flags & LDD_F_REWRITE_LDD) {
1155 CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
1156 "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index,
1158 ldd->ldd_svindex = mti->mti_stripe_index;
1159 strncpy(ldd->ldd_svname, mti->mti_svname,
1160 sizeof(ldd->ldd_svname));
1161 /* or ldd_make_sv_name(ldd); */
1162 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
1163 if (lsi->lsi_lmd->lmd_osd_type)
1165 err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
1168 CERROR("Label set error %d\n", err);
1169 label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
1171 CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
1173 /* Flush the new ldd to disk */
1174 fsfilt_sync(mgc, lsi->lsi_srv_mnt->mnt_sb);
1182 EXPORT_SYMBOL(server_register_target);
1185 * Notify the MGS that this target is ready.
1186 * Used by IR - if the MGS receives this message, it will notify clients.
1188 static int server_notify_target(struct super_block *sb, struct obd_device *obd)
1190 struct lustre_sb_info *lsi = s2lsi(sb);
1191 struct obd_device *mgc = lsi->lsi_mgc;
1192 struct mgs_target_info *mti = NULL;
1198 if (!(lsi->lsi_flags & LSI_SERVER))
1204 rc = server_sb2mti(sb, mti);
1208 mti->mti_instance = obd->u.obt.obt_instance;
1209 mti->mti_flags |= LDD_F_OPC_READY;
1211 /* FIXME use mgc_process_config instead */
1212 rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
1213 sizeof(KEY_REGISTER_TARGET),
1214 KEY_REGISTER_TARGET,
1215 sizeof(*mti), mti, NULL);
1217 /* Imperative recovery: if the mgs informs us to use IR? */
1218 if (!rc && !(mti->mti_flags & LDD_F_ERROR) &&
1219 (mti->mti_flags & LDD_F_IR_CAPABLE))
1220 lsi->lsi_flags |= LSI_IR_CAPABLE;
1229 /** Start server targets: MDTs and OSTs
1231 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
1233 struct obd_device *obd;
1234 struct lustre_sb_info *lsi = s2lsi(sb);
1235 struct config_llog_instance cfg;
1239 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
1242 /* If we're an MDT, make sure the global MDS is running */
1243 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
1244 /* make sure the MDS is started */
1245 cfs_mutex_lock(&server_start_lock);
1246 obd = class_name2obd(LUSTRE_MDS_OBDNAME);
1248 rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
1249 /* FIXME pre-rename, should eventually be LUSTRE_MDS_NAME */
1251 LUSTRE_MDS_OBDNAME"_uuid",
1254 cfs_mutex_unlock(&server_start_lock);
1255 CERROR("failed to start MDS: %d\n", rc);
1259 cfs_mutex_unlock(&server_start_lock);
1263 /* If we're an OST, make sure the global OSS is running */
1264 if (IS_OST(lsi->lsi_ldd)) {
1265 /* make sure OSS is started */
1266 cfs_mutex_lock(&server_start_lock);
1267 obd = class_name2obd(LUSTRE_OSS_OBDNAME);
1269 rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
1271 LUSTRE_OSS_OBDNAME"_uuid",
1274 cfs_mutex_unlock(&server_start_lock);
1275 CERROR("failed to start OSS: %d\n", rc);
1279 cfs_mutex_unlock(&server_start_lock);
1282 /* Set the mgc fs to our server disk. This allows the MGC to
1283 * read and write configs locally, in case it can't talk to the MGS. */
1284 if (lsi->lsi_lmd->lmd_osd_type == NULL) {
1285 rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
1290 /* Register with MGS */
1291 rc = server_register_target(sb);
1295 /* Let the target look up the mount using the target's name
1296 (we can't pass the sb or mnt through class_process_config.) */
1297 rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
1301 /* Start targets using the llog named for the target */
1302 memset(&cfg, 0, sizeof(cfg));
1303 rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
1305 CERROR("failed to start server %s: %d\n",
1306 lsi->lsi_ldd->ldd_svname, rc);
1307 /* Do NOT call server_deregister_mount() here. This makes it
1308 * impossible to find mount later in cleanup time and leaves
1309 * @lsi and othder stuff leaked. -umka */
1314 /* Release the mgc fs for others to use */
1315 if (lsi->lsi_lmd->lmd_osd_type == NULL)
1316 server_mgc_clear_fs(lsi->lsi_mgc);
1319 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1321 CERROR("no server named %s was started\n",
1322 lsi->lsi_ldd->ldd_svname);
1326 if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
1327 (OBP(obd, iocontrol))) {
1328 obd_iocontrol(OBD_IOC_ABORT_RECOVERY,
1329 obd->obd_self_export, 0, NULL, NULL);
1332 server_notify_target(sb, obd);
1334 /* calculate recovery timeout, do it after lustre_process_log */
1335 server_calc_timeout(lsi, obd);
1337 /* log has been fully processed */
1338 obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
1344 /***************** lustre superblock **************/
1346 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
1348 struct lustre_sb_info *lsi;
1354 OBD_ALLOC_PTR(lsi->lsi_lmd);
1355 if (!lsi->lsi_lmd) {
1360 lsi->lsi_lmd->lmd_exclude_count = 0;
1361 lsi->lsi_lmd->lmd_recovery_time_soft = 0;
1362 lsi->lsi_lmd->lmd_recovery_time_hard = 0;
1363 s2lsi_nocast(sb) = lsi;
1364 /* we take 1 extra ref for our setup */
1365 cfs_atomic_set(&lsi->lsi_mounts, 1);
1367 /* Default umount style */
1368 lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
1373 static int lustre_free_lsi(struct super_block *sb)
1375 struct lustre_sb_info *lsi = s2lsi(sb);
1378 LASSERT(lsi != NULL);
1379 CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
1381 /* someone didn't call server_put_mount. */
1382 LASSERT(cfs_atomic_read(&lsi->lsi_mounts) == 0);
1384 if (lsi->lsi_ldd != NULL)
1385 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
1387 if (lsi->lsi_lmd != NULL) {
1388 if (lsi->lsi_lmd->lmd_dev != NULL)
1389 OBD_FREE(lsi->lsi_lmd->lmd_dev,
1390 strlen(lsi->lsi_lmd->lmd_dev) + 1);
1391 if (lsi->lsi_lmd->lmd_profile != NULL)
1392 OBD_FREE(lsi->lsi_lmd->lmd_profile,
1393 strlen(lsi->lsi_lmd->lmd_profile) + 1);
1394 if (lsi->lsi_lmd->lmd_mgssec != NULL)
1395 OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
1396 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
1397 if (lsi->lsi_lmd->lmd_opts != NULL)
1398 OBD_FREE(lsi->lsi_lmd->lmd_opts,
1399 strlen(lsi->lsi_lmd->lmd_opts) + 1);
1400 if (lsi->lsi_lmd->lmd_exclude_count)
1401 OBD_FREE(lsi->lsi_lmd->lmd_exclude,
1402 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
1403 lsi->lsi_lmd->lmd_exclude_count);
1404 if (lsi->lsi_lmd->lmd_mgs != NULL)
1405 OBD_FREE(lsi->lsi_lmd->lmd_mgs,
1406 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
1407 if (lsi->lsi_lmd->lmd_osd_type != NULL)
1408 OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
1409 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
1411 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
1414 LASSERT(lsi->lsi_llsbi == NULL);
1415 OBD_FREE(lsi, sizeof(*lsi));
1416 s2lsi_nocast(sb) = NULL;
1421 /* The lsi has one reference for every server that is using the disk -
1422 e.g. MDT, MGS, and potentially MGC */
1423 static int lustre_put_lsi(struct super_block *sb)
1425 struct lustre_sb_info *lsi = s2lsi(sb);
1428 LASSERT(lsi != NULL);
1430 CDEBUG(D_MOUNT, "put %p %d\n", sb, cfs_atomic_read(&lsi->lsi_mounts));
1431 if (cfs_atomic_dec_and_test(&lsi->lsi_mounts)) {
1432 lustre_free_lsi(sb);
1438 static int lsi_prepare(struct lustre_sb_info *lsi)
1440 struct lustre_disk_data *ldd;
1446 LASSERT(lsi->lsi_lmd);
1448 OBD_ALLOC(ldd, sizeof(*ldd));
1452 strcpy(lsi->lsi_osd_type, LUSTRE_OSD_NAME);
1454 /* The server name is given as a mount line option */
1455 if (lsi->lsi_lmd->lmd_profile == NULL) {
1456 LCONSOLE_ERROR("Can't determine server name\n");
1457 GOTO(err, rc = -EINVAL);
1460 if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(ldd->ldd_svname))
1461 GOTO(err, rc = -ENAMETOOLONG);
1463 strcpy(ldd->ldd_svname, lsi->lsi_lmd->lmd_profile);
1465 /* Determine osd type */
1466 if (lsi->lsi_lmd->lmd_osd_type != NULL) {
1467 if (strlen(lsi->lsi_lmd->lmd_osd_type) >=
1468 sizeof(lsi->lsi_osd_type))
1469 GOTO(err, rc = -ENAMETOOLONG);
1471 strcpy(lsi->lsi_osd_type, lsi->lsi_lmd->lmd_osd_type);
1474 if ((p = strstr(ldd->ldd_svname, "-OST"))) {
1475 ldd->ldd_flags = LDD_F_SV_TYPE_OST;
1476 } else if ((p = strstr(ldd->ldd_svname, "-MDT"))) {
1477 ldd->ldd_flags = LDD_F_SV_TYPE_MDT;
1479 LCONSOLE_ERROR("Can't determine server type of '%s'\n",
1481 GOTO(err, rc = -EINVAL);
1484 len = p - ldd->ldd_svname;
1485 if (len >= MTI_NAME_MAXLEN)
1486 GOTO(err, rc = -ENAMETOOLONG);
1487 memcpy(ldd->ldd_fsname, ldd->ldd_svname, len);
1488 ldd->ldd_fsname[len] = '\0';
1490 ldd->ldd_svindex = simple_strtoul(p + 4, NULL, 16);
1491 ldd->ldd_flags |= LDD_F_WRITECONF;
1495 /* Add mount line flags that used to be in ldd:
1496 * writeconf, mgs, iam, anything else?
1499 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
1500 LDD_F_WRITECONF : 0;
1501 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ?
1502 LDD_F_SV_TYPE_MGS : 0;
1503 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_IAM) ?
1505 lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ?
1506 LDD_F_NO_PRIMNODE : 0;
1512 OBD_FREE(ldd, sizeof(*ldd));
1516 /*************** server mount ******************/
1518 /** Kernel mount using mount options in MOUNT_DATA_FILE.
1519 * Since this file lives on the disk, we pre-mount using a common
1520 * type, read the file, then re-mount using the type specified in the
1523 static struct vfsmount *server_kernel_mount(struct super_block *sb)
1525 struct lvfs_run_ctxt mount_ctxt;
1526 struct lustre_sb_info *lsi = s2lsi(sb);
1527 struct lustre_disk_data *ldd;
1528 struct lustre_mount_data *lmd = lsi->lsi_lmd;
1529 struct vfsmount *mnt;
1530 struct file_system_type *type;
1531 char *options = NULL;
1532 unsigned long page, s_flags;
1533 struct page *__page;
1538 if (lsi->lsi_lmd->lmd_osd_type) {
1539 rc = lsi_prepare(lsi);
1540 RETURN(ERR_PTR(rc));
1543 OBD_ALLOC(ldd, sizeof(*ldd));
1545 RETURN(ERR_PTR(-ENOMEM));
1546 strcpy(lsi->lsi_osd_type, LUSTRE_OSD_NAME);
1548 /* In the past, we have always used flags = 0.
1549 Note ext3/ldiskfs can't be mounted ro. */
1550 s_flags = sb->s_flags;
1552 /* allocate memory for options */
1553 OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
1555 GOTO(out_free, rc = -ENOMEM);
1556 page = (unsigned long)cfs_page_address(__page);
1557 options = (char *)page;
1558 memset(options, 0, CFS_PAGE_SIZE);
1560 /* mount-line options must be added for pre-mount because it may
1561 * contain mount options such as journal_dev which are required
1562 * to mount successfuly the underlying filesystem */
1563 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
1564 strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
1566 /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
1567 CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
1568 type = get_fs_type("ldiskfs");
1570 CERROR("premount failed: cannot find ldiskfs module\n");
1571 GOTO(out_free, rc = -ENODEV);
1573 mnt = vfs_kern_mount(type, s_flags, lmd->lmd_dev, (void *)options);
1574 cfs_module_put(type->owner);
1577 CERROR("premount %s:%#lx ldiskfs failed: %d "
1578 "Is the ldiskfs module available?\n",
1579 lmd->lmd_dev, s_flags, rc );
1583 OBD_SET_CTXT_MAGIC(&mount_ctxt);
1584 mount_ctxt.pwdmnt = mnt;
1585 mount_ctxt.pwd = mnt->mnt_root;
1586 mount_ctxt.fs = get_ds();
1588 rc = ldd_parse(&mount_ctxt, ldd);
1592 CERROR("premount parse options failed: rc = %d\n", rc);
1596 /* Done with our pre-mount, now do the real mount. */
1598 /* Glom up mount options */
1599 memset(options, 0, CFS_PAGE_SIZE);
1600 strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
1602 len = CFS_PAGE_SIZE - strlen(options) - 2;
1604 strcat(options, ",");
1605 strncat(options, "no_mbcache", len);
1607 /* Add in any mount-line options */
1608 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
1609 len = CFS_PAGE_SIZE - strlen(options) - 2;
1610 strcat(options, ",");
1611 strncat(options, lmd->lmd_opts, len);
1614 /* Special permanent mount flags */
1616 s_flags |= MS_NOATIME | MS_NODIRATIME;
1618 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
1619 MT_STR(ldd), lmd->lmd_dev, options);
1620 type = get_fs_type(MT_STR(ldd));
1622 CERROR("get_fs_type failed\n");
1623 GOTO(out_free, rc = -ENODEV);
1625 mnt = vfs_kern_mount(type, s_flags, lmd->lmd_dev, (void *)options);
1626 cfs_module_put(type->owner);
1629 CERROR("vfs_kern_mount failed: rc = %d\n", rc);
1633 if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
1634 simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
1637 OBD_PAGE_FREE(__page);
1638 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
1639 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
1644 OBD_PAGE_FREE(__page);
1645 OBD_FREE(ldd, sizeof(*ldd));
1646 lsi->lsi_ldd = NULL;
1647 RETURN(ERR_PTR(rc));
1650 /** Wait here forever until the mount refcount is 0 before completing umount,
1651 * else we risk dereferencing a null pointer.
1652 * LNET may take e.g. 165s before killing zombies.
1654 static void server_wait_finished(struct vfsmount *mnt)
1658 cfs_sigset_t blocked;
1661 cfs_waitq_init(&waitq);
1662 cfs_waitq_wait_event_interruptible_timeout(waitq, 0,
1663 cfs_time_seconds(3), rc);
1668 cfs_waitq_init(&waitq);
1670 while (mnt_get_count(mnt) > 1) {
1671 if (waited && (waited % 30 == 0))
1672 LCONSOLE_WARN("Mount still busy with %d refs after "
1676 /* Cannot use l_event_wait() for an interruptible sleep. */
1678 blocked = cfs_block_sigsinv(sigmask(SIGKILL));
1679 cfs_waitq_wait_event_interruptible_timeout(
1681 (mnt_get_count(mnt) == 1),
1682 cfs_time_seconds(3),
1684 cfs_restore_sigs(blocked);
1686 LCONSOLE_EMERG("Danger: interrupted umount %s with "
1687 "%d refs!\n", mnt_get_devname(mnt),
1688 mnt_get_count(mnt));
1695 /** Start the shutdown of servers at umount.
1697 static void server_put_super(struct super_block *sb)
1699 struct lustre_sb_info *lsi = s2lsi(sb);
1700 struct obd_device *obd;
1701 struct vfsmount *mnt = lsi->lsi_srv_mnt;
1702 char *tmpname, *extraname = NULL;
1704 int lddflags = lsi->lsi_ldd->ldd_flags;
1705 int lsiflags = lsi->lsi_flags;
1708 LASSERT(lsiflags & LSI_SERVER);
1710 tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
1711 OBD_ALLOC(tmpname, tmpname_sz);
1712 memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
1713 CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
1714 if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
1715 snprintf(tmpname, tmpname_sz, "MGS");
1717 /* Stop the target */
1718 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1719 (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
1720 struct lustre_profile *lprof = NULL;
1722 /* tell the mgc to drop the config log */
1723 lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
1725 /* COMPAT_146 - profile may get deleted in mgc_cleanup.
1726 If there are any setup/cleanup errors, save the lov
1727 name for safety cleanup later. */
1728 lprof = class_get_profile(lsi->lsi_ldd->ldd_svname);
1729 if (lprof && lprof->lp_dt) {
1730 OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
1731 strcpy(extraname, lprof->lp_dt);
1734 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1736 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1737 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1739 /* We can't seem to give an error return code
1740 * to .put_super, so we better make sure we clean up! */
1742 class_manual_cleanup(obd);
1744 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1745 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
1749 /* If they wanted the mgs to stop separately from the mdt, they
1750 should have put it on a different device. */
1751 if (IS_MGS(lsi->lsi_ldd)) {
1752 /* if MDS start with --nomgs, don't stop MGS then */
1753 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
1754 server_stop_mgs(sb);
1757 /* Clean the mgc and sb */
1758 lustre_common_put_super(sb);
1760 /* Wait for the targets to really clean up - can't exit (and let the
1761 sb get destroyed) while the mount is still in use */
1762 server_wait_finished(mnt);
1764 /* drop the One True Mount */
1768 /* Stop the servers (MDS, OSS) if no longer needed. We must wait
1769 until the target is really gone so that our type refcount check
1771 server_stop_servers(lddflags, lsiflags);
1773 /* In case of startup or cleanup err, stop related obds */
1775 obd = class_name2obd(extraname);
1777 CWARN("Cleaning orphaned obd %s\n", extraname);
1779 class_manual_cleanup(obd);
1781 OBD_FREE(extraname, strlen(extraname) + 1);
1784 LCONSOLE_WARN("server umount %s complete\n", tmpname);
1785 OBD_FREE(tmpname, tmpname_sz);
1789 /** Called only for 'umount -f'
1791 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1792 static void server_umount_begin(struct vfsmount *vfsmnt, int flags)
1794 struct super_block *sb = vfsmnt->mnt_sb;
1796 static void server_umount_begin(struct super_block *sb)
1799 struct lustre_sb_info *lsi = s2lsi(sb);
1802 #ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
1803 if (!(flags & MNT_FORCE)) {
1809 CDEBUG(D_MOUNT, "umount -f\n");
1810 /* umount = failover
1812 no third way to do non-force, non-failover */
1813 lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
1814 lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1818 static int server_statfs (struct dentry *dentry, cfs_kstatfs_t *buf)
1820 struct super_block *sb = dentry->d_sb;
1821 struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
1824 if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
1825 int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_root, buf);
1827 buf->f_type = sb->s_magic;
1833 buf->f_type = sb->s_magic;
1834 buf->f_bsize = sb->s_blocksize;
1840 buf->f_namelen = NAME_MAX;
1844 /** The operations we support directly on the superblock:
1845 * mount, umount, and df.
1847 static struct super_operations server_ops =
1849 .put_super = server_put_super,
1850 .umount_begin = server_umount_begin, /* umount -f */
1851 .statfs = server_statfs,
1854 #define log2(n) cfs_ffz(~(n))
1855 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1857 static int server_fill_super_common(struct super_block *sb)
1859 struct inode *root = 0;
1862 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1864 sb->s_blocksize = 4096;
1865 sb->s_blocksize_bits = log2(sb->s_blocksize);
1866 sb->s_magic = LUSTRE_SUPER_MAGIC;
1867 sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
1868 sb->s_flags |= MS_RDONLY;
1869 sb->s_op = &server_ops;
1871 root = new_inode(sb);
1873 CERROR("Can't make root inode\n");
1877 /* returns -EIO for every operation */
1878 /* make_bad_inode(root); -- badness - can't umount */
1879 /* apparently we need to be a directory for the mount to finish */
1880 root->i_mode = S_IFDIR;
1882 sb->s_root = d_alloc_root(root);
1884 CERROR("Can't make root dentry\n");
1892 /** Fill in the superblock info for a Lustre server.
1893 * Mount the device with the correct options.
1894 * Read the on-disk config file.
1895 * Start the services.
1897 static int server_fill_super(struct super_block *sb)
1899 struct lustre_sb_info *lsi = s2lsi(sb);
1900 struct vfsmount *mnt;
1904 /* the One True Mount */
1905 mnt = server_kernel_mount(sb);
1908 CERROR("Unable to mount device %s: %d\n",
1909 lsi->lsi_lmd->lmd_dev, rc);
1913 lsi->lsi_srv_mnt = mnt;
1915 LASSERT(lsi->lsi_ldd);
1916 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1917 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1918 lsi->lsi_lmd->lmd_dev);
1920 if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
1921 LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
1922 "running. Double-mount may have compromised"
1923 " the disk journal.\n",
1924 lsi->lsi_ldd->ldd_svname);
1930 /* Start MGS before MGC */
1931 if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){
1932 rc = server_start_mgs(sb);
1937 /* Start MGC before servers */
1938 rc = lustre_start_mgc(sb);
1942 /* Set up all obd devices for service */
1943 if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
1944 (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
1945 rc = server_start_targets(sb, mnt);
1947 CERROR("Unable to start targets: %d\n", rc);
1950 /* FIXME overmount client here,
1951 or can we just start a client log and client_fill_super on this sb?
1952 We need to make sure server_put_super gets called too - ll_put_super
1953 calls lustre_common_put_super; check there for LSI_SERVER flag,
1955 Probably should start client from new thread so we can return.
1956 Client will not finish until all servers are connected.
1957 Note - MGS-only server does NOT get a client, since there is no
1958 lustre fs associated - the MGS is for all lustre fs's */
1961 rc = server_fill_super_common(sb);
1967 /* We jump here in case of failure while starting targets or MGS.
1968 * In this case we can't just put @mnt and have to do real cleanup
1969 * with stoping targets, etc. */
1970 server_put_super(sb);
1974 /* Get the index from the obd name.
1975 rc = server type, or
1977 if endptr isn't NULL it is set to end of name */
1978 int server_name2index(char *svname, __u32 *idx, char **endptr)
1980 unsigned long index;
1982 char *dash = strrchr(svname, '-');
1986 /* intepret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
1987 * in the fsname, then determine the server index */
1988 if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
1990 for (; dash > svname && *dash != '-'; dash--);
1995 if (strncmp(dash + 1, "MDT", 3) == 0)
1996 rc = LDD_F_SV_TYPE_MDT;
1997 else if (strncmp(dash + 1, "OST", 3) == 0)
1998 rc = LDD_F_SV_TYPE_OST;
2001 if (strcmp(dash + 4, "all") == 0)
2002 return rc | LDD_F_SV_ALL;
2004 index = simple_strtoul(dash + 4, endptr, 16);
2008 EXPORT_SYMBOL(server_name2index);
2011 * Calculate timeout value for a target.
2013 void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
2015 struct lustre_mount_data *lmd;
2019 bool has_ir = !!(lsi->lsi_flags & LSI_IR_CAPABLE);
2020 int min = OBD_RECOVERY_TIME_MIN;
2022 LASSERT(lsi->lsi_flags & LSI_SERVER);
2026 soft = lmd->lmd_recovery_time_soft;
2027 hard = lmd->lmd_recovery_time_hard;
2028 has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR);
2029 obd->obd_no_ir = !has_ir;
2033 soft = OBD_RECOVERY_TIME_SOFT;
2035 hard = OBD_RECOVERY_TIME_HARD;
2037 /* target may have ir_factor configured. */
2038 factor = OBD_IR_FACTOR_DEFAULT;
2039 if (obd->obd_recovery_ir_factor)
2040 factor = obd->obd_recovery_ir_factor;
2043 int new_soft = soft;
2044 int new_hard = hard;
2046 /* adjust timeout value by imperative recovery */
2048 new_soft = (soft * factor) / OBD_IR_FACTOR_MAX;
2049 new_hard = (hard * factor) / OBD_IR_FACTOR_MAX;
2051 /* make sure the timeout is not too short */
2052 new_soft = max(min, new_soft);
2053 new_hard = max(new_soft, new_hard);
2055 LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery "
2056 "window shrunk from %d-%d down to %d-%d\n",
2057 obd->obd_name, soft, hard, new_soft, new_hard);
2064 obd->obd_recovery_timeout = max(obd->obd_recovery_timeout, soft);
2065 obd->obd_recovery_time_hard = hard;
2066 obd->obd_recovery_ir_factor = factor;
2068 EXPORT_SYMBOL(server_calc_timeout);
2070 /*************** mount common betweeen server and client ***************/
2073 int lustre_common_put_super(struct super_block *sb)
2078 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
2080 /* Drop a ref to the MGC */
2081 rc = lustre_stop_mgc(sb);
2082 if (rc && (rc != -ENOENT)) {
2084 CERROR("Can't stop MGC: %d\n", rc);
2087 /* BUSY just means that there's some other obd that
2088 needs the mgc. Let him clean it up. */
2089 CDEBUG(D_MOUNT, "MGC still in use\n");
2091 /* Drop a ref to the mounted disk */
2096 EXPORT_SYMBOL(lustre_common_put_super);
2098 static void lmd_print(struct lustre_mount_data *lmd)
2102 PRINT_CMD(PRINT_MASK, " mount data:\n");
2103 if (lmd_is_client(lmd))
2104 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
2105 PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev);
2106 PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags);
2109 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
2111 if (lmd->lmd_recovery_time_soft)
2112 PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
2113 lmd->lmd_recovery_time_soft);
2115 if (lmd->lmd_recovery_time_hard)
2116 PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
2117 lmd->lmd_recovery_time_hard);
2119 for (i = 0; i < lmd->lmd_exclude_count; i++) {
2120 PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i,
2121 lmd->lmd_exclude[i]);
2125 /* Is this server on the exclusion list */
2126 int lustre_check_exclusion(struct super_block *sb, char *svname)
2128 struct lustre_sb_info *lsi = s2lsi(sb);
2129 struct lustre_mount_data *lmd = lsi->lsi_lmd;
2134 rc = server_name2index(svname, &index, NULL);
2135 if (rc != LDD_F_SV_TYPE_OST)
2136 /* Only exclude OSTs */
2139 CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
2140 index, lmd->lmd_exclude_count, lmd->lmd_dev);
2142 for(i = 0; i < lmd->lmd_exclude_count; i++) {
2143 if (index == lmd->lmd_exclude[i]) {
2144 CWARN("Excluding %s (on exclusion list)\n", svname);
2151 /* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
2152 static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
2154 char *s1 = ptr, *s2;
2155 __u32 index, *exclude_list;
2159 /* The shortest an ost name can be is 8 chars: -OST0000.
2160 We don't actually know the fsname at this time, so in fact
2161 a user could specify any fsname. */
2162 devmax = strlen(ptr) / 8 + 1;
2164 /* temp storage until we figure out how many we have */
2165 OBD_ALLOC(exclude_list, sizeof(index) * devmax);
2169 /* we enter this fn pointing at the '=' */
2170 while (*s1 && *s1 != ' ' && *s1 != ',') {
2172 rc = server_name2index(s1, &index, &s2);
2174 CERROR("Can't parse server name '%s'\n", s1);
2177 if (rc == LDD_F_SV_TYPE_OST)
2178 exclude_list[lmd->lmd_exclude_count++] = index;
2180 CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
2182 /* now we are pointing at ':' (next exclude)
2183 or ',' (end of excludes) */
2184 if (lmd->lmd_exclude_count >= devmax)
2187 if (rc >= 0) /* non-err */
2190 if (lmd->lmd_exclude_count) {
2191 /* permanent, freed in lustre_free_lsi */
2192 OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
2193 lmd->lmd_exclude_count);
2194 if (lmd->lmd_exclude) {
2195 memcpy(lmd->lmd_exclude, exclude_list,
2196 sizeof(index) * lmd->lmd_exclude_count);
2199 lmd->lmd_exclude_count = 0;
2202 OBD_FREE(exclude_list, sizeof(index) * devmax);
2206 static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
2211 if (lmd->lmd_mgssec != NULL) {
2212 OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
2213 lmd->lmd_mgssec = NULL;
2216 tail = strchr(ptr, ',');
2218 length = strlen(ptr);
2220 length = tail - ptr;
2222 OBD_ALLOC(lmd->lmd_mgssec, length + 1);
2223 if (lmd->lmd_mgssec == NULL)
2226 memcpy(lmd->lmd_mgssec, ptr, length);
2227 lmd->lmd_mgssec[length] = '\0';
2231 static int lmd_parse_string(char **handle, char *ptr)
2236 if ((handle == NULL) || (ptr == NULL))
2239 if (*handle != NULL) {
2240 OBD_FREE(*handle, strlen(*handle) + 1);
2244 tail = strchr(ptr, ',');
2246 length = strlen(ptr);
2248 length = tail - ptr;
2250 OBD_ALLOC(*handle, length + 1);
2251 if (*handle == NULL)
2254 memcpy(*handle, ptr, length);
2255 (*handle)[length] = '\0';
2260 /* Collect multiple values for mgsnid specifiers */
2261 static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
2269 /* Find end of nidlist */
2270 while (class_parse_nid(tail, &nid, &tail) == 0) {}
2271 length = tail - *ptr;
2273 LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
2277 if (lmd->lmd_mgs != NULL)
2278 oldlen = strlen(lmd->lmd_mgs) + 1;
2280 OBD_ALLOC(mgsnid, oldlen + length + 1);
2284 if (lmd->lmd_mgs != NULL) {
2285 /* Multiple mgsnid= are taken to mean failover locations */
2286 memcpy(mgsnid, lmd->lmd_mgs, oldlen);
2287 mgsnid[oldlen - 1] = ':';
2288 OBD_FREE(lmd->lmd_mgs, oldlen);
2290 memcpy(mgsnid + oldlen, *ptr, length);
2291 mgsnid[oldlen + length] = '\0';
2292 lmd->lmd_mgs = mgsnid;
2298 /** Parse mount line options
2299 * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
2300 * dev is passed as device=uml1:/lustre by mount.lustre
2302 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
2304 char *s1, *s2, *devname = NULL;
2305 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
2311 LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
2312 "/sbin/mount.lustre is installed.\n");
2316 /* Options should be a string - try to detect old lmd data */
2317 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
2318 LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
2319 "/sbin/mount.lustre. Please install "
2320 "version %s\n", LUSTRE_VERSION_STRING);
2323 lmd->lmd_magic = LMD_MAGIC;
2325 /* Set default flags here */
2330 int time_min = OBD_RECOVERY_TIME_MIN;
2332 /* Skip whitespace and extra commas */
2333 while (*s1 == ' ' || *s1 == ',')
2336 /* Client options are parsed in ll_options: eg. flock,
2339 /* Parse non-ldiskfs options here. Rather than modifying
2340 ldiskfs, we just zero these out here */
2341 if (strncmp(s1, "abort_recov", 11) == 0) {
2342 lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
2344 } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
2345 lmd->lmd_recovery_time_soft = max_t(int,
2346 simple_strtoul(s1 + 19, NULL, 10), time_min);
2348 } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
2349 lmd->lmd_recovery_time_hard = max_t(int,
2350 simple_strtoul(s1 + 19, NULL, 10), time_min);
2352 } else if (strncmp(s1, "noir", 4) == 0) {
2353 lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
2355 } else if (strncmp(s1, "nosvc", 5) == 0) {
2356 lmd->lmd_flags |= LMD_FLG_NOSVC;
2358 } else if (strncmp(s1, "nomgs", 5) == 0) {
2359 lmd->lmd_flags |= LMD_FLG_NOMGS;
2361 } else if (strncmp(s1, "noscrub", 7) == 0) {
2362 lmd->lmd_flags |= LMD_FLG_NOSCRUB;
2364 } else if (strncmp(s1, PARAM_MGSNODE,
2365 sizeof(PARAM_MGSNODE) - 1) == 0) {
2366 s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
2367 /* Assume the next mount opt is the first
2368 invalid nid we get to. */
2369 rc = lmd_parse_mgs(lmd, &s2);
2373 } else if (strncmp(s1, "writeconf", 9) == 0) {
2374 lmd->lmd_flags |= LMD_FLG_WRITECONF;
2376 } else if (strncmp(s1, "mgssec=", 7) == 0) {
2377 rc = lmd_parse_mgssec(lmd, s1 + 7);
2381 /* ost exclusion list */
2382 } else if (strncmp(s1, "exclude=", 8) == 0) {
2383 rc = lmd_make_exclusion(lmd, s1 + 7);
2387 } else if (strncmp(s1, "svname=", 7) == 0) {
2388 rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
2392 } else if (strncmp(s1, "osd=", 4) == 0) {
2393 rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
2396 /* with ldiskfs we're still doing ldd parsing
2397 * in the kernel space */
2398 if (!strcmp(lmd->lmd_osd_type, "osd-ldiskfs")) {
2399 OBD_FREE(lmd->lmd_osd_type,
2400 strlen(lmd->lmd_osd_type) + 1);
2401 lmd->lmd_osd_type = NULL;
2405 /* Linux 2.4 doesn't pass the device, so we stuck it at the
2406 end of the options. */
2407 else if (strncmp(s1, "device=", 7) == 0) {
2409 /* terminate options right before device. device
2410 must be the last one. */
2416 s2 = strchr(s1, ',');
2424 memmove(s1, s2, strlen(s2) + 1);
2430 LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
2431 "(need mount option 'device=...')\n");
2435 s1 = strstr(devname, ":/");
2438 lmd->lmd_flags |= LMD_FLG_CLIENT;
2439 /* Remove leading /s from fsname */
2440 while (*++s1 == '/') ;
2441 /* Freed in lustre_free_lsi */
2442 OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
2443 if (!lmd->lmd_profile)
2445 sprintf(lmd->lmd_profile, "%s-client", s1);
2448 /* Freed in lustre_free_lsi */
2449 OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
2452 strcpy(lmd->lmd_dev, devname);
2454 /* Save mount options */
2455 s1 = options + strlen(options) - 1;
2456 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
2458 if (*options != 0) {
2459 /* Freed in lustre_free_lsi */
2460 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
2463 strcpy(lmd->lmd_opts, options);
2467 lmd->lmd_magic = LMD_MAGIC;
2472 CERROR("Bad mount options %s\n", options);
2476 struct lustre_mount_data2 {
2478 struct vfsmount *lmd2_mnt;
2481 /** This is the entry point for the mount call into Lustre.
2482 * This is called when a server or client is mounted,
2483 * and this is where we start setting things up.
2484 * @param data Mount options (e.g. -o flock,abort_recov)
2486 int lustre_fill_super(struct super_block *sb, void *data, int silent)
2488 struct lustre_mount_data *lmd;
2489 struct lustre_mount_data2 *lmd2 = data;
2490 struct lustre_sb_info *lsi;
2494 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
2496 lsi = lustre_init_lsi(sb);
2502 * Disable lockdep during mount, because mount locking patterns are
2508 * LU-639: the obd cleanup of last mount may not finish yet, wait here.
2510 obd_zombie_barrier();
2512 /* Figure out the lmd from the mount options */
2513 if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
2515 GOTO(out, rc = -EINVAL);
2518 if (lmd_is_client(lmd)) {
2519 CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
2520 if (!client_fill_super) {
2521 LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
2522 "client mount! Is the 'lustre' "
2523 "module loaded?\n");
2527 rc = lustre_start_mgc(sb);
2532 /* Connect and start */
2533 /* (should always be ll_fill_super) */
2534 rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
2535 /* c_f_s will call lustre_common_put_super on failure */
2538 CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
2539 lsi->lsi_flags |= LSI_SERVER;
2540 rc = server_fill_super(sb);
2541 /* s_f_s calls lustre_start_mgc after the mount because we need
2542 the MGS nids which are stored on disk. Plus, we may
2543 need to start the MGS first. */
2544 /* s_f_s will call server_put_super on failure */
2547 /* If error happens in fill_super() call, @lsi will be killed there.
2548 * This is why we do not put it here. */
2552 CERROR("Unable to mount %s (%d)\n",
2553 s2lsi(sb) ? lmd->lmd_dev : "", rc);
2555 CDEBUG(D_SUPER, "Mount %s complete\n",
2563 /* We can't call ll_fill_super by name because it lives in a module that
2564 must be loaded after this one. */
2565 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
2566 struct vfsmount *mnt))
2568 client_fill_super = cfs;
2570 EXPORT_SYMBOL(lustre_register_client_fill_super);
2572 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
2574 kill_super_cb = cfs;
2576 EXPORT_SYMBOL(lustre_register_kill_super_cb);
2578 /***************** FS registration ******************/
2579 #ifdef HAVE_FSTYPE_MOUNT
2580 struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
2581 const char *devname, void *data)
2583 struct lustre_mount_data2 lmd2 = { data, NULL };
2585 return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
2588 int lustre_get_sb(struct file_system_type *fs_type, int flags,
2589 const char *devname, void * data, struct vfsmount *mnt)
2591 struct lustre_mount_data2 lmd2 = { data, mnt };
2593 return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt);
2597 void lustre_kill_super(struct super_block *sb)
2599 struct lustre_sb_info *lsi = s2lsi(sb);
2601 if (kill_super_cb && lsi && !(lsi->lsi_flags & LSI_SERVER))
2602 (*kill_super_cb)(sb);
2604 kill_anon_super(sb);
2607 /** Register the "lustre" fs type
2609 struct file_system_type lustre_fs_type = {
2610 .owner = THIS_MODULE,
2612 #ifdef HAVE_FSTYPE_MOUNT
2613 .mount = lustre_mount,
2615 .get_sb = lustre_get_sb,
2617 .kill_sb = lustre_kill_super,
2618 .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
2619 #ifdef FS_HAS_FIEMAP
2622 LL_RENAME_DOES_D_MOVE,
2625 int lustre_register_fs(void)
2627 return register_filesystem(&lustre_fs_type);
2630 int lustre_unregister_fs(void)
2632 return unregister_filesystem(&lustre_fs_type);