1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/lvfs/lvfs_mount.c
5 * Client/server mount routines
7 * Copyright (c) 2005 Cluster File Systems, Inc.
8 * Author: Nathan Rutman <nathan@clusterfs.com>
10 * This file is part of Lustre, http://www.lustre.org/
12 * Lustre is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Lustre is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Lustre; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 #define DEBUG_SUBSYSTEM S_MGMT
28 #define D_MOUNT D_SUPER|D_CONFIG|D_ERROR
30 #include <linux/obd.h>
31 #include <linux/lvfs.h>
32 #include <linux/lustre_fsfilt.h>
33 //#include <linux/lustre_mgs.h>
34 #include <linux/obd_class.h>
35 #include <lustre/lustre_user.h>
36 #include <linux/version.h>
37 #include <linux/lustre_log.h>
38 #include <linux/lustre_disk.h>
40 static int (*client_fill_super)(struct super_block *sb) = NULL;
42 /*********** mount lookup *********/
43 DECLARE_MUTEX(lustre_mount_info_lock);
44 struct list_head lustre_mount_info_list = LIST_HEAD_INIT(lustre_mount_info_list);
46 static struct lustre_mount_info *server_find_mount(char *name)
48 struct list_head *tmp;
49 struct lustre_mount_info *lmi;
51 list_for_each(tmp, &lustre_mount_info_list) {
52 lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain);
53 if (strcmp(name, lmi->lmi_name) == 0) {
63 /* we must register an obd for a mount before we call the setup routine.
64 *_setup will call lustre_get_mount to get the mnt struct
65 by obd_name, since we can't pass the pointer to setup. */
66 static int server_register_mount(char *name, struct super_block *sb,
69 struct lustre_mount_info *lmi;
74 OBD_ALLOC(lmi, sizeof(*lmi));
77 OBD_ALLOC(name_cp, strlen(name) + 1);
79 OBD_FREE(lmi, sizeof(*lmi));
82 strcpy(name_cp, name);
84 down(&lustre_mount_info_lock);
86 if (server_find_mount(name)) {
87 up(&lustre_mount_info_lock);
88 OBD_FREE(lmi, sizeof(*lmi));
89 OBD_FREE(name_cp, strlen(name) + 1);
90 CERROR("Already registered %s\n", name);
93 lmi->lmi_name = name_cp;
96 list_add(&lmi->lmi_list_chain, &lustre_mount_info_list);
98 up(&lustre_mount_info_lock);
102 /* when an obd no longer needs a mount */
103 static int server_deregister_mount(char *name)
105 struct lustre_mount_info *lmi;
107 down(&lustre_mount_info_lock);
108 lmi = server_find_mount(name);
110 up(&lustre_mount_info_lock);
111 CERROR("%s not registered\n", name);
114 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
115 list_del(&lmi->lmi_list_chain);
116 OBD_FREE(lmi, sizeof(*lmi));
117 up(&lustre_mount_info_lock);
121 /* Deregister anyone referencing the mnt. Everyone should have
122 put_mount in *_cleanup, but this is a catch-all in case of err... */
123 static void server_deregister_mount_all(struct vfsmount *mnt)
125 struct list_head *tmp, *n;
126 struct lustre_mount_info *lmi;
131 down(&lustre_mount_info_lock);
132 list_for_each_safe(tmp, n, &lustre_mount_info_list) {
133 lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain);
134 if (lmi->lmi_mnt == mnt) {
135 CERROR("Deregister failsafe %s\n", lmi->lmi_name);
136 OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
137 list_del(&lmi->lmi_list_chain);
138 OBD_FREE(lmi, sizeof(*lmi));
141 up(&lustre_mount_info_lock);
144 /* obd's look up a registered mount using their name. This is just
145 for initial obd setup to find the mount struct. It should not be
146 called every time you want to mntget. */
147 struct lustre_mount_info *server_get_mount(char *name)
149 struct lustre_mount_info *lmi;
150 struct lustre_sb_info *lsi;
152 down(&lustre_mount_info_lock);
154 lmi = server_find_mount(name);
156 up(&lustre_mount_info_lock);
157 CERROR("Can't find mount for %s\n", name);
160 lsi = s2lsi(lmi->lmi_sb);
161 mntget(lmi->lmi_mnt);
162 atomic_inc(&lsi->lsi_mounts);
164 up(&lustre_mount_info_lock);
166 CDEBUG(D_MOUNT, "get_mnt %p from %s, vfscount=%d\n",
167 lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count));
172 static void unlock_mntput(struct vfsmount *mnt)
174 if (kernel_locked()) {
183 /* to be called from obd_cleanup methods */
184 int server_put_mount(char *name, struct vfsmount *mnt)
186 struct lustre_mount_info *lmi;
187 struct lustre_sb_info *lsi;
189 down(&lustre_mount_info_lock);
190 lmi = server_find_mount(name);
192 up(&lustre_mount_info_lock);
193 CERROR("Can't find mount for %s\n", name);
197 CDEBUG(D_MOUNT, "put_mnt %p from %s, vfscount=%d\n",
198 lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count));
200 lsi = s2lsi(lmi->lmi_sb);
201 LASSERT(lmi->lmi_mnt == mnt);
202 unlock_mntput(lmi->lmi_mnt);
203 if (atomic_dec_and_test(&lsi->lsi_mounts)) {
204 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, mount count %d\n",
206 atomic_read(&lmi->lmi_mnt->mnt_count));
207 /* 2 seems normal on mds, (may_umount() also expects 2
208 fwiw), but we only see 1 at this point in obdfilter. */
209 if (atomic_read(&lmi->lmi_mnt->mnt_count) > 2)
210 CERROR("%s: mount busy, mnt_count %d != 2\n", name,
211 atomic_read(&lmi->lmi_mnt->mnt_count));
213 up(&lustre_mount_info_lock);
215 /* this obd should never need the mount again */
216 server_deregister_mount(name);
222 /******* mount helper utilities *********/
224 static void ldd_print(struct lustre_disk_data *ldd)
228 CDEBUG(D_MOUNT, "disk data\n");
229 CDEBUG(D_MOUNT, "config: %d\n", ldd->ldd_config_ver);
230 CDEBUG(D_MOUNT, "fs: %s\n", ldd->ldd_fsname);
231 CDEBUG(D_MOUNT, "server: %s\n", ldd->ldd_svname);
232 CDEBUG(D_MOUNT, "flags: %#x\n", ldd->ldd_flags);
233 CDEBUG(D_MOUNT, "diskfs: %s\n", MT_STR(ldd));
234 CDEBUG(D_MOUNT, "options: %s\n", ldd->ldd_mount_opts);
235 if (!ldd->ldd_mgsnid_count)
236 CDEBUG(D_MOUNT, "no MGS nids\n");
237 else for (i = 0; i < ldd->ldd_mgsnid_count; i++) {
238 CDEBUG(D_MOUNT, "nid %d: %s\n", i,
239 libcfs_nid2str(ldd->ldd_mgsnid[i]));
243 static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
244 struct lustre_disk_data *ldd)
246 struct lvfs_run_ctxt saved;
252 push_ctxt(&saved, mount_ctxt, NULL);
254 file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
257 CERROR("cannot open %s: err = %d\n", MOUNT_DATA_FILE, err);
261 len = file->f_dentry->d_inode->i_size;
262 CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
263 if (len != sizeof(*ldd)) {
264 CERROR("disk data size does not match: see %lu expect %u\n",
266 GOTO(out_close, err = -EINVAL);
269 err = lustre_fread(file, ldd, len, &off);
271 CERROR("error reading %s: read %d of %lu\n",
272 MOUNT_DATA_FILE, err, len);
273 GOTO(out_close, err = -EINVAL);
276 if (ldd->ldd_magic != LDD_MAGIC) {
277 CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE,
278 ldd->ldd_magic, LDD_MAGIC);
279 GOTO(out_close, err = -EINVAL);
288 pop_ctxt(&saved, mount_ctxt, NULL);
292 static int ldd_write(struct lvfs_run_ctxt *mount_ctxt,
293 struct lustre_disk_data *ldd)
295 struct lvfs_run_ctxt saved;
298 unsigned long len = sizeof(struct lustre_disk_data);
301 LASSERT(ldd->ldd_magic != LDD_MAGIC);
303 push_ctxt(&saved, mount_ctxt, NULL);
305 file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
308 CERROR("cannot open %s: err = %d\n", MOUNT_DATA_FILE, err);
312 err = lustre_fwrite(file, ldd, len, &off);
314 CERROR("error writing %s: read %d of %lu\n",
315 MOUNT_DATA_FILE, err, len);
316 GOTO(out_close, err = -EINVAL);
325 pop_ctxt(&saved, mount_ctxt, NULL);
330 int parse_last_rcvd(struct obd_device *obd, char *uuid, int *first_mount)
332 struct lvfs_run_ctxt saved;
334 struct lr_server_data *lsd;
338 OBD_ALLOC_WAIT(lsd, sizeof(*lsd));
342 /* requires a mounted device */
345 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
347 /* open and test the last rcvd file */
348 file = filp_open(LAST_RCVD, O_RDONLY, 0644);
351 CERROR("cannot open %s file: err = %d\n", LAST_RCVD, err);
355 CDEBUG(D_MOUNT, "Have last_rcvd, size %lu\n",
356 (unsigned long)file->f_dentry->d_inode->i_size);
357 err = fsfilt_read_record(obd, file, lsd, sizeof(*lsd), &off);
359 CERROR("error reading %s: err %d\n", LAST_RCVD, err);
363 strcpy(uuid, lsd->lsd_uuid);
364 *first_mount = (lsd->lsd_mount_count == 0);
365 CDEBUG(D_MOUNT, "UUID from %s: %s, init=%d\n",
366 LAST_RCVD, uuid, *first_mount);
371 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
372 OBD_FREE(lsd, sizeof(*lsd));
377 /* Get a config log from the MGS and process it.
378 This func is called for both clients and servers. */
379 int lustre_get_process_log(struct super_block *sb, char *logname,
380 struct config_llog_instance *cfg)
382 struct lustre_sb_info *lsi = s2lsi(sb);
383 struct obd_device *mgc = lsi->lsi_mgc;
384 struct lustre_handle mgc_conn = {0, };
385 struct obd_export *exp = NULL;
386 struct llog_ctxt *rctxt, *lctxt;
391 CDEBUG(D_MOUNT, "parsing config log %s\n", logname);
393 rctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
394 lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
395 if (!lctxt || !rctxt) {
396 CERROR("missing llog context\n");
400 /* FIXME set up local llog originator with mgc_fs_setup
401 could use ioctl (can't call directly because of layering). */
403 /* Don't retry if connect fails */
404 rc = obd_set_info(mgc->obd_self_export,
405 strlen("init_recov_bk"), "init_recov_bk",
406 sizeof(recov_bk), &recov_bk);
408 CERROR("can't set init_recov_bk %d\n", rc);
412 rc = obd_connect(&mgc_conn, mgc, &(mgc->obd_uuid), NULL);
414 CERROR("connect failed %d\n", rc);
417 exp = class_conn2export(&mgc_conn);
418 LASSERT(exp->exp_obd == mgc);
420 //FIXME Copy the mgs remote log to the local disk
423 /* For debugging, it's useful to just dump the log */
424 class_config_dump_llog(rctxt, logname, cfg);
426 rc = class_config_parse_llog(rctxt, logname, cfg);
429 LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
430 "from the MGS (%d). Trying local log.\n",
431 mgc->obd_name, logname, rc);
432 /* If we couldn't connect to the MGS, try reading a copy
433 of the config log stored locally on disk */
434 rc = class_config_parse_llog(lctxt, logname, cfg);
436 LCONSOLE_ERROR("%s: Can't read the local config (%d)\n",
442 //FIXME cleanup local originator with mgc_fs_cleanup
446 static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
447 char *s1, char *s2, char *s3, char *s4)
449 struct lustre_cfg_bufs bufs;
450 struct lustre_cfg * lcfg = NULL;
453 CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
454 cmd, s1, s2, s3, s4);
456 lustre_cfg_bufs_reset(&bufs, cfgname);
458 lustre_cfg_bufs_set_string(&bufs, 1, s1);
460 lustre_cfg_bufs_set_string(&bufs, 2, s2);
462 lustre_cfg_bufs_set_string(&bufs, 3, s3);
464 lustre_cfg_bufs_set_string(&bufs, 4, s4);
466 lcfg = lustre_cfg_new(cmd, &bufs);
467 lcfg->lcfg_nid = nid;
468 err = class_process_config(lcfg);
469 lustre_cfg_free(lcfg);
473 static int lustre_start_simple(char *obdname, char *type, char *s1, char *s2)
476 CDEBUG(D_MOUNT, "Starting obd %s\n", obdname);
478 err = do_lcfg(obdname, 0, LCFG_ATTACH, type, obdname/*uuid*/, 0, 0);
480 CERROR("%s attach error %d\n", obdname, err);
483 err = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
485 CERROR("%s setup error %d\n", obdname, err);
486 do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
491 /* Set up a MGS to serve startup logs */
492 static int server_start_mgs(struct super_block *sb)
494 struct lustre_sb_info *lsi = s2lsi(sb);
495 struct vfsmount *mnt = lsi->lsi_srv_mnt;
496 struct lustre_mount_info *lmi;
497 char mgsname[] = "MGS";
501 /* It is impossible to have more than 1 MGS per node, since
502 MGC wouldn't know which to connect to */
503 lmi = server_find_mount(mgsname);
505 lsi = s2lsi(lmi->lmi_sb);
506 LCONSOLE_ERROR("The MGS service was already started from "
507 "server %s\n", lsi->lsi_ldd->ldd_svname);
511 CDEBUG(D_CONFIG, "Start MGS service %s\n", mgsname);
513 err = server_register_mount(mgsname, sb, mnt);
516 ((err = lustre_start_simple(mgsname, LUSTRE_MGS_NAME, 0, 0))))
517 server_deregister_mount(mgsname);
520 LCONSOLE_ERROR("Failed to start MGS %s (%d). Is the 'mgs' "
521 "module loaded?\n", mgsname, err);
526 static void server_stop_mgs(struct super_block *sb)
528 struct obd_device *obd;
529 char mgsname[] = "MGS";
531 CDEBUG(D_MOUNT, "Stop MGS service %s\n", mgsname);
533 obd = class_name2obd(mgsname);
535 CDEBUG(D_CONFIG, "mgs %s not running\n", mgsname);
539 class_manual_cleanup(obd);
542 /* Set up a mgcobd to process startup logs */
543 static int lustre_start_mgc(struct super_block *sb)
545 struct lustre_sb_info *lsi = s2lsi(sb);
546 struct obd_device *obd;
547 char mgcname[] = "MGC";
550 LASSERT(lsi->lsi_lmd);
552 obd = class_name2obd(mgcname);
554 atomic_inc(&obd->u.cli.cl_mgc_refcount);
555 /* FIXME But now do we add uuids or not? If there's truly
556 one MGC per site, should all be the same...
561 atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
563 if (lsi->lsi_lmd->lmd_mgsnid_count == 0) {
564 LCONSOLE_ERROR("No NIDs for the MGS were given.\n");
568 CDEBUG(D_MOUNT, "Start MGC %s\n", mgcname);
570 /* Add the first uuid for the MGS */
571 nid = lsi->lsi_lmd->lmd_mgsnid[0];
572 err = do_lcfg(mgcname, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0,0,0);
577 if ((err = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, "MGS",
578 libcfs_nid2str(nid))))
581 /* Add the redundant MGS nids */
582 for (i = 1; i < lsi->lsi_lmd->lmd_mgsnid_count; i++) {
583 nid = lsi->lsi_lmd->lmd_mgsnid[i];
584 err = do_lcfg(mgcname, nid, LCFG_ADD_UUID, libcfs_nid2str(nid),
587 CERROR("Add uuid for %s failed %d\n",
588 libcfs_nid2str(nid), err);
591 err = do_lcfg(mgcname, 0, LCFG_ADD_CONN, libcfs_nid2str(nid),
594 CERROR("Add conn for %s failed %d\n",
595 libcfs_nid2str(nid), err);
598 /* Keep the mgc info in the sb */
599 obd = class_name2obd(mgcname);
601 CERROR("Can't find mgcobd %s\n", mgcname);
610 static void lustre_stop_mgc(struct super_block *sb)
612 struct lustre_sb_info *lsi = s2lsi(sb);
613 struct obd_device *obd;
622 if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
623 CDEBUG(D_MOUNT, "mgc still has %d references.\n",
624 atomic_read(&obd->u.cli.cl_mgc_refcount));
628 class_manual_cleanup(obd);
630 /* class_add_uuid adds a nid even if the same uuid exists; we might
631 delete any copy here. So they all better match. */
632 for (i = 0; i < lsi->lsi_lmd->lmd_mgsnid_count; i++) {
633 nid = lsi->lsi_lmd->lmd_mgsnid[i];
634 err = do_lcfg(obd->obd_name, nid, LCFG_DEL_UUID,
635 libcfs_nid2str(nid), 0, 0, 0);
637 CERROR("del MDC UUID %s failed: rc = %d\n",
638 libcfs_nid2str(nid), err);
640 /* class_import_put will get rid of the additional connections */
643 /* Since there's only one mgc per node, we have to change it's fs to get
644 access to the right disk. */
645 static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
649 /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
650 err = obd_set_info(mgc->obd_self_export,
651 strlen("set_fs"), "set_fs",
654 CERROR("can't set_fs %d\n", err);
660 static int server_mgc_clear_fs(struct obd_device *mgc)
663 err = obd_set_info(mgc->obd_self_export,
664 strlen("clear_fs"), "clear_fs", 0, NULL);
668 /* Stop MDS/OSS if nobody is using them */
669 static void server_stop_servers(struct super_block *sb)
671 struct lustre_sb_info *lsi = s2lsi(sb);
672 struct obd_device *obd;
674 /* if this was an MDT, and there are no more MDT's, clean up the MDS */
675 if ((lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) &&
676 (obd = class_name2obd("MDS"))) {
677 //FIXME pre-rename, should eventually be LUSTRE_MDT_NAME
678 struct obd_type *type = class_search_type(LUSTRE_MDS_NAME);
679 if (!type || !type->typ_refcnt) {
680 /* nobody is using the MDT type, clean the MDS */
681 if (lsi->lsi_flags & LSI_UMOUNT_FORCE)
683 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
685 class_manual_cleanup(obd);
689 /* if this was an OST, and there are no more OST's, clean up the OSS */
690 if ((lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) &&
691 (obd = class_name2obd("OSS"))) {
692 struct obd_type *type = class_search_type(LUSTRE_OST_NAME);
693 if (!type || !type->typ_refcnt) {
694 /* nobody is using the OST type, clean the OSS */
695 if (lsi->lsi_flags & LSI_UMOUNT_FORCE)
697 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
699 class_manual_cleanup(obd);
704 static int server_initial_connect(struct super_block *sb, struct vfsmount *mnt)
706 struct lustre_sb_info *lsi = s2lsi(sb);
707 struct obd_device *mgc = lsi->lsi_mgc;
708 struct lustre_disk_data *ldd = lsi->lsi_ldd;
709 struct lustre_handle mgc_conn = {0, };
710 struct obd_export *exp = NULL;
711 struct mgmt_target_info *mti = NULL;
715 /* send MGMT_TARGET_ADD rpc via MGC, MGS should reply with an
718 OBD_ALLOC(mti, sizeof(*mti));
722 strncpy(mti->mti_ostname, ldd->ldd_svname,
723 sizeof(mti->mti_ostname));
724 strncpy(mti->mti_fsname, ldd->ldd_fsname,
725 sizeof(mti->mti_fullfsname));
726 mti->mti_flags = ldd->ldd_flags;
727 mti->mti_stripe_index = ldd->ldd_svindex;
729 rc = obd_connect(&mgc_conn, mgc, &(mgc->obd_uuid), NULL);
731 CERROR("connect failed %d\n", rc);
734 exp = class_conn2export(&mgc_conn);
735 LASSERT(exp->exp_obd == mgc);
737 /* FIXME use ioctl instead? eg
738 struct obd_ioctl_data ioc_data = { 0 };
739 ioc_data.ioc_inllen1 = strlen(ldd->ldd_svname) + 1;
740 ioc_data.ioc_inlbuf1 = ldd->ldd_svname;
742 err = obd_iocontrol(OBD_IOC_START, obd->obd_self_export,
743 sizeof ioc_data, &ioc_data, NULL);
745 rc = obd_set_info(exp,
746 strlen("add_target"), "add_target",
750 CERROR("add_target failed %d\n", rc);
754 /* If this flag is still set, it means we need to change our on-disk
755 index to what the mgs assigned us. */
756 if (mti->mti_flags & LDD_F_NEED_INDEX) {
757 CERROR("Must change on-disk index from %#x to %#x\n",
758 ldd->ldd_svindex, mti->mti_stripe_index);
759 ldd->ldd_flags &= ~(LDD_F_NEED_INDEX | LDD_F_FIRST_START);
760 ldd->ldd_config_ver = 666; // FIXME
761 ldd->ldd_svindex = mti->mti_stripe_index;
762 ldd_make_sv_name(ldd);
763 ldd_write(&mgc->obd_lvfs_ctxt, ldd);
764 /* FIXME write last_rcvd?, disk label? */
769 OBD_FREE(mti, sizeof(*mti));
774 static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
776 struct obd_device *obd;
777 struct lustre_sb_info *lsi = s2lsi(sb);
778 struct config_llog_instance cfg;
781 CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
783 /* If we're an MDT, make sure the global MDS is running */
784 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
785 /* make sure (what will be called) the MDS is started */
786 obd = class_name2obd("MDS");
788 //FIXME pre-rename, should eventually be LUSTRE_MDS_NAME
789 err = lustre_start_simple("MDS", LUSTRE_MDT_NAME, 0, 0);
791 CERROR("failed to start MDS: %d\n", err);
797 /* If we're an OST, make sure the global OSS is running */
798 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
799 /* make sure OSS is started */
800 obd = class_name2obd("OSS");
802 err = lustre_start_simple("OSS", LUSTRE_OSS_NAME, 0, 0);
804 CERROR("failed to start OSS: %d\n", err);
810 /* Set the mgc fs to our server disk */
811 server_mgc_set_fs(lsi->lsi_mgc, sb);
813 /* Get a new index if needed */
814 if (lsi->lsi_ldd->ldd_flags & (LDD_F_NEED_INDEX | LDD_F_NEED_REGISTER)) {
815 /* FIXME Maybe need to change NEED_INDEX to NEVER_CONNECTED,
816 in case index number was given but llog still is needed.*/
817 CERROR("Need new target index from MGS!\n");
818 err = server_initial_connect(sb, mnt);
820 CERROR("Initial connect failed for %s: %d\n",
821 lsi->lsi_ldd->ldd_svname, err);
827 /* Register the mount for the target */
828 err = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
832 /* The MGC starts targets using the svname llog */
833 cfg.cfg_instance = NULL;
834 cfg.cfg_uuid = lsi->lsi_mgc->obd_uuid;
835 lustre_get_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
837 CERROR("failed to start server %s: %d\n",
838 lsi->lsi_ldd->ldd_svname, err);
839 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
843 if (!class_name2obd(lsi->lsi_ldd->ldd_svname)) {
844 CERROR("no server named %s was started\n",
845 lsi->lsi_ldd->ldd_svname);
846 server_deregister_mount(lsi->lsi_ldd->ldd_svname);
851 /* Release the mgc fs for others to use */
852 server_mgc_clear_fs(lsi->lsi_mgc);
855 server_stop_servers(sb);
859 /***************** mount **************/
861 struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
863 struct lustre_sb_info *lsi = NULL;
865 OBD_ALLOC(lsi, sizeof(*lsi));
868 OBD_ALLOC(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
870 OBD_FREE(lsi, sizeof(*lsi));
874 s2lsi_nocast(sb) = lsi;
875 atomic_set(&lsi->lsi_mounts, 0);
879 void lustre_free_lsi(struct super_block *sb)
881 struct lustre_sb_info *lsi = s2lsi(sb);
885 if (lsi->lsi_ldd != NULL)
886 OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
888 if (lsi->lsi_lmd != NULL) {
889 if (lsi->lsi_lmd->lmd_dev != NULL)
890 OBD_FREE(lsi->lsi_lmd->lmd_dev,
891 strlen(lsi->lsi_lmd->lmd_dev) + 1);
892 if (lsi->lsi_lmd->lmd_opts != NULL)
893 OBD_FREE(lsi->lsi_lmd->lmd_opts,
894 strlen(lsi->lsi_lmd->lmd_opts) + 1);
895 OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
898 LASSERT(lsi->lsi_llsbi == NULL);
900 server_deregister_mount_all(lsi->lsi_srv_mnt);
902 OBD_FREE(lsi, sizeof(*lsi));
903 s2lsi_nocast(sb) = NULL;
910 /*************** server mount ******************/
912 /* Kernel mount using mount options in MOUNT_DATA_FILE */
913 static struct vfsmount *server_kernel_mount(struct super_block *sb)
915 struct lvfs_run_ctxt mount_ctxt;
916 struct lustre_sb_info *lsi = s2lsi(sb);
917 struct lustre_disk_data *ldd;
918 struct lustre_mount_data *lmd = lsi->lsi_lmd;
919 struct vfsmount *mnt;
920 char *options = NULL;
921 unsigned long page, s_flags;
924 OBD_ALLOC(ldd, sizeof(*ldd));
926 return(ERR_PTR(-ENOMEM));
928 /* In the past, we have always used flags = 0.
929 Note ext3/ldiskfs can't be mounted ro. */
930 s_flags = sb->s_flags;
932 /* Pre-mount ext3 to read the MOUNT_DATA_FILE */
933 CDEBUG(D_MOUNT, "Pre-mount ext3 %s\n", lmd->lmd_dev);
934 mnt = do_kern_mount("ext3", s_flags, lmd->lmd_dev, 0);
937 CERROR("premount failed: err = %d\n", err);
941 OBD_SET_CTXT_MAGIC(&mount_ctxt);
942 mount_ctxt.pwdmnt = mnt;
943 mount_ctxt.pwd = mnt->mnt_root;
944 mount_ctxt.fs = get_ds();
946 err = ldd_parse(&mount_ctxt, ldd);
950 CERROR("premount parse options failed: err = %d\n", err);
954 /* Done with our pre-mount, now do the real mount. */
956 /* Glom up mount options */
957 page = __get_free_page(GFP_KERNEL);
962 options = (char *)page;
963 memset(options, 0, PAGE_SIZE);
964 strcpy(options, ldd->ldd_mount_opts);
966 /* Add in any mount-line options */
967 if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
969 strcat(options, ",");
970 strcat(options, lmd->lmd_opts);
973 /* Special permanent mount flags */
975 s_flags |= MS_NOATIME | MS_NODIRATIME;
977 CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
978 MT_STR(ldd), lmd->lmd_dev, options);
979 mnt = do_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
984 CERROR("do_kern_mount failed: err = %d\n", err);
988 lsi->lsi_ldd = ldd; /* freed at lsi cleanup */
989 CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
993 OBD_FREE(ldd, sizeof(*ldd));
995 return(ERR_PTR(err));
998 static void server_put_super(struct super_block *sb)
1000 struct lustre_sb_info *lsi = s2lsi(sb);
1001 struct obd_device *obd;
1003 CDEBUG(D_MOUNT, "server put_super %s\n", lsi->lsi_ldd->ldd_svname);
1005 obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
1007 CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
1008 if (lsi->lsi_flags & LSI_UMOUNT_FORCE)
1010 if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
1012 class_manual_cleanup(obd);
1014 CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
1017 //class_del_profile(lsi->lsi_ldd->ldd_svname); /* if it exists */
1019 server_stop_servers(sb);
1021 /* If they wanted the mgs to stop separately from the mdt, they
1022 should have put it on a different device. */
1023 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MGMT)
1024 server_stop_mgs(sb);
1026 /* clean the mgc and sb */
1027 lustre_common_put_super(sb);
1029 /* drop the kernel mount from server_fill_super */
1030 unlock_mntput(lsi->lsi_srv_mnt);
1033 static void server_umount_begin(struct super_block *sb)
1035 struct lustre_sb_info *lsi = s2lsi(sb);
1037 CDEBUG(D_MOUNT, "umount -f\n");
1039 umount -f = failover
1040 no third way to do LSI_UMOUNT_FORCE */
1041 lsi->lsi_flags |= LSI_UMOUNT_FAILOVER;
1044 #define log2(n) ffz(~(n))
1045 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
1047 static struct super_operations server_ops =
1050 .put_super = server_put_super,
1051 .umount_begin = server_umount_begin, /* umount -f */
1054 static int server_fill_super_common(struct super_block *sb)
1056 struct inode *root = 0;
1059 CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
1061 sb->s_blocksize = 4096;
1062 sb->s_blocksize_bits = log2(sb->s_blocksize);
1063 sb->s_magic = LUSTRE_SUPER_MAGIC;
1064 sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
1065 sb->s_flags |= MS_RDONLY;
1066 sb->s_op = &server_ops;
1068 root = new_inode(sb);
1070 CERROR("Can't make root inode\n");
1074 /* returns -EIO for every operation */
1075 /* make_bad_inode(root); -- badness - can't umount */
1076 /* apparently we need to be a directory for the mount to finish */
1077 root->i_mode = S_IFDIR;
1079 sb->s_root = d_alloc_root(root);
1081 CERROR("Can't make root dentry\n");
1089 static int server_fill_super(struct super_block *sb)
1091 struct lustre_sb_info *lsi = s2lsi(sb);
1092 struct vfsmount *mnt;
1093 int mgs_service = 0, i = 0, err;
1096 /* the One True Mount */
1097 mnt = server_kernel_mount(sb);
1100 CERROR("Unable to mount device %s: %d\n",
1101 lsi->lsi_lmd->lmd_dev, err);
1104 lsi->lsi_srv_mnt = mnt;
1106 LASSERT(lsi->lsi_ldd);
1107 CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
1108 lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname,
1109 lsi->lsi_lmd->lmd_dev);
1111 /* append ldd nids to lmd nids */
1112 for (i = 0; (i < lsi->lsi_ldd->ldd_mgsnid_count) &&
1113 (lsi->lsi_lmd->lmd_mgsnid_count < MAX_FAILOVER_NIDS); i++) {
1114 lsi->lsi_lmd->lmd_mgsnid[lsi->lsi_lmd->lmd_mgsnid_count++] =
1115 lsi->lsi_ldd->ldd_mgsnid[i];
1118 /* start MGS before MGC */
1119 if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MGMT) {
1120 err = server_start_mgs(sb);
1122 CERROR("ignoring Failed MGS start!!\n");
1123 //GOTO(out_mnt, err);
1129 err = lustre_start_mgc(sb);
1133 /* Set up all obd devices for service */
1134 err = server_start_targets(sb, mnt);
1136 CERROR("Unable to start targets: %d\n", err);
1140 /* FIXME overmount client here,
1141 or can we just start a client log and client_fill_super on this sb?
1142 We need to make sure server_put_super gets called too - ll_put_super
1143 calls lustre_common_put_super; check there for LSI_SERVER flag,
1145 Probably should start client from new thread so we can return.
1146 Client will not finish until all servers are connected. */
1147 err = server_fill_super_common(sb);
1155 server_stop_mgs(sb);
1156 /* mgc is stopped in lustre_fill_super */
1159 //if (lsi->lsi_ldd) class_del_profile(lsi->lsi_ldd->ldd_svname);
1164 /*************** mount common betweeen server and client ***************/
1167 void lustre_common_put_super(struct super_block *sb)
1169 CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
1171 lustre_stop_mgc(sb);
1172 lustre_free_lsi(sb);
1175 static void lmd_print(struct lustre_mount_data *lmd)
1179 CDEBUG(D_MOUNT, "mount data\n");
1180 if (!lmd->lmd_mgsnid_count)
1181 CDEBUG(D_MOUNT, "no MGS nids\n");
1182 else for (i = 0; i < lmd->lmd_mgsnid_count; i++) {
1183 CDEBUG(D_MOUNT, "nid %d: %s\n", i,
1184 libcfs_nid2str(lmd->lmd_mgsnid[i]));
1186 if (lmd_is_client(lmd))
1187 CDEBUG(D_MOUNT, "fsname: %s\n", lmd->lmd_dev);
1189 CDEBUG(D_MOUNT, "device: %s\n", lmd->lmd_dev);
1190 CDEBUG(D_MOUNT, "flags: %x\n", lmd->lmd_flags);
1192 CDEBUG(D_MOUNT, "options: %s\n", lmd->lmd_opts);
1195 static int lmd_parse(char *options, struct lustre_mount_data *lmd)
1197 char *s1, *s2, *devname = NULL;
1198 struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
1203 LCONSOLE_ERROR("Missing mount data: check that "
1204 "/sbin/mount.lustre is installed.\n");
1208 /* Try to detect old lmd data in options */
1209 if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
1210 LCONSOLE_ERROR("You're using an old version of "
1211 "/sbin/mount.lustre. Please install version "
1212 "1.%d\n", LMD_MAGIC & 0xFF);
1215 lmd->lmd_magic = LMD_MAGIC;
1218 lmd->lmd_flags |= LMD_FLG_MNTCNF | LMD_FLG_RECOVER;
1222 while (*s1 == ' ' || *s1 == ',')
1224 if (strncmp(s1, "recov", 5) == 0)
1225 lmd->lmd_flags |= LMD_FLG_RECOVER;
1226 if (strncmp(s1, "norecov", 7) == 0)
1227 lmd->lmd_flags &= ~LMD_FLG_RECOVER;
1228 /* Linux 2.4 doesn't pass the device, so we stuck it at the
1229 end of the options. */
1230 if (strncmp(s1, "device=", 7) == 0) {
1232 /* terminate options right before device. device
1233 must be the last one. */
1236 s2 = strstr(s1, ",");
1243 LCONSOLE_ERROR("Can't find the device name "
1244 "(need mount option 'device=...')\n");
1248 if (strchr(devname, ',')) {
1249 LCONSOLE_ERROR("Device name must be the final option\n");
1254 /* Get MGS nids if client mount */
1255 while ((s2 = strchr(s1, ':'))) {
1258 lmd->lmd_flags = LMD_FLG_CLIENT;
1259 nid = libcfs_str2nid(s1);
1260 if (nid == LNET_NID_ANY) {
1261 LCONSOLE_ERROR("Can't parse NID '%s'\n", s1);
1264 if (lmd->lmd_mgsnid_count >= MAX_FAILOVER_NIDS) {
1265 LCONSOLE_ERROR("Too many NIDs: '%s'\n", s1);
1268 lmd->lmd_mgsnid[lmd->lmd_mgsnid_count++] = nid;
1272 if (lmd_is_client(lmd)) {
1273 /* Remove leading /s from fsname */
1274 while (*++s1 == '/')
1279 LCONSOLE_ERROR("No filesytem specified\n");
1283 /* freed in lustre_free_lsi */
1284 OBD_ALLOC(lmd->lmd_dev, strlen(s1) + 1);
1287 strcpy(lmd->lmd_dev, s1);
1289 /* save mount options */
1290 s1 = options + strlen(options) - 1;
1291 while (s1 >= options && (*s1 == ',' || *s1 == ' '))
1293 if (*options != 0) {
1294 /* freed in lustre_free_lsi */
1295 OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
1298 strcpy(lmd->lmd_opts, options);
1301 lmd->lmd_magic = LMD_MAGIC;
1307 CERROR("Bad mount options %s\n", options);
1313 int lustre_fill_super(struct super_block *sb, void *data, int silent)
1315 struct lustre_mount_data *lmd;
1316 struct lustre_sb_info *lsi;
1320 CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
1322 lsi = lustre_init_lsi(sb);
1327 /* Figure out the lmd from the mount options */
1328 if (lmd_parse((char *)data, lmd)) {
1329 lustre_free_lsi(sb);
1333 if (lmd_is_client(lmd)) {
1334 CDEBUG(D_MOUNT, "Mounting client for fs %s\n", lmd->lmd_dev);
1335 if (!client_fill_super) {
1336 LCONSOLE_ERROR("Nothing registered for client mount!"
1337 " Is llite module loaded?\n");
1340 err = lustre_start_mgc(sb);
1343 /* Connect and start */
1344 /* (should always be ll_fill_super) */
1345 err = (*client_fill_super)(sb);
1348 CDEBUG(D_MOUNT, "Mounting server\n");
1349 err = server_fill_super(sb);
1350 /* s_f_s calls lustre_start_mgc after the mount because we need
1351 the MGS nids which are stored on disk. Plus, we may
1352 need to start the MGS first. */
1357 CERROR("Unable to mount %s\n", lmd->lmd_dev);
1358 lustre_stop_mgc(sb);
1359 lustre_free_lsi(sb);
1361 CDEBUG(D_MOUNT, "Successfully mounted %s\n", lmd->lmd_dev);
1367 /* We can't call ll_fill_super by name because it lives in a module that
1368 must be loaded after this one. */
1369 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb))
1371 client_fill_super = cfs;
1374 /***************** FS registration ******************/
1376 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
1378 struct super_block * lustre_get_sb(struct file_system_type *fs_type,
1379 int flags, const char *devname, void * data)
1381 /* calls back in fill super */
1382 /* we could append devname= onto options (*data) here,
1383 but 2.4 doesn't get devname. So we do it in mount_lustre.c */
1384 return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
1387 struct file_system_type lustre_fs_type = {
1388 .owner = THIS_MODULE,
1390 .get_sb = lustre_get_sb,
1391 .kill_sb = kill_anon_super,
1392 .fs_flags = FS_BINARY_MOUNTDATA,
1397 static struct super_block *lustre_read_super(struct super_block *sb,
1398 void *data, int silent)
1403 err = lustre_fill_super(sb, data, silent);
1409 static struct file_system_type lustre_fs_type = {
1410 .owner = THIS_MODULE,
1412 .fs_flags = FS_NFSEXP_FSID,
1413 .read_super = lustre_read_super,
1417 int lustre_register_fs(void)
1419 return register_filesystem(&lustre_fs_type);
1422 int lustre_unregister_fs(void)
1424 return unregister_filesystem(&lustre_fs_type);
1427 EXPORT_SYMBOL(lustre_register_client_fill_super);
1428 EXPORT_SYMBOL(lustre_common_put_super);
1429 EXPORT_SYMBOL(lustre_get_process_log);
1430 EXPORT_SYMBOL(server_get_mount);
1431 EXPORT_SYMBOL(server_put_mount);