4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/utils/mount_lustre.c
38 * Author: Robert Read <rread@clusterfs.com>
39 * Author: Nathan Rutman <nathan@clusterfs.com>
52 #include <lustre_ver.h>
56 #include "mount_utils.h"
58 #define MAX_HW_SECTORS_KB_PATH "queue/max_hw_sectors_kb"
59 #define MAX_SECTORS_KB_PATH "queue/max_sectors_kb"
60 #define STRIPE_CACHE_SIZE "md/stripe_cache_size"
62 #define MAX_RETRIES 99
65 int md_stripe_cache_size = 16384;
66 char *progname = NULL;
70 fprintf(out, "%s v"LUSTRE_VERSION_STRING"\n", progname);
71 fprintf(out, "\nThis mount helper should only be invoked via the "
72 "mount (8) command,\ne.g. mount -t lustre dev dir\n\n");
73 fprintf(out, "usage: %s [-fhnv] [-o <mntopt>] <device> <mountpt>\n",
76 "\t<device>: the disk device, or for a client:\n"
77 "\t\t<mgmtnid>[:<altmgtnid>...]:/<filesystem>-client\n"
78 "\t<filesystem>: name of the Lustre filesystem (e.g. lustre1)\n"
79 "\t<mountpt>: filesystem mountpoint (e.g. /mnt/lustre)\n"
80 "\t-f|--fake: fake mount (updates /etc/mtab)\n"
81 "\t-o force|--force: force mount even if already in /etc/mtab\n"
82 "\t-h|--help: print this usage message\n"
83 "\t-n|--nomtab: do not update /etc/mtab after mount\n"
84 "\t-v|--verbose: print verbose config settings\n"
85 "\t<mntopt>: one or more comma separated of:\n"
86 "\t\t(no)flock,(no)user_xattr,(no)acl\n"
87 "\t\tabort_recov: abort server recovery handling\n"
88 "\t\tnosvc: only start MGC/MGS obds\n"
89 "\t\tnomgs: only start target obds, using existing MGS\n"
90 "\t\texclude=<ostname>[:<ostname>] : colon-separated list of "
91 "inactive OSTs (e.g. lustre-OST0001)\n"
92 "\t\tretry=<num>: number of times mount is retried by client\n"
93 "\t\tmd_stripe_cache_size=<num>: set the raid stripe cache "
94 "size for the underlying raid if present\n"
96 exit((out != stdout) ? EINVAL : 0);
99 /* Get rid of symbolic hostnames for tcp, since kernel can't do lookups */
100 #define MAXNIDSTR 1024
101 static char *convert_hostnames(char *s1)
103 char *converted, *s2 = 0, *c;
105 int left = MAXNIDSTR;
108 converted = malloc(left);
109 if (converted == NULL) {
110 fprintf(stderr, "out of memory: needed %d bytes\n",
115 while ((left > 0) && (*s1 != '/')) {
116 s2 = strpbrk(s1, ",:");
121 nid = libcfs_str2nid(s1);
122 *s2 = sep; /* back to original string */
123 if (nid == LNET_NID_ANY)
125 c += snprintf(c, left, "%s%c", libcfs_nid2str(nid), sep);
126 left = converted + MAXNIDSTR - c;
129 snprintf(c, left, "%s", s1);
132 fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1);
137 /*****************************************************************************
139 * This part was cribbed from util-linux/mount/mount.c. There was no clear
140 * license information, but many other files in the package are identified as
141 * GNU GPL, so it's a pretty safe bet that was their intent.
143 ****************************************************************************/
145 const char *opt; /* option name */
146 int inv; /* true if flag value should be inverted */
147 int mask; /* flag mask value */
150 static const struct opt_map opt_map[] = {
151 /*"optname", inv,ms_mask */
152 /* These flags are parsed by mount, not lustre */
153 { "defaults", 0, 0 }, /* default options */
154 { "remount", 0, MS_REMOUNT}, /* remount with different options */
155 { "rw", 1, MS_RDONLY }, /* read-write */
156 { "ro", 0, MS_RDONLY }, /* read-only */
157 { "exec", 1, MS_NOEXEC }, /* permit execution of binaries */
158 { "noexec", 0, MS_NOEXEC }, /* don't execute binaries */
159 { "suid", 1, MS_NOSUID }, /* honor suid executables */
160 { "nosuid", 0, MS_NOSUID }, /* don't honor suid executables */
161 { "dev", 1, MS_NODEV }, /* interpret device files */
162 { "nodev", 0, MS_NODEV }, /* don't interpret devices */
163 { "sync", 0, MS_SYNCHRONOUS}, /* synchronous I/O */
164 { "async", 1, MS_SYNCHRONOUS}, /* asynchronous I/O */
165 { "atime", 1, MS_NOATIME }, /* set file access time on read */
166 { "noatime", 0, MS_NOATIME }, /* do not set file access time on read */
168 { "diratime", 1, MS_NODIRATIME }, /* set file access time on read */
169 { "nodiratime",0,MS_NODIRATIME }, /* do not set file access time on read */
172 { "relatime", 0, MS_RELATIME }, /* set file access time on read */
173 { "norelatime",1,MS_RELATIME }, /* do not set file access time on read */
175 #ifdef MS_STRICTATIME
176 { "strictatime",0,MS_STRICTATIME }, /* update access time strictly */
178 { "auto", 0, 0 }, /* Can be mounted using -a */
179 { "noauto", 0, 0 }, /* Can only be mounted explicitly */
180 { "nousers", 1, 0 }, /* Forbid ordinary user to mount */
181 { "nouser", 1, 0 }, /* Forbid ordinary user to mount */
182 { "noowner", 1, 0 }, /* Device owner has no special privs */
183 { "_netdev", 0, 0 }, /* Device accessible only via network */
187 /****************************************************************************/
189 /* 1 = don't pass on to lustre
190 0 = pass on to lustre */
191 static int parse_one_option(const char *check, int *flagp)
193 const struct opt_map *opt;
195 for (opt = &opt_map[0]; opt->opt != NULL; opt++) {
196 if (strncmp(check, opt->opt, strlen(opt->opt)) == 0) {
199 *flagp &= ~(opt->mask);
206 /* Assume any unknown options are valid and pass them on. The mount
207 will fail if lmd_parse, ll_options or ldiskfs doesn't recognize it.*/
211 static void append_option(char *options, const char *one)
214 strcat(options, ",");
215 strcat(options, one);
218 /* Replace options with subset of Lustre-specific options, and
219 fill in mount flags */
220 int parse_options(struct mount_opts *mop, char *orig_options, int *flagp)
222 char *options, *opt, *nextopt, *arg, *val;
224 options = calloc(strlen(orig_options) + 1, 1);
226 nextopt = orig_options;
227 while ((opt = strsep(&nextopt, ","))) {
232 /* Handle retries in a slightly different
235 val = strchr(opt, '=');
236 /* please note that some ldiskfs mount options are also in the form
237 * of param=value. We should pay attention not to remove those
238 * mount options, see bug 22097. */
239 if (val && strncmp(arg, "md_stripe_cache_size", 20) == 0) {
240 md_stripe_cache_size = atoi(val + 1);
241 } else if (val && strncmp(arg, "retry", 5) == 0) {
242 mop->mo_retry = atoi(val + 1);
243 if (mop->mo_retry > MAX_RETRIES)
244 mop->mo_retry = MAX_RETRIES;
245 else if (mop->mo_retry < 0)
247 } else if (val && strncmp(arg, "mgssec", 6) == 0) {
248 append_option(options, opt);
249 } else if (strcmp(opt, "force") == 0) {
250 //XXX special check for 'force' option
252 printf("force: %d\n", mop->mo_force);
253 } else if (parse_one_option(opt, flagp) == 0) {
254 /* pass this on as an option */
255 append_option(options, opt);
258 #ifdef MS_STRICTATIME
259 /* set strictatime to default if NOATIME or RELATIME
260 not given explicit */
261 if (!(*flagp & (MS_NOATIME | MS_RELATIME)))
262 *flagp |= MS_STRICTATIME;
264 strcpy(orig_options, options);
270 int read_file(char *path, char *buf, int size)
274 fd = fopen(path, "r");
278 /* should not ignore fgets(3)'s return value */
279 if (!fgets(buf, size, fd)) {
280 fprintf(stderr, "reading from %s: %s", path, strerror(errno));
288 int write_file(char *path, char *buf)
292 fd = fopen(path, "w");
301 /* This is to tune the kernel for good SCSI performance.
302 * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
303 * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
304 int set_blockdev_tunables(char *source, int fan_out)
306 glob_t glob_info = { 0 };
307 struct stat stat_buf;
308 char *chk_major, *chk_minor;
309 char *savept = NULL, *dev;
311 char buf[PATH_MAX] = {'\0'}, path[PATH_MAX] = {'\0'};
312 char real_path[PATH_MAX] = {'\0'};
319 ret_path = realpath(source, real_path);
320 if (ret_path == NULL) {
322 fprintf(stderr, "warning: %s: cannot resolve: %s\n",
323 source, strerror(errno));
327 if (strncmp(real_path, "/dev/loop", 9) == 0)
330 if ((real_path[0] != '/') && (strpbrk(real_path, ",:") != NULL))
333 snprintf(path, sizeof(path), "/sys/block%s", real_path + 4);
334 if (access(path, X_OK) == 0)
337 /* The name of the device say 'X' specified in /dev/X may not
338 * match any entry under /sys/block/. In that case we need to
339 * match the major/minor number to find the entry under
340 * sys/block corresponding to /dev/X */
342 /* Don't chop tail digit on /dev/mapper/xxx, LU-478 */
343 if (strncmp(real_path, "/dev/mapper", 11) != 0) {
344 dev = real_path + strlen(real_path);
345 while (--dev > real_path && isdigit(*dev))
348 if (strncmp(real_path, "/dev/md_", 8) == 0)
352 rc = stat(real_path, &stat_buf);
355 fprintf(stderr, "warning: %s, device %s stat failed\n",
356 strerror(errno), real_path);
360 major = major(stat_buf.st_rdev);
361 minor = minor(stat_buf.st_rdev);
362 rc = glob("/sys/block/*", GLOB_NOSORT, NULL, &glob_info);
365 fprintf(stderr, "warning: failed to read entries under "
367 globfree(&glob_info);
371 for (i = 0; i < glob_info.gl_pathc; i++){
372 snprintf(path, sizeof(path), "%s/dev", glob_info.gl_pathv[i]);
374 rc = read_file(path, buf, sizeof(buf));
378 if (buf[strlen(buf) - 1] == '\n')
379 buf[strlen(buf) - 1] = '\0';
381 chk_major = strtok_r(buf, ":", &savept);
383 if (major == atoi(chk_major) &&minor == atoi(chk_minor))
387 if (i == glob_info.gl_pathc) {
389 fprintf(stderr,"warning: device %s does not match any "
390 "entry under /sys/block\n", real_path);
391 globfree(&glob_info);
395 /* Chop off "/dev" from path we found */
396 path[strlen(glob_info.gl_pathv[i])] = '\0';
397 globfree(&glob_info);
400 if (strncmp(real_path, "/dev/md", 7) == 0) {
401 snprintf(real_path, sizeof(real_path), "%s/%s", path,
404 rc = read_file(real_path, buf, sizeof(buf));
407 fprintf(stderr, "warning: opening %s: %s\n",
408 real_path, strerror(errno));
412 if (atoi(buf) >= md_stripe_cache_size)
415 if (strlen(buf) - 1 > 0) {
416 snprintf(buf, sizeof(buf), "%d", md_stripe_cache_size);
417 rc = write_file(real_path, buf);
419 fprintf(stderr, "warning: opening %s: %s\n",
420 real_path, strerror(errno));
422 /* Return since raid and disk tunables are different */
426 snprintf(real_path, sizeof(real_path), "%s/%s", path,
427 MAX_HW_SECTORS_KB_PATH);
428 rc = read_file(real_path, buf, sizeof(buf));
431 fprintf(stderr, "warning: opening %s: %s\n",
432 real_path, strerror(errno));
433 /* No MAX_HW_SECTORS_KB_PATH isn't necessary an
434 * error for some device. */
438 if (strlen(buf) - 1 > 0) {
439 snprintf(real_path, sizeof(real_path), "%s/%s", path,
440 MAX_SECTORS_KB_PATH);
441 rc = write_file(real_path, buf);
444 fprintf(stderr, "warning: writing to %s: %s\n",
445 real_path, strerror(errno));
446 /* No MAX_SECTORS_KB_PATH isn't necessary an
447 * error for some device. */
454 glob_info.gl_pathc = 0;
455 glob_info.gl_offs = 0;
456 /* if device is multipath device, tune its slave devices */
457 snprintf(real_path, sizeof(real_path), "%s/slaves/*", path);
458 rc = glob(real_path, GLOB_NOSORT, NULL, &glob_info);
460 for (i = 0; rc == 0 && i < glob_info.gl_pathc; i++){
461 slave = basename(glob_info.gl_pathv[i]);
462 snprintf(real_path, sizeof(real_path), "/dev/%s", slave);
463 rc = set_blockdev_tunables(real_path, 0);
466 if (rc == GLOB_NOMATCH) {
467 /* no slave device is not an error */
469 } else if (rc && verbose) {
471 fprintf(stderr, "warning: %s, failed to read"
472 " entries under %s/slaves\n",
473 strerror(errno), path);
475 fprintf(stderr, "unable to set tunables for"
476 " slave device %s (slave would be"
477 " unable to handle IO request from"
482 globfree(&glob_info);
488 static int parse_ldd(char *source, struct mount_opts *mop, char *options)
490 struct lustre_disk_data *ldd = &mop->mo_ldd;
493 rc = osd_is_lustre(source, &ldd->ldd_mount_type);
495 fprintf(stderr, "%s: %s has not been formatted with mkfs.lustre"
496 " or the backend filesystem type is not supported by "
497 "this tool\n", progname, source);
504 static void set_defaults(struct mount_opts *mop)
506 memset(mop, 0, sizeof(*mop));
507 mop->mo_usource = NULL;
508 mop->mo_source = NULL;
513 mop->mo_have_mgsnid = 0;
514 mop->mo_md_stripe_cache_size = 16384;
515 mop->mo_orig_options = "";
518 static int parse_opts(int argc, char *const argv[], struct mount_opts *mop)
520 static struct option long_opt[] = {
524 {"nomtab", 0, 0, 'n'},
525 {"options", 1, 0, 'o'},
526 {"verbose", 0, 0, 'v'},
529 char real_path[PATH_MAX] = {'\0'};
531 char path[256], name[256];
536 while ((opt = getopt_long(argc, argv, "fhno:v",
537 long_opt, NULL)) != EOF){
541 printf("force: %d\n", mop->mo_force);
545 printf("fake: %d\n", mop->mo_fake);
552 printf("nomtab: %d\n", mop->mo_nomtab);
555 mop->mo_orig_options = optarg;
561 fprintf(stderr, "%s: unknown option '%c'\n",
568 if (optind + 2 > argc) {
569 fprintf(stderr, "%s: too few arguments\n", progname);
573 mop->mo_usource = argv[optind];
574 if (!mop->mo_usource) {
579 * Try to get the real path to the device, in case it is a
580 * symbolic link for instance
582 if (realpath(mop->mo_usource, real_path) != NULL) {
583 mop->mo_usource = strdup(real_path);
585 ptr = strrchr(real_path, '/');
586 if (ptr && strncmp(ptr, "/dm-", 4) == 0 && isdigit(*(ptr + 4))) {
587 snprintf(path, sizeof(path), "/sys/block/%s/dm/name", ptr+1);
588 if ((f = fopen(path, "r"))) {
589 /* read "<name>\n" from sysfs */
590 if (fgets(name, sizeof(name), f) && (sz = strlen(name)) > 1) {
592 snprintf(real_path, sizeof(real_path), "/dev/mapper/%s", name);
599 mop->mo_source = convert_hostnames(mop->mo_usource);
600 if (!mop->mo_source) {
604 if (realpath(argv[optind + 1], mop->mo_target) == NULL) {
606 fprintf(stderr, "warning: %s: cannot resolve: %s\n",
607 argv[optind + 1], strerror(errno));
614 int main(int argc, char *const argv[])
616 struct mount_opts mop;
620 progname = strrchr(argv[0], '/');
621 progname = progname ? progname + 1 : argv[0];
629 rc = parse_opts(argc, argv, &mop);
634 for (i = 0; i < argc; i++)
635 printf("arg[%d] = %s\n", i, argv[i]);
636 printf("source = %s (%s), target = %s\n", mop.mo_usource,
637 mop.mo_source, mop.mo_target);
638 printf("options = %s\n", mop.mo_orig_options);
641 options = malloc(MAXOPT);
642 if (options == NULL) {
643 fprintf(stderr, "can't allocate memory for options\n");
646 strcpy(options, mop.mo_orig_options);
647 rc = parse_options(&mop, options, &flags);
649 fprintf(stderr, "%s: can't parse options: %s\n",
655 rc = check_mtab_entry(mop.mo_usource, mop.mo_source,
656 mop.mo_target, "lustre");
657 if (rc && !(flags & MS_REMOUNT)) {
658 fprintf(stderr, "%s: according to %s %s is "
659 "already mounted on %s\n", progname, MOUNTED,
660 mop.mo_usource, mop.mo_target);
663 if (!rc && (flags & MS_REMOUNT)) {
664 fprintf(stderr, "%s: according to %s %s is "
665 "not already mounted on %s\n", progname, MOUNTED,
666 mop.mo_usource, mop.mo_target);
670 if (flags & MS_REMOUNT)
673 rc = access(mop.mo_target, F_OK);
676 fprintf(stderr, "%s: %s inaccessible: %s\n", progname,
677 mop.mo_target, strerror(errno));
681 if (!strstr(mop.mo_usource, ":/")) {
682 rc = parse_ldd(mop.mo_source, &mop, options);
687 /* In Linux 2.4, the target device doesn't get passed to any of our
688 functions. So we'll stick it on the end of the options. */
689 append_option(options, "device=");
690 strcat(options, mop.mo_source);
693 printf("mounting device %s at %s, flags=%#x options=%s\n",
694 mop.mo_source, mop.mo_target, flags, options);
696 if (!strstr(mop.mo_usource, ":/") && set_blockdev_tunables(mop.mo_source, 1)) {
698 fprintf(stderr, "%s: unable to set tunables for %s"
699 " (may cause reduced IO performance)\n",
700 argv[0], mop.mo_source);
704 /* flags and target get to lustre_get_sb, but not
705 lustre_fill_super. Lustre ignores the flags, but mount
707 for (i = 0, rc = -EAGAIN; i <= mop.mo_retry && rc != 0; i++) {
708 rc = mount(mop.mo_source, mop.mo_target, "lustre",
709 flags, (void *)options);
712 fprintf(stderr, "%s: mount %s at %s "
713 "failed: %s retries left: "
714 "%d\n", basename(progname),
715 mop.mo_usource, mop.mo_target,
721 sleep(1 << max((i/2), 5));
735 cli = strrchr(mop.mo_usource, ':');
736 if (cli && (strlen(cli) > 2))
741 fprintf(stderr, "%s: mount %s at %s failed: %s\n", progname,
742 mop.mo_usource, mop.mo_target, strerror(errno));
744 fprintf(stderr, "Are the lustre modules loaded?\n"
745 "Check /etc/modprobe.conf and "
746 "/proc/filesystems\n");
747 if (errno == ENOTBLK)
748 fprintf(stderr, "Do you need -o loop?\n");
749 if (errno == ENOMEDIUM)
751 "This filesystem needs at least 1 OST\n");
752 if (errno == ENOENT) {
753 fprintf(stderr, "Is the MGS specification correct?\n");
754 fprintf(stderr, "Is the filesystem name correct?\n");
755 fprintf(stderr, "If upgrading, is the copied client log"
756 " valid? (see upgrade docs)\n");
758 if (errno == EALREADY)
759 fprintf(stderr, "The target service is already running."
760 " (%s)\n", mop.mo_usource);
762 fprintf(stderr, "The target service failed to start "
763 "(bad config log?) (%s). "
764 "See /var/log/messages.\n", mop.mo_usource);
766 fprintf(stderr, "Is the MGS running?\n");
767 if (errno == EADDRINUSE)
768 fprintf(stderr, "The target service's index is already "
769 "in use. (%s)\n", mop.mo_usource);
770 if (errno == EINVAL) {
771 fprintf(stderr, "This may have multiple causes.\n");
773 fprintf(stderr, "Is '%s' the correct filesystem"
775 fprintf(stderr, "Are the mount options correct?\n");
776 fprintf(stderr, "Check the syslog for more info.\n");
779 /* May as well try to clean up loop devs */
780 if (strncmp(mop.mo_usource, "/dev/loop", 9) == 0) {
783 sprintf(cmd, "/sbin/losetup -d %s", mop.mo_usource);
784 if ((ret = system(cmd)) < 0)
787 rc = WEXITSTATUS(ret);
790 } else if (!mop.mo_nomtab) {
791 rc = update_mtab_entry(mop.mo_usource, mop.mo_target, "lustre",
792 mop.mo_orig_options, 0,0,0);
796 /* mo_usource should be freed, but we can rely on the kernel */