X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Futils%2Fmount_lustre.c;h=f622728265f7a6464dc757e462760181f9c16f5f;hp=74d2910ef3df3e5f3312a068bfcfcdfd5b4368c3;hb=536981b0f297a2fa4ae53d4ab81b38e183ee43c0;hpb=fb1c60ab9ff510024275dababe55ddf325f7ca1f diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c index 74d2910..f622728 100644 --- a/lustre/utils/mount_lustre.c +++ b/lustre/utils/mount_lustre.c @@ -1,29 +1,47 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Robert Read - * Author: Nathan Rutman + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/utils/mount_lustre.c + * + * Author: Robert Read + * Author: Nathan Rutman */ - +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include #include #include @@ -31,21 +49,28 @@ #include #include #include +#include #include #include -#include #include "obdctl.h" #include #include +#include +#include +#include "mount_utils.h" #define MAX_HW_SECTORS_KB_PATH "queue/max_hw_sectors_kb" #define MAX_SECTORS_KB_PATH "queue/max_sectors_kb" +#define STRIPE_CACHE_SIZE "md/stripe_cache_size" +#define MAX_RETRIES 99 int verbose = 0; int nomtab = 0; int fake = 0; int force = 0; -static char *progname = NULL; +int retry = 0; +int md_stripe_cache_size = 16384; +char *progname = NULL; void usage(FILE *out) { @@ -60,20 +85,25 @@ void usage(FILE *out) "\t: name of the Lustre filesystem (e.g. lustre1)\n" "\t: filesystem mountpoint (e.g. /mnt/lustre)\n" "\t-f|--fake: fake mount (updates /etc/mtab)\n" - "\t--force: force mount even if already in /etc/mtab\n" + "\t-o force|--force: force mount even if already in /etc/mtab\n" "\t-h|--help: print this usage message\n" "\t-n|--nomtab: do not update /etc/mtab after mount\n" "\t-v|--verbose: print verbose config settings\n" "\t: one or more comma separated of:\n" "\t\t(no)flock,(no)user_xattr,(no)acl\n" + "\t\tabort_recov: abort server recovery handling\n" "\t\tnosvc: only start MGC/MGS obds\n" + "\t\tnomgs: only start target obds, using existing MGS\n" "\t\texclude=[:] : colon-separated list of " "inactive OSTs (e.g. lustre-OST0001)\n" + "\t\tretry=: number of times mount is retried by client\n" + "\t\tmd_stripe_cache_size=: set the raid stripe cache " + "size for the underlying raid if present\n" ); exit((out != stdout) ? EINVAL : 0); } -static int check_mtab_entry(char *spec, char *mtpt, char *type) +static int check_mtab_entry(char *spec1, char *spec2, char *mtpt, char *type) { FILE *fp; struct mntent *mnt; @@ -83,7 +113,8 @@ static int check_mtab_entry(char *spec, char *mtpt, char *type) return(0); while ((mnt = getmntent(fp)) != NULL) { - if (strcmp(mnt->mnt_fsname, spec) == 0 && + if ((strcmp(mnt->mnt_fsname, spec1) == 0 || + strcmp(mnt->mnt_fsname, spec2) == 0) && strcmp(mnt->mnt_dir, mtpt) == 0 && strcmp(mnt->mnt_type, type) == 0) { endmntent(fp); @@ -137,6 +168,11 @@ static char *convert_hostnames(char *s1) lnet_nid_t nid; converted = malloc(left); + if (converted == NULL) { + fprintf(stderr, "out of memory: needed %d bytes\n", + MAXNIDSTR); + return NULL; + } c = converted; while ((left > 0) && (*s1 != '/')) { s2 = strpbrk(s1, ",:"); @@ -186,13 +222,28 @@ static const struct opt_map opt_map[] = { { "nosuid", 0, MS_NOSUID }, /* don't honor suid executables */ { "dev", 1, MS_NODEV }, /* interpret device files */ { "nodev", 0, MS_NODEV }, /* don't interpret devices */ + { "sync", 0, MS_SYNCHRONOUS}, /* synchronous I/O */ { "async", 1, MS_SYNCHRONOUS}, /* asynchronous I/O */ + { "atime", 1, MS_NOATIME }, /* set file access time on read */ + { "noatime", 0, MS_NOATIME }, /* do not set file access time on read */ +#ifdef MS_NODIRATIME + { "diratime", 1, MS_NODIRATIME }, /* set file access time on read */ + { "nodiratime",0,MS_NODIRATIME }, /* do not set file access time on read */ +#endif +#ifdef MS_RELATIME + { "relatime", 0, MS_RELATIME }, /* set file access time on read */ + { "norelatime",1,MS_RELATIME }, /* do not set file access time on read */ +#endif +#ifdef MS_STRICTATIME + { "strictatime",0,MS_STRICTATIME }, /* update access time strictly */ +#endif { "auto", 0, 0 }, /* Can be mounted using -a */ { "noauto", 0, 0 }, /* Can only be mounted explicitly */ { "nousers", 1, 0 }, /* Forbid ordinary user to mount */ { "nouser", 1, 0 }, /* Forbid ordinary user to mount */ { "noowner", 1, 0 }, /* Device owner has no special privs */ { "_netdev", 0, 0 }, /* Device accessible only via network */ + { "loop", 0, 0 }, { NULL, 0, 0 } }; /****************************************************************************/ @@ -219,11 +270,18 @@ static int parse_one_option(const char *check, int *flagp) return 0; } +static void append_option(char *options, const char *one) +{ + if (*options) + strcat(options, ","); + strcat(options, one); +} + /* Replace options with subset of Lustre-specific options, and fill in mount flags */ int parse_options(char *orig_options, int *flagp) { - char *options, *opt, *nextopt; + char *options, *opt, *nextopt, *arg, *val; options = calloc(strlen(orig_options) + 1, 1); *flagp = 0; @@ -232,13 +290,39 @@ int parse_options(char *orig_options, int *flagp) if (!*opt) /* empty option */ continue; - if (parse_one_option(opt, flagp) == 0) { + + /* Handle retries in a slightly different + * manner */ + arg = opt; + val = strchr(opt, '='); + /* please note that some ldiskfs mount options are also in the form + * of param=value. We should pay attention not to remove those + * mount options, see bug 22097. */ + if (val && strncmp(arg, "md_stripe_cache_size", 20) == 0) { + md_stripe_cache_size = atoi(val + 1); + } else if (val && strncmp(arg, "retry", 5) == 0) { + retry = atoi(val + 1); + if (retry > MAX_RETRIES) + retry = MAX_RETRIES; + else if (retry < 0) + retry = 0; + } else if (val && strncmp(arg, "mgssec", 6) == 0) { + append_option(options, opt); + } else if (strcmp(opt, "force") == 0) { + //XXX special check for 'force' option + ++force; + printf("force: %d\n", force); + } else if (parse_one_option(opt, flagp) == 0) { /* pass this on as an option */ - if (*options) - strcat(options, ","); - strcat(options, opt); + append_option(options, opt); } } +#ifdef MS_STRICTATIME + /* set strictatime to default if NOATIME or RELATIME + not given explicit */ + if (!(*flagp & (MS_NOATIME | MS_RELATIME))) + *flagp |= MS_STRICTATIME; +#endif strcpy(orig_options, options); free(options); return 0; @@ -253,7 +337,12 @@ int read_file(char *path, char *buf, int size) if (fd == NULL) return errno; - fgets(buf, size, fd); + /* should not ignore fgets(3)'s return value */ + if (!fgets(buf, size, fd)) { + fprintf(stderr, "reading from %s: %s", path, strerror(errno)); + fclose(fd); + return 1; + } fclose(fd); return 0; } @@ -274,62 +363,59 @@ int write_file(char *path, char *buf) /* This is to tune the kernel for good SCSI performance. * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */ -int set_tunables(char *source, int src_len) +int set_blockdev_tunables(char *source, int fan_out) { - glob_t glob_info; + glob_t glob_info = { 0 }; struct stat stat_buf; char *chk_major, *chk_minor; - char *savept, *dev, *s2 = 0; - char buf[PATH_MAX], path[PATH_MAX]; + char *savept, *dev; + char *ret_path; + char buf[PATH_MAX] = {'\0'}, path[PATH_MAX] = {'\0'}; + char real_path[PATH_MAX] = {'\0'}; int i, rc = 0; int major, minor; if (!source) return -EINVAL; - if (strncmp(source, "/dev/loop", 9) == 0) - return 0; + ret_path = realpath(source, real_path); + if (ret_path == NULL) { + if (verbose) + fprintf(stderr, "warning: %s: cannot resolve: %s\n", + source, strerror(errno)); + return -EINVAL; + } - if ((*source != '/') && ((s2 = strpbrk(source, ",:")) != NULL)) + if (strncmp(real_path, "/dev/loop", 9) == 0) return 0; - dev = source + src_len - 1; - while (dev > source && (*dev != '/')) { - if (isdigit(*dev)) - *dev = 0; - dev--; - } - snprintf(path, sizeof(path), "/sys/block%s/%s", dev, - MAX_HW_SECTORS_KB_PATH); - rc = read_file(path, buf, sizeof(buf)); - if (rc == 0 && (strlen(buf) - 1) > 0) { - snprintf(path, sizeof(path), "/sys/block%s/%s", dev, - MAX_SECTORS_KB_PATH); - rc = write_file(path, buf); - if (rc && verbose) - fprintf(stderr, "warning: opening %s: %s\n", - path, strerror(errno)); - return rc; - } + if ((real_path[0] != '/') && (strpbrk(real_path, ",:") != NULL)) + return 0; - if (rc != ENOENT) - return rc; + snprintf(path, sizeof(path), "/sys/block%s", real_path + 4); + if (access(path, X_OK) == 0) + goto set_params; /* The name of the device say 'X' specified in /dev/X may not * match any entry under /sys/block/. In that case we need to * match the major/minor number to find the entry under * sys/block corresponding to /dev/X */ - dev = source + src_len - 1; - while (dev > source) { - if (isdigit(*dev)) + + /* Don't chop tail digit on /dev/mapper/xxx, LU-478 */ + if (strncmp(real_path, "/dev/mapper", 11) != 0) { + dev = real_path + strlen(real_path); + while (--dev > real_path && isdigit(*dev)) + *dev = 0; + + if (strncmp(real_path, "/dev/md_", 8) == 0) *dev = 0; - dev--; } - rc = stat(dev, &stat_buf); + rc = stat(real_path, &stat_buf); if (rc) { - fprintf(stderr, "warning: %s, device %s stat failed\n", - strerror(errno), dev); + if (verbose) + fprintf(stderr, "warning: %s, device %s stat failed\n", + strerror(errno), real_path); return rc; } @@ -337,8 +423,10 @@ int set_tunables(char *source, int src_len) minor = minor(stat_buf.st_rdev); rc = glob("/sys/block/*", GLOB_NOSORT, NULL, &glob_info); if (rc) { - fprintf(stderr, "warning: failed to read entries under " - "/sys/block\n"); + if (verbose) + fprintf(stderr, "warning: failed to read entries under " + "/sys/block\n"); + globfree(&glob_info); return rc; } @@ -361,35 +449,113 @@ int set_tunables(char *source, int src_len) if (i == glob_info.gl_pathc) { if (verbose) fprintf(stderr,"warning: device %s does not match any " - "entry under /sys/block\n", source); + "entry under /sys/block\n", real_path); + globfree(&glob_info); return -EINVAL; } - snprintf(path, sizeof(path), "%s/%s", glob_info.gl_pathv[i], + /* Chop off "/dev" from path we found */ + path[strlen(glob_info.gl_pathv[i])] = '\0'; + globfree(&glob_info); + +set_params: + if (strncmp(real_path, "/dev/md", 7) == 0) { + snprintf(real_path, sizeof(real_path), "%s/%s", path, + STRIPE_CACHE_SIZE); + + rc = read_file(real_path, buf, sizeof(buf)); + if (rc) { + if (verbose) + fprintf(stderr, "warning: opening %s: %s\n", + real_path, strerror(errno)); + return 0; + } + + if (atoi(buf) >= md_stripe_cache_size) + return 0; + + if (strlen(buf) - 1 > 0) { + snprintf(buf, sizeof(buf), "%d", md_stripe_cache_size); + rc = write_file(real_path, buf); + if (rc && verbose) + fprintf(stderr, "warning: opening %s: %s\n", + real_path, strerror(errno)); + } + /* Return since raid and disk tunables are different */ + return rc; + } + + snprintf(real_path, sizeof(real_path), "%s/%s", path, MAX_HW_SECTORS_KB_PATH); - rc = read_file(path, buf, sizeof(buf)); + rc = read_file(real_path, buf, sizeof(buf)); if (rc) { if (verbose) fprintf(stderr, "warning: opening %s: %s\n", - path, strerror(errno)); - return rc; + real_path, strerror(errno)); + /* No MAX_HW_SECTORS_KB_PATH isn't necessary an + * error for some device. */ + rc = 0; } if (strlen(buf) - 1 > 0) { - snprintf(path, sizeof(path), "%s/%s", - glob_info.gl_pathv[i], MAX_SECTORS_KB_PATH); - rc = write_file(path, buf); - if (rc && verbose) - fprintf(stderr, "warning: writing to %s: %s\n", - path, strerror(errno)); + snprintf(real_path, sizeof(real_path), "%s/%s", path, + MAX_SECTORS_KB_PATH); + rc = write_file(real_path, buf); + if (rc) { + if (verbose) + fprintf(stderr, "warning: writing to %s: %s\n", + real_path, strerror(errno)); + /* No MAX_SECTORS_KB_PATH isn't necessary an + * error for some device. */ + rc = 0; + } } + + if (fan_out) { + char *slave = NULL; + glob_info.gl_pathc = 0; + glob_info.gl_offs = 0; + /* if device is multipath device, tune its slave devices */ + snprintf(real_path, sizeof(real_path), "%s/slaves/*", path); + rc = glob(real_path, GLOB_NOSORT, NULL, &glob_info); + + for (i = 0; rc == 0 && i < glob_info.gl_pathc; i++){ + slave = basename(glob_info.gl_pathv[i]); + snprintf(real_path, sizeof(real_path), "/dev/%s", slave); + rc = set_blockdev_tunables(real_path, 0); + } + + if (rc == GLOB_NOMATCH) { + /* no slave device is not an error */ + rc = 0; + } else if (rc && verbose) { + if (slave == NULL) { + fprintf(stderr, "warning: %s, failed to read" + " entries under %s/slaves\n", + strerror(errno), path); + } else { + fprintf(stderr, "unable to set tunables for" + " slave device %s (slave would be" + " unable to handle IO request from" + " master %s)\n", + real_path, source); + } + } + globfree(&glob_info); + } + return rc; } int main(int argc, char *const argv[]) { char default_options[] = ""; - char *usource, *source, *target, *ptr; + char *usource, *source, *ptr; + char target[PATH_MAX] = {'\0'}; + char real_path[PATH_MAX] = {'\0'}; + char path[256], name[256]; + FILE *f; + size_t sz; char *options, *optcopy, *orig_options = default_options; int i, nargs = 3, opt, rc, flags, optlen; static struct option long_opt[] = { @@ -448,26 +614,56 @@ int main(int argc, char *const argv[]) } usource = argv[optind]; - source = convert_hostnames(usource); - target = argv[optind + 1]; - ptr = target + strlen(target) - 1; - while ((ptr > target) && (*ptr == '/')) { - *ptr = 0; - ptr--; + if (!usource) { + usage(stderr); } - if (!usource || !source) { + /** + * Try to get the real path to the device, in case it is a + * symbolic link for instance + */ + if (realpath(usource, real_path) != NULL) { + usource = real_path; + + ptr = strrchr(real_path, '/'); + if (ptr && strncmp(ptr, "/dm-", 4) == 0 && isdigit(*(ptr + 4))) { + snprintf(path, sizeof(path), "/sys/block/%s/dm/name", ptr+1); + if ((f = fopen(path, "r"))) { + /* read "\n" from sysfs */ + if (fgets(name, sizeof(name), f) && (sz = strlen(name)) > 1) { + name[sz - 1] = '\0'; + snprintf(real_path, sizeof(real_path), "/dev/mapper/%s", name); + } + fclose(f); + } + } + } + + source = convert_hostnames(usource); + if (!source) { usage(stderr); } + if (realpath(argv[optind + 1], target) == NULL) { + rc = errno; + fprintf(stderr, "warning: %s: cannot resolve: %s\n", + argv[optind + 1], strerror(errno)); + return rc; + } + if (verbose) { for (i = 0; i < argc; i++) printf("arg[%d] = %s\n", i, argv[i]); - printf("source = %s (%s), target = %s\n", usource, source, target); + printf("source = %s (%s), target = %s\n", usource, source, + target); printf("options = %s\n", orig_options); } options = malloc(strlen(orig_options) + 1); + if (options == NULL) { + fprintf(stderr, "can't allocate memory for options\n"); + return -1; + } strcpy(options, orig_options); rc = parse_options(options, &flags); if (rc) { @@ -477,7 +673,7 @@ int main(int argc, char *const argv[]) } if (!force) { - rc = check_mtab_entry(usource, target, "lustre"); + rc = check_mtab_entry(usource, source, target, "lustre"); if (rc && !(flags & MS_REMOUNT)) { fprintf(stderr, "%s: according to %s %s is " "already mounted on %s\n", @@ -506,6 +702,10 @@ int main(int argc, char *const argv[]) functions. So we'll stick it on the end of the options. */ optlen = strlen(options) + strlen(",device=") + strlen(source) + 1; optcopy = malloc(optlen); + if (optcopy == NULL) { + fprintf(stderr, "can't allocate memory to optcopy\n"); + return -1; + } strcpy(optcopy, options); if (*optcopy) strcat(optcopy, ","); @@ -516,16 +716,40 @@ int main(int argc, char *const argv[]) printf("mounting device %s at %s, flags=%#x options=%s\n", source, target, flags, optcopy); - if (set_tunables(source, strlen(source)) && verbose) - fprintf(stderr, "%s: unable to set tunables for %s" - " (may cause reduced IO performance)", + if (!strstr(usource, ":/") && set_blockdev_tunables(source, 1)) { + if (verbose) + fprintf(stderr, "%s: unable to set tunables for %s" + " (may cause reduced IO performance)\n", argv[0], source); + } - if (!fake) + register_service_tags(usource, source, target); + + if (!fake) { /* flags and target get to lustre_get_sb, but not lustre_fill_super. Lustre ignores the flags, but mount does not. */ - rc = mount(source, target, "lustre", flags, (void *)optcopy); + for (i = 0, rc = -EAGAIN; i <= retry && rc != 0; i++) { + rc = mount(source, target, "lustre", flags, + (void *)optcopy); + if (rc) { + if (verbose) { + fprintf(stderr, "%s: mount %s at %s " + "failed: %s retries left: " + "%d\n", basename(progname), + usource, target, + strerror(errno), retry-i); + } + + if (retry) { + sleep(1 << max((i/2), 5)); + } + else { + rc = errno; + } + } + } + } if (rc) { char *cli; @@ -542,9 +766,8 @@ int main(int argc, char *const argv[]) usource, target, strerror(errno)); if (errno == ENODEV) fprintf(stderr, "Are the lustre modules loaded?\n" - "Check /etc/modprobe.conf and /proc/filesystems" - "\nNote 'alias lustre llite' should be removed" - " from modprobe.conf\n"); + "Check /etc/modprobe.conf and " + "/proc/filesystems\n"); if (errno == ENOTBLK) fprintf(stderr, "Do you need -o loop?\n"); if (errno == ENOMEDIUM) @@ -580,8 +803,12 @@ int main(int argc, char *const argv[]) /* May as well try to clean up loop devs */ if (strncmp(usource, "/dev/loop", 9) == 0) { char cmd[256]; + int ret; sprintf(cmd, "/sbin/losetup -d %s", usource); - system(cmd); + if ((ret = system(cmd)) < 0) + rc = errno; + else if (ret > 0) + rc = WEXITSTATUS(ret); } } else if (!nomtab) {