* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
#include <string.h>
#include <inttypes.h>
#include <unistd.h>
+#include <dirent.h>
#include <fcntl.h>
#include <mntent.h>
-#include <glob.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/utsname.h>
+#include <sys/sysmacros.h>
#include <string.h>
#include <getopt.h>
#ifndef BLKGETSIZE64
#include <linux/fs.h> /* for BLKGETSIZE64 */
#endif
+#include <linux/major.h>
#include <linux/types.h>
#include <linux/version.h>
#include <linux/lnet/lnetctl.h>
#include <linux/lustre/lustre_ver.h>
+#include <libcfs/util/string.h>
#ifdef HAVE_SELINUX
#include <selinux/selinux.h>
extern char *progname;
-#define L_BLOCK_SIZE 4096
/* keep it less than LL_FID_NAMELEN */
#define DUMMY_FILE_NAME_LEN 25
#define EXT3_DIRENT_SIZE DUMMY_FILE_NAME_LEN
}
#endif
-/* return canonicalized absolute pathname, even if the target file does not
- * exist, unlike realpath */
-static char *absolute_path(char *devname)
-{
- char buf[PATH_MAX + 1] = "";
- char *path;
- char *ptr;
- int len;
-
- path = malloc(sizeof(buf));
- if (path == NULL)
- return NULL;
-
- if (devname[0] != '/') {
- if (getcwd(buf, sizeof(buf) - 1) == NULL) {
- free(path);
- return NULL;
- }
- len = snprintf(path, sizeof(buf), "%s/%s", buf, devname);
- if (len >= sizeof(buf)) {
- free(path);
- return NULL;
- }
- } else {
- len = snprintf(path, sizeof(buf), "%s", devname);
- if (len >= sizeof(buf)) {
- free(path);
- return NULL;
- }
- }
-
- /* truncate filename before calling realpath */
- ptr = strrchr(path, '/');
- if (ptr == NULL) {
- free(path);
- return NULL;
- }
- *ptr = '\0';
- if (buf != realpath(path, buf)) {
- free(path);
- return NULL;
- }
- /* add the filename back */
- len = snprintf(path, PATH_MAX, "%s/%s", buf, ptr+1);
- if (len >= PATH_MAX) {
- free(path);
- return NULL;
- }
- return path;
-}
-
/* Determine if a device is a block device (as opposed to a file) */
static int is_block(char *devname)
{
int ret = 0;
char *devpath;
- devpath = absolute_path(devname);
- if (devpath == NULL) {
- fprintf(stderr, "%s: failed to resolve path to %s\n",
- progname, devname);
+ ret = cfs_abs_path(devname, &devpath);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to resolve path '%s': %s\n",
+ progname, devname, strerror(-ret));
return -1;
}
int ldiskfs_write_ldd(struct mkfs_opts *mop)
{
char mntpt[] = "/tmp/mntXXXXXX";
- char filepnm[128];
+ char filepnm[192];
char *dev;
FILE *filep;
int ret = 0;
E2FSPROGS, feature);
#if !(HAVE_LDISKFSPROGS)
fprintf(stderr, "Please install the latest version of e2fsprogs from\n"
- "https://downloads.hpdd.intel.com/public/e2fsprogs/latest/\n"
+ "https://downloads.whamcloud.com/public/e2fsprogs/latest/\n"
"to enable this feature.\n");
#endif
if (make_backfs)
static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
size_t maxbuflen, int user_spec)
{
+ int enable_64bit = 0;
+
+ /* Enable large block addresses if the LUN is over 2^32 blocks. */
+ if (mop->mo_device_kb / mop->mo_blocksize_kb > 0xffffffffULL &&
+ is_e2fsprogs_feature_supp("-O 64bit") == 0)
+ enable_64bit = 1;
+
if (IS_OST(&mop->mo_ldd)) {
append_unique(anchor, user_spec ? "," : " -O ",
"extents", NULL, maxbuflen);
append_unique(anchor, user_spec ? "," : " -O ",
"dirdata", NULL, maxbuflen);
append_unique(anchor, ",", "uninit_bg", NULL, maxbuflen);
- append_unique(anchor, ",", "^extents", NULL, maxbuflen);
+ if (enable_64bit)
+ append_unique(anchor, ",", "extents", NULL, maxbuflen);
+ else
+ append_unique(anchor, ",", "^extents", NULL, maxbuflen);
} else {
append_unique(anchor, user_spec ? "," : " -O ",
"uninit_bg", NULL, maxbuflen);
if (is_e2fsprogs_feature_supp("-O huge_file") == 0)
append_unique(anchor, ",", "huge_file", NULL, maxbuflen);
- /* Enable large block addresses if the LUN is over 2^32 blocks. */
- if (mop->mo_device_kb / (L_BLOCK_SIZE >> 10) >= 0x100002000ULL &&
- is_e2fsprogs_feature_supp("-O 64bit") == 0)
+ if (enable_64bit)
append_unique(anchor, ",", "64bit", NULL, maxbuflen);
/* Cluster inode/block bitmaps and inode table for more efficient IO.
if (IS_OST(&mop->mo_ldd) &&
strstr(mop->mo_mkfsopts, "-G") == NULL) {
snprintf(tmp_buf, sizeof(tmp_buf), " -G %u",
- (1 << 20) / L_BLOCK_SIZE);
+ 1024 / mop->mo_blocksize_kb);
strscat(anchor, tmp_buf, maxbuflen);
}
}
/* Build fs according to type */
int ldiskfs_make_lustre(struct mkfs_opts *mop)
{
- __u64 device_kb = mop->mo_device_kb, block_count = 0;
char mkfs_cmd[PATH_MAX];
char buf[64];
char *start;
char *dev;
int ret = 0, ext_opts = 0;
+ bool have_64bit = false;
size_t maxbuflen;
+ mop->mo_blocksize_kb = 4;
+
+ start = strstr(mop->mo_mkfsopts, "-b");
+ if (start) {
+ char *end = NULL;
+ long blocksize;
+
+ blocksize = strtol(start + 2, &end, 0);
+ if (end && (*end == 'k' || *end == 'K'))
+ blocksize *= 1024;
+ /* EXT4_MIN_BLOCK_SIZE || EXT4_MAX_BLOCK_SIZE */
+ if (blocksize < 1024 || blocksize > 65536) {
+ fprintf(stderr,
+ "%s: blocksize %lu not in 1024-65536 bytes, normally 4096 bytes\n",
+ progname, blocksize);
+ return EINVAL;
+ }
+
+ if ((blocksize & (blocksize - 1)) != 0) {
+ fprintf(stderr,
+ "%s: blocksize %lu not a power-of-two value\n",
+ progname, blocksize);
+ return EINVAL;
+ }
+ mop->mo_blocksize_kb = blocksize >> 10;
+ }
+
if (!(mop->mo_flags & MO_IS_LOOP)) {
- mop->mo_device_kb = get_device_size(mop->mo_device);
+ __u64 device_kb = get_device_size(mop->mo_device);
- if (mop->mo_device_kb == 0)
+ if (device_kb == 0)
return ENODEV;
/* Compare to real size */
- if (device_kb == 0 || device_kb > mop->mo_device_kb)
- device_kb = mop->mo_device_kb;
- else
+ if (mop->mo_device_kb == 0 || device_kb < mop->mo_device_kb)
mop->mo_device_kb = device_kb;
}
if (mop->mo_device_kb != 0) {
+ __u64 block_count;
+
if (mop->mo_device_kb < 32384) {
fprintf(stderr, "%s: size of filesystem must be larger "
"than 32MB, but is set to %lldKB\n",
progname, (long long)mop->mo_device_kb);
return EINVAL;
}
- block_count = mop->mo_device_kb / (L_BLOCK_SIZE >> 10);
- /* If the LUN size is just over 2^32 blocks, limit the
- * filesystem size to 2^32-1 blocks to avoid problems with
- * ldiskfs/mkfs not handling this size. Bug 22906 */
- if (block_count > 0xffffffffULL && block_count < 0x100002000ULL)
- block_count = 0xffffffffULL;
+ block_count = mop->mo_device_kb / mop->mo_blocksize_kb;
+ if (block_count > 0xffffffffULL) {
+ /* If the LUN size is just over 2^32 blocks, limit the
+ * filesystem size to 2^32-1 blocks to avoid problems
+ * with ldiskfs/mkfs not handling this well. b=22906
+ */
+ if (block_count < 0x100002000ULL)
+ mop->mo_device_kb =
+ 0xffffffffULL * mop->mo_blocksize_kb;
+ else
+ have_64bit = true;
+ }
}
+
if ((mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3) ||
(mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS) ||
(mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS2)) {
/* Journal size in MB */
if (strstr(mop->mo_mkfsopts, "-J") == NULL &&
- device_kb > 1024 * 1024) {
+ mop->mo_device_kb > 1024 * 1024) {
/* Choose our own default journal size */
long journal_mb = 0, max_mb;
max_mb = 0;
/* Use at most 4% of device for journal */
- journal_mb = device_kb * 4 / (1024 * 100);
+ journal_mb = mop->mo_device_kb * 4 / (1024 * 100);
if (journal_mb > max_mb)
journal_mb = max_mb;
* (assuming all files are in composite layout and has
* 3 components):
*
- * ldiskfs inode size: 156
- * extended attributes size, including:
+ * ldiskfs inode size: 160
+ * MDT extended attributes size, including:
* ext4_xattr_header: 32
* LOV EA size: 32(lov_comp_md_v1) +
* 3 * 40(lov_comp_md_entry_v1) +
* 3 * 32(lov_mds_md) +
* stripes * 24(lov_ost_data) +
- * 16(xattr_entry) + 3(lov)
+ * 16(xattr_entry) + 4("lov")
* LMA EA size: 24(lustre_mdt_attrs) +
- * 16(xattr_entry) + 3(lma)
+ * 16(xattr_entry) + 4("lma")
+ * SOM EA size: 24(lustre_som_attrs) +
+ * 16(xattr_entry) + 4("som")
* link EA size: 24(link_ea_header) + 18(link_ea_entry) +
- * (filename) + 16(xattr_entry) + 4(link)
+ * 16(filename) + 16(xattr_entry) + 4("link")
* and some margin for 4-byte alignment, ACLs and other EAs.
*
* If we say the average filename length is about 32 bytes,
* the calculation looks like:
- * 156 + 32 + (32+3*(40 + 32)+24*N+19) + (24+19) +
- * (24+18+~32+20) + other <= 512*2^m, {m=0,1,2,3}
+ * 160 + 32 + (32+3*(40+32)+24*stripes+20) + (24+20) + (24+20) +
+ * (24+20) + (~42+16+20) + other <= 512*2^m, {m=0,1,2,3}
*/
if (strstr(mop->mo_mkfsopts, "-I") == NULL) {
if (IS_MDT(&mop->mo_ldd)) {
inode_size = 1024;
} else if (IS_OST(&mop->mo_ldd)) {
/* We store MDS FID and necessary composite
- * layout information in the OST object EA. */
+ * layout information in the OST object EA:
+ * ldiskfs inode size: 160
+ * OST extended attributes size, including:
+ * ext4_xattr_header: 32
+ * LMA EA size: 24(lustre_mdt_attrs) +
+ * 16(xattr_entry) + 4("lma")
+ * FID EA size: 52(filter_fid) +
+ * 16(xattr_entry) + 4("fid")
+ * 160 + 32 + (24+20) + (52+20) = 308
+ */
inode_size = 512;
}
* this, but it is impossible to know in advance. */
if (IS_OST(&mop->mo_ldd)) {
/* OST > 16TB assume average file size 1MB */
- if (device_kb > (16ULL << 30))
+ if (mop->mo_device_kb > (16ULL << 30))
bytes_per_inode = 1024 * 1024;
/* OST > 4TB assume average file size 512kB */
- else if (device_kb > (4ULL << 30))
+ else if (mop->mo_device_kb > (4ULL << 30))
bytes_per_inode = 512 * 1024;
/* OST > 1TB assume average file size 256kB */
- else if (device_kb > (1ULL << 30))
+ else if (mop->mo_device_kb > (1ULL << 30))
bytes_per_inode = 256 * 1024;
/* OST > 10GB assume average file size 64kB,
* plus a bit so that inodes will fit into a
* 256x flex_bg without overflowing */
- else if (device_kb > (10ULL << 20))
+ else if (mop->mo_device_kb > (10ULL << 20))
bytes_per_inode = 69905;
}
sprintf(buf, " -i %ld", bytes_per_inode);
strscat(mop->mo_mkfsopts, buf,
sizeof(mop->mo_mkfsopts));
+ mop->mo_inode_size = bytes_per_inode;
}
}
start = moveopts_to_end(start);
maxbuflen = sizeof(mop->mo_mkfsopts) -
(start - mop->mo_mkfsopts) - strlen(start);
- ret = enable_default_ext4_features(mop, start, maxbuflen, 1);
+ ret = enable_default_ext4_features(mop, start,
+ maxbuflen, 1);
} else {
start = mop->mo_mkfsopts + strlen(mop->mo_mkfsopts),
maxbuflen = sizeof(mop->mo_mkfsopts) -
strlen(mop->mo_mkfsopts);
- ret = enable_default_ext4_features(mop, start, maxbuflen, 0);
+ ret = enable_default_ext4_features(mop, start,
+ maxbuflen, 0);
}
if (ret)
return ret;
* limitations. */
if (strstr(mop->mo_mkfsopts, "meta_bg") == NULL &&
IS_OST(&mop->mo_ldd) && mop->mo_device_kb > 100 * 1024 &&
- mop->mo_device_kb * 1024 / L_BLOCK_SIZE <= 0xffffffffULL) {
- unsigned group_blocks = L_BLOCK_SIZE * 8;
- unsigned desc_per_block = L_BLOCK_SIZE / 32;
- unsigned resize_blks;
+ !have_64bit) {
+ unsigned int group_blocks = mop->mo_blocksize_kb * 8192;
+ unsigned int desc_per_block =
+ mop->mo_blocksize_kb * 1024 / 32;
+ unsigned int resize_blks;
resize_blks = (1ULL<<32) - desc_per_block*group_blocks;
snprintf(buf, sizeof(buf), "%u", resize_blks);
strscat(mop->mo_mkfsopts, " -F", sizeof(mop->mo_mkfsopts));
snprintf(mkfs_cmd, sizeof(mkfs_cmd),
- "%s -j -b %d -L %s ", MKE2FS, L_BLOCK_SIZE,
- mop->mo_ldd.ldd_svname);
+ "%s -j -b %d -L %s ", MKE2FS,
+ mop->mo_blocksize_kb * 1024, mop->mo_ldd.ldd_svname);
} else {
fprintf(stderr,"%s: unsupported fs type: %d (%s)\n",
progname, mop->mo_ldd.ldd_mount_type,
vprint("formatting backing filesystem %s on %s\n",
MT_STR(&mop->mo_ldd), dev);
vprint("\ttarget name %s\n", mop->mo_ldd.ldd_svname);
- vprint("\t4k blocks %ju\n", (uintmax_t)block_count);
+ vprint("\tkilobytes %llu\n", mop->mo_device_kb);
vprint("\toptions %s\n", mop->mo_mkfsopts);
/* mkfs_cmd's trailing space is important! */
strscat(mkfs_cmd, mop->mo_mkfsopts, sizeof(mkfs_cmd));
strscat(mkfs_cmd, " ", sizeof(mkfs_cmd));
strscat(mkfs_cmd, dev, sizeof(mkfs_cmd));
- if (block_count != 0) {
- snprintf(buf, sizeof(buf), " %ju",
- (uintmax_t)block_count);
+ if (mop->mo_device_kb != 0) {
+ snprintf(buf, sizeof(buf), " %lluk",
+ (unsigned long long)mop->mo_device_kb);
strscat(mkfs_cmd, buf, sizeof(mkfs_cmd));
}
return rc < 0 ? errno : 0;
}
-static int set_blockdev_scheduler(const char *path, const char *scheduler)
+static int tune_md_stripe_cache_size(const char *sys_path,
+ struct mount_opts *mop)
{
- char buf[PATH_MAX], *s, *e, orig_sched[50];
+ char path[PATH_MAX];
+ unsigned long old_stripe_cache_size;
+ unsigned long new_stripe_cache_size;
+ char buf[3 * sizeof(old_stripe_cache_size) + 2];
int rc;
- /* Before setting the scheduler, we need to check to see if it's
- * already set to "noop". If it is, we don't want to override
- * that setting. If it's set to anything other than "noop", set
- * the scheduler to what has been passed in. */
+ if (mop->mo_md_stripe_cache_size <= 0)
+ return 0;
+
+ new_stripe_cache_size = mop->mo_md_stripe_cache_size;
+ snprintf(path, sizeof(path), "%s/%s", sys_path, STRIPE_CACHE_SIZE);
rc = read_file(path, buf, sizeof(buf));
- if (rc) {
+ if (rc != 0) {
if (verbose)
- fprintf(stderr, "%s: cannot open '%s': %s\n",
- progname, path, strerror(errno));
+ fprintf(stderr, "warning: cannot read '%s': %s\n",
+ path, strerror(errno));
return rc;
}
- /* The expected format of buf: noop anticipatory deadline [cfq] */
- s = strchr(buf, '[');
- e = strchr(buf, ']');
-
- /* If the format is not what we expect. Play it safe and error out. */
- if (s == NULL || e == NULL) {
- if (verbose)
- fprintf(stderr, "%s: cannot parse scheduler "
- "options for '%s'\n", progname, path);
- return -EINVAL;
- }
-
- snprintf(orig_sched, e - s, "%s", s + 1);
+ old_stripe_cache_size = strtoul(buf, NULL, 0);
+ if (old_stripe_cache_size == 0 || old_stripe_cache_size == ULONG_MAX)
+ return EINVAL;
- if (strcmp(orig_sched, "noop") == 0 ||
- strcmp(orig_sched, scheduler) == 0)
+ if (new_stripe_cache_size <= old_stripe_cache_size)
return 0;
- rc = write_file(path, scheduler);
- if (rc) {
+ snprintf(buf, sizeof(buf), "%lu", new_stripe_cache_size);
+ rc = write_file(path, buf);
+ if (rc != 0) {
if (verbose)
- fprintf(stderr, "%s: cannot set scheduler on "
- "'%s': %s\n", progname, path,
- strerror(errno));
+ fprintf(stderr, "warning: cannot write '%s': %s\n",
+ path, strerror(errno));
return rc;
- } else {
- fprintf(stderr, "%s: change scheduler of %s from %s to %s\n",
- progname, path, orig_sched, scheduler);
}
- return rc;
+ return 0;
}
-/* This is to tune the kernel for good SCSI performance.
- * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
- * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
-static int set_blockdev_tunables(char *source, struct mount_opts *mop)
+static int tune_max_sectors_kb(const char *sys_path, struct mount_opts *mop)
{
- glob_t glob_info = { 0 };
- struct stat stat_buf;
- char *chk_major, *chk_minor;
- char *savept = NULL, *dev;
- char *ret_path;
- char buf[PATH_MAX] = {'\0'}, path[PATH_MAX] = {'\0'};
- char real_path[PATH_MAX] = {'\0'};
- int i, rc = 0;
- int major, minor;
- char *slave = NULL;
-
- if (!source)
- return -EINVAL;
-
- ret_path = realpath(source, real_path);
- if (ret_path == NULL) {
- if (verbose)
- fprintf(stderr, "warning: %s: cannot resolve: %s\n",
- source, strerror(errno));
- return -EINVAL;
+ char path[PATH_MAX];
+ unsigned long max_hw_sectors_kb;
+ unsigned long old_max_sectors_kb;
+ unsigned long new_max_sectors_kb;
+ char buf[3 * sizeof(old_max_sectors_kb) + 2];
+ int rc;
+
+ if (mop->mo_max_sectors_kb >= 0) {
+ new_max_sectors_kb = mop->mo_max_sectors_kb;
+ goto have_new_max_sectors_kb;
}
- if (strncmp(real_path, "/dev/loop", 9) == 0)
+ snprintf(path, sizeof(path), "%s/%s", sys_path, MAX_HW_SECTORS_KB_PATH);
+ rc = read_file(path, buf, sizeof(buf));
+ if (rc != 0) {
+ /* No MAX_HW_SECTORS_KB_PATH isn't necessary an
+ * error for some devices. */
return 0;
+ }
- if ((real_path[0] != '/') && (strpbrk(real_path, ",:") != NULL))
+ max_hw_sectors_kb = strtoul(buf, NULL, 0);
+ if (max_hw_sectors_kb == 0 || max_hw_sectors_kb == ULLONG_MAX) {
+ /* No digits at all or something weird. */
return 0;
+ }
- snprintf(path, sizeof(path), "/sys/block%s", real_path + 4);
- if (access(path, X_OK) == 0)
- goto set_params;
+ new_max_sectors_kb = max_hw_sectors_kb;
- /* The name of the device say 'X' specified in /dev/X may not
- * match any entry under /sys/block/. In that case we need to
- * match the major/minor number to find the entry under
- * sys/block corresponding to /dev/X */
+ /* Don't increase IO request size limit past 16MB. It is
+ * about PTLRPC_MAX_BRW_SIZE, but that isn't in a public
+ * header. Note that even though the block layer allows
+ * larger values, setting max_sectors_kb = 32768 causes
+ * crashes (LU-6974). */
+ if (new_max_sectors_kb > 16 * 1024)
+ new_max_sectors_kb = 16 * 1024;
- /* Don't chop tail digit on /dev/mapper/xxx, LU-478 */
- if (strncmp(real_path, "/dev/mapper", 11) != 0) {
- dev = real_path + strlen(real_path);
- while (--dev > real_path && isdigit(*dev))
- *dev = 0;
+have_new_max_sectors_kb:
+ snprintf(path, sizeof(path), "%s/%s", sys_path, MAX_SECTORS_KB_PATH);
+ rc = read_file(path, buf, sizeof(buf));
+ if (rc != 0) {
+ /* No MAX_SECTORS_KB_PATH isn't necessary an error for
+ * some devices. */
+ return 0;
+ }
- if (strncmp(real_path, "/dev/md", 7) == 0 && dev[0] == 'p')
- *dev = 0;
+ old_max_sectors_kb = strtoul(buf, NULL, 0);
+ if (old_max_sectors_kb == 0 || old_max_sectors_kb == ULLONG_MAX) {
+ /* No digits at all or something weird. */
+ return 0;
}
- rc = stat(real_path, &stat_buf);
- if (rc) {
+ if (new_max_sectors_kb <= old_max_sectors_kb)
+ return 0;
+
+ snprintf(buf, sizeof(buf), "%lu", new_max_sectors_kb);
+ rc = write_file(path, buf);
+ if (rc != 0) {
if (verbose)
- fprintf(stderr, "warning: %s, device %s stat failed\n",
- strerror(errno), real_path);
+ fprintf(stderr, "warning: cannot write '%s': %s\n",
+ path, strerror(errno));
return rc;
}
- major = major(stat_buf.st_rdev);
- minor = minor(stat_buf.st_rdev);
- rc = glob("/sys/block/*", GLOB_NOSORT, NULL, &glob_info);
- if (rc) {
+ fprintf(stderr, "%s: increased '%s' from %lu to %lu\n",
+ progname, path, old_max_sectors_kb, new_max_sectors_kb);
+
+ return 0;
+}
+
+static int tune_block_dev_scheduler(const char *sys_path, const char *new_sched)
+{
+ char path[PATH_MAX];
+ char buf[PATH_MAX];
+ char *s, *e;
+ char *old_sched;
+ int rc;
+
+ /* Before setting the scheduler, we need to check to see if
+ * it's already set to "noop". If it is then we don't want to
+ * override that setting. If it's set to anything other than
+ * "noop" then set the scheduler to what has been passed
+ * in. */
+
+ snprintf(path, sizeof(path), "%s/%s", sys_path, SCHEDULER_PATH);
+ rc = read_file(path, buf, sizeof(buf));
+ if (rc != 0) {
if (verbose)
- fprintf(stderr, "warning: failed to read entries under "
- "/sys/block\n");
- globfree(&glob_info);
+ fprintf(stderr, "%s: cannot read '%s': %s\n",
+ progname, path, strerror(errno));
+
return rc;
}
- for (i = 0; i < glob_info.gl_pathc; i++){
- snprintf(path, sizeof(path), "%s/dev", glob_info.gl_pathv[i]);
-
- rc = read_file(path, buf, sizeof(buf));
- if (rc)
- continue;
+ /* The expected format of buf: noop anticipatory deadline [cfq] */
+ s = strchr(buf, '[');
+ e = strchr(buf, ']');
- if (buf[strlen(buf) - 1] == '\n')
- buf[strlen(buf) - 1] = '\0';
+ /* If the format is not what we expect then be safe and error out. */
+ if (s == NULL || e == NULL || !(s < e)) {
+ if (verbose)
+ fprintf(stderr,
+ "%s: cannot parse scheduler options for '%s'\n",
+ progname, path);
- chk_major = strtok_r(buf, ":", &savept);
- chk_minor = savept;
- if (chk_major != NULL && major == atoi(chk_major) &&
- chk_minor != NULL && minor == atoi(chk_minor))
- break;
+ return EINVAL;
}
- if (i == glob_info.gl_pathc) {
+ old_sched = s + 1;
+ *e = '\0';
+
+ if (strcmp(old_sched, "noop") == 0 ||
+ strcmp(old_sched, new_sched) == 0)
+ return 0;
+
+ rc = write_file(path, new_sched);
+ if (rc != 0) {
if (verbose)
- fprintf(stderr,"warning: device %s does not match any "
- "entry under /sys/block\n", real_path);
- globfree(&glob_info);
- return -EINVAL;
+ fprintf(stderr,
+ "%s: cannot set scheduler on '%s': %s\n",
+ progname, path, strerror(errno));
+ return rc;
}
- /* Chop off "/dev" from path we found */
- path[strlen(glob_info.gl_pathv[i])] = '\0';
- globfree(&glob_info);
+ fprintf(stderr, "%s: changed scheduler of '%s' from %s to %s\n",
+ progname, path, old_sched, new_sched);
-set_params:
- if (strncmp(real_path, "/dev/md", 7) == 0) {
- snprintf(real_path, sizeof(real_path), "%s/%s", path,
- STRIPE_CACHE_SIZE);
+ return 0;
+}
- rc = read_file(real_path, buf, sizeof(buf));
- if (rc) {
- if (verbose)
- fprintf(stderr, "warning: opening %s: %s\n",
- real_path, strerror(errno));
- return 0;
- }
+static int tune_block_dev(const char *src, struct mount_opts *mop);
- if (atoi(buf) >= mop->mo_md_stripe_cache_size)
+static int tune_block_dev_slaves(const char *sys_path, struct mount_opts *mop)
+{
+ char slaves_path[PATH_MAX];
+ DIR *slaves_dir;
+ struct dirent *d;
+ int rc = 0;
+
+ snprintf(slaves_path, sizeof(slaves_path), "%s/slaves", sys_path);
+ slaves_dir = opendir(slaves_path);
+ if (slaves_dir == NULL) {
+ if (errno == ENOENT)
return 0;
- if (strlen(buf) - 1 > 0) {
- snprintf(buf, sizeof(buf), "%d",
- mop->mo_md_stripe_cache_size);
- rc = write_file(real_path, buf);
- if (rc != 0 && verbose)
- fprintf(stderr, "warning: opening %s: %s\n",
- real_path, strerror(errno));
- }
- /* Return since raid and disk tunables are different */
- return rc;
+ return errno;
}
- if (mop->mo_max_sectors_kb >= 0) {
- snprintf(buf, sizeof(buf), "%d", mop->mo_max_sectors_kb);
- } else {
- snprintf(real_path, sizeof(real_path), "%s/%s", path,
- MAX_HW_SECTORS_KB_PATH);
- rc = read_file(real_path, buf, sizeof(buf));
- if (rc) {
- if (verbose)
- fprintf(stderr, "warning: opening %s: %s\n",
- real_path, strerror(errno));
- /* No MAX_HW_SECTORS_KB_PATH isn't necessary an
- * error for some device. */
- goto subdevs;
- }
+ while ((d = readdir(slaves_dir)) != NULL) {
+ char path[PATH_MAX];
+ int rc2;
+
+ if (d->d_type != DT_LNK)
+ continue;
+
+ snprintf(path, sizeof(path), "%s/%s", slaves_path, d->d_name);
+ rc2 = tune_block_dev(path, mop);
+ if (rc2 != 0)
+ rc = rc2;
}
- if (strlen(buf) - 1 > 0) {
- char oldbuf[32] = "", *end = NULL;
- unsigned long long oldval, newval;
-
- snprintf(real_path, sizeof(real_path), "%s/%s", path,
- MAX_SECTORS_KB_PATH);
- rc = read_file(real_path, oldbuf, sizeof(oldbuf));
- /* Only set new parameter if different from the old one. */
- if (rc != 0 || strcmp(oldbuf, buf) == 0) {
- /* No MAX_SECTORS_KB_PATH isn't necessary an
- * error for some device. */
- goto subdevs;
- }
+ closedir(slaves_dir);
- newval = strtoull(buf, &end, 0);
- if (newval == 0 || newval == ULLONG_MAX || end == buf)
- goto subdevs;
-
- /* Don't increase IO request size limit past 16MB. It is about
- * PTLRPC_MAX_BRW_SIZE, but that isn't in a public header.
- * Note that even though the block layer allows larger values,
- * setting max_sectors_kb = 32768 causes crashes (LU-6974). */
- if (mop->mo_max_sectors_kb < 0 && newval > 16 * 1024) {
- newval = 16 * 1024;
- snprintf(buf, sizeof(buf), "%llu", newval);
- }
+ return rc;
+}
- oldval = strtoull(oldbuf, &end, 0);
- /* Don't shrink the current limit. */
- if (mop->mo_max_sectors_kb < 0 && oldval != ULLONG_MAX &&
- newval <= oldval)
- goto subdevs;
-
- rc = write_file(real_path, buf);
- if (rc != 0) {
- if (verbose)
- fprintf(stderr, "warning: writing to %s: %s\n",
- real_path, strerror(errno));
- /* No MAX_SECTORS_KB_PATH isn't necessary an
- * error for some device. */
- goto subdevs;
- }
- fprintf(stderr, "%s: increased %s from %s to %s\n",
- progname, real_path, oldbuf, buf);
+/* This is to tune the kernel for good SCSI performance.
+ * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
+ * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
+static int tune_block_dev(const char *src, struct mount_opts *mop)
+{
+ struct stat st;
+ char sys_path[PATH_MAX];
+ char partition_path[PATH_MAX];
+ char *real_sys_path = NULL;
+ int rc;
+
+ /*
+ * Don't apply block device tuning for MDT or MGT devices,
+ * since we don't need huge IO sizes to get good performance
+ */
+ if (!IS_OST(&mop->mo_ldd))
+ return 0;
+
+ if (src == NULL)
+ return EINVAL;
+
+ rc = stat(src, &st);
+ if (rc < 0) {
+ if (verbose)
+ fprintf(stderr, "warning: cannot stat '%s': %s\n",
+ src, strerror(errno));
+ return errno;
+ }
+
+ if (!S_ISBLK(st.st_mode))
+ return 0;
+
+ if (major(st.st_rdev) == LOOP_MAJOR)
+ return 0;
+
+ snprintf(sys_path, sizeof(sys_path), "/sys/dev/block/%u:%u",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ snprintf(partition_path, sizeof(partition_path), "%s/partition",
+ sys_path);
+
+ rc = access(partition_path, F_OK);
+ if (rc < 0) {
+ if (errno == ENOENT)
+ goto have_whole_dev;
+
+ if (verbose)
+ fprintf(stderr, "warning: cannot access '%s': %s\n",
+ partition_path, strerror(errno));
+ rc = errno;
+ goto out;
}
-subdevs:
- /* Purposely ignore errors reported from set_blockdev_scheduler.
- * The worst that will happen is a block device with an "incorrect"
- * scheduler. */
- snprintf(real_path, sizeof(real_path), "%s/%s", path, SCHEDULER_PATH);
- set_blockdev_scheduler(real_path, DEFAULT_SCHEDULER);
-
- /* if device is multipath device, tune its slave devices */
- glob_info.gl_pathc = 0;
- glob_info.gl_offs = 0;
- snprintf(real_path, sizeof(real_path), "%s/slaves/*", path);
- rc = glob(real_path, GLOB_NOSORT, NULL, &glob_info);
-
- for (i = 0; rc == 0 && i < glob_info.gl_pathc; i++) {
- slave = basename(glob_info.gl_pathv[i]);
- snprintf(real_path, sizeof(real_path), "/dev/%s", slave);
- rc = set_blockdev_tunables(real_path, mop);
+ snprintf(sys_path, sizeof(sys_path), "/sys/dev/block/%u:%u/..",
+ major(st.st_rdev), minor(st.st_rdev));
+
+have_whole_dev:
+ /* Since we recurse on slave devices we resolve the sys_path to
+ * avoid path buffer overflows. */
+ real_sys_path = realpath(sys_path, NULL);
+ if (real_sys_path == NULL) {
+ if (verbose)
+ fprintf(stderr,
+ "warning: cannot resolve '%s': %s\n",
+ sys_path, strerror(errno));
+ rc = errno;
+ goto out;
}
- if (rc == GLOB_NOMATCH) {
- /* no slave device is not an error */
- rc = 0;
- } else if (rc && verbose) {
- if (slave == NULL) {
- fprintf(stderr, "warning: %s, failed to read"
- " entries under %s/slaves\n",
- strerror(errno), path);
- } else {
- fprintf(stderr, "unable to set tunables for"
- " slave device %s (slave would be"
- " unable to handle IO request from"
- " master %s)\n",
- real_path, source);
- }
+ if (major(st.st_rdev) == MD_MAJOR) {
+ rc = tune_md_stripe_cache_size(real_sys_path, mop);
+ } else {
+ /* Ignore errors from tune_max_sectors_kb() and
+ * tune_scheduler(). The worst that will happen is a block
+ * device with an "incorrect" scheduler. */
+ tune_max_sectors_kb(real_sys_path, mop);
+ tune_block_dev_scheduler(real_sys_path, DEFAULT_SCHEDULER);
+
+ /* If device is multipath device then tune its slave
+ * devices. */
+ rc = tune_block_dev_slaves(real_sys_path, mop);
}
- globfree(&glob_info);
+
+out:
+ free(real_sys_path);
return rc;
}
int ldiskfs_tune_lustre(char *dev, struct mount_opts *mop)
{
- return set_blockdev_tunables(dev, mop);
+ return tune_block_dev(dev, mop);
}
int ldiskfs_label_lustre(struct mount_opts *mop)