Whamcloud - gitweb
LU-11736 utils: don't set max_sectors_kb on MDT/MGT
[fs/lustre-release.git] / lustre / utils / libmount_utils_ldiskfs.c
index d6b9ca6..c1f79ae 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 #include <string.h>
 #include <inttypes.h>
 #include <unistd.h>
+#include <dirent.h>
 #include <fcntl.h>
 #include <mntent.h>
-#include <glob.h>
 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
 #include <sys/utsname.h>
+#include <sys/sysmacros.h>
 
 #include <string.h>
 #include <getopt.h>
 #ifndef BLKGETSIZE64
 #include <linux/fs.h> /* for BLKGETSIZE64 */
 #endif
+#include <linux/major.h>
 #include <linux/types.h>
 #include <linux/version.h>
-#include <lnet/lnetctl.h>
-#include <lustre_ver.h>
+#include <linux/lnet/lnetctl.h>
+#include <linux/lustre/lustre_ver.h>
+#include <libcfs/util/string.h>
 
 #ifdef HAVE_SELINUX
 #include <selinux/selinux.h>
@@ -120,57 +123,6 @@ static void append_context_for_mount(char *mntpt, struct mkfs_opts *mop)
 }
 #endif
 
-/* return canonicalized absolute pathname, even if the target file does not
- * exist, unlike realpath */
-static char *absolute_path(char *devname)
-{
-       char  buf[PATH_MAX + 1] = "";
-       char *path;
-       char *ptr;
-       int len;
-
-       path = malloc(sizeof(buf));
-       if (path == NULL)
-               return NULL;
-
-       if (devname[0] != '/') {
-               if (getcwd(buf, sizeof(buf) - 1) == NULL) {
-                       free(path);
-                       return NULL;
-               }
-               len = snprintf(path, sizeof(buf), "%s/%s", buf, devname);
-               if (len >= sizeof(buf)) {
-                       free(path);
-                       return NULL;
-               }
-       } else {
-               len = snprintf(path, sizeof(buf), "%s", devname);
-               if (len >= sizeof(buf)) {
-                       free(path);
-                       return NULL;
-               }
-       }
-
-       /* truncate filename before calling realpath */
-       ptr = strrchr(path, '/');
-       if (ptr == NULL) {
-               free(path);
-               return NULL;
-       }
-       *ptr = '\0';
-       if (buf != realpath(path, buf)) {
-               free(path);
-               return NULL;
-       }
-       /* add the filename back */
-       len = snprintf(path, PATH_MAX, "%s/%s", buf, ptr+1);
-       if (len >= PATH_MAX) {
-               free(path);
-               return NULL;
-       }
-       return path;
-}
-
 /* Determine if a device is a block device (as opposed to a file) */
 static int is_block(char *devname)
 {
@@ -178,10 +130,10 @@ static int is_block(char *devname)
        int     ret = 0;
        char    *devpath;
 
-       devpath = absolute_path(devname);
-       if (devpath == NULL) {
-               fprintf(stderr, "%s: failed to resolve path to %s\n",
-                       progname, devname);
+       ret = cfs_abs_path(devname, &devpath);
+       if (ret != 0) {
+               fprintf(stderr, "%s: failed to resolve path '%s': %s\n",
+                       progname, devname, strerror(-ret));
                return -1;
        }
 
@@ -240,7 +192,7 @@ static int is_feature_enabled(const char *feature, const char *devpath)
 int ldiskfs_write_ldd(struct mkfs_opts *mop)
 {
        char mntpt[] = "/tmp/mntXXXXXX";
-       char filepnm[128];
+       char filepnm[192];
        char *dev;
        FILE *filep;
        int ret = 0;
@@ -436,7 +388,7 @@ static void disp_old_e2fsprogs_msg(const char *feature, int make_backfs)
                E2FSPROGS, feature);
 #if !(HAVE_LDISKFSPROGS)
        fprintf(stderr, "Please install the latest version of e2fsprogs from\n"
-               "https://downloads.hpdd.intel.com/public/e2fsprogs/latest/\n"
+               "https://downloads.whamcloud.com/public/e2fsprogs/latest/\n"
                "to enable this feature.\n");
 #endif
        if (make_backfs)
@@ -590,6 +542,13 @@ static void append_unique(char *buf, char *prefix, char *key, char *val,
 static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
                                        size_t maxbuflen, int user_spec)
 {
+       int enable_64bit = 0;
+
+       /* Enable large block addresses if the LUN is over 2^32 blocks. */
+       if ((mop->mo_device_kb / (L_BLOCK_SIZE >> 10) > UINT32_MAX) &&
+            is_e2fsprogs_feature_supp("-O 64bit") == 0)
+               enable_64bit = 1;
+
        if (IS_OST(&mop->mo_ldd)) {
                append_unique(anchor, user_spec ? "," : " -O ",
                              "extents", NULL, maxbuflen);
@@ -598,7 +557,10 @@ static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
                append_unique(anchor, user_spec ? "," : " -O ",
                              "dirdata", NULL, maxbuflen);
                append_unique(anchor, ",", "uninit_bg", NULL, maxbuflen);
-               append_unique(anchor, ",", "^extents", NULL, maxbuflen);
+               if (enable_64bit)
+                       append_unique(anchor, ",", "extents", NULL, maxbuflen);
+               else
+                       append_unique(anchor, ",", "^extents", NULL, maxbuflen);
        } else {
                append_unique(anchor, user_spec ? "," : " -O ",
                              "uninit_bg", NULL, maxbuflen);
@@ -635,9 +597,7 @@ static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
        if (is_e2fsprogs_feature_supp("-O huge_file") == 0)
                append_unique(anchor, ",", "huge_file", NULL, maxbuflen);
 
-       /* Enable large block addresses if the LUN is over 2^32 blocks. */
-       if (mop->mo_device_kb / (L_BLOCK_SIZE >> 10) >= 0x100002000ULL &&
-           is_e2fsprogs_feature_supp("-O 64bit") == 0)
+       if (enable_64bit)
                append_unique(anchor, ",", "64bit", NULL, maxbuflen);
 
        /* Cluster inode/block bitmaps and inode table for more efficient IO.
@@ -779,24 +739,26 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop)
                 * (assuming all files are in composite layout and has
                 * 3 components):
                 *
-                *   ldiskfs inode size: 156
-                *   extended attributes size, including:
+                *   ldiskfs inode size: 160
+                *   MDT extended attributes size, including:
                 *      ext4_xattr_header: 32
                 *      LOV EA size: 32(lov_comp_md_v1) +
                 *                   3 * 40(lov_comp_md_entry_v1) +
                 *                   3 * 32(lov_mds_md) +
                 *                   stripes * 24(lov_ost_data) +
-                *                   16(xattr_entry) + 3(lov)
+                *                   16(xattr_entry) + 4("lov")
                 *      LMA EA size: 24(lustre_mdt_attrs) +
-                *                   16(xattr_entry) + 3(lma)
+                *                   16(xattr_entry) + 4("lma")
+                *      SOM EA size: 24(lustre_som_attrs) +
+                *                   16(xattr_entry) + 4("som")
                 *      link EA size: 24(link_ea_header) + 18(link_ea_entry) +
-                *                    (filename) + 16(xattr_entry) + 4(link)
+                *                    16(filename) + 16(xattr_entry) + 4("link")
                 *   and some margin for 4-byte alignment, ACLs and other EAs.
                 *
                 * If we say the average filename length is about 32 bytes,
                 * the calculation looks like:
-                * 156 + 32 + (32+3*(40 + 32)+24*N+19) + (24+19) +
-                * (24+18+~32+20) + other <= 512*2^m, {m=0,1,2,3}
+                * 160 + 32 + (32+3*(40+32)+24*stripes+20) + (24+20) + (24+20) +
+                *  (24+20) + (~42+16+20) + other <= 512*2^m, {m=0,1,2,3}
                 */
                if (strstr(mop->mo_mkfsopts, "-I") == NULL) {
                        if (IS_MDT(&mop->mo_ldd)) {
@@ -809,7 +771,16 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop)
                                        inode_size = 1024;
                        } else if (IS_OST(&mop->mo_ldd)) {
                                /* We store MDS FID and necessary composite
-                                * layout information in the OST object EA. */
+                                * layout information in the OST object EA:
+                                *   ldiskfs inode size: 160
+                                *   OST extended attributes size, including:
+                                *      ext4_xattr_header: 32
+                                *      LMA EA size: 24(lustre_mdt_attrs) +
+                                *                   16(xattr_entry) + 4("lma")
+                                *      FID EA size: 52(filter_fid) +
+                                *                   16(xattr_entry) + 4("fid")
+                                * 160 + 32 + (24+20) + (52+20) = 308
+                                */
                                inode_size = 512;
                        }
 
@@ -864,6 +835,7 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop)
                                sprintf(buf, " -i %ld", bytes_per_inode);
                                strscat(mop->mo_mkfsopts, buf,
                                        sizeof(mop->mo_mkfsopts));
+                               mop->mo_inode_size = bytes_per_inode;
                        }
                }
 
@@ -1046,290 +1018,307 @@ static int write_file(const char *path, const char *buf)
        return rc < 0 ? errno : 0;
 }
 
-static int set_blockdev_scheduler(const char *path, const char *scheduler)
+static int tune_md_stripe_cache_size(const char *sys_path,
+                                    struct mount_opts *mop)
 {
-       char buf[PATH_MAX], *s, *e, orig_sched[50];
+       char path[PATH_MAX];
+       unsigned long old_stripe_cache_size;
+       unsigned long new_stripe_cache_size;
+       char buf[3 * sizeof(old_stripe_cache_size) + 2];
        int rc;
 
-       /* Before setting the scheduler, we need to check to see if it's
-        * already set to "noop". If it is, we don't want to override
-        * that setting. If it's set to anything other than "noop", set
-        * the scheduler to what has been passed in. */
+       if (mop->mo_md_stripe_cache_size <= 0)
+               return 0;
+
+       new_stripe_cache_size = mop->mo_md_stripe_cache_size;
 
+       snprintf(path, sizeof(path), "%s/%s", sys_path, STRIPE_CACHE_SIZE);
        rc = read_file(path, buf, sizeof(buf));
-       if (rc) {
+       if (rc != 0) {
                if (verbose)
-                       fprintf(stderr, "%s: cannot open '%s': %s\n",
-                               progname, path, strerror(errno));
+                       fprintf(stderr, "warning: cannot read '%s': %s\n",
+                               path, strerror(errno));
                return rc;
        }
 
-       /* The expected format of buf: noop anticipatory deadline [cfq] */
-       s = strchr(buf, '[');
-       e = strchr(buf, ']');
-
-       /* If the format is not what we expect. Play it safe and error out. */
-       if (s == NULL || e == NULL) {
-               if (verbose)
-                       fprintf(stderr, "%s: cannot parse scheduler "
-                                       "options for '%s'\n", progname, path);
-               return -EINVAL;
-       }
-
-       snprintf(orig_sched, e - s, "%s", s + 1);
+       old_stripe_cache_size = strtoul(buf, NULL, 0);
+       if (old_stripe_cache_size == 0 || old_stripe_cache_size == ULONG_MAX)
+               return EINVAL;
 
-       if (strcmp(orig_sched, "noop") == 0 ||
-           strcmp(orig_sched, scheduler) == 0)
+       if (new_stripe_cache_size <= old_stripe_cache_size)
                return 0;
 
-       rc = write_file(path, scheduler);
-       if (rc) {
+       snprintf(buf, sizeof(buf), "%lu", new_stripe_cache_size);
+       rc = write_file(path, buf);
+       if (rc != 0) {
                if (verbose)
-                       fprintf(stderr, "%s: cannot set scheduler on "
-                                       "'%s': %s\n", progname, path,
-                                       strerror(errno));
+                       fprintf(stderr, "warning: cannot write '%s': %s\n",
+                               path, strerror(errno));
                return rc;
-       } else {
-               fprintf(stderr, "%s: change scheduler of %s from %s to %s\n",
-                       progname, path, orig_sched, scheduler);
        }
 
-       return rc;
+       return 0;
 }
 
-/* This is to tune the kernel for good SCSI performance.
- * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
- * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
-static int set_blockdev_tunables(char *source, struct mount_opts *mop)
+static int tune_max_sectors_kb(const char *sys_path, struct mount_opts *mop)
 {
-       glob_t glob_info = { 0 };
-       struct stat stat_buf;
-       char *chk_major, *chk_minor;
-       char *savept = NULL, *dev;
-       char *ret_path;
-       char buf[PATH_MAX] = {'\0'}, path[PATH_MAX] = {'\0'};
-       char real_path[PATH_MAX] = {'\0'};
-       int i, rc = 0;
-       int major, minor;
-       char *slave = NULL;
-
-       if (!source)
-               return -EINVAL;
-
-       ret_path = realpath(source, real_path);
-       if (ret_path == NULL) {
-               if (verbose)
-                       fprintf(stderr, "warning: %s: cannot resolve: %s\n",
-                               source, strerror(errno));
-               return -EINVAL;
+       char path[PATH_MAX];
+       unsigned long max_hw_sectors_kb;
+       unsigned long old_max_sectors_kb;
+       unsigned long new_max_sectors_kb;
+       char buf[3 * sizeof(old_max_sectors_kb) + 2];
+       int rc;
+
+       if (mop->mo_max_sectors_kb >= 0) {
+               new_max_sectors_kb = mop->mo_max_sectors_kb;
+               goto have_new_max_sectors_kb;
        }
 
-       if (strncmp(real_path, "/dev/loop", 9) == 0)
+       snprintf(path, sizeof(path), "%s/%s", sys_path, MAX_HW_SECTORS_KB_PATH);
+       rc = read_file(path, buf, sizeof(buf));
+       if (rc != 0) {
+               /* No MAX_HW_SECTORS_KB_PATH isn't necessary an
+                * error for some devices. */
                return 0;
+       }
 
-       if ((real_path[0] != '/') && (strpbrk(real_path, ",:") != NULL))
+       max_hw_sectors_kb = strtoul(buf, NULL, 0);
+       if (max_hw_sectors_kb == 0 || max_hw_sectors_kb == ULLONG_MAX) {
+               /* No digits at all or something weird. */
                return 0;
+       }
 
-       snprintf(path, sizeof(path), "/sys/block%s", real_path + 4);
-       if (access(path, X_OK) == 0)
-               goto set_params;
+       new_max_sectors_kb = max_hw_sectors_kb;
 
-       /* The name of the device say 'X' specified in /dev/X may not
-        * match any entry under /sys/block/. In that case we need to
-        * match the major/minor number to find the entry under
-        * sys/block corresponding to /dev/X */
+       /* Don't increase IO request size limit past 16MB.  It is
+        * about PTLRPC_MAX_BRW_SIZE, but that isn't in a public
+        * header.  Note that even though the block layer allows
+        * larger values, setting max_sectors_kb = 32768 causes
+        * crashes (LU-6974). */
+       if (new_max_sectors_kb > 16 * 1024)
+               new_max_sectors_kb = 16 * 1024;
 
-       /* Don't chop tail digit on /dev/mapper/xxx, LU-478 */
-       if (strncmp(real_path, "/dev/mapper", 11) != 0) {
-               dev = real_path + strlen(real_path);
-               while (--dev > real_path && isdigit(*dev))
-                       *dev = 0;
+have_new_max_sectors_kb:
+       snprintf(path, sizeof(path), "%s/%s", sys_path, MAX_SECTORS_KB_PATH);
+       rc = read_file(path, buf, sizeof(buf));
+       if (rc != 0) {
+               /* No MAX_SECTORS_KB_PATH isn't necessary an error for
+                * some devices. */
+               return 0;
+       }
 
-               if (strncmp(real_path, "/dev/md", 7) == 0 && dev[0] == 'p')
-                       *dev = 0;
+       old_max_sectors_kb = strtoul(buf, NULL, 0);
+       if (old_max_sectors_kb == 0 || old_max_sectors_kb == ULLONG_MAX) {
+               /* No digits at all or something weird. */
+               return 0;
        }
 
-       rc = stat(real_path, &stat_buf);
-       if (rc) {
+       if (new_max_sectors_kb <= old_max_sectors_kb)
+               return 0;
+
+       snprintf(buf, sizeof(buf), "%lu", new_max_sectors_kb);
+       rc = write_file(path, buf);
+       if (rc != 0) {
                if (verbose)
-                       fprintf(stderr, "warning: %s, device %s stat failed\n",
-                               strerror(errno), real_path);
+                       fprintf(stderr, "warning: cannot write '%s': %s\n",
+                               path, strerror(errno));
                return rc;
        }
 
-       major = major(stat_buf.st_rdev);
-       minor = minor(stat_buf.st_rdev);
-       rc = glob("/sys/block/*", GLOB_NOSORT, NULL, &glob_info);
-       if (rc) {
+       fprintf(stderr, "%s: increased '%s' from %lu to %lu\n",
+               progname, path, old_max_sectors_kb, new_max_sectors_kb);
+
+       return 0;
+}
+
+static int tune_block_dev_scheduler(const char *sys_path, const char *new_sched)
+{
+       char path[PATH_MAX];
+       char buf[PATH_MAX];
+       char *s, *e;
+       char *old_sched;
+       int rc;
+
+       /* Before setting the scheduler, we need to check to see if
+        * it's already set to "noop". If it is then we don't want to
+        * override that setting. If it's set to anything other than
+        * "noop" then set the scheduler to what has been passed
+        * in. */
+
+       snprintf(path, sizeof(path), "%s/%s", sys_path, SCHEDULER_PATH);
+       rc = read_file(path, buf, sizeof(buf));
+       if (rc != 0) {
                if (verbose)
-                       fprintf(stderr, "warning: failed to read entries under "
-                               "/sys/block\n");
-               globfree(&glob_info);
+                       fprintf(stderr, "%s: cannot read '%s': %s\n",
+                               progname, path, strerror(errno));
+
                return rc;
        }
 
-       for (i = 0; i < glob_info.gl_pathc; i++){
-               snprintf(path, sizeof(path), "%s/dev", glob_info.gl_pathv[i]);
-
-               rc = read_file(path, buf, sizeof(buf));
-               if (rc)
-                       continue;
+       /* The expected format of buf: noop anticipatory deadline [cfq] */
+       s = strchr(buf, '[');
+       e = strchr(buf, ']');
 
-               if (buf[strlen(buf) - 1] == '\n')
-                       buf[strlen(buf) - 1] = '\0';
+       /* If the format is not what we expect then be safe and error out. */
+       if (s == NULL || e == NULL || !(s < e)) {
+               if (verbose)
+                       fprintf(stderr,
+                               "%s: cannot parse scheduler options for '%s'\n",
+                               progname, path);
 
-               chk_major = strtok_r(buf, ":", &savept);
-               chk_minor = savept;
-               if (chk_major != NULL && major == atoi(chk_major) &&
-                   chk_minor != NULL && minor == atoi(chk_minor))
-                       break;
+               return EINVAL;
        }
 
-       if (i == glob_info.gl_pathc) {
+       old_sched = s + 1;
+       *e = '\0';
+
+       if (strcmp(old_sched, "noop") == 0 ||
+           strcmp(old_sched, new_sched) == 0)
+               return 0;
+
+       rc = write_file(path, new_sched);
+       if (rc != 0) {
                if (verbose)
-                       fprintf(stderr,"warning: device %s does not match any "
-                               "entry under /sys/block\n", real_path);
-               globfree(&glob_info);
-               return -EINVAL;
+                       fprintf(stderr,
+                               "%s: cannot set scheduler on '%s': %s\n",
+                               progname, path, strerror(errno));
+               return rc;
        }
 
-       /* Chop off "/dev" from path we found */
-       path[strlen(glob_info.gl_pathv[i])] = '\0';
-       globfree(&glob_info);
+       fprintf(stderr, "%s: changed scheduler of '%s' from %s to %s\n",
+               progname, path, old_sched, new_sched);
 
-set_params:
-       if (strncmp(real_path, "/dev/md", 7) == 0) {
-               snprintf(real_path, sizeof(real_path), "%s/%s", path,
-                        STRIPE_CACHE_SIZE);
+       return 0;
+}
 
-               rc = read_file(real_path, buf, sizeof(buf));
-               if (rc) {
-                       if (verbose)
-                               fprintf(stderr, "warning: opening %s: %s\n",
-                                       real_path, strerror(errno));
-                       return 0;
-               }
+static int tune_block_dev(const char *src, struct mount_opts *mop);
 
-               if (atoi(buf) >= mop->mo_md_stripe_cache_size)
+static int tune_block_dev_slaves(const char *sys_path, struct mount_opts *mop)
+{
+       char slaves_path[PATH_MAX];
+       DIR *slaves_dir;
+       struct dirent *d;
+       int rc = 0;
+
+       snprintf(slaves_path, sizeof(slaves_path), "%s/slaves", sys_path);
+       slaves_dir = opendir(slaves_path);
+       if (slaves_dir == NULL) {
+               if (errno == ENOENT)
                        return 0;
 
-               if (strlen(buf) - 1 > 0) {
-                       snprintf(buf, sizeof(buf), "%d",
-                                mop->mo_md_stripe_cache_size);
-                       rc = write_file(real_path, buf);
-                       if (rc != 0 && verbose)
-                               fprintf(stderr, "warning: opening %s: %s\n",
-                                       real_path, strerror(errno));
-               }
-               /* Return since raid and disk tunables are different */
-               return rc;
+               return errno;
        }
 
-       if (mop->mo_max_sectors_kb >= 0) {
-               snprintf(buf, sizeof(buf), "%d", mop->mo_max_sectors_kb);
-       } else {
-               snprintf(real_path, sizeof(real_path), "%s/%s", path,
-                        MAX_HW_SECTORS_KB_PATH);
-               rc = read_file(real_path, buf, sizeof(buf));
-               if (rc) {
-                       if (verbose)
-                               fprintf(stderr, "warning: opening %s: %s\n",
-                                       real_path, strerror(errno));
-                       /* No MAX_HW_SECTORS_KB_PATH isn't necessary an
-                        * error for some device. */
-                       goto subdevs;
-               }
+       while ((d = readdir(slaves_dir)) != NULL) {
+               char path[PATH_MAX];
+               int rc2;
+
+               if (d->d_type != DT_LNK)
+                       continue;
+
+               snprintf(path, sizeof(path), "%s/%s", slaves_path, d->d_name);
+               rc2 = tune_block_dev(path, mop);
+               if (rc2 != 0)
+                       rc = rc2;
        }
 
-       if (strlen(buf) - 1 > 0) {
-               char oldbuf[32] = "", *end = NULL;
-               unsigned long long oldval, newval;
-
-               snprintf(real_path, sizeof(real_path), "%s/%s", path,
-                        MAX_SECTORS_KB_PATH);
-               rc = read_file(real_path, oldbuf, sizeof(oldbuf));
-               /* Only set new parameter if different from the old one. */
-               if (rc != 0 || strcmp(oldbuf, buf) == 0) {
-                       /* No MAX_SECTORS_KB_PATH isn't necessary an
-                        * error for some device. */
-                       goto subdevs;
-               }
+       closedir(slaves_dir);
 
-               newval = strtoull(buf, &end, 0);
-               if (newval == 0 || newval == ULLONG_MAX || end == buf)
-                       goto subdevs;
-
-               /* Don't increase IO request size limit past 16MB.  It is about
-                * PTLRPC_MAX_BRW_SIZE, but that isn't in a public header.
-                * Note that even though the block layer allows larger values,
-                * setting max_sectors_kb = 32768 causes crashes (LU-6974). */
-               if (mop->mo_max_sectors_kb < 0 && newval > 16 * 1024) {
-                       newval = 16 * 1024;
-                       snprintf(buf, sizeof(buf), "%llu", newval);
-               }
+       return rc;
+}
 
-               oldval = strtoull(oldbuf, &end, 0);
-               /* Don't shrink the current limit. */
-               if (mop->mo_max_sectors_kb < 0 && oldval != ULLONG_MAX &&
-                   newval <= oldval)
-                       goto subdevs;
-
-               rc = write_file(real_path, buf);
-               if (rc != 0) {
-                       if (verbose)
-                               fprintf(stderr, "warning: writing to %s: %s\n",
-                                       real_path, strerror(errno));
-                       /* No MAX_SECTORS_KB_PATH isn't necessary an
-                        * error for some device. */
-                       goto subdevs;
-               }
-               fprintf(stderr, "%s: increased %s from %s to %s\n",
-                       progname, real_path, oldbuf, buf);
+/* This is to tune the kernel for good SCSI performance.
+ * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
+ * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
+static int tune_block_dev(const char *src, struct mount_opts *mop)
+{
+       struct stat st;
+       char sys_path[PATH_MAX];
+       char partition_path[PATH_MAX];
+       char *real_sys_path = NULL;
+       int rc;
+
+       /*
+        * Don't apply block device tuning for MDT or MGT devices,
+        * since we don't need huge IO sizes to get good performance
+        */
+       if (!IS_OST(&mop->mo_ldd))
+               return 0;
+
+       if (src == NULL)
+               return EINVAL;
+
+       rc = stat(src, &st);
+       if (rc < 0) {
+               if (verbose)
+                       fprintf(stderr, "warning: cannot stat '%s': %s\n",
+                               src, strerror(errno));
+               return errno;
        }
 
-subdevs:
-       /* Purposely ignore errors reported from set_blockdev_scheduler.
-        * The worst that will happen is a block device with an "incorrect"
-        * scheduler. */
-       snprintf(real_path, sizeof(real_path), "%s/%s", path, SCHEDULER_PATH);
-       set_blockdev_scheduler(real_path, DEFAULT_SCHEDULER);
-
-       /* if device is multipath device, tune its slave devices */
-       glob_info.gl_pathc = 0;
-       glob_info.gl_offs = 0;
-       snprintf(real_path, sizeof(real_path), "%s/slaves/*", path);
-       rc = glob(real_path, GLOB_NOSORT, NULL, &glob_info);
-
-       for (i = 0; rc == 0 && i < glob_info.gl_pathc; i++) {
-               slave = basename(glob_info.gl_pathv[i]);
-               snprintf(real_path, sizeof(real_path), "/dev/%s", slave);
-               rc = set_blockdev_tunables(real_path, mop);
+       if (!S_ISBLK(st.st_mode))
+               return 0;
+
+       if (major(st.st_rdev) == LOOP_MAJOR)
+               return 0;
+
+       snprintf(sys_path, sizeof(sys_path), "/sys/dev/block/%u:%u",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       snprintf(partition_path, sizeof(partition_path), "%s/partition",
+                sys_path);
+
+       rc = access(partition_path, F_OK);
+       if (rc < 0) {
+               if (errno == ENOENT)
+                       goto have_whole_dev;
+
+               if (verbose)
+                       fprintf(stderr, "warning: cannot access '%s': %s\n",
+                               partition_path, strerror(errno));
+               rc = errno;
+               goto out;
        }
 
-       if (rc == GLOB_NOMATCH) {
-               /* no slave device is not an error */
-               rc = 0;
-       } else if (rc && verbose) {
-               if (slave == NULL) {
-                       fprintf(stderr, "warning: %s, failed to read"
-                               " entries under %s/slaves\n",
-                               strerror(errno), path);
-               } else {
-                       fprintf(stderr, "unable to set tunables for"
-                               " slave device %s (slave would be"
-                               " unable to handle IO request from"
-                               " master %s)\n",
-                               real_path, source);
-               }
+       snprintf(sys_path, sizeof(sys_path), "/sys/dev/block/%u:%u/..",
+                major(st.st_rdev), minor(st.st_rdev));
+
+have_whole_dev:
+       /* Since we recurse on slave devices we resolve the sys_path to
+        * avoid path buffer overflows. */
+       real_sys_path = realpath(sys_path, NULL);
+       if (real_sys_path == NULL) {
+               if (verbose)
+                       fprintf(stderr,
+                               "warning: cannot resolve '%s': %s\n",
+                               sys_path, strerror(errno));
+               rc = errno;
+               goto out;
        }
-       globfree(&glob_info);
+
+       if (major(st.st_rdev) == MD_MAJOR) {
+               rc = tune_md_stripe_cache_size(real_sys_path, mop);
+       } else {
+               /* Ignore errors from tune_max_sectors_kb() and
+                * tune_scheduler(). The worst that will happen is a block
+                * device with an "incorrect" scheduler. */
+               tune_max_sectors_kb(real_sys_path, mop);
+               tune_block_dev_scheduler(real_sys_path, DEFAULT_SCHEDULER);
+
+               /* If device is multipath device then tune its slave
+                * devices. */
+               rc = tune_block_dev_slaves(real_sys_path, mop);
+       }
+
+out:
+       free(real_sys_path);
 
        return rc;
 }
 
 int ldiskfs_tune_lustre(char *dev, struct mount_opts *mop)
 {
-       return set_blockdev_tunables(dev, mop);
+       return tune_block_dev(dev, mop);
 }
 
 int ldiskfs_label_lustre(struct mount_opts *mop)