X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Futils%2Flibmount_utils_ldiskfs.c;h=c1f79ae2df1fdbeb6a207007e05419ec6caeca33;hb=2f8d7b4679de3fa467040aa61733f262714e39c9;hp=c6ece221ec748691e7df2259c55edb6522b1753c;hpb=6712478e79588e73e28c7ccac3afc7ac2368a4f3;p=fs%2Flustre-release.git

diff --git a/lustre/utils/libmount_utils_ldiskfs.c b/lustre/utils/libmount_utils_ldiskfs.c
index c6ece22..c1f79ae 100644
--- a/lustre/utils/libmount_utils_ldiskfs.c
+++ b/lustre/utils/libmount_utils_ldiskfs.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,14 +49,15 @@
 #include <string.h>
 #include <inttypes.h>
 #include <unistd.h>
+#include <dirent.h>
 #include <fcntl.h>
 #include <mntent.h>
-#include <glob.h>
 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
 #include <sys/utsname.h>
+#include <sys/sysmacros.h>
 
 #include <string.h>
 #include <getopt.h>
@@ -66,10 +67,12 @@
 #ifndef BLKGETSIZE64
 #include <linux/fs.h> /* for BLKGETSIZE64 */
 #endif
+#include <linux/major.h>
 #include <linux/types.h>
 #include <linux/version.h>
 #include <linux/lnet/lnetctl.h>
 #include <linux/lustre/lustre_ver.h>
+#include <libcfs/util/string.h>
 
 #ifdef HAVE_SELINUX
 #include <selinux/selinux.h>
@@ -120,57 +123,6 @@ static void append_context_for_mount(char *mntpt, struct mkfs_opts *mop)
 }
 #endif
 
-/* return canonicalized absolute pathname, even if the target file does not
- * exist, unlike realpath */
-static char *absolute_path(char *devname)
-{
-	char  buf[PATH_MAX + 1] = "";
-	char *path;
-	char *ptr;
-	int len;
-
-	path = malloc(sizeof(buf));
-	if (path == NULL)
-		return NULL;
-
-	if (devname[0] != '/') {
-		if (getcwd(buf, sizeof(buf) - 1) == NULL) {
-			free(path);
-			return NULL;
-		}
-		len = snprintf(path, sizeof(buf), "%s/%s", buf, devname);
-		if (len >= sizeof(buf)) {
-			free(path);
-			return NULL;
-		}
-	} else {
-		len = snprintf(path, sizeof(buf), "%s", devname);
-		if (len >= sizeof(buf)) {
-			free(path);
-			return NULL;
-		}
-	}
-
-	/* truncate filename before calling realpath */
-	ptr = strrchr(path, '/');
-	if (ptr == NULL) {
-		free(path);
-		return NULL;
-	}
-	*ptr = '\0';
-	if (buf != realpath(path, buf)) {
-		free(path);
-		return NULL;
-	}
-	/* add the filename back */
-	len = snprintf(path, PATH_MAX, "%s/%s", buf, ptr+1);
-	if (len >= PATH_MAX) {
-		free(path);
-		return NULL;
-	}
-	return path;
-}
-
 /* Determine if a device is a block device (as opposed to a file) */
 static int is_block(char *devname)
 {
@@ -178,10 +130,10 @@ static int is_block(char *devname)
 	int	ret = 0;
 	char	*devpath;
 
-	devpath = absolute_path(devname);
-	if (devpath == NULL) {
-		fprintf(stderr, "%s: failed to resolve path to %s\n",
-			progname, devname);
+	ret = cfs_abs_path(devname, &devpath);
+	if (ret != 0) {
+		fprintf(stderr, "%s: failed to resolve path '%s': %s\n",
+			progname, devname, strerror(-ret));
 		return -1;
 	}
 
@@ -240,7 +192,7 @@ static int is_feature_enabled(const char *feature, const char *devpath)
 int ldiskfs_write_ldd(struct mkfs_opts *mop)
 {
 	char mntpt[] = "/tmp/mntXXXXXX";
-	char filepnm[128];
+	char filepnm[192];
 	char *dev;
 	FILE *filep;
 	int ret = 0;
@@ -436,7 +388,7 @@ static void disp_old_e2fsprogs_msg(const char *feature, int make_backfs)
 		E2FSPROGS, feature);
 #if !(HAVE_LDISKFSPROGS)
 	fprintf(stderr, "Please install the latest version of e2fsprogs from\n"
-		"https://downloads.hpdd.intel.com/public/e2fsprogs/latest/\n"
+		"https://downloads.whamcloud.com/public/e2fsprogs/latest/\n"
 		"to enable this feature.\n");
 #endif
 	if (make_backfs)
@@ -590,6 +542,13 @@ static void append_unique(char *buf, char *prefix, char *key, char *val,
 static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
 					size_t maxbuflen, int user_spec)
 {
+	int enable_64bit = 0;
+
+	/* Enable large block addresses if the LUN is over 2^32 blocks. */
+	if ((mop->mo_device_kb / (L_BLOCK_SIZE >> 10) > UINT32_MAX) &&
+	     is_e2fsprogs_feature_supp("-O 64bit") == 0)
+		enable_64bit = 1;
+
 	if (IS_OST(&mop->mo_ldd)) {
 		append_unique(anchor, user_spec ? "," : " -O ",
 			      "extents", NULL, maxbuflen);
@@ -598,7 +557,10 @@ static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
 		append_unique(anchor, user_spec ? "," : " -O ",
 			      "dirdata", NULL, maxbuflen);
 		append_unique(anchor, ",", "uninit_bg", NULL, maxbuflen);
-		append_unique(anchor, ",", "^extents", NULL, maxbuflen);
+		if (enable_64bit)
+			append_unique(anchor, ",", "extents", NULL, maxbuflen);
+		else
+			append_unique(anchor, ",", "^extents", NULL, maxbuflen);
 	} else {
 		append_unique(anchor, user_spec ? "," : " -O ",
 			      "uninit_bg", NULL, maxbuflen);
@@ -635,9 +597,7 @@ static int enable_default_ext4_features(struct mkfs_opts *mop, char *anchor,
 	if (is_e2fsprogs_feature_supp("-O huge_file") == 0)
 		append_unique(anchor, ",", "huge_file", NULL, maxbuflen);
 
-	/* Enable large block addresses if the LUN is over 2^32 blocks. */
-	if (mop->mo_device_kb / (L_BLOCK_SIZE >> 10) >= 0x100002000ULL &&
-	    is_e2fsprogs_feature_supp("-O 64bit") == 0)
+	if (enable_64bit)
 		append_unique(anchor, ",", "64bit", NULL, maxbuflen);
 
 	/* Cluster inode/block bitmaps and inode table for more efficient IO.
@@ -779,24 +739,26 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop)
 		 * (assuming all files are in composite layout and has
 		 * 3 components):
 		 *
-		 *   ldiskfs inode size: 156
-		 *   extended attributes size, including:
+		 *   ldiskfs inode size: 160
+		 *   MDT extended attributes size, including:
 		 *	ext4_xattr_header: 32
 		 *	LOV EA size: 32(lov_comp_md_v1) +
 		 *		     3 * 40(lov_comp_md_entry_v1) +
 		 *		     3 * 32(lov_mds_md) +
 		 *		     stripes * 24(lov_ost_data) +
-		 *		     16(xattr_entry) + 3(lov)
+		 *		     16(xattr_entry) + 4("lov")
 		 *	LMA EA size: 24(lustre_mdt_attrs) +
-		 *		     16(xattr_entry) + 3(lma)
+		 *		     16(xattr_entry) + 4("lma")
+		 *	SOM EA size: 24(lustre_som_attrs) +
+		 *		     16(xattr_entry) + 4("som")
 		 *	link EA size: 24(link_ea_header) + 18(link_ea_entry) +
-		 *		      (filename) + 16(xattr_entry) + 4(link)
+		 *		      16(filename) + 16(xattr_entry) + 4("link")
 		 *   and some margin for 4-byte alignment, ACLs and other EAs.
 		 *
 		 * If we say the average filename length is about 32 bytes,
 		 * the calculation looks like:
-		 * 156 + 32 + (32+3*(40 + 32)+24*N+19) + (24+19) +
-		 * (24+18+~32+20) + other <= 512*2^m, {m=0,1,2,3}
+		 * 160 + 32 + (32+3*(40+32)+24*stripes+20) + (24+20) + (24+20) +
+		 *  (24+20) + (~42+16+20) + other <= 512*2^m, {m=0,1,2,3}
 		 */
 		if (strstr(mop->mo_mkfsopts, "-I") == NULL) {
 			if (IS_MDT(&mop->mo_ldd)) {
@@ -809,7 +771,16 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop)
 					inode_size = 1024;
 			} else if (IS_OST(&mop->mo_ldd)) {
 				/* We store MDS FID and necessary composite
-				 * layout information in the OST object EA. */
+				 * layout information in the OST object EA:
+				 *   ldiskfs inode size: 160
+				 *   OST extended attributes size, including:
+				 *	ext4_xattr_header: 32
+				 *	LMA EA size: 24(lustre_mdt_attrs) +
+				 *		     16(xattr_entry) + 4("lma")
+				 *	FID EA size: 52(filter_fid) +
+				 *		     16(xattr_entry) + 4("fid")
+				 * 160 + 32 + (24+20) + (52+20) = 308
+				 */
 				inode_size = 512;
 			}
 
@@ -864,6 +835,7 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop)
 				sprintf(buf, " -i %ld", bytes_per_inode);
 				strscat(mop->mo_mkfsopts, buf,
 					sizeof(mop->mo_mkfsopts));
+				mop->mo_inode_size = bytes_per_inode;
 			}
 		}
 
@@ -1046,290 +1018,307 @@ static int write_file(const char *path, const char *buf)
 	return rc < 0 ? errno : 0;
 }
 
-static int set_blockdev_scheduler(const char *path, const char *scheduler)
+static int tune_md_stripe_cache_size(const char *sys_path,
+				     struct mount_opts *mop)
 {
-	char buf[PATH_MAX], *s, *e, orig_sched[50];
+	char path[PATH_MAX];
+	unsigned long old_stripe_cache_size;
+	unsigned long new_stripe_cache_size;
+	char buf[3 * sizeof(old_stripe_cache_size) + 2];
 	int rc;
 
-	/* Before setting the scheduler, we need to check to see if it's
-	 * already set to "noop". If it is, we don't want to override
-	 * that setting. If it's set to anything other than "noop", set
-	 * the scheduler to what has been passed in. */
+	if (mop->mo_md_stripe_cache_size <= 0)
+		return 0;
+
+	new_stripe_cache_size = mop->mo_md_stripe_cache_size;
 
+	snprintf(path, sizeof(path), "%s/%s", sys_path, STRIPE_CACHE_SIZE);
 	rc = read_file(path, buf, sizeof(buf));
-	if (rc) {
+	if (rc != 0) {
 		if (verbose)
-			fprintf(stderr, "%s: cannot open '%s': %s\n",
-				progname, path, strerror(errno));
+			fprintf(stderr, "warning: cannot read '%s': %s\n",
+				path, strerror(errno));
 		return rc;
 	}
 
-	/* The expected format of buf: noop anticipatory deadline [cfq] */
-	s = strchr(buf, '[');
-	e = strchr(buf, ']');
-
-	/* If the format is not what we expect. Play it safe and error out. */
-	if (s == NULL || e == NULL) {
-		if (verbose)
-			fprintf(stderr, "%s: cannot parse scheduler "
-					"options for '%s'\n", progname, path);
-		return -EINVAL;
-	}
-
-	snprintf(orig_sched, e - s, "%s", s + 1);
+	old_stripe_cache_size = strtoul(buf, NULL, 0);
+	if (old_stripe_cache_size == 0 || old_stripe_cache_size == ULONG_MAX)
+		return EINVAL;
 
-	if (strcmp(orig_sched, "noop") == 0 ||
-	    strcmp(orig_sched, scheduler) == 0)
+	if (new_stripe_cache_size <= old_stripe_cache_size)
 		return 0;
 
-	rc = write_file(path, scheduler);
-	if (rc) {
+	snprintf(buf, sizeof(buf), "%lu", new_stripe_cache_size);
+	rc = write_file(path, buf);
+	if (rc != 0) {
 		if (verbose)
-			fprintf(stderr, "%s: cannot set scheduler on "
-					"'%s': %s\n", progname, path,
-					strerror(errno));
+			fprintf(stderr, "warning: cannot write '%s': %s\n",
+				path, strerror(errno));
 		return rc;
-	} else {
-		fprintf(stderr, "%s: change scheduler of %s from %s to %s\n",
-			progname, path, orig_sched, scheduler);
 	}
 
-	return rc;
+	return 0;
 }
 
-/* This is to tune the kernel for good SCSI performance.
- * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
- * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
-static int set_blockdev_tunables(char *source, struct mount_opts *mop)
+static int tune_max_sectors_kb(const char *sys_path, struct mount_opts *mop)
 {
-	glob_t glob_info = { 0 };
-	struct stat stat_buf;
-	char *chk_major, *chk_minor;
-	char *savept = NULL, *dev;
-	char *ret_path;
-	char buf[PATH_MAX] = {'\0'}, path[PATH_MAX] = {'\0'};
-	char real_path[PATH_MAX] = {'\0'};
-	int i, rc = 0;
-	int major, minor;
-	char *slave = NULL;
-
-	if (!source)
-		return -EINVAL;
-
-	ret_path = realpath(source, real_path);
-	if (ret_path == NULL) {
-		if (verbose)
-			fprintf(stderr, "warning: %s: cannot resolve: %s\n",
-				source, strerror(errno));
-		return -EINVAL;
+	char path[PATH_MAX];
+	unsigned long max_hw_sectors_kb;
+	unsigned long old_max_sectors_kb;
+	unsigned long new_max_sectors_kb;
+	char buf[3 * sizeof(old_max_sectors_kb) + 2];
+	int rc;
+
+	if (mop->mo_max_sectors_kb >= 0) {
+		new_max_sectors_kb = mop->mo_max_sectors_kb;
+		goto have_new_max_sectors_kb;
 	}
 
-	if (strncmp(real_path, "/dev/loop", 9) == 0)
+	snprintf(path, sizeof(path), "%s/%s", sys_path, MAX_HW_SECTORS_KB_PATH);
+	rc = read_file(path, buf, sizeof(buf));
+	if (rc != 0) {
+		/* No MAX_HW_SECTORS_KB_PATH isn't necessary an
+		 * error for some devices. */
 		return 0;
+	}
 
-	if ((real_path[0] != '/') && (strpbrk(real_path, ",:") != NULL))
+	max_hw_sectors_kb = strtoul(buf, NULL, 0);
+	if (max_hw_sectors_kb == 0 || max_hw_sectors_kb == ULLONG_MAX) {
+		/* No digits at all or something weird. */
 		return 0;
+	}
 
-	snprintf(path, sizeof(path), "/sys/block%s", real_path + 4);
-	if (access(path, X_OK) == 0)
-		goto set_params;
+	new_max_sectors_kb = max_hw_sectors_kb;
 
-	/* The name of the device say 'X' specified in /dev/X may not
-	 * match any entry under /sys/block/. In that case we need to
-	 * match the major/minor number to find the entry under
-	 * sys/block corresponding to /dev/X */
+	/* Don't increase IO request size limit past 16MB.  It is
+	 * about PTLRPC_MAX_BRW_SIZE, but that isn't in a public
+	 * header.  Note that even though the block layer allows
+	 * larger values, setting max_sectors_kb = 32768 causes
+	 * crashes (LU-6974). */
+	if (new_max_sectors_kb > 16 * 1024)
+		new_max_sectors_kb = 16 * 1024;
 
-	/* Don't chop tail digit on /dev/mapper/xxx, LU-478 */
-	if (strncmp(real_path, "/dev/mapper", 11) != 0) {
-		dev = real_path + strlen(real_path);
-		while (--dev > real_path && isdigit(*dev))
-			*dev = 0;
+have_new_max_sectors_kb:
+	snprintf(path, sizeof(path), "%s/%s", sys_path, MAX_SECTORS_KB_PATH);
+	rc = read_file(path, buf, sizeof(buf));
+	if (rc != 0) {
+		/* No MAX_SECTORS_KB_PATH isn't necessary an error for
+		 * some devices. */
+		return 0;
+	}
 
-		if (strncmp(real_path, "/dev/md", 7) == 0 && dev[0] == 'p')
-			*dev = 0;
+	old_max_sectors_kb = strtoul(buf, NULL, 0);
+	if (old_max_sectors_kb == 0 || old_max_sectors_kb == ULLONG_MAX) {
+		/* No digits at all or something weird. */
+		return 0;
 	}
 
-	rc = stat(real_path, &stat_buf);
-	if (rc) {
+	if (new_max_sectors_kb <= old_max_sectors_kb)
+		return 0;
+
+	snprintf(buf, sizeof(buf), "%lu", new_max_sectors_kb);
+	rc = write_file(path, buf);
+	if (rc != 0) {
 		if (verbose)
-			fprintf(stderr, "warning: %s, device %s stat failed\n",
-				strerror(errno), real_path);
+			fprintf(stderr, "warning: cannot write '%s': %s\n",
+				path, strerror(errno));
 		return rc;
 	}
 
-	major = major(stat_buf.st_rdev);
-	minor = minor(stat_buf.st_rdev);
-	rc = glob("/sys/block/*", GLOB_NOSORT, NULL, &glob_info);
-	if (rc) {
+	fprintf(stderr, "%s: increased '%s' from %lu to %lu\n",
+		progname, path, old_max_sectors_kb, new_max_sectors_kb);
+
+	return 0;
+}
+
+static int tune_block_dev_scheduler(const char *sys_path, const char *new_sched)
+{
+	char path[PATH_MAX];
+	char buf[PATH_MAX];
+	char *s, *e;
+	char *old_sched;
+	int rc;
+
+	/* Before setting the scheduler, we need to check to see if
+	 * it's already set to "noop". If it is then we don't want to
+	 * override that setting. If it's set to anything other than
+	 * "noop" then set the scheduler to what has been passed
+	 * in. */
+
+	snprintf(path, sizeof(path), "%s/%s", sys_path, SCHEDULER_PATH);
+	rc = read_file(path, buf, sizeof(buf));
+	if (rc != 0) {
 		if (verbose)
-			fprintf(stderr, "warning: failed to read entries under "
-				"/sys/block\n");
-		globfree(&glob_info);
+			fprintf(stderr, "%s: cannot read '%s': %s\n",
+				progname, path, strerror(errno));
+
 		return rc;
 	}
 
-	for (i = 0; i < glob_info.gl_pathc; i++){
-		snprintf(path, sizeof(path), "%s/dev", glob_info.gl_pathv[i]);
-
-		rc = read_file(path, buf, sizeof(buf));
-		if (rc)
-			continue;
+	/* The expected format of buf: noop anticipatory deadline [cfq] */
+	s = strchr(buf, '[');
+	e = strchr(buf, ']');
 
-		if (buf[strlen(buf) - 1] == '\n')
-			buf[strlen(buf) - 1] = '\0';
+	/* If the format is not what we expect then be safe and error out. */
+	if (s == NULL || e == NULL || !(s < e)) {
+		if (verbose)
+			fprintf(stderr,
+				"%s: cannot parse scheduler options for '%s'\n",
+				progname, path);
 
-		chk_major = strtok_r(buf, ":", &savept);
-		chk_minor = savept;
-		if (chk_major != NULL && major == atoi(chk_major) &&
-		    chk_minor != NULL && minor == atoi(chk_minor))
-			break;
+		return EINVAL;
 	}
 
-	if (i == glob_info.gl_pathc) {
+	old_sched = s + 1;
+	*e = '\0';
+
+	if (strcmp(old_sched, "noop") == 0 ||
+	    strcmp(old_sched, new_sched) == 0)
+		return 0;
+
+	rc = write_file(path, new_sched);
+	if (rc != 0) {
 		if (verbose)
-			fprintf(stderr,"warning: device %s does not match any "
-				"entry under /sys/block\n", real_path);
-		globfree(&glob_info);
-		return -EINVAL;
+			fprintf(stderr,
+				"%s: cannot set scheduler on '%s': %s\n",
+				progname, path, strerror(errno));
+		return rc;
 	}
 
-	/* Chop off "/dev" from path we found */
-	path[strlen(glob_info.gl_pathv[i])] = '\0';
-	globfree(&glob_info);
+	fprintf(stderr, "%s: changed scheduler of '%s' from %s to %s\n",
+		progname, path, old_sched, new_sched);
 
-set_params:
-	if (strncmp(real_path, "/dev/md", 7) == 0) {
-		snprintf(real_path, sizeof(real_path), "%s/%s", path,
-			 STRIPE_CACHE_SIZE);
+	return 0;
+}
 
-		rc = read_file(real_path, buf, sizeof(buf));
-		if (rc) {
-			if (verbose)
-				fprintf(stderr, "warning: opening %s: %s\n",
-					real_path, strerror(errno));
-			return 0;
-		}
+static int tune_block_dev(const char *src, struct mount_opts *mop);
 
-		if (atoi(buf) >= mop->mo_md_stripe_cache_size)
+static int tune_block_dev_slaves(const char *sys_path, struct mount_opts *mop)
+{
+	char slaves_path[PATH_MAX];
+	DIR *slaves_dir;
+	struct dirent *d;
+	int rc = 0;
+
+	snprintf(slaves_path, sizeof(slaves_path), "%s/slaves", sys_path);
+	slaves_dir = opendir(slaves_path);
+	if (slaves_dir == NULL) {
+		if (errno == ENOENT)
 			return 0;
 
-		if (strlen(buf) - 1 > 0) {
-			snprintf(buf, sizeof(buf), "%d",
-				 mop->mo_md_stripe_cache_size);
-			rc = write_file(real_path, buf);
-			if (rc != 0 && verbose)
-				fprintf(stderr, "warning: opening %s: %s\n",
-					real_path, strerror(errno));
-		}
-		/* Return since raid and disk tunables are different */
-		return rc;
+		return errno;
 	}
 
-	if (mop->mo_max_sectors_kb >= 0) {
-		snprintf(buf, sizeof(buf), "%d", mop->mo_max_sectors_kb);
-	} else {
-		snprintf(real_path, sizeof(real_path), "%s/%s", path,
-			 MAX_HW_SECTORS_KB_PATH);
-		rc = read_file(real_path, buf, sizeof(buf));
-		if (rc) {
-			if (verbose)
-				fprintf(stderr, "warning: opening %s: %s\n",
-					real_path, strerror(errno));
-			/* No MAX_HW_SECTORS_KB_PATH isn't necessary an
-			 * error for some device. */
-			goto subdevs;
-		}
+	while ((d = readdir(slaves_dir)) != NULL) {
+		char path[PATH_MAX];
+		int rc2;
+
+		if (d->d_type != DT_LNK)
+			continue;
+
+		snprintf(path, sizeof(path), "%s/%s", slaves_path, d->d_name);
+		rc2 = tune_block_dev(path, mop);
+		if (rc2 != 0)
+			rc = rc2;
 	}
 
-	if (strlen(buf) - 1 > 0) {
-		char oldbuf[32] = "", *end = NULL;
-		unsigned long long oldval, newval;
-
-		snprintf(real_path, sizeof(real_path), "%s/%s", path,
-			 MAX_SECTORS_KB_PATH);
-		rc = read_file(real_path, oldbuf, sizeof(oldbuf));
-		/* Only set new parameter if different from the old one. */
-		if (rc != 0 || strcmp(oldbuf, buf) == 0) {
-			/* No MAX_SECTORS_KB_PATH isn't necessary an
-			 * error for some device. */
-			goto subdevs;
-		}
+	closedir(slaves_dir);
 
-		newval = strtoull(buf, &end, 0);
-		if (newval == 0 || newval == ULLONG_MAX || end == buf)
-			goto subdevs;
-
-		/* Don't increase IO request size limit past 16MB.  It is about
-		 * PTLRPC_MAX_BRW_SIZE, but that isn't in a public header.
-		 * Note that even though the block layer allows larger values,
-		 * setting max_sectors_kb = 32768 causes crashes (LU-6974). */
-		if (mop->mo_max_sectors_kb < 0 && newval > 16 * 1024) {
-			newval = 16 * 1024;
-			snprintf(buf, sizeof(buf), "%llu", newval);
-		}
+	return rc;
+}
 
-		oldval = strtoull(oldbuf, &end, 0);
-		/* Don't shrink the current limit. */
-		if (mop->mo_max_sectors_kb < 0 && oldval != ULLONG_MAX &&
-		    newval <= oldval)
-			goto subdevs;
-
-		rc = write_file(real_path, buf);
-		if (rc != 0) {
-			if (verbose)
-				fprintf(stderr, "warning: writing to %s: %s\n",
-					real_path, strerror(errno));
-			/* No MAX_SECTORS_KB_PATH isn't necessary an
-			 * error for some device. */
-			goto subdevs;
-		}
-		fprintf(stderr, "%s: increased %s from %s to %s\n",
-			progname, real_path, oldbuf, buf);
+/* This is to tune the kernel for good SCSI performance.
+ * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
+ * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
+static int tune_block_dev(const char *src, struct mount_opts *mop)
+{
+	struct stat st;
+	char sys_path[PATH_MAX];
+	char partition_path[PATH_MAX];
+	char *real_sys_path = NULL;
+	int rc;
+
+	/*
+	 * Don't apply block device tuning for MDT or MGT devices,
+	 * since we don't need huge IO sizes to get good performance
+	 */
+	if (!IS_OST(&mop->mo_ldd))
+		return 0;
+
+	if (src == NULL)
+		return EINVAL;
+
+	rc = stat(src, &st);
+	if (rc < 0) {
+		if (verbose)
+			fprintf(stderr, "warning: cannot stat '%s': %s\n",
+				src, strerror(errno));
+		return errno;
 	}
 
-subdevs:
-	/* Purposely ignore errors reported from set_blockdev_scheduler.
-	 * The worst that will happen is a block device with an "incorrect"
-	 * scheduler. */
-	snprintf(real_path, sizeof(real_path), "%s/%s", path, SCHEDULER_PATH);
-	set_blockdev_scheduler(real_path, DEFAULT_SCHEDULER);
-
-	/* if device is multipath device, tune its slave devices */
-	glob_info.gl_pathc = 0;
-	glob_info.gl_offs = 0;
-	snprintf(real_path, sizeof(real_path), "%s/slaves/*", path);
-	rc = glob(real_path, GLOB_NOSORT, NULL, &glob_info);
-
-	for (i = 0; rc == 0 && i < glob_info.gl_pathc; i++) {
-		slave = basename(glob_info.gl_pathv[i]);
-		snprintf(real_path, sizeof(real_path), "/dev/%s", slave);
-		rc = set_blockdev_tunables(real_path, mop);
+	if (!S_ISBLK(st.st_mode))
+		return 0;
+
+	if (major(st.st_rdev) == LOOP_MAJOR)
+		return 0;
+
+	snprintf(sys_path, sizeof(sys_path), "/sys/dev/block/%u:%u",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	snprintf(partition_path, sizeof(partition_path), "%s/partition",
+		 sys_path);
+
+	rc = access(partition_path, F_OK);
+	if (rc < 0) {
+		if (errno == ENOENT)
+			goto have_whole_dev;
+
+		if (verbose)
+			fprintf(stderr, "warning: cannot access '%s': %s\n",
+				partition_path, strerror(errno));
+		rc = errno;
+		goto out;
 	}
 
-	if (rc == GLOB_NOMATCH) {
-		/* no slave device is not an error */
-		rc = 0;
-	} else if (rc && verbose) {
-		if (slave == NULL) {
-			fprintf(stderr, "warning: %s, failed to read"
-				" entries under %s/slaves\n",
-				strerror(errno), path);
-		} else {
-			fprintf(stderr, "unable to set tunables for"
-				" slave device %s (slave would be"
-				" unable to handle IO request from"
-				" master %s)\n",
-				real_path, source);
-		}
+	snprintf(sys_path, sizeof(sys_path), "/sys/dev/block/%u:%u/..",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+have_whole_dev:
+	/* Since we recurse on slave devices we resolve the sys_path to
+	 * avoid path buffer overflows. */
+	real_sys_path = realpath(sys_path, NULL);
+	if (real_sys_path == NULL) {
+		if (verbose)
+			fprintf(stderr,
+				"warning: cannot resolve '%s': %s\n",
+				sys_path, strerror(errno));
+		rc = errno;
+		goto out;
 	}
-	globfree(&glob_info);
+
+	if (major(st.st_rdev) == MD_MAJOR) {
+		rc = tune_md_stripe_cache_size(real_sys_path, mop);
+	} else {
+		/* Ignore errors from tune_max_sectors_kb() and
+		 * tune_scheduler(). The worst that will happen is a block
+		 * device with an "incorrect" scheduler. */
+		tune_max_sectors_kb(real_sys_path, mop);
+		tune_block_dev_scheduler(real_sys_path, DEFAULT_SCHEDULER);
+
+		/* If device is multipath device then tune its slave
+		 * devices. */
+		rc = tune_block_dev_slaves(real_sys_path, mop);
+	}
+
+out:
+	free(real_sys_path);
 
 	return rc;
 }
 
 int ldiskfs_tune_lustre(char *dev, struct mount_opts *mop)
 {
-	return set_blockdev_tunables(dev, mop);
+	return tune_block_dev(dev, mop);
 }
 
 int ldiskfs_label_lustre(struct mount_opts *mop)