From eb012d4a10208b26c2d3e795a90f1bb07dde6d91 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@whamcloud.com>
Date: Fri, 29 Apr 2011 15:59:52 -0600
Subject: [PATCH] LU-255: enable ext4 features by default

Enable the flex_bg, huge_file, and dir_nlink features from ext4 by
default for ext4-based ldiskfs filesystems.  The flex_bg feature
can significantly reduce e2fsck time as well as time-to-first-write
on newly mounted OSTs.

Reduce the number of inodes created on larger OST filesystems, since
there are far too many inodes created by default, which wastes space
and slows down e2fsck.

Increase the number of inodes on MDT filesystems, subject to
constraints from the default number of stripes being stored for
each inode, which may increase the inode size being used.  Remove
changes to the statfs() results on the MDT to limit inode count to
the returned block count since increased MDT inodes will always
exceed free blocks, and it was confusing.

Skip zeroing the journal at format time.  It will be overwritten
at first use, and avoids writing 400MB of zeroes to the filesystem
needlessly.  We can't skip zering the inode table by default, but
newer mke2fs will check if the kernel handles this itself, and
request it internally.

Cleanups to remove support for 2.4 kernels and reiserfs filesystem.

Change-Id: I519bffaad5e97ee68c189efb00e07ebc0b1600f5
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/480
Reviewed-by: Yu Jian <yujian@whamcloud.com>
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Johann Lombardi <johann@whamcloud.com>
---
 lustre/lvfs/fsfilt_ext3.c  |  12 ---
 lustre/utils/mkfs_lustre.c | 264 ++++++++++++++++++++++++++-------------------
 2 files changed, 152 insertions(+), 124 deletions(-)
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c
index a3c10f9..a3b3380 100644
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -813,13 +813,6 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
         return 0;
 }
 
-/*
- * We need to hack the return value for the free inode counts because
- * the current EA code requires one filesystem block per inode with EAs,
- * so it is possible to run out of blocks before we run out of inodes.
- *
- * This can be removed when the ext3 EA code is fixed.
- */
 static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
 {
         struct kstatfs sfs;
@@ -827,11 +820,6 @@ static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
 
         memset(&sfs, 0, sizeof(sfs));
         rc = ll_do_statfs(sb, &sfs);
-        if (!rc && sfs.f_bfree < sfs.f_ffree) {
-                sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
-                sfs.f_ffree = sfs.f_bfree;
-        }
-
         statfs_pack(osfs, &sfs);
         return rc;
 }
diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c
index 0f74cc0..4a8b906 100644
--- a/lustre/utils/mkfs_lustre.c
+++ b/lustre/utils/mkfs_lustre.c
@@ -65,8 +65,8 @@
 #include <ctype.h>
 
 #ifdef __linux__
-/* libcfs.h is not really needed here, but on SLES10/PPC, fs.h includes idr.h which
- * requires BITS_PER_LONG to be defined */
+/* libcfs.h is not really needed here, but on SLES10/PPC, fs.h includes idr.h
+ * which requires BITS_PER_LONG to be defined */
 #include <libcfs/libcfs.h>
 #ifndef BLKGETSIZE64
 #include <linux/fs.h> /* for BLKGETSIZE64 */
@@ -93,8 +93,8 @@
 struct mkfs_opts {
         struct lustre_disk_data mo_ldd; /* to be written in MOUNT_DATA_FILE */
         char  mo_device[128];           /* disk device name */
-        char  mo_mkfsopts[128];         /* options to the backing-store mkfs */
         char  mo_loopdev[128];          /* in case a loop dev is needed */
+        char  mo_mkfsopts[512];         /* options to the backing-store mkfs */
         __u64 mo_device_sz;             /* in KB */
         int   mo_stripe_count;
         int   mo_flags;
@@ -125,21 +125,17 @@ void usage(FILE *out)
                 "\t\t--param <key>=<value> : set a permanent parameter\n"
                 "\t\t\te.g. --param sys.timeout=40\n"
                 "\t\t\t     --param lov.stripesize=2M\n"
-                "\t\t--index=#N : target index (i.e. ost index within the lov)\n"
-                /* FIXME implement 1.6.x
-                "\t\t--configdev=<altdevice|file>: store configuration info\n"
-                "\t\t\tfor this device on an alternate device\n"
-                */
-                "\t\t--comment=<user comment>: arbitrary user string (%d bytes)\n"
+                "\t\t--index=#N : target index (i.e. ost index within lov)\n"
+                "\t\t--comment=<user comment>: arbitrary string (%d bytes)\n"
                 "\t\t--mountfsoptions=<opts> : permanent mount options\n"
-                "\t\t--network=<net>[,<...>] : network(s) to restrict this ost/mdt to\n"
+                "\t\t--network=<net>[,<...>] : restrict OST/MDT to network(s)\n"
 #ifndef TUNEFS
                 "\t\t--backfstype=<fstype> : backing fs type (ext3, ldiskfs)\n"
                 "\t\t--device-size=#N(KB) : device size for loop devices\n"
                 "\t\t--mkfsoptions=<opts> : format options\n"
                 "\t\t--reformat: overwrite an existing disk\n"
-                "\t\t--stripe-count-hint=#N : used for optimizing MDT inode size\n"
-                "\t\t--iam-dir: make use of IAM directory format on backfs, incompatible with ext3.\n"
+                "\t\t--stripe-count-hint=#N : for optimizing MDT inode size\n"
+                "\t\t--iam-dir: use IAM directory format, not ext3 compatible\n"
 #else
                 "\t\t--erase-params : erase all old parameter settings\n"
                 "\t\t--nomgs: turn off MGS service on this MDT\n"
@@ -195,13 +191,13 @@ int get_os_version()
                 char release[4] = "";
 
                 fd = open("/proc/sys/kernel/osrelease", O_RDONLY);
-                if (fd < 0)
+                if (fd < 0) {
                         fprintf(stderr, "%s: Warning: Can't resolve kernel "
                                 "version, assuming 2.6\n", progname);
-                else {
+                } else {
                         if (read(fd, release, 4) < 0) {
                                 fprintf(stderr, "reading from /proc/sys/kernel"
-                                                "/osrelease: %s\n", strerror(errno));
+                                        "/osrelease: %s\n", strerror(errno));
                                 close(fd);
                                 exit(-1);
                         }
@@ -481,82 +477,99 @@ static int is_lustre_target(struct mkfs_opts *mop)
  * mke2fs to check for its support. */
 static int is_e2fsprogs_feature_supp(const char *feature)
 {
+        static char supp_features[4096] = "";
         FILE *fp;
         char cmd[PATH_MAX];
         char imgname[] = "/tmp/test-img-XXXXXX";
         int fd = -1;
-        int ret = 0;
+        int ret = 1;
 
-        snprintf(cmd, sizeof(cmd), "%s -c -R \"supported_features %s\" 2>&1",
-                 DEBUGFS, feature);
+        if (supp_features[0] == '\0') {
+                snprintf(cmd, sizeof(cmd), "%s -c -R supported_features 2>&1",
+                         DEBUGFS);
 
-        /* Using popen() instead of run_command() since debugfs does not return
-         * proper error code if command is not supported */
-        fp = popen(cmd, "r");
-        if (!fp) {
-                fprintf(stderr, "%s: %s\n", progname, strerror(errno));
-                return 0;
-        }
-        ret = fread(cmd, 1, sizeof(cmd), fp);
-        if (ret > 0) {
-                if (strstr(cmd, feature) && !(strstr(cmd, "Unknown")))
+                /* Using popen() instead of run_command() since debugfs does
+                 * not return proper error code if command is not supported */
+                fp = popen(cmd, "r");
+                if (!fp) {
+                        fprintf(stderr, "%s: %s\n", progname, strerror(errno));
                         return 0;
+                }
+                ret = fread(supp_features, 1, sizeof(supp_features), fp);
+                fclose(fp);
         }
+        if (ret > 0 && strstr(supp_features,
+                              strncmp(feature, "-O ", 3) ? feature : feature+3))
+                return 0;
 
         if ((fd = mkstemp(imgname)) < 0)
                 return -1;
+        else
+                close(fd);
 
-        snprintf(cmd, sizeof(cmd), "%s -F -O %s %s 100 >/dev/null 2>&1",
+        snprintf(cmd, sizeof(cmd), "%s -F %s %s 100 >/dev/null 2>&1",
                  MKE2FS, feature, imgname);
         /* run_command() displays the output of mke2fs when it fails for
          * some feature, so use system() directly */
         ret = system(cmd);
-        if (fd >= 0)
-                remove(imgname);
+        unlink(imgname);
 
         return ret;
 }
 
-static void enable_default_backfs_features(struct mkfs_opts *mop)
+static void enable_default_ext4_features(struct mkfs_opts *mop)
 {
-        struct utsname uts;
-        int ret;
-
         if (IS_OST(&mop->mo_ldd))
-                strscat(mop->mo_mkfsopts, " -O dir_index,extents",
+                strscat(mop->mo_mkfsopts, " -O extents,uninit_bg",
                         sizeof(mop->mo_mkfsopts));
         else if (IS_MDT(&mop->mo_ldd))
-                strscat(mop->mo_mkfsopts, " -O dir_index,dirdata",
+                strscat(mop->mo_mkfsopts, " -O dirdata,uninit_bg",
                         sizeof(mop->mo_mkfsopts));
         else
-                strscat(mop->mo_mkfsopts, " -O dir_index",
-                        sizeof(mop->mo_mkfsopts));
-
-        /* Upstream e2fsprogs called our uninit_groups feature uninit_bg,
-         * check for both of them when testing e2fsprogs features. */
-        if (is_e2fsprogs_feature_supp("uninit_bg") == 0)
-                strscat(mop->mo_mkfsopts, ",uninit_bg",
-                        sizeof(mop->mo_mkfsopts));
-        else if (is_e2fsprogs_feature_supp("uninit_groups") == 0)
-                strscat(mop->mo_mkfsopts, ",uninit_groups",
+                strscat(mop->mo_mkfsopts, " -O uninit_bg",
                         sizeof(mop->mo_mkfsopts));
-        else
-                disp_old_e2fsprogs_msg("uninit_bg", 1);
 
-        ret = uname(&uts);
-        if (ret)
-                return;
-
-        /* Multiple mount protection is enabled only if failover node is
-         * specified and if kernel version is higher than 2.6.9 */
+        /* Multiple mount protection enabled only if failover node specified */
         if (failover) {
-                if (is_e2fsprogs_feature_supp("mmp") == 0)
+                if (is_e2fsprogs_feature_supp("-O mmp") == 0)
                         strscat(mop->mo_mkfsopts, ",mmp",
                                 sizeof(mop->mo_mkfsopts));
                 else
                         disp_old_e2fsprogs_msg("mmp", 1);
         }
+
+        /* Allow more than 65000 subdirectories */
+        if (is_e2fsprogs_feature_supp("-O dir_nlink") == 0)
+                strscat(mop->mo_mkfsopts,",dir_nlink",sizeof(mop->mo_mkfsopts));
+
+#ifdef HAVE_EXT4_LDISKFS
+        /* The following options are only valid for ext4-based ldiskfs.
+         * If --backfstype=ext3 is specified, do not enable them. */
+        if (mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3)
+                return;
+
+        /* Allow files larger than 2TB.  Also needs LU-16, but not harmful. */
+        if (is_e2fsprogs_feature_supp("-O huge_file") == 0)
+                strscat(mop->mo_mkfsopts,",huge_file",sizeof(mop->mo_mkfsopts));
+
+        /* Cluster inode/block bitmaps and inode table for more efficient IO.
+         * Align the flex groups on a 1MB boundary for better performance.
+         * This -O feature needs to go last, since it adds an extra option. */
+        if (is_e2fsprogs_feature_supp("-O flex_bg") == 0) {
+                char tmp_buf[64];
+
+                strscat(mop->mo_mkfsopts, ",flex_bg", sizeof(mop->mo_mkfsopts));
+
+                if (IS_OST(&mop->mo_ldd)) {
+                        snprintf(tmp_buf, sizeof(tmp_buf), " -G %u",
+                                 (1 << 20) / L_BLOCK_SIZE);
+                        strscat(mop->mo_mkfsopts, tmp_buf,
+                                sizeof(mop->mo_mkfsopts));
+                }
+        }
+#endif
 }
+
 /* Build fs according to type */
 int make_lustre_backfs(struct mkfs_opts *mop)
 {
@@ -597,6 +610,8 @@ int make_lustre_backfs(struct mkfs_opts *mop)
         if ((mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3) ||
             (mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS) ||
             (mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS2)) {
+                long inode_size = 0;
+
                 /* Journal size in MB */
                 if (strstr(mop->mo_mkfsopts, "-J") == NULL) {
                         /* Choose our own default journal size */
@@ -617,36 +632,14 @@ int make_lustre_backfs(struct mkfs_opts *mop)
                         }
                 }
 
-                /* Bytes_per_inode: disk size / num inodes */
-                if (strstr(mop->mo_mkfsopts, "-i") == NULL) {
-                        long bytes_per_inode = 0;
-
-                        if (IS_MDT(&mop->mo_ldd))
-                                bytes_per_inode = 4096;
-
-                        /* Allocate fewer inodes on large OST devices.  Most
-                           filesystems can be much more aggressive than even
-                           this. */
-                        if ((IS_OST(&mop->mo_ldd) && (device_sz > 100000000)))
-                                bytes_per_inode = 16384;  /* > 100 Gb device */
-
-
-                        if (bytes_per_inode > 0) {
-                                sprintf(buf, " -i %ld", bytes_per_inode);
-                                strscat(mop->mo_mkfsopts, buf,
-                                        sizeof(mop->mo_mkfsopts));
-                        }
-                }
-
                 /* Inode size (for extended attributes).  The LOV EA size is
                  * 32 (EA hdr) + 32 (lov_mds_md) + stripes * 24 (lov_ost_data),
                  * and we want some margin above that for ACLs, other EAs... */
                 if (strstr(mop->mo_mkfsopts, "-I") == NULL) {
-                        long inode_size = 0;
                         if (IS_MDT(&mop->mo_ldd)) {
                                 if (mop->mo_stripe_count > 72)
                                         inode_size = 512; /* bz 7241 */
-                                /* cray stripes across all osts (>60) */
+                                        /* see also "-i" below for EA blocks */
                                 else if (mop->mo_stripe_count > 32)
                                         inode_size = 2048;
                                 else if (mop->mo_stripe_count > 10)
@@ -654,8 +647,8 @@ int make_lustre_backfs(struct mkfs_opts *mop)
                                 else
                                         inode_size = 512;
                         } else if (IS_OST(&mop->mo_ldd)) {
-                                /* now as we store fids in EA on OST we need
-                                   to make inode bigger */
+                                /* We store MDS FID and OST objid in EA on OST
+                                 * we need to make inode bigger as well. */
                                 inode_size = 256;
                         }
 
@@ -666,13 +659,82 @@ int make_lustre_backfs(struct mkfs_opts *mop)
                         }
                 }
 
+                /* Bytes_per_inode: disk size / num inodes */
+                if (strstr(mop->mo_mkfsopts, "-i") == NULL &&
+                    strstr(mop->mo_mkfsopts, "-N") == NULL) {
+                        long bytes_per_inode = 0;
+
+                        /* Allocate more inodes on MDT devices.  There is
+                         * no data stored on the MDT, and very little extra
+                         * metadata beyond the inode.  It could go down as
+                         * low as 1024 bytes, but this is conservative.
+                         * Account for external EA blocks for wide striping. */
+                        if (IS_MDT(&mop->mo_ldd)) {
+                                bytes_per_inode = inode_size + 1536;
+
+                                if (mop->mo_stripe_count > 72) {
+                                        int extra = mop->mo_stripe_count * 24;
+                                        extra = ((extra - 1) | 4095) + 1;
+                                        bytes_per_inode += extra;
+                                }
+                        }
+
+                        /* Allocate fewer inodes on large OST devices.  Most
+                         * filesystems can be much more aggressive than even
+                         * this, but it is impossible to know in advance. */
+                        if (IS_OST(&mop->mo_ldd)) {
+                                /* OST > 8TB assume average file size 1MB */
+                                if (device_sz >= (8ULL << 30))
+                                        bytes_per_inode = 1024 * 1024;
+                                /* OST > 1TB assume average file size 256kB */
+                                else if (device_sz >= (1ULL << 30))
+                                        bytes_per_inode = 256 * 1024;
+                                /* OST > 100GB assume average file size 64kB,
+                                 * plus a bit so that inodes will fit into a
+                                 * 256x flex_bg without overflowing */
+                                else if (device_sz >= (10ULL << 20))
+                                        bytes_per_inode = 69905;
+                        }
+
+
+                        if (bytes_per_inode > 0) {
+                                sprintf(buf, " -i %ld", bytes_per_inode);
+                                strscat(mop->mo_mkfsopts, buf,
+                                        sizeof(mop->mo_mkfsopts));
+                        }
+                }
+
                 if (verbose < 2) {
                         strscat(mop->mo_mkfsopts, " -q",
                                 sizeof(mop->mo_mkfsopts));
                 }
 
                 if (strstr(mop->mo_mkfsopts, "-O") == NULL)
-                        enable_default_backfs_features(mop);
+                        enable_default_ext4_features(mop);
+
+                /* In order to align the filesystem metadata on 1MB boundaries,
+                 * give a resize value that will reserve a power-of-two group
+                 * descriptor blocks, but leave one block for the superblock.
+                 * Only useful for filesystems with < 2^32 blocks due to resize
+                 * limitations. */
+                if (IS_OST(&mop->mo_ldd) && mop->mo_device_sz > 100 * 1024 &&
+                    mop->mo_device_sz / L_BLOCK_SIZE <= 0xffffffff) {
+                        unsigned group_blocks = L_BLOCK_SIZE * 8;
+                        unsigned desc_per_block = L_BLOCK_SIZE / 32;
+                        unsigned resize_blks;
+
+                        resize_blks = (1ULL<<32) - desc_per_block*group_blocks;
+                        snprintf(buf, sizeof(buf)," -E resize=%u,",resize_blks);
+                } else {
+                        strncpy(buf, " -E ", sizeof(buf));
+                }
+
+                /* Avoid zeroing out the full journal - speeds up mkfs */
+                if (is_e2fsprogs_feature_supp("-E lazy_journal_init") == 0)
+                        strscat(buf, "lazy_journal_init,", sizeof(buf));
+
+                if (strlen(buf) > strlen(" -E "))
+                        strscat(mop->mo_mkfsopts, buf,sizeof(mop->mo_mkfsopts));
 
                 /* Allow reformat of full devices (as opposed to
                    partitions.)  We already checked for mounted dev. */
@@ -681,14 +743,6 @@ int make_lustre_backfs(struct mkfs_opts *mop)
                 snprintf(mkfs_cmd, sizeof(mkfs_cmd),
                          "%s -j -b %d -L %s ", MKE2FS, L_BLOCK_SIZE,
                          mop->mo_ldd.ldd_svname);
-        } else if (mop->mo_ldd.ldd_mount_type == LDD_MT_REISERFS) {
-                long journal_sz = 0; /* FIXME default journal size */
-                if (journal_sz > 0) {
-                        sprintf(buf, " --journal_size %ld", journal_sz);
-                        strscat(mop->mo_mkfsopts, buf,
-                                sizeof(mop->mo_mkfsopts));
-                }
-                snprintf(mkfs_cmd, sizeof(mkfs_cmd), "mkreiserfs -ff ");
         } else {
                 fprintf(stderr,"%s: unsupported fs type: %d (%s)\n",
                         progname, mop->mo_ldd.ldd_mount_type,
@@ -987,7 +1041,7 @@ int read_local_files(struct mkfs_opts *mop)
                 vprint("Reading %s\n", MOUNT_DATA_FILE);
                 num_read = fread(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep);
                 if (num_read < 1 && ferror(filep)) {
-                        fprintf(stderr, "%s: Unable to read from file (%s): %s\n",
+                        fprintf(stderr, "%s: Unable to read from file %s: %s\n",
                                 progname, filepnm, strerror(errno));
                         goto out_close;
                 }
@@ -1106,10 +1160,7 @@ void set_defaults(struct mkfs_opts *mop)
         mop->mo_ldd.ldd_flags = LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_VIRGIN;
         mop->mo_mgs_failnodes = 0;
         strcpy(mop->mo_ldd.ldd_fsname, "lustre");
-        if (get_os_version() == 24)
-                mop->mo_ldd.ldd_mount_type = LDD_MT_EXT3;
-        else
-                mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS;
+        mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS;
 
         mop->mo_ldd.ldd_svindex = INDEX_UNASSIGNED;
         mop->mo_stripe_count = 1;
@@ -1165,7 +1216,8 @@ static char *convert_hostnames(char *s1)
                 nid = libcfs_str2nid(s1);
 
                 if (nid == LNET_NID_ANY) {
-                        fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1);
+                        fprintf(stderr, "%s: Can't parse NID '%s'\n",
+                                progname, s1);
                         free(converted);
                         return NULL;
                 }
@@ -1610,18 +1662,14 @@ int main(int argc, char *const argv[])
         switch (ldd->ldd_mount_type) {
         case LDD_MT_EXT3:
         case LDD_MT_LDISKFS:
-        case LDD_MT_LDISKFS2: {
+        case LDD_MT_LDISKFS2:
                 strscat(default_mountopts, ",errors=remount-ro",
                         sizeof(default_mountopts));
                 if (IS_MDT(ldd) || IS_MGS(ldd))
                         strscat(always_mountopts, ",user_xattr",
                                 sizeof(always_mountopts));
-                if ((get_os_version() == 24) && IS_OST(ldd))
-                        strscat(always_mountopts, ",asyncdel",
-                                sizeof(always_mountopts));
-                /* NB: Files created while extents are enabled cannot be read
-                   if mounted with a kernel that doesn't include the Lustre ldiskfs
-                   patches! */
+                /* NB: Files created while extents are enabled can only be read
+                 * if mounted using the ext4 or ldiskfs filesystem type. */
                 if (IS_OST(ldd) &&
                     (ldd->ldd_mount_type == LDD_MT_LDISKFS ||
                      ldd->ldd_mount_type == LDD_MT_LDISKFS2)) {
@@ -1629,14 +1677,7 @@ int main(int argc, char *const argv[])
                                 sizeof(default_mountopts));
                 }
                 break;
-        }
-        case LDD_MT_SMFS: {
-                mop.mo_flags |= MO_IS_LOOP;
-                sprintf(always_mountopts, ",type=ext3,dev=%s",
-                        mop.mo_device);
-                break;
-        }
-        default: {
+        default:
                 fatal();
                 fprintf(stderr, "unknown fs type %d '%s'\n",
                         ldd->ldd_mount_type,
@@ -1644,7 +1685,6 @@ int main(int argc, char *const argv[])
                 ret = EINVAL;
                 goto out;
         }
-        }
 
         if (mountopts) {
                 trim_mountfsoptions(mountopts);
-- 
1.8.3.1