From eb012d4a10208b26c2d3e795a90f1bb07dde6d91 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 29 Apr 2011 15:59:52 -0600 Subject: [PATCH] LU-255: enable ext4 features by default Enable the flex_bg, huge_file, and dir_nlink features from ext4 by default for ext4-based ldiskfs filesystems. The flex_bg feature can significantly reduce e2fsck time as well as time-to-first-write on newly mounted OSTs. Reduce the number of inodes created on larger OST filesystems, since there are far too many inodes created by default, which wastes space and slows down e2fsck. Increase the number of inodes on MDT filesystems, subject to constraints from the default number of stripes being stored for each inode, which may increase the inode size being used. Remove changes to the statfs() results on the MDT to limit inode count to the returned block count since increased MDT inodes will always exceed free blocks, and it was confusing. Skip zeroing the journal at format time. It will be overwritten at first use, and avoids writing 400MB of zeroes to the filesystem needlessly. We can't skip zering the inode table by default, but newer mke2fs will check if the kernel handles this itself, and request it internally. Cleanups to remove support for 2.4 kernels and reiserfs filesystem. Change-Id: I519bffaad5e97ee68c189efb00e07ebc0b1600f5 Signed-off-by: Andreas Dilger Reviewed-on: http://review.whamcloud.com/480 Reviewed-by: Yu Jian Tested-by: Hudson Tested-by: Maloo Reviewed-by: Johann Lombardi --- lustre/lvfs/fsfilt_ext3.c | 12 --- lustre/utils/mkfs_lustre.c | 264 ++++++++++++++++++++++++++------------------- 2 files changed, 152 insertions(+), 124 deletions(-) diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index a3c10f9..a3b3380 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -813,13 +813,6 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd, return 0; } -/* - * We need to hack the return value for the free inode counts because - * the current EA code requires one filesystem block per inode with EAs, - * so it is possible to run out of blocks before we run out of inodes. - * - * This can be removed when the ext3 EA code is fixed. - */ static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs) { struct kstatfs sfs; @@ -827,11 +820,6 @@ static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs) memset(&sfs, 0, sizeof(sfs)); rc = ll_do_statfs(sb, &sfs); - if (!rc && sfs.f_bfree < sfs.f_ffree) { - sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree; - sfs.f_ffree = sfs.f_bfree; - } - statfs_pack(osfs, &sfs); return rc; } diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index 0f74cc0..4a8b906 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -65,8 +65,8 @@ #include #ifdef __linux__ -/* libcfs.h is not really needed here, but on SLES10/PPC, fs.h includes idr.h which - * requires BITS_PER_LONG to be defined */ +/* libcfs.h is not really needed here, but on SLES10/PPC, fs.h includes idr.h + * which requires BITS_PER_LONG to be defined */ #include #ifndef BLKGETSIZE64 #include /* for BLKGETSIZE64 */ @@ -93,8 +93,8 @@ struct mkfs_opts { struct lustre_disk_data mo_ldd; /* to be written in MOUNT_DATA_FILE */ char mo_device[128]; /* disk device name */ - char mo_mkfsopts[128]; /* options to the backing-store mkfs */ char mo_loopdev[128]; /* in case a loop dev is needed */ + char mo_mkfsopts[512]; /* options to the backing-store mkfs */ __u64 mo_device_sz; /* in KB */ int mo_stripe_count; int mo_flags; @@ -125,21 +125,17 @@ void usage(FILE *out) "\t\t--param = : set a permanent parameter\n" "\t\t\te.g. --param sys.timeout=40\n" "\t\t\t --param lov.stripesize=2M\n" - "\t\t--index=#N : target index (i.e. ost index within the lov)\n" - /* FIXME implement 1.6.x - "\t\t--configdev=: store configuration info\n" - "\t\t\tfor this device on an alternate device\n" - */ - "\t\t--comment=: arbitrary user string (%d bytes)\n" + "\t\t--index=#N : target index (i.e. ost index within lov)\n" + "\t\t--comment=: arbitrary string (%d bytes)\n" "\t\t--mountfsoptions= : permanent mount options\n" - "\t\t--network=[,<...>] : network(s) to restrict this ost/mdt to\n" + "\t\t--network=[,<...>] : restrict OST/MDT to network(s)\n" #ifndef TUNEFS "\t\t--backfstype= : backing fs type (ext3, ldiskfs)\n" "\t\t--device-size=#N(KB) : device size for loop devices\n" "\t\t--mkfsoptions= : format options\n" "\t\t--reformat: overwrite an existing disk\n" - "\t\t--stripe-count-hint=#N : used for optimizing MDT inode size\n" - "\t\t--iam-dir: make use of IAM directory format on backfs, incompatible with ext3.\n" + "\t\t--stripe-count-hint=#N : for optimizing MDT inode size\n" + "\t\t--iam-dir: use IAM directory format, not ext3 compatible\n" #else "\t\t--erase-params : erase all old parameter settings\n" "\t\t--nomgs: turn off MGS service on this MDT\n" @@ -195,13 +191,13 @@ int get_os_version() char release[4] = ""; fd = open("/proc/sys/kernel/osrelease", O_RDONLY); - if (fd < 0) + if (fd < 0) { fprintf(stderr, "%s: Warning: Can't resolve kernel " "version, assuming 2.6\n", progname); - else { + } else { if (read(fd, release, 4) < 0) { fprintf(stderr, "reading from /proc/sys/kernel" - "/osrelease: %s\n", strerror(errno)); + "/osrelease: %s\n", strerror(errno)); close(fd); exit(-1); } @@ -481,82 +477,99 @@ static int is_lustre_target(struct mkfs_opts *mop) * mke2fs to check for its support. */ static int is_e2fsprogs_feature_supp(const char *feature) { + static char supp_features[4096] = ""; FILE *fp; char cmd[PATH_MAX]; char imgname[] = "/tmp/test-img-XXXXXX"; int fd = -1; - int ret = 0; + int ret = 1; - snprintf(cmd, sizeof(cmd), "%s -c -R \"supported_features %s\" 2>&1", - DEBUGFS, feature); + if (supp_features[0] == '\0') { + snprintf(cmd, sizeof(cmd), "%s -c -R supported_features 2>&1", + DEBUGFS); - /* Using popen() instead of run_command() since debugfs does not return - * proper error code if command is not supported */ - fp = popen(cmd, "r"); - if (!fp) { - fprintf(stderr, "%s: %s\n", progname, strerror(errno)); - return 0; - } - ret = fread(cmd, 1, sizeof(cmd), fp); - if (ret > 0) { - if (strstr(cmd, feature) && !(strstr(cmd, "Unknown"))) + /* Using popen() instead of run_command() since debugfs does + * not return proper error code if command is not supported */ + fp = popen(cmd, "r"); + if (!fp) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); return 0; + } + ret = fread(supp_features, 1, sizeof(supp_features), fp); + fclose(fp); } + if (ret > 0 && strstr(supp_features, + strncmp(feature, "-O ", 3) ? feature : feature+3)) + return 0; if ((fd = mkstemp(imgname)) < 0) return -1; + else + close(fd); - snprintf(cmd, sizeof(cmd), "%s -F -O %s %s 100 >/dev/null 2>&1", + snprintf(cmd, sizeof(cmd), "%s -F %s %s 100 >/dev/null 2>&1", MKE2FS, feature, imgname); /* run_command() displays the output of mke2fs when it fails for * some feature, so use system() directly */ ret = system(cmd); - if (fd >= 0) - remove(imgname); + unlink(imgname); return ret; } -static void enable_default_backfs_features(struct mkfs_opts *mop) +static void enable_default_ext4_features(struct mkfs_opts *mop) { - struct utsname uts; - int ret; - if (IS_OST(&mop->mo_ldd)) - strscat(mop->mo_mkfsopts, " -O dir_index,extents", + strscat(mop->mo_mkfsopts, " -O extents,uninit_bg", sizeof(mop->mo_mkfsopts)); else if (IS_MDT(&mop->mo_ldd)) - strscat(mop->mo_mkfsopts, " -O dir_index,dirdata", + strscat(mop->mo_mkfsopts, " -O dirdata,uninit_bg", sizeof(mop->mo_mkfsopts)); else - strscat(mop->mo_mkfsopts, " -O dir_index", - sizeof(mop->mo_mkfsopts)); - - /* Upstream e2fsprogs called our uninit_groups feature uninit_bg, - * check for both of them when testing e2fsprogs features. */ - if (is_e2fsprogs_feature_supp("uninit_bg") == 0) - strscat(mop->mo_mkfsopts, ",uninit_bg", - sizeof(mop->mo_mkfsopts)); - else if (is_e2fsprogs_feature_supp("uninit_groups") == 0) - strscat(mop->mo_mkfsopts, ",uninit_groups", + strscat(mop->mo_mkfsopts, " -O uninit_bg", sizeof(mop->mo_mkfsopts)); - else - disp_old_e2fsprogs_msg("uninit_bg", 1); - ret = uname(&uts); - if (ret) - return; - - /* Multiple mount protection is enabled only if failover node is - * specified and if kernel version is higher than 2.6.9 */ + /* Multiple mount protection enabled only if failover node specified */ if (failover) { - if (is_e2fsprogs_feature_supp("mmp") == 0) + if (is_e2fsprogs_feature_supp("-O mmp") == 0) strscat(mop->mo_mkfsopts, ",mmp", sizeof(mop->mo_mkfsopts)); else disp_old_e2fsprogs_msg("mmp", 1); } + + /* Allow more than 65000 subdirectories */ + if (is_e2fsprogs_feature_supp("-O dir_nlink") == 0) + strscat(mop->mo_mkfsopts,",dir_nlink",sizeof(mop->mo_mkfsopts)); + +#ifdef HAVE_EXT4_LDISKFS + /* The following options are only valid for ext4-based ldiskfs. + * If --backfstype=ext3 is specified, do not enable them. */ + if (mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3) + return; + + /* Allow files larger than 2TB. Also needs LU-16, but not harmful. */ + if (is_e2fsprogs_feature_supp("-O huge_file") == 0) + strscat(mop->mo_mkfsopts,",huge_file",sizeof(mop->mo_mkfsopts)); + + /* Cluster inode/block bitmaps and inode table for more efficient IO. + * Align the flex groups on a 1MB boundary for better performance. + * This -O feature needs to go last, since it adds an extra option. */ + if (is_e2fsprogs_feature_supp("-O flex_bg") == 0) { + char tmp_buf[64]; + + strscat(mop->mo_mkfsopts, ",flex_bg", sizeof(mop->mo_mkfsopts)); + + if (IS_OST(&mop->mo_ldd)) { + snprintf(tmp_buf, sizeof(tmp_buf), " -G %u", + (1 << 20) / L_BLOCK_SIZE); + strscat(mop->mo_mkfsopts, tmp_buf, + sizeof(mop->mo_mkfsopts)); + } + } +#endif } + /* Build fs according to type */ int make_lustre_backfs(struct mkfs_opts *mop) { @@ -597,6 +610,8 @@ int make_lustre_backfs(struct mkfs_opts *mop) if ((mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3) || (mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS) || (mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS2)) { + long inode_size = 0; + /* Journal size in MB */ if (strstr(mop->mo_mkfsopts, "-J") == NULL) { /* Choose our own default journal size */ @@ -617,36 +632,14 @@ int make_lustre_backfs(struct mkfs_opts *mop) } } - /* Bytes_per_inode: disk size / num inodes */ - if (strstr(mop->mo_mkfsopts, "-i") == NULL) { - long bytes_per_inode = 0; - - if (IS_MDT(&mop->mo_ldd)) - bytes_per_inode = 4096; - - /* Allocate fewer inodes on large OST devices. Most - filesystems can be much more aggressive than even - this. */ - if ((IS_OST(&mop->mo_ldd) && (device_sz > 100000000))) - bytes_per_inode = 16384; /* > 100 Gb device */ - - - if (bytes_per_inode > 0) { - sprintf(buf, " -i %ld", bytes_per_inode); - strscat(mop->mo_mkfsopts, buf, - sizeof(mop->mo_mkfsopts)); - } - } - /* Inode size (for extended attributes). The LOV EA size is * 32 (EA hdr) + 32 (lov_mds_md) + stripes * 24 (lov_ost_data), * and we want some margin above that for ACLs, other EAs... */ if (strstr(mop->mo_mkfsopts, "-I") == NULL) { - long inode_size = 0; if (IS_MDT(&mop->mo_ldd)) { if (mop->mo_stripe_count > 72) inode_size = 512; /* bz 7241 */ - /* cray stripes across all osts (>60) */ + /* see also "-i" below for EA blocks */ else if (mop->mo_stripe_count > 32) inode_size = 2048; else if (mop->mo_stripe_count > 10) @@ -654,8 +647,8 @@ int make_lustre_backfs(struct mkfs_opts *mop) else inode_size = 512; } else if (IS_OST(&mop->mo_ldd)) { - /* now as we store fids in EA on OST we need - to make inode bigger */ + /* We store MDS FID and OST objid in EA on OST + * we need to make inode bigger as well. */ inode_size = 256; } @@ -666,13 +659,82 @@ int make_lustre_backfs(struct mkfs_opts *mop) } } + /* Bytes_per_inode: disk size / num inodes */ + if (strstr(mop->mo_mkfsopts, "-i") == NULL && + strstr(mop->mo_mkfsopts, "-N") == NULL) { + long bytes_per_inode = 0; + + /* Allocate more inodes on MDT devices. There is + * no data stored on the MDT, and very little extra + * metadata beyond the inode. It could go down as + * low as 1024 bytes, but this is conservative. + * Account for external EA blocks for wide striping. */ + if (IS_MDT(&mop->mo_ldd)) { + bytes_per_inode = inode_size + 1536; + + if (mop->mo_stripe_count > 72) { + int extra = mop->mo_stripe_count * 24; + extra = ((extra - 1) | 4095) + 1; + bytes_per_inode += extra; + } + } + + /* Allocate fewer inodes on large OST devices. Most + * filesystems can be much more aggressive than even + * this, but it is impossible to know in advance. */ + if (IS_OST(&mop->mo_ldd)) { + /* OST > 8TB assume average file size 1MB */ + if (device_sz >= (8ULL << 30)) + bytes_per_inode = 1024 * 1024; + /* OST > 1TB assume average file size 256kB */ + else if (device_sz >= (1ULL << 30)) + bytes_per_inode = 256 * 1024; + /* OST > 100GB assume average file size 64kB, + * plus a bit so that inodes will fit into a + * 256x flex_bg without overflowing */ + else if (device_sz >= (10ULL << 20)) + bytes_per_inode = 69905; + } + + + if (bytes_per_inode > 0) { + sprintf(buf, " -i %ld", bytes_per_inode); + strscat(mop->mo_mkfsopts, buf, + sizeof(mop->mo_mkfsopts)); + } + } + if (verbose < 2) { strscat(mop->mo_mkfsopts, " -q", sizeof(mop->mo_mkfsopts)); } if (strstr(mop->mo_mkfsopts, "-O") == NULL) - enable_default_backfs_features(mop); + enable_default_ext4_features(mop); + + /* In order to align the filesystem metadata on 1MB boundaries, + * give a resize value that will reserve a power-of-two group + * descriptor blocks, but leave one block for the superblock. + * Only useful for filesystems with < 2^32 blocks due to resize + * limitations. */ + if (IS_OST(&mop->mo_ldd) && mop->mo_device_sz > 100 * 1024 && + mop->mo_device_sz / L_BLOCK_SIZE <= 0xffffffff) { + unsigned group_blocks = L_BLOCK_SIZE * 8; + unsigned desc_per_block = L_BLOCK_SIZE / 32; + unsigned resize_blks; + + resize_blks = (1ULL<<32) - desc_per_block*group_blocks; + snprintf(buf, sizeof(buf)," -E resize=%u,",resize_blks); + } else { + strncpy(buf, " -E ", sizeof(buf)); + } + + /* Avoid zeroing out the full journal - speeds up mkfs */ + if (is_e2fsprogs_feature_supp("-E lazy_journal_init") == 0) + strscat(buf, "lazy_journal_init,", sizeof(buf)); + + if (strlen(buf) > strlen(" -E ")) + strscat(mop->mo_mkfsopts, buf,sizeof(mop->mo_mkfsopts)); /* Allow reformat of full devices (as opposed to partitions.) We already checked for mounted dev. */ @@ -681,14 +743,6 @@ int make_lustre_backfs(struct mkfs_opts *mop) snprintf(mkfs_cmd, sizeof(mkfs_cmd), "%s -j -b %d -L %s ", MKE2FS, L_BLOCK_SIZE, mop->mo_ldd.ldd_svname); - } else if (mop->mo_ldd.ldd_mount_type == LDD_MT_REISERFS) { - long journal_sz = 0; /* FIXME default journal size */ - if (journal_sz > 0) { - sprintf(buf, " --journal_size %ld", journal_sz); - strscat(mop->mo_mkfsopts, buf, - sizeof(mop->mo_mkfsopts)); - } - snprintf(mkfs_cmd, sizeof(mkfs_cmd), "mkreiserfs -ff "); } else { fprintf(stderr,"%s: unsupported fs type: %d (%s)\n", progname, mop->mo_ldd.ldd_mount_type, @@ -987,7 +1041,7 @@ int read_local_files(struct mkfs_opts *mop) vprint("Reading %s\n", MOUNT_DATA_FILE); num_read = fread(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep); if (num_read < 1 && ferror(filep)) { - fprintf(stderr, "%s: Unable to read from file (%s): %s\n", + fprintf(stderr, "%s: Unable to read from file %s: %s\n", progname, filepnm, strerror(errno)); goto out_close; } @@ -1106,10 +1160,7 @@ void set_defaults(struct mkfs_opts *mop) mop->mo_ldd.ldd_flags = LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_VIRGIN; mop->mo_mgs_failnodes = 0; strcpy(mop->mo_ldd.ldd_fsname, "lustre"); - if (get_os_version() == 24) - mop->mo_ldd.ldd_mount_type = LDD_MT_EXT3; - else - mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS; + mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS; mop->mo_ldd.ldd_svindex = INDEX_UNASSIGNED; mop->mo_stripe_count = 1; @@ -1165,7 +1216,8 @@ static char *convert_hostnames(char *s1) nid = libcfs_str2nid(s1); if (nid == LNET_NID_ANY) { - fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1); + fprintf(stderr, "%s: Can't parse NID '%s'\n", + progname, s1); free(converted); return NULL; } @@ -1610,18 +1662,14 @@ int main(int argc, char *const argv[]) switch (ldd->ldd_mount_type) { case LDD_MT_EXT3: case LDD_MT_LDISKFS: - case LDD_MT_LDISKFS2: { + case LDD_MT_LDISKFS2: strscat(default_mountopts, ",errors=remount-ro", sizeof(default_mountopts)); if (IS_MDT(ldd) || IS_MGS(ldd)) strscat(always_mountopts, ",user_xattr", sizeof(always_mountopts)); - if ((get_os_version() == 24) && IS_OST(ldd)) - strscat(always_mountopts, ",asyncdel", - sizeof(always_mountopts)); - /* NB: Files created while extents are enabled cannot be read - if mounted with a kernel that doesn't include the Lustre ldiskfs - patches! */ + /* NB: Files created while extents are enabled can only be read + * if mounted using the ext4 or ldiskfs filesystem type. */ if (IS_OST(ldd) && (ldd->ldd_mount_type == LDD_MT_LDISKFS || ldd->ldd_mount_type == LDD_MT_LDISKFS2)) { @@ -1629,14 +1677,7 @@ int main(int argc, char *const argv[]) sizeof(default_mountopts)); } break; - } - case LDD_MT_SMFS: { - mop.mo_flags |= MO_IS_LOOP; - sprintf(always_mountopts, ",type=ext3,dev=%s", - mop.mo_device); - break; - } - default: { + default: fatal(); fprintf(stderr, "unknown fs type %d '%s'\n", ldd->ldd_mount_type, @@ -1644,7 +1685,6 @@ int main(int argc, char *const argv[]) ret = EINVAL; goto out; } - } if (mountopts) { trim_mountfsoptions(mountopts); -- 1.8.3.1