From 49b06fba39e7fec26a0250ed37f04a620e349b5f Mon Sep 17 00:00:00 2001 From: "Christopher J. Morrone" Date: Sat, 16 Mar 2013 02:13:48 -0700 Subject: [PATCH] LU-2473 ldiskfs: Add ldiskfs support for RHEL 6.4 Add an ldiskfs kernel patch series to support the RHEL 6.4 kernel. The ldiskfs series selection macro (LB_LDISKFS_SERIES) is fixed up to use the AS_VERSION_COMPARE, which allows us to check if the kernel version is greater than or equal to a specific number, rather than just a simple pattern match. Change-Id: I894ace2d98e3d5c7481230794e9edf984bce7aee Signed-off-by: Christopher J. Morrone Reviewed-on: http://review.whamcloud.com/4804 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Bob Glossman Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- ldiskfs/config/ldiskfs-build.m4 | 46 +- .../patches/rhel6.4/ext4-fix-mbgroups-access.patch | 16 + .../rhel6.4/ext4-mballoc-pa_free-mismatch.patch | 110 ++++ .../kernel_patches/patches/rhel6.4/ext4-misc.patch | 252 +++++++++ .../kernel_patches/patches/rhel6.4/ext4-mmp.patch | 575 +++++++++++++++++++++ .../patches/rhel6.4/ext4-prealloc.patch | 412 +++++++++++++++ .../patches/rhel6.4/ext4-vmalloc.patch | 181 +++++++ .../series/ldiskfs-2.6-rhel6.4.series | 43 ++ 8 files changed, 1610 insertions(+), 25 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/rhel6.4/ext4-fix-mbgroups-access.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel6.4/ext4-mballoc-pa_free-mismatch.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel6.4/ext4-misc.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel6.4/ext4-mmp.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel6.4/ext4-prealloc.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel6.4/ext4-vmalloc.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series diff --git a/ldiskfs/config/ldiskfs-build.m4 b/ldiskfs/config/ldiskfs-build.m4 index fc2d772..2feec80 100644 --- a/ldiskfs/config/ldiskfs-build.m4 +++ b/ldiskfs/config/ldiskfs-build.m4 @@ -676,33 +676,29 @@ AC_MSG_RESULT([$LDISKFS_RELEASE]) AC_SUBST(LDISKFS_RELEASE) ]) -AC_DEFUN([LB_LDISKFS_SERIES], -[ -if $1; then +AC_DEFUN([LB_LDISKFS_SERIES], [ +LDISKFS_SERIES= +AS_IF([$1], [ AC_MSG_CHECKING([which ldiskfs series to use]) - case $LINUXRELEASE in - 2.6.32*) - if test x$RHEL_KERNEL = xyes; then - LDISKFS_SERIES="2.6-rhel6.series" - fi - if test x$SUSE_KERNEL = xyes; then - LDISKFS_SERIES="2.6-sles11.series" - fi - ;; - 3.0.*) - if test x$SUSE_KERNEL = xyes; then - LDISKFS_SERIES="3.0-sles11.series" - fi - ;; - *) - AC_MSG_WARN([Unknown kernel version $LINUXRELEASE]) - LDISKFS_SERIES= - ;; - esac + + SER= + AS_IF([test x$RHEL_KERNEL = xyes], [ + AS_VERSION_COMPARE([$LINUXRELEASE],[2.6.32-343],[ + AS_VERSION_COMPARE([$LINUXRELEASE],[2.6.32],[], + [SER="2.6-rhel6.series"],[SER="2.6-rhel6.series"])], + [SER="2.6-rhel6.4.series"],[SER="2.6-rhel6.4.series"]) + ], [test x$SUSE_KERNEL = xyes], [ + AS_VERSION_COMPARE([$LINUXRELEASE],[3.0.0],[ + AS_VERSION_COMPARE([$LINUXRELEASE],[2.6.32],[], + [SER="2.6-sles11.series"],[SER="2.6-sles11.series"])], + [SER="3.0-sles11.series"],[SER="3.0-sles11.series"]) + ]) + LDISKFS_SERIES=$SER + + AS_IF([test -z "$LDISKFS_SERIES"], + [AC_MSG_WARN([Unknown kernel version $LINUXRELEASE])]) AC_MSG_RESULT([$LDISKFS_SERIES]) -else - LDISKFS_SERIES= -fi +]) AC_SUBST(LDISKFS_SERIES) ]) diff --git a/ldiskfs/kernel_patches/patches/rhel6.4/ext4-fix-mbgroups-access.patch b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-fix-mbgroups-access.patch new file mode 100644 index 0000000..9c6bf5a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-fix-mbgroups-access.patch @@ -0,0 +1,16 @@ +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -4825,6 +4825,11 @@ do_more: + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); ++ if (!new_entry) { ++ ext4_mb_release_desc(&e4b); ++ err = -ENOMEM; ++ goto error_return; ++ } + new_entry->efd_start_blk = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count; diff --git a/ldiskfs/kernel_patches/patches/rhel6.4/ext4-mballoc-pa_free-mismatch.patch b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-mballoc-pa_free-mismatch.patch new file mode 100644 index 0000000..d557c4b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-mballoc-pa_free-mismatch.patch @@ -0,0 +1,110 @@ +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -3585,6 +3585,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; ++ pa->pa_error = 0; + + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); +@@ -3646,6 +3647,7 @@ ext4_mb_new_group_pa(struct ext4_allocat + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; ++ pa->pa_error = 0; + + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); +@@ -3708,7 +3710,9 @@ ext4_mb_release_inode_pa(struct ext4_bud + int err = 0; + int free = 0; + ++ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + BUG_ON(pa->pa_deleted == 0); ++ BUG_ON(pa->pa_inode == NULL); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); +@@ -3744,19 +3748,27 @@ ext4_mb_release_inode_pa(struct ext4_bud + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } +- if (free != pa->pa_free) { +- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", +- pa, (unsigned long) pa->pa_lstart, +- (unsigned long) pa->pa_pstart, +- (unsigned long) pa->pa_len); ++ ++ /* "free < pa->pa_free" means we maybe double alloc the same blocks, ++ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ ++ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { ++ ext4_error(sb, "pa free mismatch: [pa %p] " ++ "[phy %lu] [logic %lu] [len %u] [free %u] " ++ "[error %u] [inode %lu] [freed %u]", pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, pa->pa_inode->i_ino, ++ free); + ext4_grp_locked_error(sb, group, +- __func__, "free %u, pa_free %u", +- free, pa->pa_free); ++ __func__, "free %u, pa_free %u", ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } ++ BUG_ON(pa->pa_free != free); + atomic_add(free, &sbi->s_mb_discarded); + + return err; +@@ -4541,6 +4553,24 @@ repeat: + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); ++ if (ac->ac_pa) { ++ struct ext4_prealloc_space *pa = ac->ac_pa; ++ /* We can not make sure whether the bitmap has ++ * been updated or not when fail case. So can ++ * not revert pa_free back, just mark pa_error*/ ++ pa->pa_error++; ++ ext4_error(sb, ++ "Updating bitmap error: [err %d] " ++ "[pa %p] [phy %lu] [logic %lu] " ++ "[len %u] [free %u] [error %u] " ++ "[inode %lu]", *errp, pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, ++ (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, ++ pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ } + } + ext4_mb_release_context(ac); + out: +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h ++++ linux-stage/fs/ext4/mballoc.h +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include "ext4_jbd2.h" + #include "ext4.h" + +@@ -130,6 +131,7 @@ struct ext4_prealloc_space { + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + spinlock_t *pa_obj_lock; ++ unsigned short pa_error; + struct inode *pa_inode; /* hack, for history only */ + }; + diff --git a/ldiskfs/kernel_patches/patches/rhel6.4/ext4-misc.patch b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-misc.patch new file mode 100644 index 0000000..8c0f482 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-misc.patch @@ -0,0 +1,252 @@ +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1256,6 +1256,9 @@ EXT4_INODE_BIT_FNS(state, state_flags) + + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++/* Has been moved to linux/magic.h but we need it for Lustre */ ++#define EXT4_SUPER_MAGIC 0xEF53 ++ + /* + * Codes for operating systems + */ +@@ -1757,6 +1760,9 @@ extern void ext4_add_groupblocks(handle_ + ext4_fsblk_t block, unsigned long count); + extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + ++extern void ext4_mb_discard_inode_preallocations(struct inode *); ++ ++ + /* inode.c */ + int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); +Index: linux-stage/fs/ext4/ext4_extents.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_extents.h ++++ linux-stage/fs/ext4/ext4_extents.h +@@ -58,6 +58,12 @@ + */ + #define EXT_STATS_ + ++/* ++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb ++ * are now accounted in ext4_ext_calc_credits_for_insert() ++ */ ++#define EXT4_ALLOC_NEEDED 0 ++#define HAVE_EXT_PREPARE_CB_EXTENT + + /* + * ext4_inode has i_block array (60 bytes total). +@@ -291,6 +297,8 @@ extern int ext4_extent_tree_init(handle_ + extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); ++extern int ext4_ext_calc_credits_for_insert(struct inode *, ++ struct ext4_ext_path *); + extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +Index: linux-stage/fs/ext4/ext4_jbd2.c +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_jbd2.c ++++ linux-stage/fs/ext4/ext4_jbd2.c +@@ -31,6 +31,7 @@ int __ext4_journal_get_write_access(cons + } + return err; + } ++EXPORT_SYMBOL(__ext4_journal_get_write_access); + + int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +@@ -107,3 +108,4 @@ int __ext4_handle_dirty_metadata(const c + } + return err; + } ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); +Index: linux-stage/fs/ext4/ext4_jbd2.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_jbd2.h ++++ linux-stage/fs/ext4/ext4_jbd2.h +@@ -35,6 +35,8 @@ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) + ++#define ext4_journal_dirty_metadata(handle, bh) \ ++ ext4_handle_dirty_metadata(handle, NULL, bh) + /* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ +Index: linux-stage/fs/ext4/extents.c +=================================================================== +--- linux-stage.orig/fs/ext4/extents.c ++++ linux-stage/fs/ext4/extents.c +@@ -2200,6 +2200,55 @@ int ext4_ext_calc_credits_for_single_ext + } + + /* ++ * This routine returns max. credits extent tree can consume. ++ * It should be OK for low-performance paths like ->writepage() ++ * To allow many writing process to fit a single transaction, ++ * caller should calculate credits under truncate_mutex and ++ * pass actual path. ++ */ ++int ext4_ext_calc_credits_for_insert(struct inode *inode, ++ struct ext4_ext_path *path) ++{ ++ int depth, needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ depth = path->p_depth; ++ if (le16_to_cpu(path[depth].p_hdr->eh_entries) ++ < le16_to_cpu(path[depth].p_hdr->eh_max)) ++ return 1; ++ } ++ ++ /* ++ * given 32bit logical block (4294967296 blocks), max. tree ++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000. ++ * let's also add one more level for imbalance. ++ */ ++ depth = 5; ++ ++ /* allocation of new data block(s) */ ++ needed = 2; ++ ++ /* ++ * tree can be full, so it'd need to grow in depth: ++ * we need one credit to modify old root, credits for ++ * new root will be added in split accounting ++ */ ++ needed += 1; ++ /* ++ * Index split can happen, we'd need: ++ * allocate intermediate indexes (bitmap + group) ++ * + change two blocks at each level, but root (already included) ++ */ ++ needed += (depth * 2) + (depth * 2); ++ ++ /* any allocation modifies superblock */ ++ needed += 1; ++ ++ return needed; ++} ++ ++/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then +@@ -4488,3 +4537,14 @@ int ext4_fiemap(struct inode *inode, str + return error; + } + ++EXPORT_SYMBOL(ext4_ext_store_pblock); ++EXPORT_SYMBOL(ext4_ext_search_right); ++EXPORT_SYMBOL(ext4_ext_search_left); ++EXPORT_SYMBOL(ext4_ext_pblock); ++EXPORT_SYMBOL(ext4_ext_insert_extent); ++EXPORT_SYMBOL(ext4_mb_new_blocks); ++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); ++EXPORT_SYMBOL(ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_ext_walk_space); ++EXPORT_SYMBOL(ext4_ext_find_extent); ++EXPORT_SYMBOL(ext4_ext_drop_refs); +Index: linux-stage/fs/ext4/inode.c +=================================================================== +--- linux-stage.orig/fs/ext4/inode.c ++++ linux-stage/fs/ext4/inode.c +@@ -5549,6 +5549,7 @@ bad_inode: + iget_failed(inode); + return ERR_PTR(ret); + } ++EXPORT_SYMBOL(ext4_iget); + + static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -4031,6 +4031,7 @@ repeat: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + } ++EXPORT_SYMBOL(ext4_discard_preallocations); + + /* + * finds all preallocated spaces and return blocks being freed to them +@@ -5189,3 +5190,6 @@ out: + range->len = trimmed * sb->s_blocksize; + return ret; + } ++ ++EXPORT_SYMBOL(ext4_free_blocks); ++ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -137,6 +137,7 @@ __u32 ext4_itable_unused_count(struct su + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); + } ++EXPORT_SYMBOL(ext4_itable_unused_count); + + void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +@@ -1176,11 +1177,14 @@ enum { + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, +- Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, ++ Opt_mballoc, ++ Opt_discard, Opt_nodiscard, ++ Opt_init_itable, Opt_noinit_itable, + }; + + static const match_table_t tokens = { +@@ -1232,6 +1236,9 @@ static const match_table_t tokens = { + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, +@@ -1247,6 +1254,7 @@ static const match_table_t tokens = { + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, +@@ -1607,6 +1615,10 @@ set_qf_format: + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ case Opt_noiopen: ++ case Opt_iopen_nopriv: ++ break; + case Opt_ignore: + break; + case Opt_resize: +@@ -1704,6 +1716,8 @@ set_qf_format: + case Opt_noinit_itable: + clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + break; ++ case Opt_mballoc: ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " diff --git a/ldiskfs/kernel_patches/patches/rhel6.4/ext4-mmp.patch b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-mmp.patch new file mode 100644 index 0000000..9fcf40f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-mmp.patch @@ -0,0 +1,575 @@ +Prevent an ext4 filesystem from being mounted multiple times. +A sequence number is stored on disk and is periodically updated (every 5 +seconds by default) by a mounted filesystem. +At mount time, we now wait for s_mmp_update_interval seconds to make sure +that the MMP sequence does not change. +In case of failure, the nodename, bdevname and the time at which the MMP +block was last updated is displayed. +Move all mmp code to a dedicated file (mmp.c). + +Signed-off-by: Andreas Dilger whamcloud.com> +Signed-off-by: Johann Lombardi whamcloud.com> +--- + fs/ext4/Makefile | 3 +- + fs/ext4/ext4.h | 76 ++++++++++++- + fs/ext4/mmp.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext4/super.c | 18 +++- + 4 files changed, 444 insertions(+), 4 deletions(-) + create mode 100644 fs/ext4/mmp.c + +Index: linux-stage/fs/ext4/Makefile +=================================================================== +--- linux-stage.orig/fs/ext4/Makefile ++++ linux-stage/fs/ext4/Makefile +@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ +- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ++ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ ++ mmp.o + + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1009,7 +1009,7 @@ struct ext4_super_block { + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ +- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ +@@ -1177,6 +1177,9 @@ struct ext4_sb_info { + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; + ++ /* Kernel thread for multiple mount protection */ ++ struct task_struct *s_mmp_tsk; ++ + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ +@@ -1322,7 +1325,8 @@ EXT4_INODE_BIT_FNS(state, state_flags) + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ +- EXT4_FEATURE_INCOMPAT_FLEX_BG) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_MMP) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1576,6 +1580,67 @@ struct ext4_features { + }; + + /* ++ * This structure will be used for multiple mount protection. It will be ++ * written into the block number saved in the s_mmp_block field in the ++ * superblock. Programs that check MMP should assume that if ++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe ++ * to use the filesystem, regardless of how old the timestamp is. ++ */ ++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ ++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ ++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ ++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ ++ ++struct mmp_struct { ++ __le32 mmp_magic; /* Magic number for MMP */ ++ __le32 mmp_seq; /* Sequence no. updated periodically */ ++ ++ /* ++ * mmp_time, mmp_nodename & mmp_bdevname are only used for information ++ * purposes and do not affect the correctness of the algorithm ++ */ ++ __le64 mmp_time; /* Time last updated */ ++ char mmp_nodename[64]; /* Node which last updated MMP block */ ++ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ ++ ++ /* ++ * mmp_check_interval is used to verify if the MMP block has been ++ * updated on the block device. The value is updated based on the ++ * maximum time to write the MMP block during an update cycle. ++ */ ++ __le16 mmp_check_interval; ++ ++ __le16 mmp_pad1; ++ __le32 mmp_pad2[227]; ++}; ++ ++/* arguments passed to the mmp thread */ ++struct mmpd_data { ++ struct buffer_head *bh; /* bh from initial read_mmp_block() */ ++ struct super_block *sb; /* super block of the fs */ ++}; ++ ++/* ++ * Check interval multiplier ++ * The MMP block is written every update interval and initially checked every ++ * update interval x the multiplier (the value is then adapted based on the ++ * write latency). The reason is that writes can be delayed under load and we ++ * don't want readers to incorrectly assume that the filesystem is no longer ++ * in use. ++ */ ++#define EXT4_MMP_CHECK_MULT 2UL ++ ++/* ++ * Minimum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL ++ ++/* ++ * Maximum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL ++ ++/* + * Function prototypes + */ + +@@ -1757,6 +1822,10 @@ extern void __ext4_warning(struct super_ + #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) + extern void ext4_msg(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); ++extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, ++ const char *, const char *); ++#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ ++ msg) + extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, + const char *, const char *, ...) + __attribute__ ((format (printf, 4, 5))); +@@ -2050,6 +2119,8 @@ extern int ext4_move_extents(struct file + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + ++/* mmp.c */ ++extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + + /* + * Add new method to test wether block and inode bitmaps are properly +Index: linux-stage/fs/ext4/mmp.c +=================================================================== +--- /dev/null ++++ linux-stage/fs/ext4/mmp.c +@@ -0,0 +1,351 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "ext4.h" ++ ++/* ++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk ++ * faster. ++ */ ++static int write_mmp_block(struct buffer_head *bh) ++{ ++ mark_buffer_dirty(bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE_SYNC, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Read the MMP block. It _must_ be read from disk and hence we clear the ++ * uptodate flag on the buffer. ++ */ ++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ++ ext4_fsblk_t mmp_block) ++{ ++ struct mmp_struct *mmp; ++ ++ if (*bh) ++ clear_buffer_uptodate(*bh); ++ ++ /* This would be sb_bread(sb, mmp_block), except we need to be sure ++ * that the MD RAID device cache has been bypassed, and that the read ++ * is not blocked in the elevator. */ ++ if (!*bh) ++ *bh = sb_getblk(sb, mmp_block); ++ if (*bh) { ++ get_bh(*bh); ++ lock_buffer(*bh); ++ (*bh)->b_end_io = end_buffer_read_sync; ++ submit_bh(READ_SYNC, *bh); ++ wait_on_buffer(*bh); ++ if (!buffer_uptodate(*bh)) { ++ brelse(*bh); ++ *bh = NULL; ++ } ++ } ++ if (!*bh) { ++ ext4_warning(sb, "Error while reading MMP block %llu", ++ mmp_block); ++ return -EIO; ++ } ++ ++ mmp = (struct mmp_struct *)((*bh)->b_data); ++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* ++ * Dump as much information as possible to help the admin. ++ */ ++void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, ++ const char *function, const char *msg) ++{ ++ __ext4_warning(sb, function, msg); ++ __ext4_warning(sb, function, ++ "MMP failure info: last update time: %llu, last update " ++ "node: %s, last update device: %s\n", ++ (long long unsigned int) le64_to_cpu(mmp->mmp_time), ++ mmp->mmp_nodename, mmp->mmp_bdevname); ++} ++ ++/* ++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds ++ */ ++static int kmmpd(void *data) ++{ ++ struct super_block *sb = ((struct mmpd_data *) data)->sb; ++ struct buffer_head *bh = ((struct mmpd_data *) data)->bh; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct mmp_struct *mmp; ++ ext4_fsblk_t mmp_block; ++ u32 seq = 0; ++ unsigned long failed_writes = 0; ++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned mmp_check_interval; ++ unsigned long last_update_time; ++ unsigned long diff; ++ int retval; ++ ++ mmp_block = le64_to_cpu(es->s_mmp_block); ++ mmp = (struct mmp_struct *)(bh->b_data); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ /* ++ * Start with the higher mmp_check_interval and reduce it if ++ * the MMP block is being updated on time. ++ */ ++ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ bdevname(bh->b_bdev, mmp->mmp_bdevname); ++ ++ memcpy(mmp->mmp_nodename, init_utsname()->nodename, ++ sizeof(mmp->mmp_nodename)); ++ ++ while (!kthread_should_stop()) { ++ if (++seq > EXT4_MMP_SEQ_MAX) ++ seq = 1; ++ ++ mmp->mmp_seq = cpu_to_le32(seq); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ last_update_time = jiffies; ++ ++ retval = write_mmp_block(bh); ++ /* ++ * Don't spew too many error messages. Print one every ++ * (s_mmp_update_interval * 60) seconds. ++ */ ++ if (retval) { ++ if ((failed_writes % 60) == 0) ++ ext4_error(sb, "Error writing to MMP block"); ++ failed_writes++; ++ } ++ ++ if (!(le32_to_cpu(es->s_feature_incompat) & ++ EXT4_FEATURE_INCOMPAT_MMP)) { ++ ext4_warning(sb, "kmmpd being stopped since MMP feature" ++ " has been disabled."); ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ if (sb->s_flags & MS_RDONLY) { ++ ext4_warning(sb, "kmmpd being stopped since filesystem " ++ "has been remounted as readonly."); ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ diff = jiffies - last_update_time; ++ if (diff < mmp_update_interval * HZ) ++ schedule_timeout_interruptible(mmp_update_interval * ++ HZ - diff); ++ ++ /* ++ * We need to make sure that more than mmp_check_interval ++ * seconds have not passed since writing. If that has happened ++ * we need to check if the MMP block is as we left it. ++ */ ++ diff = jiffies - last_update_time; ++ if (diff > mmp_check_interval * HZ) { ++ struct buffer_head *bh_check = NULL; ++ struct mmp_struct *mmp_check; ++ ++ retval = read_mmp_block(sb, &bh_check, mmp_block); ++ if (retval) { ++ ext4_error(sb, "error reading MMP data: %d", ++ retval); ++ ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ mmp_check = (struct mmp_struct *)(bh_check->b_data); ++ if (mmp->mmp_seq != mmp_check->mmp_seq || ++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, ++ sizeof(mmp->mmp_nodename))) { ++ dump_mmp_msg(sb, mmp_check, ++ "Error while updating MMP info. " ++ "The filesystem seems to have been" ++ " multiply mounted."); ++ ext4_error(sb, "abort"); ++ goto failed; ++ } ++ put_bh(bh_check); ++ } ++ ++ /* ++ * Adjust the mmp_check_interval depending on how much time ++ * it took for the MMP block to be written. ++ */ ++ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, ++ EXT4_MMP_MAX_CHECK_INTERVAL), ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ } ++ ++ /* ++ * Unmount seems to be clean. ++ */ ++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ ++ retval = write_mmp_block(bh); ++ ++failed: ++ kfree(data); ++ brelse(bh); ++ return retval; ++} ++ ++/* ++ * Get a random new sequence number but make sure it is not greater than ++ * EXT4_MMP_SEQ_MAX. ++ */ ++static unsigned int mmp_new_seq(void) ++{ ++ u32 new_seq; ++ ++ do { ++ get_random_bytes(&new_seq, sizeof(u32)); ++ } while (new_seq > EXT4_MMP_SEQ_MAX); ++ ++ return new_seq; ++} ++ ++/* ++ * Protect the filesystem from being mounted more than once. ++ */ ++int ext4_multi_mount_protect(struct super_block *sb, ++ ext4_fsblk_t mmp_block) ++{ ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp = NULL; ++ struct mmpd_data *mmpd_data; ++ u32 seq; ++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned int wait_time = 0; ++ int retval; ++ ++ if (mmp_block < le32_to_cpu(es->s_first_data_block) || ++ mmp_block >= ext4_blocks_count(es)) { ++ ext4_warning(sb, "Invalid MMP block in superblock"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ ++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) ++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; ++ ++ /* ++ * If check_interval in MMP block is larger, use that instead of ++ * update_interval from the superblock. ++ */ ++ if (mmp->mmp_check_interval > mmp_check_interval) ++ mmp_check_interval = mmp->mmp_check_interval; ++ ++ seq = le32_to_cpu(mmp->mmp_seq); ++ if (seq == EXT4_MMP_SEQ_CLEAN) ++ goto skip; ++ ++ if (seq == EXT4_MMP_SEQ_FSCK) { ++ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); ++ goto failed; ++ } ++ ++ wait_time = min(mmp_check_interval * 2 + 1, ++ mmp_check_interval + 60); ++ ++ /* Print MMP interval if more than 20 secs. */ ++ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) ++ ext4_warning(sb, "MMP interval %u higher than expected, please" ++ " wait.\n", wait_time * 2); ++ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, "MMP startup interrupted, failing mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++skip: ++ /* ++ * write a new random sequence number. ++ */ ++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); ++ ++ retval = write_mmp_block(bh); ++ if (retval) ++ goto failed; ++ ++ /* ++ * wait for MMP interval and check mmp_seq. ++ */ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, "MMP startup interrupted, failing mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); ++ if (!mmpd_data) { ++ ext4_warning(sb, "not enough memory for mmpd_data"); ++ goto failed; ++ } ++ mmpd_data->sb = sb; ++ mmpd_data->bh = bh; ++ ++ /* ++ * Start a kernel thread to update the MMP block periodically. ++ */ ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", ++ bdevname(bh->b_bdev, ++ mmp->mmp_bdevname)); ++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ kfree(mmpd_data); ++ ext4_warning(sb, "Unable to create kmmpd thread for %s.", ++ sb->s_id); ++ goto failed; ++ } ++ ++ return 0; ++ ++failed: ++ brelse(bh); ++ return 1; ++} ++ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -40,6 +40,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -716,6 +718,8 @@ static void ext4_put_super(struct super_ + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the +@@ -3241,6 +3245,10 @@ static int ext4_fill_super(struct super_ + needs_recovery = (es->s_last_orphan != 0 || + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && ++ !(sb->s_flags & MS_RDONLY)) ++ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) ++ goto failed_mount3; + + /* + * The first inode we look at is the journal inode. Don't try +@@ -3491,6 +3499,8 @@ failed_mount3: + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +@@ -4001,7 +4011,7 @@ static int ext4_remount(struct super_blo + int enable_quota = 0; + ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; +- int err; ++ int err = 0; + #ifdef CONFIG_QUOTA + int i; + #endif +@@ -4129,6 +4139,13 @@ static int ext4_remount(struct super_blo + goto restore_opts; + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_MMP)) ++ if (ext4_multi_mount_protect(sb, ++ le64_to_cpu(es->s_mmp_block))) { ++ err = -EROFS; ++ goto restore_opts; ++ } + enable_quota = 1; + } + } diff --git a/ldiskfs/kernel_patches/patches/rhel6.4/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-prealloc.patch new file mode 100644 index 0000000..9a53745 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-prealloc.patch @@ -0,0 +1,412 @@ +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1136,11 +1136,14 @@ struct ext4_sb_info { + + /* tunables */ + unsigned long s_stripe; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; + unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; + /* where last allocation was done - for stream allocation */ +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -1838,6 +1838,25 @@ void ext4_mb_complex_scan_group(struct e + ext4_mb_check_limits(ac, e4b, 1); + } + ++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return; ++ } ++} ++ + /* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size requests +@@ -2155,6 +2174,80 @@ static const struct seq_operations ext4_ + .show = ext4_mb_seq_groups_show, + }; + ++#define EXT4_MB_PREALLOC_TABLE "prealloc_table" ++ ++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len = 0; ++ int i; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ len += sprintf(page + len, "%ld ", ++ sbi->s_mb_prealloc_table[i]); ++ len += sprintf(page + len, "\n"); ++ ++ *start = page; ++ return len; ++} ++ ++static int ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ unsigned long cnt, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; ++ } ++ ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ ext4_mb_prealloc_table_add(sbi, value); ++ i++; ++ } ++ ++ return cnt; ++} ++ + static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) + { + struct super_block *sb = PDE(inode)->data; +@@ -2380,26 +2473,62 @@ int ext4_mb_init(struct super_block *sb, + i++; + } while (i <= sb->s_blocksize_bits + 1); + +- /* init file for buddy data */ +- ret = ext4_mb_init_backend(sb); +- if (ret != 0) { +- kfree(sbi->s_mb_offsets); +- kfree(sbi->s_mb_maxs); +- return ret; +- } +- + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; +- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; ++ ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 10; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, 4); ++ ext4_mb_prealloc_table_add(sbi, 8); ++ ext4_mb_prealloc_table_add(sbi, 16); ++ ext4_mb_prealloc_table_add(sbi, 32); ++ ext4_mb_prealloc_table_add(sbi, 64); ++ ext4_mb_prealloc_table_add(sbi, 128); ++ ext4_mb_prealloc_table_add(sbi, 256); ++ ext4_mb_prealloc_table_add(sbi, 512); ++ ext4_mb_prealloc_table_add(sbi, 1024); ++ ext4_mb_prealloc_table_add(sbi, 2048); ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; ++ } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return -ENOMEM; +@@ -2413,9 +2542,27 @@ int ext4_mb_init(struct super_block *sb, + spin_lock_init(&lg->lg_prealloc_lock); + } + +- if (sbi->s_proc) ++ /* init file for buddy data */ ++ ret = ext4_mb_init_backend(sb); ++ if (ret != 0) { ++ kfree(sbi->s_mb_prealloc_table); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return ret; ++ } ++ ++ if (sbi->s_proc) { ++ struct proc_dir_entry *p; + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); ++ p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG | ++ S_IRUGO | S_IWUSR, sbi->s_proc); ++ if (p) { ++ p->data = sbi; ++ p->read_proc = ext4_mb_prealloc_table_proc_read; ++ p->write_proc = ext4_mb_prealloc_table_proc_write; ++ } ++ } + + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = release_blocks_on_commit; +@@ -2448,8 +2595,10 @@ int ext4_mb_release(struct super_block * + struct ext4_group_info *grinfo; + struct ext4_sb_info *sbi = EXT4_SB(sb); + +- if (sbi->s_proc) ++ if (sbi->s_proc) { + remove_proc_entry("mb_groups", sbi->s_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ } + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { +@@ -2469,6 +2618,7 @@ int ext4_mb_release(struct super_block * + kfree(sbi->s_group_info[i]); + kfree(sbi->s_group_info); + } ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + if (sbi->s_buddy_cache) +@@ -2798,11 +2948,12 @@ static noinline_for_stack void + ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) + { +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, orig_size, start_off; ++ loff_t size, orig_size; + ext4_lblk_t start, orig_start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests +@@ -2832,49 +2983,35 @@ ext4_mb_normalize_request(struct ext4_al + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ start = wind = 0; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } ++ } ++ size = wind; + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; +- } else { +- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; +- size = ac->ac_o_ex.fe_len << bsbits; ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; + } +- orig_size = size = size >> bsbits; +- orig_start = start = start_off >> bsbits; ++ orig_size = size; ++ orig_start = start; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -2946,7 +3083,6 @@ ext4_mb_normalize_request(struct ext4_al + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -3930,11 +4066,19 @@ static void ext4_mb_group_or_file(struct + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -2357,7 +2357,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); + +@@ -2372,7 +2373,8 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + NULL, +Index: linux-stage/fs/ext4/inode.c +=================================================================== +--- linux-stage.orig/fs/ext4/inode.c ++++ linux-stage/fs/ext4/inode.c +@@ -3070,6 +3070,11 @@ static int ext4_da_writepages(struct add + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) + return -EROFS; + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) { ++ nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write; ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ } ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + diff --git a/ldiskfs/kernel_patches/patches/rhel6.4/ext4-vmalloc.patch b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-vmalloc.patch new file mode 100644 index 0000000..bd80814 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.4/ext4-vmalloc.patch @@ -0,0 +1,181 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -690,7 +690,12 @@ static void ext4_put_super(struct super_ + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ++ if (is_vmalloc_addr(sbi->s_group_desc)) ++ vfree(sbi->s_group_desc); ++ else ++ kfree(sbi->s_group_desc); ++ + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else +@@ -2938,12 +2943,13 @@ static int ext4_fill_super(struct super_ + unsigned long offset = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; +- struct inode *root; ++ struct inode *root = NULL; + char *cp; + const char *descr; + int ret = -EINVAL; + int blocksize; + unsigned int db_count; ++ size_t size; + unsigned int i; + int needs_recovery, has_huge_files; + __u64 blocks_count; +@@ -3286,11 +3292,18 @@ static int ext4_fill_super(struct super_ + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); +- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), +- GFP_KERNEL); ++ size = (size_t)db_count * sizeof(struct buffer_head *); ++ sbi->s_group_desc = kzalloc(size, GFP_KERNEL); + if (sbi->s_group_desc == NULL) { +- ext4_msg(sb, KERN_ERR, "not enough memory"); +- goto failed_mount; ++ sbi->s_group_desc = vmalloc(size); ++ if (sbi->s_group_desc != NULL) { ++ memset(sbi->s_group_desc, 0, size); ++ } else { ++ ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n", ++ sbi->s_groups_count, (unsigned int)size); ++ ret = -ENOMEM; ++ goto failed_mount; ++ } + } + + #ifdef __BIG_ENDIAN +@@ -3505,12 +3518,10 @@ no_journal: + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); +- iput(root); + goto failed_mount4; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { +- iput(root); + ext4_msg(sb, KERN_ERR, "get root dentry failed"); + ret = -ENOMEM; + goto failed_mount4; +@@ -3562,6 +3573,7 @@ no_journal: + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", + err); ++ ret = err; + goto failed_mount5; + } + +@@ -3616,6 +3628,8 @@ failed_mount4a: + dput(sb->s_root); + sb->s_root = NULL; + failed_mount4: ++ iput(root); ++ sb->s_root = NULL; + ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + failed_mount_wq: +@@ -3639,7 +3653,11 @@ failed_mount3: + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ++ if (is_vmalloc_addr(sbi->s_group_desc)) ++ vfree(sbi->s_group_desc); ++ else ++ kfree(sbi->s_group_desc); + failed_mount: + if (sbi->s_proc) { + remove_proc_entry(sb->s_id, ext4_proc_root); +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -23,6 +23,7 @@ + + #include "mballoc.h" + #include ++#include + #include + + /* +@@ -2408,24 +2409,37 @@ static int ext4_mb_init_backend(struct s + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; +- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte +- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. +- * So a two level scheme suffices for now. */ +- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); ++ /* A 16TB filesystem with 64-bit pointers requires an 8192 byte ++ * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally) ++ * have group descriptors at least twice as large (64 bytes or ++ * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB ++ * filesystem needs a 128kB allocation, which may need vmalloc(). */ ++ sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { +- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); +- return -ENOMEM; ++ sbi->s_group_info = vmalloc(array_size); ++ if (sbi->s_group_info != NULL) { ++ memset(sbi->s_group_info, 0, array_size); ++ } else { ++ ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n", ++ array_size); ++ return -ENOMEM; ++ } + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { +- printk(KERN_ERR "EXT4-fs: can't get new inode\n"); ++ ext4_msg(sb, KERN_ERR, "can't get new inode\n"); + goto err_freesgi; + } ++ /* To avoid potentially colliding with an valid on-disk inode number, ++ * use EXT4_BAD_INO for the buddy cache inode number. This inode is ++ * not in the inode hash, so it should never be found by iget(), but ++ * this will avoid confusion if it ever shows up during debugging. */ ++ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { +- printk(KERN_ERR ++ ext4_msg(sb, KERN_ERR, + "EXT4-fs: can't read descriptor %u\n", i); + goto err_freebuddy; + } +@@ -2443,7 +2457,10 @@ err_freebuddy: + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); + err_freesgi: +- kfree(sbi->s_group_info); ++ if (is_vmalloc_addr(sbi->s_group_info)) ++ vfree(sbi->s_group_info); ++ else ++ kfree(sbi->s_group_info); + return -ENOMEM; + } + +@@ -2627,7 +2644,10 @@ int ext4_mb_release(struct super_block * + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); +- kfree(sbi->s_group_info); ++ if (is_vmalloc_addr(sbi->s_group_info)) ++ vfree(sbi->s_group_info); ++ else ++ kfree(sbi->s_group_info); + } + kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series new file mode 100644 index 0000000..409e8cd --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series @@ -0,0 +1,43 @@ +rhel6.3/ext4-wantedi-2.6.patch +rhel6.3/ext4-map_inode_page-2.6.18.patch +rhel6.3/export-ext4-2.6.patch +rhel6.3/ext4-remove-cond_resched-calls.patch +rhel6.3/ext4-nlink-2.6.patch +sles11sp1/ext4-ext_generation.patch +rhel6.3/ext4-inode-version.patch +rhel6.4/ext4-mmp.patch +rhel6.3/ext4-lookup-dotdot.patch +rhel6.3/ext4-max-dir-size.patch +rhel6.3/ext4-print-inum-in-htree-warning.patch +rhel6.3/ext4-xattr-no-update-ctime.patch +rhel6.4/ext4-prealloc.patch +rhel6.3/ext4-mballoc-extra-checks.patch +rhel6.4/ext4-misc.patch +rhel6.3/ext4-big-endian-check-2.6.patch +rhel6.3/ext4-alloc-policy-2.6.patch +rhel6.3/ext4-force_over_128tb.patch +rhel6.3/ext4-pdir-fix.patch +rhel6.3/ext4-add-more-error-checks-to-ext4_mkdir.patch +rhel6.3/ext4-osd-iop-common.patch +rhel6.3/ext4-osd-iam-exports.patch +rhel6.3/ext4-dynlocks-common.patch +rhel6.3/ext4-hash-indexed-dir-dotdot-update.patch +rhel6.3/ext4-kill-dx_root.patch +rhel6.3/ext4-extents-mount-option.patch +rhel6.3/ext4-fiemap-2.6.patch +rhel6.4/ext4-mballoc-pa_free-mismatch.patch +rhel6.3/ext4_data_in_dirent.patch +rhel6.3/ext4-large-eas.patch +rhel6.3/ext4-disable-mb-cache.patch +rhel6.3/ext4-back-dquot-to.patch +rhel6.3/ext4-nocmtime-2.6.patch +rhel6.4/ext4-vmalloc.patch +rhel6.3/ext4-journal-callback.patch +rhel6.3/ext4-store-tree-generation-at-find.patch +rhel6.3/ext4_pdirop.patch +rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch +rhel6.3/ext4-quota-dont-update-cmtime.patch +rhel6.3/ext4-quota-first-class.patch +rhel6.3/ext4-inode_info_reorganize.patch +rhel6.4/ext4-fix-mbgroups-access.patch +rhel6.3/ext4-fix-ext4_mb_add_n_trim.patch -- 1.8.3.1