From 854e3969e168c05e166eb1b9fb1f62dfc69c5f0b Mon Sep 17 00:00:00 2001 From: yangsheng Date: Fri, 21 Dec 2012 12:46:29 +0800 Subject: [PATCH] LU-992 kernel: deprecate RHEL5 server support for master Remove patches relate to RHEL5 server support. Signed-off-by: yang sheng Change-Id: I694c9bbe0b6713119501392540c9cf5c6f8e53f3 Reviewed-on: http://review.whamcloud.com/4865 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: James Simmons --- .../patches/export-ext4-2.6-rhel5.patch | 81 --- .../patches/ext4-back-dquot-to-rhel54.patch | 53 -- .../patches/ext4-big-endian-check-2.6-rhel5.patch | 56 -- .../patches/ext4-disable-delalloc-rhel5.patch | 19 - .../patches/ext4-disable-mb-cache-rhel5.patch | 154 ----- .../patches/ext4-dynlocks-2.6-rhel5.patch | 33 - .../patches/ext4-dynlocks-common.patch | 278 -------- .../patches/ext4-failed-mount-b23368.patch | 12 - .../patches/ext4-fiemap-2.6-rhel5.patch | 96 --- .../patches/ext4-force_over_128tb-rhel5.patch | 56 -- .../patches/ext4-inode-version-rhel5.patch | 73 -- .../patches/ext4-journal-callback-rhel5.patch | 448 ------------- .../kernel_patches/patches/ext4-kill-dx_root.patch | 245 ------- .../patches/ext4-max-dir-size-rhel5.patch | 49 -- .../patches/ext4-mballoc-extra-checks-rhel5.patch | 352 ---------- .../patches/ext4-mballoc-group_check-rhel5.patch | 320 --------- .../patches/ext4-mballoc-pa_free-mismatch.patch | 108 --- .../kernel_patches/patches/ext4-misc-rhel5.patch | 330 --------- .../kernel_patches/patches/ext4-mmp-rhel5.patch | 578 ---------------- .../patches/ext4-osd-iam-exports.patch | 64 -- .../patches/ext4-osd-iop-common.patch | 224 ------- ldiskfs/kernel_patches/patches/ext4-pdir-fix.patch | 59 -- .../patches/ext4-prealloc-rhel5.patch | 378 ----------- .../ext4-print-inum-in-htree-warning-rhel5.patch | 16 - .../patches/ext4-quota-minimal-rhel5.patch | 20 - .../patches/ext4-version-2.6-rhel5.patch | 18 - .../patches/ext4-vmalloc-rhel5.patch | 198 ------ .../patches/ext4-wantedi-2.6-rhel5.patch | 84 --- .../patches/ext4_data_in_dirent.patch | 546 --------------- .../series/ldiskfs-2.6-rhel5-ext4.series | 41 -- .../patches/blkdev_tunables-2.6-rhel5.patch | 44 -- .../patches/dev_read_only-2.6.18-vanilla.patch | 166 ----- .../patches/export-2.6.18-vanilla.patch | 17 - .../patches/export_symbol_numa-2.6-fc5.patch | 12 - .../patches/export_symbols-2.6.12.patch | 15 - .../fix-forever-in-do_get_write_access.patch | 41 -- .../patches/jbd-jcberr-2.6.18-vanilla.patch | 238 ------- .../jbd-journal-chksum-2.6.18-vanilla.patch | 637 ------------------ .../patches/jbd-stats-2.6-rhel5.patch | 743 --------------------- .../patches/jbd2-jcberr-2.6-rhel5.patch | 224 ------- .../patches/jbd2_stats_proc_init-wrong-place.patch | 53 -- .../lustre_iser_max_sectors_tuning_lustre2.0.patch | 78 --- .../md-avoid-bug_on-when-bmc-overflow.patch | 64 -- .../kernel_patches/patches/md-rebuild-policy.patch | 140 ---- .../patches/mpt-fusion-max-sge.patch | 31 - .../patches/prune-icache-use-trylock-rhel5.patch | 13 - .../patches/quota-large-limits-rhel5.patch | 622 ----------------- .../raid5-configurable-cachesize-rhel5.patch | 31 - .../patches/raid5-large-io-rhel5.patch | 15 - .../patches/raid5-maxsectors-rhel5.patch | 23 - .../patches/raid5-merge-ios-rhel5.patch | 185 ----- .../patches/raid5-mmp-unplug-dev.patch | 22 - .../patches/raid5-rebuild-corrupt-bug.patch | 26 - .../kernel_patches/patches/raid5-stats-rhel5.patch | 256 ------- .../raid5-stripe-by-stripe-handling-rhel5.patch | 284 -------- .../patches/raid5-zerocopy-rhel5.patch | 489 -------------- .../patches/sd_iostats-2.6-rhel5.patch | 581 ---------------- .../patches/small-fixes-about-jbd.patch | 13 - lustre/kernel_patches/series/2.6-rhel5.series | 30 - lustre/kernel_patches/which_patch | 1 - 60 files changed, 10083 deletions(-) delete mode 100644 ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel54.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-disable-delalloc-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-dynlocks-common.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-failed-mount-b23368.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-force_over_128tb-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-journal-callback-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-kill-dx_root.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-mballoc-group_check-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-osd-iam-exports.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-osd-iop-common.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-pdir-fix.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-quota-minimal-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-version-2.6-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext4_data_in_dirent.patch delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series delete mode 100644 lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export_symbol_numa-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch delete mode 100644 lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/jbd2_stats_proc_init-wrong-place.patch delete mode 100644 lustre/kernel_patches/patches/lustre_iser_max_sectors_tuning_lustre2.0.patch delete mode 100644 lustre/kernel_patches/patches/md-avoid-bug_on-when-bmc-overflow.patch delete mode 100644 lustre/kernel_patches/patches/md-rebuild-policy.patch delete mode 100644 lustre/kernel_patches/patches/mpt-fusion-max-sge.patch delete mode 100644 lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/quota-large-limits-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-large-io-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-maxsectors-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-mmp-unplug-dev.patch delete mode 100644 lustre/kernel_patches/patches/raid5-rebuild-corrupt-bug.patch delete mode 100644 lustre/kernel_patches/patches/raid5-stats-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/small-fixes-about-jbd.patch delete mode 100644 lustre/kernel_patches/series/2.6-rhel5.series diff --git a/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch deleted file mode 100644 index a89a0aa..0000000 --- a/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch +++ /dev/null @@ -1,81 +0,0 @@ -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -185,6 +185,8 @@ void ext4_journal_abort_handle(const cha - jbd2_journal_abort_handle(handle); - } - -+EXPORT_SYMBOL(ext4_journal_abort_handle); -+ - /* Deal with the reporting of failure conditions on a filesystem such as - * inconsistencies detected or read IO failures. - * -@@ -2459,6 +2461,8 @@ out_fail: - return ret; - } - -+EXPORT_SYMBOL(ext4_force_commit); -+ - /* - * Setup any per-fs journal parameters now. We'll do this both on - * initial mount, once the journal has been initialised but before we've -@@ -3504,6 +3508,12 @@ int ext4_map_inode_page(struct inode *in - unsigned long *blocks, int *created, int create); - EXPORT_SYMBOL(ext4_map_inode_page); - -+EXPORT_SYMBOL(ext4_xattr_get); -+EXPORT_SYMBOL(ext4_xattr_set_handle); -+EXPORT_SYMBOL(ext4_bread); -+EXPORT_SYMBOL(ext4_journal_start_sb); -+EXPORT_SYMBOL(__ext4_journal_stop); -+ - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); - MODULE_LICENSE("GPL"); -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -1024,6 +1024,8 @@ extern unsigned long ext4_count_free_ino - ext4_group_t group, - struct ext4_group_desc *desc); - extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); -+extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, -+ ext4_group_t block_group); - - /* mballoc.c */ - extern long ext4_mb_stats; -Index: linux-stage/fs/ext4/ialloc.c -=================================================================== ---- linux-stage.orig/fs/ext4/ialloc.c -+++ linux-stage/fs/ext4/ialloc.c -@@ -96,7 +96,7 @@ unsigned ext4_init_inode_bitmap(struct s - * - * Return buffer_head of bitmap on success or NULL. - */ --static struct buffer_head * -+struct buffer_head * - ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) - { - struct ext4_group_desc *desc; -@@ -137,6 +137,7 @@ ext4_read_inode_bitmap(struct super_bloc - } - return bh; - } -+EXPORT_SYMBOL(ext4_read_inode_bitmap); - - /* - * NOTE! When we get the inode, we're the only people -Index: linux-stage/fs/ext4/balloc.c -=================================================================== ---- linux-stage.orig/fs/ext4/balloc.c -+++ linux-stage/fs/ext4/balloc.c -@@ -236,6 +236,7 @@ struct ext4_group_desc * ext4_get_group_ - *bh = sbi->s_group_desc[group_desc]; - return desc; - } -+EXPORT_SYMBOL(ext4_get_group_desc); - - static int ext4_valid_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, diff --git a/ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel54.patch b/ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel54.patch deleted file mode 100644 index c3b0ef8..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel54.patch +++ /dev/null @@ -1,53 +0,0 @@ -diff -up a/fs/ext4/super.c b/s/ext4/super.c ---- a/fs/ext4/super.c -+++ b/fs/ext4/super.c -@@ -706,9 +767,47 @@ static inline struct inode *dquot_to_ino - static ssize_t ext4_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off); - -+static int ext4_dquot_initialize(struct inode *inode, int type) -+{ -+ handle_t *handle; -+ int ret, err; -+ -+ /* We may create quota structure so we need to reserve enough blocks */ -+ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_initialize(inode, type); -+ err = ext4_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+static int ext4_dquot_drop(struct inode *inode) -+{ -+ handle_t *handle; -+ int ret, err; -+ -+ /* We may delete quota structure so we need to reserve enough blocks */ -+ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) { -+ /* -+ * We call dquot_drop() anyway to at least release references -+ * to quota structures so that umount does not hang. -+ */ -+ dquot_drop(inode); -+ return PTR_ERR(handle); -+ } -+ ret = dquot_drop(inode); -+ err = ext4_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ - static struct dquot_operations ext4_quota_operations = { -- .initialize = dquot_initialize, -- .drop = dquot_drop, -+ .initialize = ext4_dquot_initialize, -+ .drop = ext4_dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, diff --git a/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch deleted file mode 100644 index 6775a31..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch +++ /dev/null @@ -1,56 +0,0 @@ -Index: linux-2.6.18-128.1.6/fs/ext4/super.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/super.c -+++ linux-2.6.18-128.1.6/fs/ext4/super.c -@@ -70,6 +70,8 @@ struct page *ext4_zero_page; - - struct proc_dir_entry *proc_root_ext4; - -+static int bigendian_extents; -+ - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, - struct ext4_group_desc *bg) - { -@@ -1222,7 +1224,7 @@ enum { - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_inode_readahead_blks, Opt_journal_ioprio, -- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents, - }; - - static match_table_t tokens = { -@@ -1284,6 +1286,7 @@ static match_table_t tokens = { - {Opt_auto_da_alloc, "auto_da_alloc=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc"}, - {Opt_noauto_da_alloc, "noauto_da_alloc"}, -+ {Opt_bigendian_extents, "bigendian_extents"}, - {Opt_err, NULL}, - }; - -@@ -1682,6 +1685,9 @@ clear_qf_name: - return 0; - sbi->s_stripe = option; - break; -+ case Opt_bigendian_extents: -+ bigendian_extents = 1; -+ break; - default: - printk(KERN_ERR - "EXT4-fs: Unrecognized mount option \"%s\" " -@@ -2561,6 +2567,15 @@ static int ext4_fill_super(struct super_ - goto failed_mount; - } - -+#ifdef __BIG_ENDIAN -+ if (bigendian_extents == 0) { -+ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to " -+ "work on big-endian systems. Use \"bigendian_extents\" " -+ "mount option to override.\n"); -+ goto failed_mount; -+ } -+#endif -+ - bgl_lock_init(sbi->s_blockgroup_lock); - - sbi->s_last_alloc_group = -1; diff --git a/ldiskfs/kernel_patches/patches/ext4-disable-delalloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-disable-delalloc-rhel5.patch deleted file mode 100644 index 912b52c..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-disable-delalloc-rhel5.patch +++ /dev/null @@ -1,19 +0,0 @@ -Disable the DELALLOC feature because it is not robust in ext4 versions < 2.6.31. - --- -diff -rupN linux-2.6.18-164.6.1_1/fs/ext4/super.c linux-2.6.18-164.6.1_2/fs/ext4/super.c ---- linux-2.6.18-164.6.1_1/fs/ext4/super.c 2010-08-05 13:44:07.000000000 +0530 -+++ linux-2.6.18-164.6.1_2/fs/ext4/super.c 2010-08-05 13:46:29.000000000 +0530 -@@ -2091,12 +2091,6 @@ static int ext4_fill_super(struct super_ - - set_opt(sbi->s_mount_opt, BARRIER); - -- /* -- * enable delayed allocation by default -- * Use -o nodelalloc to turn it off -- */ -- set_opt(sbi->s_mount_opt, DELALLOC); -- - if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, NULL, 0)) - goto failed_mount; diff --git a/ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel5.patch deleted file mode 100644 index 9a0d61a..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel5.patch +++ /dev/null @@ -1,154 +0,0 @@ -Index: linux-stage/fs/ext4/xattr.c -=================================================================== ---- linux-stage.orig/fs/ext4/xattr.c -+++ linux-stage/fs/ext4/xattr.c -@@ -86,7 +86,8 @@ - # define ea_bdebug(f...) - #endif - --static void ext4_xattr_cache_insert(struct buffer_head *); -+static void ext4_xattr_cache_insert(struct super_block *, -+ struct buffer_head *); - static struct buffer_head *ext4_xattr_cache_find(struct inode *, - struct ext4_xattr_header *, - struct mb_cache_entry **); -@@ -233,7 +234,7 @@ bad_block: ext4_error(inode->i_sb, __fun - error = -EIO; - goto cleanup; - } -- ext4_xattr_cache_insert(bh); -+ ext4_xattr_cache_insert(inode->i_sb, bh); - entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EIO) -@@ -375,7 +376,7 @@ ext4_xattr_block_list(struct inode *inod - error = -EIO; - goto cleanup; - } -- ext4_xattr_cache_insert(bh); -+ ext4_xattr_cache_insert(inode->i_sb, bh); - error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); - - cleanup: -@@ -472,7 +473,9 @@ ext4_xattr_release_block(handle_t *handl - struct mb_cache_entry *ce = NULL; - int error = 0; - -- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); -+ if (!test_opt(inode->i_sb, NO_MBCACHE)) -+ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, -+ bh->b_blocknr); - error = ext4_journal_get_write_access(handle, bh); - if (error) - goto out; -@@ -700,8 +703,10 @@ ext4_xattr_block_set(handle_t *handle, s - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; - if (s->base) { -- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, -- bs->bh->b_blocknr); -+ if (!test_opt(inode->i_sb, NO_MBCACHE)) -+ ce = mb_cache_entry_get(ext4_xattr_cache, -+ bs->bh->b_bdev, -+ bs->bh->b_blocknr); - error = ext4_journal_get_write_access(handle, bs->bh); - if (error) - goto cleanup; -@@ -718,7 +723,7 @@ ext4_xattr_block_set(handle_t *handle, s - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), - s->here); -- ext4_xattr_cache_insert(bs->bh); -+ ext4_xattr_cache_insert(sb, bs->bh); - } - unlock_buffer(bs->bh); - if (error == -EIO) -@@ -801,7 +806,8 @@ inserted: - if (error) - goto cleanup_dquot; - } -- mb_cache_entry_release(ce); -+ if (ce) -+ mb_cache_entry_release(ce); - ce = NULL; - } else if (bs->bh && s->base == bs->bh->b_data) { - /* We were modifying this block in-place. */ -@@ -845,7 +851,7 @@ getblk_failed: - memcpy(new_bh->b_data, s->base, new_bh->b_size); - set_buffer_uptodate(new_bh); - unlock_buffer(new_bh); -- ext4_xattr_cache_insert(new_bh); -+ ext4_xattr_cache_insert(sb, new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, new_bh); - if (error) -@@ -1404,12 +1410,15 @@ ext4_xattr_put_super(struct super_block - * Returns 0, or a negative error number on failure. - */ - static void --ext4_xattr_cache_insert(struct buffer_head *bh) -+ext4_xattr_cache_insert(struct super_block *sb, struct buffer_head *bh) - { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; - int error; - -+ if (test_opt(sb, NO_MBCACHE)) -+ return; -+ - ce = mb_cache_entry_alloc(ext4_xattr_cache); - if (!ce) { - ea_bdebug(bh, "out of memory"); -@@ -1483,6 +1492,8 @@ ext4_xattr_cache_find(struct inode *inod - __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; - -+ if (test_opt(inode->i_sb, NO_MBCACHE)) -+ return NULL; - if (!header->h_hash) - return NULL; /* never share */ - ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -1481,6 +1481,7 @@ enum { - - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents, - Opt_force_over_128tb, -+ Opt_no_mbcache, - }; - - static match_table_t tokens = { -@@ -1554,6 +1555,7 @@ static match_table_t tokens = { - {Opt_noauto_da_alloc, "noauto_da_alloc"}, - {Opt_bigendian_extents, "bigendian_extents"}, - {Opt_force_over_128tb, "force_over_128tb"}, -+ {Opt_no_mbcache, "no_mbcache"}, - {Opt_err, NULL}, - }; - -@@ -2030,6 +2032,9 @@ set_qf_format: - } - clear_opt(sbi->s_mount_opt, EXTENTS); - break; -+ case Opt_no_mbcache: -+ set_opt(sbi->s_mount_opt, NO_MBCACHE); -+ break; - default: - ext4_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -715,7 +715,8 @@ struct ext4_inode_info { - /* - * Mount flags - */ --#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ -+#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Disable mbcache */ -+#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ - #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ - #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ - #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ diff --git a/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch deleted file mode 100644 index cecbbb1..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch +++ /dev/null @@ -1,33 +0,0 @@ -Index: linux-stage/fs/ext4/Makefile -=================================================================== ---- linux-stage.orig/fs/ext4/Makefile -+++ linux-stage/fs/ext4/Makefile -@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o - ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ -- mmp.o -+ mmp.o dynlocks.o - - ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -4159,6 +4159,7 @@ static int __init init_ext4_fs(void) - err = init_inodecache(); - if (err) - goto out1; -+ dynlock_cache_init(); - err = register_filesystem(&ext4_fs_type); - if (err) - goto out; -@@ -4195,6 +4196,7 @@ static void __exit exit_ext4_fs(void) - unregister_filesystem(&ext4dev_fs_type); - #endif - destroy_inodecache(); -+ dynlock_cache_exit(); - exit_ext4_xattr(); - exit_ext4_mballoc(); - __free_page(ext4_zero_page); diff --git a/ldiskfs/kernel_patches/patches/ext4-dynlocks-common.patch b/ldiskfs/kernel_patches/patches/ext4-dynlocks-common.patch deleted file mode 100644 index b9dcbd9..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-dynlocks-common.patch +++ /dev/null @@ -1,278 +0,0 @@ -diff -rupN linux-2.6.18-128.1.6_1/fs/ext4/dynlocks.c linux-2.6.18-128.1.6_2/fs/ext4/dynlocks.c ---- linux-2.6.18-128.1.6_1/fs/ext4/dynlocks.c 1970-01-01 05:30:00.000000000 +0530 -+++ linux-2.6.18-128.1.6_2/fs/ext4/dynlocks.c 2009-08-13 20:42:59.000000000 +0530 -@@ -0,0 +1,236 @@ -+/* -+ * Dynamic Locks -+ * -+ * struct dynlock is lockspace -+ * one may request lock (exclusive or shared) for some value -+ * in that lockspace -+ * -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define DYNLOCK_HANDLE_MAGIC 0xd19a10c -+#define DYNLOCK_HANDLE_DEAD 0xd1956ee -+#define DYNLOCK_LIST_MAGIC 0x11ee91e6 -+ -+static kmem_cache_t * dynlock_cachep = NULL; -+ -+struct dynlock_handle { -+ unsigned dh_magic; -+ struct list_head dh_list; -+ unsigned long dh_value; /* lock value */ -+ int dh_refcount; /* number of users */ -+ int dh_readers; -+ int dh_writers; -+ int dh_pid; /* holder of the lock */ -+ wait_queue_head_t dh_wait; -+}; -+ -+int __init dynlock_cache_init(void) -+{ -+ int rc = 0; -+ -+ /* printk(KERN_INFO "init dynlocks cache\n"); */ -+ dynlock_cachep = kmem_cache_create("dynlock_cache", -+ sizeof(struct dynlock_handle), -+ 0, -+ SLAB_HWCACHE_ALIGN, -+ NULL, NULL); -+ if (dynlock_cachep == NULL) { -+ printk(KERN_ERR "Not able to create dynlock cache"); -+ rc = -ENOMEM; -+ } -+ return rc; -+} -+ -+void dynlock_cache_exit(void) -+{ -+ /* printk(KERN_INFO "exit dynlocks cache\n"); */ -+ kmem_cache_destroy(dynlock_cachep); -+} -+ -+/* -+ * dynlock_init -+ * -+ * initialize lockspace -+ * -+ */ -+void dynlock_init(struct dynlock *dl) -+{ -+ spin_lock_init(&dl->dl_list_lock); -+ INIT_LIST_HEAD(&dl->dl_list); -+ dl->dl_magic = DYNLOCK_LIST_MAGIC; -+} -+EXPORT_SYMBOL(dynlock_init); -+ -+/* -+ * dynlock_lock -+ * -+ * acquires lock (exclusive or shared) in specified lockspace -+ * each lock in lockspace is allocated separately, so user have -+ * to specify GFP flags. -+ * routine returns pointer to lock. this pointer is intended to -+ * be passed to dynlock_unlock -+ * -+ */ -+struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value, -+ enum dynlock_type lt, gfp_t gfp) -+{ -+ struct dynlock_handle *nhl = NULL; -+ struct dynlock_handle *hl; -+ -+ BUG_ON(dl == NULL); -+ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); -+ -+repeat: -+ /* find requested lock in lockspace */ -+ spin_lock(&dl->dl_list_lock); -+ BUG_ON(dl->dl_list.next == NULL); -+ BUG_ON(dl->dl_list.prev == NULL); -+ list_for_each_entry(hl, &dl->dl_list, dh_list) { -+ BUG_ON(hl->dh_list.next == NULL); -+ BUG_ON(hl->dh_list.prev == NULL); -+ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); -+ if (hl->dh_value == value) { -+ /* lock is found */ -+ if (nhl) { -+ /* someone else just allocated -+ * lock we didn't find and just created -+ * so, we drop our lock -+ */ -+ kmem_cache_free(dynlock_cachep, nhl); -+ nhl = NULL; -+ } -+ hl->dh_refcount++; -+ goto found; -+ } -+ } -+ /* lock not found */ -+ if (nhl) { -+ /* we already have allocated lock. use it */ -+ hl = nhl; -+ nhl = NULL; -+ list_add(&hl->dh_list, &dl->dl_list); -+ goto found; -+ } -+ spin_unlock(&dl->dl_list_lock); -+ -+ /* lock not found and we haven't allocated lock yet. allocate it */ -+ nhl = kmem_cache_alloc(dynlock_cachep, gfp); -+ if (nhl == NULL) -+ return NULL; -+ nhl->dh_refcount = 1; -+ nhl->dh_value = value; -+ nhl->dh_readers = 0; -+ nhl->dh_writers = 0; -+ nhl->dh_magic = DYNLOCK_HANDLE_MAGIC; -+ init_waitqueue_head(&nhl->dh_wait); -+ -+ /* while lock is being allocated, someone else may allocate it -+ * and put onto to list. check this situation -+ */ -+ goto repeat; -+ -+found: -+ if (lt == DLT_WRITE) { -+ /* exclusive lock: user don't want to share lock at all -+ * NOTE: one process may take the same lock several times -+ * this functionaly is useful for rename operations */ -+ while ((hl->dh_writers && hl->dh_pid != current->pid) || -+ hl->dh_readers) { -+ spin_unlock(&dl->dl_list_lock); -+ wait_event(hl->dh_wait, -+ hl->dh_writers == 0 && hl->dh_readers == 0); -+ spin_lock(&dl->dl_list_lock); -+ } -+ hl->dh_writers++; -+ } else { -+ /* shared lock: user do not want to share lock with writer */ -+ while (hl->dh_writers) { -+ spin_unlock(&dl->dl_list_lock); -+ wait_event(hl->dh_wait, hl->dh_writers == 0); -+ spin_lock(&dl->dl_list_lock); -+ } -+ hl->dh_readers++; -+ } -+ hl->dh_pid = current->pid; -+ spin_unlock(&dl->dl_list_lock); -+ -+ return hl; -+} -+EXPORT_SYMBOL(dynlock_lock); -+ -+ -+/* -+ * dynlock_unlock -+ * -+ * user have to specify lockspace (dl) and pointer to lock structure -+ * returned by dynlock_lock() -+ * -+ */ -+void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *hl) -+{ -+ int wakeup = 0; -+ -+ BUG_ON(dl == NULL); -+ BUG_ON(hl == NULL); -+ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); -+ -+ if (hl->dh_magic != DYNLOCK_HANDLE_MAGIC) -+ printk(KERN_EMERG "wrong lock magic: %#x\n", hl->dh_magic); -+ -+ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); -+ BUG_ON(hl->dh_writers != 0 && current->pid != hl->dh_pid); -+ -+ spin_lock(&dl->dl_list_lock); -+ if (hl->dh_writers) { -+ BUG_ON(hl->dh_readers != 0); -+ hl->dh_writers--; -+ if (hl->dh_writers == 0) -+ wakeup = 1; -+ } else if (hl->dh_readers) { -+ hl->dh_readers--; -+ if (hl->dh_readers == 0) -+ wakeup = 1; -+ } else { -+ BUG(); -+ } -+ if (wakeup) { -+ hl->dh_pid = 0; -+ wake_up(&hl->dh_wait); -+ } -+ if (--(hl->dh_refcount) == 0) { -+ hl->dh_magic = DYNLOCK_HANDLE_DEAD; -+ list_del(&hl->dh_list); -+ kmem_cache_free(dynlock_cachep, hl); -+ } -+ spin_unlock(&dl->dl_list_lock); -+} -+EXPORT_SYMBOL(dynlock_unlock); -+ -+int dynlock_is_locked(struct dynlock *dl, unsigned long value) -+{ -+ struct dynlock_handle *hl; -+ int result = 0; -+ -+ /* find requested lock in lockspace */ -+ spin_lock(&dl->dl_list_lock); -+ BUG_ON(dl->dl_list.next == NULL); -+ BUG_ON(dl->dl_list.prev == NULL); -+ list_for_each_entry(hl, &dl->dl_list, dh_list) { -+ BUG_ON(hl->dh_list.next == NULL); -+ BUG_ON(hl->dh_list.prev == NULL); -+ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); -+ if (hl->dh_value == value && hl->dh_pid == current->pid) { -+ /* lock is found */ -+ result = 1; -+ break; -+ } -+ } -+ spin_unlock(&dl->dl_list_lock); -+ return result; -+} -+EXPORT_SYMBOL(dynlock_is_locked); -diff -rupN linux-2.6.18-128.1.6_1/include/linux/dynlocks.h linux-2.6.18-128.1.6_2/include/linux/dynlocks.h ---- linux-2.6.18-128.1.6_1/include/linux/dynlocks.h 1970-01-01 05:30:00.000000000 +0530 -+++ linux-2.6.18-128.1.6_2/include/linux/dynlocks.h 2009-08-13 20:43:18.000000000 +0530 -@@ -0,0 +1,34 @@ -+#ifndef _LINUX_DYNLOCKS_H -+#define _LINUX_DYNLOCKS_H -+ -+#include -+#include -+ -+struct dynlock_handle; -+ -+/* -+ * lock's namespace: -+ * - list of locks -+ * - lock to protect this list -+ */ -+struct dynlock { -+ unsigned dl_magic; -+ struct list_head dl_list; -+ spinlock_t dl_list_lock; -+}; -+ -+enum dynlock_type { -+ DLT_WRITE, -+ DLT_READ -+}; -+ -+int dynlock_cache_init(void); -+void dynlock_cache_exit(void); -+void dynlock_init(struct dynlock *dl); -+struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value, -+ enum dynlock_type lt, gfp_t gfp); -+void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *lock); -+int dynlock_is_locked(struct dynlock *dl, unsigned long value); -+ -+#endif -+ diff --git a/ldiskfs/kernel_patches/patches/ext4-failed-mount-b23368.patch b/ldiskfs/kernel_patches/patches/ext4-failed-mount-b23368.patch deleted file mode 100644 index e38f7c7..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-failed-mount-b23368.patch +++ /dev/null @@ -1,12 +0,0 @@ -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -3427,7 +3427,6 @@ failed_mount: - brelse(bh); - out_fail: - sb->s_fs_info = NULL; -- kfree(sbi->s_blockgroup_lock); - kfree(sbi); - lock_kernel(); - return ret; diff --git a/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch deleted file mode 100644 index 47269d8..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch +++ /dev/null @@ -1,96 +0,0 @@ -diff -rupN linux-2.6.18-164.6.1_1/fs/ext4/ext4.h linux-2.6.18-164.6.1_2/fs/ext4/ext4.h ---- linux-2.6.18-164.6.1_1/fs/ext4/ext4.h 2009-12-22 13:07:27.000000000 +0530 -+++ linux-2.6.18-164.6.1_2/fs/ext4/ext4.h 2009-12-22 13:10:18.000000000 +0530 -@@ -305,6 +305,7 @@ struct ext4_new_group_data { - #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) - #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) - #define EXT4_IOC_MIGRATE _IO('f', 9) -+#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap) - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ - /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ - -diff -rupN linux-2.6.18-164.6.1_1/fs/ext4/ioctl.c linux-2.6.18-164.6.1_2/fs/ext4/ioctl.c ---- linux-2.6.18-164.6.1_1/fs/ext4/ioctl.c 2009-12-22 13:06:51.000000000 +0530 -+++ linux-2.6.18-164.6.1_2/fs/ext4/ioctl.c 2009-12-22 13:09:45.000000000 +0530 -@@ -17,6 +17,71 @@ - #include "ext4_jbd2.h" - #include "ext4.h" - -+/* So that the fiemap access checks can't overflow on 32 bit machines. */ -+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) -+ -+static int fiemap_check_ranges(struct super_block *sb, -+ u64 start, u64 len, u64 *new_len) -+{ -+ *new_len = len; -+ -+ if (len == 0) -+ return -EINVAL; -+ -+ if (start > sb->s_maxbytes) -+ return -EFBIG; -+ -+ /* -+ * Shrink request scope to what the fs can actually handle. -+ */ -+ if ((len > sb->s_maxbytes) || -+ (sb->s_maxbytes - len) < start) -+ *new_len = sb->s_maxbytes - start; -+ -+ return 0; -+} -+ -+int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg) -+{ -+ struct fiemap fiemap; -+ u64 len; -+ struct fiemap_extent_info fieinfo = {0, }; -+ struct super_block *sb = inode->i_sb; -+ int error = 0; -+ -+ if (copy_from_user(&fiemap, (struct fiemap __user *) arg, -+ sizeof(struct fiemap))) -+ return -EFAULT; -+ -+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) -+ return -EINVAL; -+ -+ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, -+ &len); -+ if (error) -+ return error; -+ -+ fieinfo.fi_flags = fiemap.fm_flags; -+ fieinfo.fi_extents_max = fiemap.fm_extent_count; -+ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); -+ -+ if (fiemap.fm_extent_count != 0 && -+ !access_ok(VERIFY_WRITE, (void *)arg, -+ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count]))) -+ return -EFAULT; -+ -+ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) -+ filemap_write_and_wait(inode->i_mapping); -+ -+ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len); -+ fiemap.fm_flags = fieinfo.fi_flags; -+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; -+ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) -+ error = -EFAULT; -+ -+ return error; -+} -+ - long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) - { - struct inode *inode = filp->f_dentry->d_inode; -@@ -249,6 +314,9 @@ flags_out: - mutex_unlock(&(inode->i_mutex)); - return err; - } -+ case EXT4_IOC_FIEMAP: { -+ return ioctl_fiemap(inode, filp, arg); -+ } - - default: - return -ENOTTY; diff --git a/ldiskfs/kernel_patches/patches/ext4-force_over_128tb-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-force_over_128tb-rhel5.patch deleted file mode 100644 index 487b2cc..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-force_over_128tb-rhel5.patch +++ /dev/null @@ -1,56 +0,0 @@ -Index: linux-2.6.18-164.6.1/fs/ext4/super.c -=================================================================== ---- linux-2.6.18-164.6.1.orig/fs/ext4/super.c -+++ linux-2.6.18-164.6.1/fs/ext4/super.c -@@ -51,6 +51,8 @@ - - struct proc_dir_entry *ext4_proc_root; - -+static int force_over_128tb; -+ - static int ext4_load_journal(struct super_block *, struct ext4_super_block *, - unsigned long journal_devnum); - static int ext4_commit_super(struct super_block *sb, -@@ -1343,6 +1345,7 @@ enum { - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_inode_readahead_blks, Opt_journal_ioprio, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents, -+ Opt_force_over_128tb, - }; - - static match_table_t tokens = { -@@ -1410,6 +1413,7 @@ static match_table_t tokens = { - {Opt_auto_da_alloc, "auto_da_alloc"}, - {Opt_noauto_da_alloc, "noauto_da_alloc"}, - {Opt_bigendian_extents, "bigendian_extents"}, -+ {Opt_force_over_128tb, "force_over_128tb"}, - {Opt_err, NULL}, - }; - -@@ -1837,6 +1841,9 @@ set_qf_format: - break; - case Opt_mballoc: - break; -+ case Opt_force_over_128tb: -+ force_over_128tb = 1; -+ break; - default: - printk(KERN_ERR - "EXT4-fs: Unrecognized mount option \"%s\" " -@@ -2692,6 +2699,16 @@ static int ext4_fill_super(struct super_ - goto failed_mount; - } - -+ if (ext4_blocks_count(es) > (8ULL << 32)) { -+ if (force_over_128tb == 0) { -+ printk(KERN_ERR "EXT4-fs does not support filesystems " -+ "greater than 128TB and can cause data corruption." -+ "Use \"force_over_128tb\" mount option to override." -+ "\n"); -+ goto failed_mount; -+ } -+ } -+ - if (EXT4_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext4; - diff --git a/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch deleted file mode 100644 index bc583d2..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch +++ /dev/null @@ -1,73 +0,0 @@ -Index: linux-2.6.18-128.1.6/fs/ext4/inode.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/inode.c -+++ linux-2.6.18-128.1.6/fs/ext4/inode.c -@@ -2850,11 +2850,11 @@ struct inode *ext4_iget(struct super_blo - EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); - EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - -- inode->i_version = le32_to_cpu(raw_inode->i_disk_version); -+ ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) -- inode->i_version |= -- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; -+ ei->i_fs_version |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) -+ << 32; - } - - if (S_ISREG(inode->i_mode)) { -@@ -3043,16 +3043,11 @@ static int ext4_do_update_inode(handle_t - } else for (block = 0; block < EXT4_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - -- raw_inode->i_disk_version = cpu_to_le32(inode->i_version); -+ raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) -- /* in RHEL5 i_version is an unsigned long */ --#if BITS_PER_LONG == 64 -- raw_inode->i_version_hi = -- cpu_to_le32(inode->i_version >> 32); --#else -- raw_inode->i_version_hi = 0; --#endif -+ raw_inode->i_version_hi = cpu_to_le32(ei->i_fs_version -+ >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); - } - -Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h -+++ linux-2.6.18-128.1.6/fs/ext4/ext4.h -@@ -21,6 +21,8 @@ - #include - #include - -+#define HAVE_DISK_INODE_VERSION -+ - /* data type for block offset of block group */ - typedef int ext4_grpblk_t; - -@@ -164,6 +166,8 @@ struct ext4_inode_info { - */ - tid_t i_sync_tid; - tid_t i_datasync_tid; -+ -+ __u64 i_fs_version; - }; - - /* -Index: linux-2.6.18-128.1.6/fs/ext4/ialloc.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/ialloc.c -+++ linux-2.6.18-128.1.6/fs/ext4/ialloc.c -@@ -878,6 +878,7 @@ got: - ei->i_block_alloc_info = NULL; - ei->i_block_group = group; - ei->i_last_alloc_group = ~0; -+ ei->i_fs_version = 0; - - ext4_set_inode_flags(inode); - if (IS_DIRSYNC(inode)) diff --git a/ldiskfs/kernel_patches/patches/ext4-journal-callback-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-journal-callback-rhel5.patch deleted file mode 100644 index 4c08c9e..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-journal-callback-rhel5.patch +++ /dev/null @@ -1,448 +0,0 @@ -Index: linux-stage/fs/ext4/ext4_jbd2.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4_jbd2.h -+++ linux-stage/fs/ext4/ext4_jbd2.h -@@ -106,6 +106,80 @@ - #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) - #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) - -+/** -+ * struct ext4_journal_cb_entry - Base structure for callback information. -+ * -+ * This struct is a 'seed' structure for a using with your own callback -+ * structs. If you are using callbacks you must allocate one of these -+ * or another struct of your own definition which has this struct -+ * as it's first element and pass it to ext4_journal_callback_add(). -+ */ -+struct ext4_journal_cb_entry { -+ /* list information for other callbacks attached to the same handle */ -+ struct list_head jce_list; -+ -+ /* Function to call with this callback structure */ -+ void (*jce_func)(struct super_block *sb, -+ struct ext4_journal_cb_entry *jce, int error); -+ -+ /* user data goes here */ -+}; -+ -+/** -+ * ext4_journal_callback_add: add a function to call after transaction commit -+ * @handle: active journal transaction handle to register callback on -+ * @func: callback function to call after the transaction has committed: -+ * @sb: superblock of current filesystem for transaction -+ * @jce: returned journal callback data -+ * @rc: journal state at commit (0 = transaction committed properly) -+ * @jce: journal callback data (internal and function private data struct) -+ * -+ * The registered function will be called in the context of the journal thread -+ * after the transaction for which the handle was created has completed. -+ * -+ * No locks are held when the callback function is called, so it is safe to -+ * call blocking functions from within the callback, but the callback should -+ * not block or run for too long, or the filesystem will be blocked waiting for -+ * the next transaction to commit. No journaling functions can be used, or -+ * there is a risk of deadlock. -+ * -+ * There is no guaranteed calling order of multiple registered callbacks on -+ * the same transaction. -+ */ -+static inline void ext4_journal_callback_add(handle_t *handle, -+ void (*func)(struct super_block *sb, -+ struct ext4_journal_cb_entry *jce, -+ int rc), -+ struct ext4_journal_cb_entry *jce) -+{ -+ struct ext4_sb_info *sbi = -+ EXT4_SB(handle->h_transaction->t_journal->j_private); -+ -+ /* Add the jce to transaction's private list */ -+ jce->jce_func = func; -+ spin_lock(&sbi->s_md_lock); -+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); -+ spin_unlock(&sbi->s_md_lock); -+} -+ -+/** -+ * ext4_journal_callback_del: delete a registered callback -+ * @handle: active journal transaction handle on which callback was registered -+ * @jce: registered journal callback entry to unregister -+ */ -+static inline void ext4_journal_callback_del(handle_t *handle, -+ struct ext4_journal_cb_entry *jce) -+{ -+ struct ext4_sb_info *sbi = -+ EXT4_SB(handle->h_transaction->t_journal->j_private); -+ -+ spin_lock(&sbi->s_md_lock); -+ list_del_init(&jce->jce_list); -+ spin_unlock(&sbi->s_md_lock); -+} -+ -+#define HAVE_EXT4_JOURNAL_CALLBACK_ADD -+ - int - ext4_mark_iloc_dirty(handle_t *handle, - struct inode *inode, -Index: linux-stage/fs/ext4/mballoc.c -=================================================================== ---- linux-stage.orig/fs/ext4/mballoc.c -+++ linux-stage/fs/ext4/mballoc.c -@@ -21,6 +21,7 @@ - * mballoc.c contains the multiblocks allocation routines - */ - -+#include "ext4_jbd2.h" - #include "mballoc.h" - #include - -@@ -335,14 +336,12 @@ - */ - static struct kmem_cache *ext4_pspace_cachep; - static struct kmem_cache *ext4_ac_cachep; --static struct kmem_cache *ext4_free_ext_cachep; -+static struct kmem_cache *ext4_free_data_cachep; - static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group); - static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group); --static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); -- -- -+static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error); - - static inline void *mb_correct_addr_and_bit(int *bit, void *addr) - { -@@ -2942,8 +2941,6 @@ int ext4_mb_init(struct super_block *sb, - - ext4_mb_history_init(sb); - -- if (sbi->s_journal) -- sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; - } - -@@ -3032,46 +3029,42 @@ int ext4_mb_release(struct super_block * - * This function is called by the jbd2 layer once the commit has finished, - * so we know we can free the blocks that were released with that commit. - */ --static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) -+static void ext4_free_data_callback(struct super_block *sb, -+ struct ext4_journal_cb_entry *jce, -+ int rc) - { -- struct super_block *sb = journal->j_private; -+ struct ext4_free_data *entry = (struct ext4_free_data *)jce; - struct ext4_buddy e4b; - struct ext4_group_info *db; - int err, count = 0, count2 = 0; -- struct ext4_free_data *entry; -- struct list_head *l, *ltmp; - -- list_for_each_safe(l, ltmp, &txn->t_private_list) { -- entry = list_entry(l, struct ext4_free_data, list); -- -- mb_debug(1, "gonna free %u blocks in group %u (0x%p):", -- entry->count, entry->group, entry); -- -- err = ext4_mb_load_buddy(sb, entry->group, &e4b); -- /* we expect to find existing buddy because it's pinned */ -- BUG_ON(err != 0); -- -- db = e4b.bd_info; -- /* there are blocks to put in buddy to make them really free */ -- count += entry->count; -- count2++; -- ext4_lock_group(sb, entry->group); -- /* Take it out of per group rb tree */ -- rb_erase(&entry->node, &(db->bb_free_root)); -- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); -- -- if (!db->bb_free_root.rb_node) { -- /* No more items in the per group rb tree -- * balance refcounts from ext4_mb_free_metadata() -- */ -- page_cache_release(e4b.bd_buddy_page); -- page_cache_release(e4b.bd_bitmap_page); -- } -- ext4_unlock_group(sb, entry->group); -+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):", -+ entry->efd_count, entry->efd_group, entry); - -- kmem_cache_free(ext4_free_ext_cachep, entry); -- ext4_mb_unload_buddy(&e4b); -+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ db = e4b.bd_info; -+ /* there are blocks to put in buddy to make them really free */ -+ count += entry->efd_count; -+ count2++; -+ ext4_lock_group(sb, entry->efd_group); -+ /* Take it out of per group rb tree */ -+ rb_erase(&entry->efd_node, &(db->bb_free_root)); -+ mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count); -+ -+ if (!db->bb_free_root.rb_node) { -+ /* No more items in the per group rb tree -+ * balance refcounts from ext4_mb_free_metadata() -+ */ -+ page_cache_release(e4b.bd_buddy_page); -+ page_cache_release(e4b.bd_bitmap_page); - } -+ ext4_unlock_group(sb, entry->efd_group); -+ -+ kmem_cache_free(ext4_free_data_cachep, entry); -+ ext4_mb_unload_buddy(&e4b); - - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); - } -@@ -3123,22 +3116,24 @@ int __init init_ext4_mballoc(void) - kmem_cache_create("ext4_alloc_context", - sizeof(struct ext4_allocation_context), - 0, SLAB_RECLAIM_ACCOUNT, NULL, NULL); -- if (ext4_ac_cachep == NULL) { -- kmem_cache_destroy(ext4_pspace_cachep); -- return -ENOMEM; -- } -+ if (ext4_ac_cachep == NULL) -+ goto out_err; - -- ext4_free_ext_cachep = -- kmem_cache_create("ext4_free_block_extents", -+ ext4_free_data_cachep = -+ kmem_cache_create("ext4_free_data", - sizeof(struct ext4_free_data), - 0, SLAB_RECLAIM_ACCOUNT, NULL, NULL); -- if (ext4_free_ext_cachep == NULL) { -- kmem_cache_destroy(ext4_pspace_cachep); -- kmem_cache_destroy(ext4_ac_cachep); -- return -ENOMEM; -- } -+ if (ext4_free_data_cachep == NULL) -+ goto out1_err; -+ - ext4_create_debugfs_entry(); - return 0; -+ -+out1_err: -+ kmem_cache_destroy(ext4_ac_cachep); -+out_err: -+ kmem_cache_destroy(ext4_pspace_cachep); -+ return -ENOMEM; - } - - void exit_ext4_mballoc(void) -@@ -3150,7 +3145,7 @@ void exit_ext4_mballoc(void) - rcu_barrier(); - kmem_cache_destroy(ext4_pspace_cachep); - kmem_cache_destroy(ext4_ac_cachep); -- kmem_cache_destroy(ext4_free_ext_cachep); -+ kmem_cache_destroy(ext4_free_data_cachep); - ext4_remove_debugfs_entry(); - } - -@@ -3688,8 +3683,8 @@ static void ext4_mb_generate_from_freeli - n = rb_first(&(grp->bb_free_root)); - - while (n) { -- entry = rb_entry(n, struct ext4_free_data, node); -- mb_set_bits(bitmap, entry->start_blk, entry->count); -+ entry = rb_entry(n, struct ext4_free_data, efd_node); -+ mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count); - n = rb_next(n); - } - return; -@@ -4974,11 +4969,11 @@ out3: - * AND the blocks are associated with the same group. - */ - static int can_merge(struct ext4_free_data *entry1, -- struct ext4_free_data *entry2) -+ struct ext4_free_data *entry2) - { -- if ((entry1->t_tid == entry2->t_tid) && -- (entry1->group == entry2->group) && -- ((entry1->start_blk + entry1->count) == entry2->start_blk)) -+ if ((entry1->efd_tid == entry2->efd_tid) && -+ (entry1->efd_group == entry2->efd_group) && -+ ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk)) - return 1; - return 0; - } -@@ -4991,7 +4986,6 @@ ext4_mb_free_metadata(handle_t *handle, - struct ext4_free_data *entry; - struct ext4_group_info *db = e4b->bd_info; - struct super_block *sb = e4b->bd_sb; -- struct ext4_sb_info *sbi = EXT4_SB(sb); - struct rb_node **n = &db->bb_free_root.rb_node, *node; - struct rb_node *parent = NULL, *new_node; - -@@ -4999,8 +4993,8 @@ ext4_mb_free_metadata(handle_t *handle, - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); - -- new_node = &new_entry->node; -- block = new_entry->start_blk; -+ new_node = &new_entry->efd_node; -+ block = new_entry->efd_start_blk; - - if (!*n) { - /* first free block exent. We need to -@@ -5013,15 +5007,15 @@ ext4_mb_free_metadata(handle_t *handle, - } - while (*n) { - parent = *n; -- entry = rb_entry(parent, struct ext4_free_data, node); -- if (block < entry->start_blk) -+ entry = rb_entry(parent, struct ext4_free_data, efd_node); -+ if (block < entry->efd_start_blk) - n = &(*n)->rb_left; -- else if (block >= (entry->start_blk + entry->count)) -+ else if (block >= (entry->efd_start_blk + entry->efd_count)) - n = &(*n)->rb_right; - else { - ext4_grp_locked_error(sb, e4b->bd_group, __func__, - "Double free of blocks %d (%d %d)", -- block, entry->start_blk, entry->count); -+ block, entry->efd_start_blk, entry->efd_count); - return 0; - } - } -@@ -5032,34 +5026,29 @@ ext4_mb_free_metadata(handle_t *handle, - /* Now try to see the extent can be merged to left and right */ - node = rb_prev(new_node); - if (node) { -- entry = rb_entry(node, struct ext4_free_data, node); -+ entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry)) { -- new_entry->start_blk = entry->start_blk; -- new_entry->count += entry->count; -+ new_entry->efd_start_blk = entry->efd_start_blk; -+ new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); -- spin_lock(&sbi->s_md_lock); -- list_del(&entry->list); -- spin_unlock(&sbi->s_md_lock); -- kmem_cache_free(ext4_free_ext_cachep, entry); -+ ext4_journal_callback_del(handle, &entry->efd_jce); -+ kmem_cache_free(ext4_free_data_cachep, entry); - } - } - - node = rb_next(new_node); - if (node) { -- entry = rb_entry(node, struct ext4_free_data, node); -+ entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry)) { -- new_entry->count += entry->count; -+ new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); -- spin_lock(&sbi->s_md_lock); -- list_del(&entry->list); -- spin_unlock(&sbi->s_md_lock); -- kmem_cache_free(ext4_free_ext_cachep, entry); -+ ext4_journal_callback_del(handle, &entry->efd_jce); -+ kmem_cache_free(ext4_free_data_cachep, entry); - } - } - /* Add the extent to transaction's private list */ -- spin_lock(&sbi->s_md_lock); -- list_add(&new_entry->list, &handle->h_transaction->t_private_list); -- spin_unlock(&sbi->s_md_lock); -+ ext4_journal_callback_add(handle, ext4_free_data_callback, -+ &new_entry->efd_jce); - return 0; - } - -@@ -5191,11 +5180,11 @@ do_more: - * blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed - */ -- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); -- new_entry->start_blk = bit; -- new_entry->group = block_group; -- new_entry->count = count; -- new_entry->t_tid = handle->h_transaction->t_tid; -+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); -+ new_entry->efd_start_blk = bit; -+ new_entry->efd_group = block_group; -+ new_entry->efd_count = count; -+ new_entry->efd_tid = handle->h_transaction->t_tid; - - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); -Index: linux-stage/fs/ext4/mballoc.h -=================================================================== ---- linux-stage.orig/fs/ext4/mballoc.h -+++ linux-stage/fs/ext4/mballoc.h -@@ -107,23 +107,24 @@ extern u8 mb_enable_debug; - */ - #define MB_DEFAULT_GROUP_PREALLOC 512 - -- - struct ext4_free_data { -- /* this links the free block information from group_info */ -- struct rb_node node; -+ /* MUST be the first member */ -+ struct ext4_journal_cb_entry efd_jce; - -- /* this links the free block information from ext4_sb_info */ -- struct list_head list; -+ /* ext4_free_data private data starts from here */ -+ -+ /* this links the free block information from group_info */ -+ struct rb_node efd_node; - - /* group which free block extent belongs */ -- ext4_group_t group; -+ ext4_group_t efd_group; - - /* free block extent */ -- ext4_grpblk_t start_blk; -- ext4_grpblk_t count; -+ ext4_grpblk_t efd_start_blk; -+ ext4_grpblk_t efd_count; - - /* transaction which freed this extent */ -- tid_t t_tid; -+ tid_t efd_tid; - }; - - struct ext4_prealloc_space { -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -304,6 +304,23 @@ void ext4_journal_abort_handle(const cha - - EXPORT_SYMBOL(ext4_journal_abort_handle); - -+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) -+{ -+ struct super_block *sb = journal->j_private; -+ struct ext4_sb_info *sbi = EXT4_SB(sb); -+ int error = is_journal_aborted(journal); -+ struct ext4_journal_cb_entry *jce, *tmp; -+ -+ spin_lock(&sbi->s_md_lock); -+ list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { -+ list_del_init(&jce->jce_list); -+ spin_unlock(&sbi->s_md_lock); -+ jce->jce_func(sb, jce, error); -+ spin_lock(&sbi->s_md_lock); -+ } -+ spin_unlock(&sbi->s_md_lock); -+} -+ - /* Deal with the reporting of failure conditions on a filesystem such as - * inconsistencies detected or read IO failures. - * -@@ -2997,6 +3014,8 @@ static int ext4_fill_super(struct super_ - } - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - -+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; -+ - no_journal: - - if (test_opt(sb, NOBH)) { diff --git a/ldiskfs/kernel_patches/patches/ext4-kill-dx_root.patch b/ldiskfs/kernel_patches/patches/ext4-kill-dx_root.patch deleted file mode 100644 index c8f2d1a..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-kill-dx_root.patch +++ /dev/null @@ -1,245 +0,0 @@ -removes static definition of dx_root struct. so that "." and ".." dirent can -have extra data. This patch does not change any functionality but is required for -ext4_data_in_dirent patch. - -Index: b/fs/ext4/namei.c -=================================================================== ---- a/fs/ext4/namei.c -+++ b/fs/ext4/namei.c -@@ -121,22 +121,13 @@ struct dx_entry - * hash version mod 4 should never be 0. Sincerely, the paranoia department. - */ - --struct dx_root -+struct dx_root_info - { -- struct fake_dirent dot; -- char dot_name[4]; -- struct fake_dirent dotdot; -- char dotdot_name[4]; -- struct dx_root_info -- { -- __le32 reserved_zero; -- u8 hash_version; -- u8 info_length; /* 8 */ -- u8 indirect_levels; -- u8 unused_flags; -- } -- info; -- struct dx_entry entries[0]; -+ __le32 reserved_zero; -+ u8 hash_version; -+ u8 info_length; /* 8 */ -+ u8 indirect_levels; -+ u8 unused_flags; - }; - - struct dx_node -@@ -225,6 +216,16 @@ ext4_next_entry(struct ext4_dir_entry_2 - * Future: use high four bits of block for coalesce-on-delete flags - * Mask them off for now. - */ -+struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de) -+{ -+ /* get dotdot first */ -+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); -+ -+ /* dx root info is after dotdot entry */ -+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); -+ -+ return (struct dx_root_info *) de; -+} - - static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) - { -@@ -378,7 +379,7 @@ dx_probe(struct dentry *dentry, struct i - { - unsigned count, indirect; - struct dx_entry *at, *entries, *p, *q, *m; -- struct dx_root *root; -+ struct dx_root_info * info; - struct buffer_head *bh; - struct dx_frame *frame = frame_in; - u32 hash; -@@ -388,18 +389,19 @@ dx_probe(struct dentry *dentry, struct i - dir = dentry->d_parent->d_inode; - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) - goto fail; -- root = (struct dx_root *) bh->b_data; -- if (root->info.hash_version != DX_HASH_TEA && -- root->info.hash_version != DX_HASH_HALF_MD4 && -- root->info.hash_version != DX_HASH_LEGACY) { -+ -+ info = dx_get_dx_info((struct ext4_dir_entry_2*)bh->b_data); -+ if (info->hash_version != DX_HASH_TEA && -+ info->hash_version != DX_HASH_HALF_MD4 && -+ info->hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, "Unrecognised inode hash code %d" - "for directory #%lu", -- root->info.hash_version, dir->i_ino); -+ info->hash_version, dir->i_ino); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } -- hinfo->hash_version = root->info.hash_version; -+ hinfo->hash_version = info->hash_version; - if (hinfo->hash_version <= DX_HASH_TEA) - hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; -@@ -398,27 +399,26 @@ dx_probe(struct dentry *dentry, struct i - ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); - hash = hinfo->hash; - -- if (root->info.unused_flags & 1) { -+ if (info->unused_flags & 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", -- root->info.unused_flags); -+ info->unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - -- if ((indirect = root->info.indirect_levels) > 1) { -+ if ((indirect = info->indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", -- root->info.indirect_levels); -+ info->indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - -- entries = (struct dx_entry *) (((char *)&root->info) + -- root->info.info_length); -+ entries = (struct dx_entry *) (((char *)info) + info->info_length); - - if (dx_get_limit(entries) != dx_root_limit(dir, -- root->info.info_length)) { -+ info->info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); - brelse(bh); - -@@ -509,10 +510,12 @@ fail: - - static void dx_release (struct dx_frame *frames) - { -+ struct dx_root_info *info; - if (frames[0].bh == NULL) - return; - -- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); -+ if (info->indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); - } -@@ -1430,17 +1433,16 @@ static int make_indexed_dir(handle_t *ha - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; -- struct dx_root *root; - struct dx_frame frames[2], *frame; - struct dx_entry *entries; -- struct ext4_dir_entry_2 *de, *de2; -+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; - char *data1, *top; - unsigned len; - int retval; - unsigned blocksize; - struct dx_hash_info hinfo; - ext4_lblk_t block; -- struct fake_dirent *fde; -+ struct dx_root_info *dx_info; - - blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); -@@ -1450,7 +1452,6 @@ static int make_indexed_dir(handle_t *ha - brelse(bh); - return retval; - } -- root = (struct dx_root *) bh->b_data; - - bh2 = ext4_append (handle, dir, &block, &retval); - if (!(bh2)) { -@@ -1460,18 +1461,20 @@ static int make_indexed_dir(handle_t *ha - } - root = (struct dx_root *) bh->b_data; - -+ dot_de = (struct ext4_dir_entry_2 *) bh->b_data; -+ dotdot_de = ext4_next_entry(dot_de, blocksize); -+ - /* The 0th block becomes the root, move the dirents out */ -- fde = &root->dotdot; -- de = (struct ext4_dir_entry_2 *)((char *)fde + -- ext4_rec_len_from_disk(fde->rec_len, blocksize)); -+ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de + -+ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize)); -- if ((char *) de >= (((char *) root) + blocksize)) { -+ if ((char *) de >= (((char *) dot_de) + blocksize)) { - ext4_error(dir->i_sb, - "invalid rec_len for '..' in inode %lu", - dir->i_ino); - brelse(bh); - return -EIO; - } -- len = ((char *) root) + blocksize - (char *) de; -+ len = ((char *) dot_de) + blocksize - (char *) de; - - /* Allocate new block for the 0th block's dirents */ - bh2 = ext4_append(handle, dir, &block, &retval); -@@ -1472,19 +1475,23 @@ static int make_indexed_dir(handle_t *ha - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); - blocksize); - /* Initialize the root; the dot dirents already exist */ -- de = (struct ext4_dir_entry_2 *) (&root->dotdot); -- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), -- blocksize); -- memset (&root->info, 0, sizeof(root->info)); -- root->info.info_length = sizeof(root->info); -- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; -- entries = root->entries; -- dx_set_block(entries, 1); -- dx_set_count(entries, 1); -- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); -+ dotdot_de->rec_len = ext4_rec_len_to_disk(blocksize - -+ le16_to_cpu(dot_de->rec_len), blocksize); -+ -+ /* initialize hashing info */ -+ dx_info = dx_get_dx_info(dot_de); -+ memset (dx_info, 0, sizeof(*dx_info)); -+ dx_info->info_length = sizeof(*dx_info); -+ dx_info->hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; -+ -+ entries = (void *)dx_info + sizeof(*dx_info); -+ -+ dx_set_block(entries, 1); -+ dx_set_count(entries, 1); -+ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); - - /* Initialize as for dx_probe */ -- hinfo.hash_version = root->info.hash_version; -+ hinfo.hash_version = dx_info->hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; -@@ -1724,6 +1733,7 @@ static int ext4_dx_add_entry(handle_t *h - goto journal_error; - brelse (bh2); - } else { -+ struct dx_root_info * info; - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); - memcpy((char *) entries2, (char *) entries, -@@ -1732,7 +1742,9 @@ static int ext4_dx_add_entry(handle_t *h - /* Set up root */ - dx_set_count(entries, 1); - dx_set_block(entries + 0, newblock); -- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ info = dx_get_dx_info((struct ext4_dir_entry_2*) -+ frames[0].bh->b_data); -+ info->indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; diff --git a/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch deleted file mode 100644 index 7c3933c..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch +++ /dev/null @@ -1,49 +0,0 @@ -diff -rupN linux-2.6.18-164.6.1_1//fs/ext4/ialloc.c linux-2.6.18-164.6.1_2//fs/ext4/ialloc.c ---- linux-2.6.18-164.6.1_1//fs/ext4/ialloc.c 2010-03-31 17:42:50.000000000 +0530 -+++ linux-2.6.18-164.6.1_2//fs/ext4/ialloc.c 2010-03-31 17:43:22.000000000 +0530 -@@ -622,11 +622,14 @@ struct inode *ext4_new_inode_goal(handle - sb = dir->i_sb; - trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, - dir->i_ino, mode); -+ sbi = EXT4_SB(sb); -+ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size) -+ return ERR_PTR(-EFBIG); -+ - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - ei = EXT4_I(inode); -- sbi = EXT4_SB(sb); - - if (goal) - goal = sbi->s_inode_goal; -diff -rupN linux-2.6.18-164.6.1_1//fs/ext4/super.c linux-2.6.18-164.6.1_2//fs/ext4/super.c ---- linux-2.6.18-164.6.1_1//fs/ext4/super.c 2010-03-31 17:42:50.000000000 +0530 -+++ linux-2.6.18-164.6.1_2//fs/ext4/super.c 2010-03-31 17:45:32.000000000 +0530 -@@ -40,6 +40,7 @@ - EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); - EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); - EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -+EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size); - - static struct attribute *ext4_attrs[] = { - ATTR_LIST(delayed_allocation_blocks), -@@ -48,6 +48,7 @@ - ATTR_LIST(mb_order2_req), - ATTR_LIST(mb_stream_req), - ATTR_LIST(mb_group_prealloc), -+ ATTR_LIST(max_dir_size), - NULL, - }; - -diff -rupN linux-2.6.18-164.6.1_1//fs/ext4/ext4_sb.h linux-2.6.18-164.6.1_2//fs/ext4/ext4_sb.h ---- linux-2.6.18-164.6.1_1//fs/ext4/ext4.h 2010-03-31 17:42:50.000000000 +0530 -+++ linux-2.6.18-164.6.1_2//fs/ext4/ext4.h 2010-03-31 17:43:22.000000000 +0530 -@@ -119,6 +119,7 @@ struct ext4_sb_info { - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; -+ unsigned long s_max_dir_size; - - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch deleted file mode 100644 index 9a9466f..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch +++ /dev/null @@ -1,352 +0,0 @@ -diff -rupN linux-2.6.18-128.1.6_1//fs/ext4/mballoc.c linux-2.6.18-128.1.6_2//fs/ext4/mballoc.c ---- linux-2.6.18-128.1.6_1//fs/ext4/mballoc.c -+++ linux-2.6.18-128.1.6_2//fs/ext4/mballoc.c -@@ -360,8 +360,8 @@ static void ext4_mb_mark_free_simple(str - static struct kmem_cache *ext4_pspace_cachep; - static struct kmem_cache *ext4_ac_cachep; - static struct kmem_cache *ext4_free_ext_cachep; --static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, -- ext4_group_t group); -+static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, -+ ext4_group_t group); - static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group); - -@@ -660,7 +660,7 @@ static void ext4_mb_mark_free_simple(str - } - - static noinline_for_stack --void ext4_mb_generate_buddy(struct super_block *sb, -+int ext4_mb_generate_buddy(struct super_block *sb, - void *buddy, void *bitmap, ext4_group_t group) - { - struct ext4_group_info *grp = ext4_get_group_info(sb, group); -@@ -692,14 +692,14 @@ static void ext4_mb_generate_buddy(struc - grp->bb_fragments = fragments; - - if (free != grp->bb_free) { -- ext4_grp_locked_error(sb, group, __func__, -- "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", -- group, free, grp->bb_free); -- /* -- * If we intent to continue, we consider group descritor -- * corrupt and update bb_free using bitmap value -- */ -- grp->bb_free = free; -+ struct ext4_group_desc *gdp; -+ gdp = ext4_get_group_desc (sb, group, NULL); -+ ext4_error(sb, -+ "group %lu: %u blocks in bitmap, %u in bb, " -+ "%u in gd, %lu pa's\n", (long unsigned int)group, -+ free, grp->bb_free, ext4_free_blks_count(sb, gdp), -+ grp->bb_prealloc_nr); -+ return -EIO; - } - - clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); -@@ -709,6 +709,8 @@ static void ext4_mb_generate_buddy(struc - EXT4_SB(sb)->s_mb_buddies_generated++; - EXT4_SB(sb)->s_mb_generation_time += period; - spin_unlock(&EXT4_SB(sb)->s_bal_lock); -+ -+ return 0; - } - - /* The buddy information is attached the buddy cache inode -@@ -814,7 +816,7 @@ static int ext4_mb_init_cache(struct pag - first_block = page->index * blocks_per_page; - /* init the page */ - memset(page_address(page), 0xff, PAGE_CACHE_SIZE); -- for (i = 0; i < blocks_per_page; i++) { -+ for (i = 0; i < blocks_per_page && err == 0; i++) { - int group; - struct ext4_group_info *grinfo; - -@@ -848,7 +850,7 @@ static int ext4_mb_init_cache(struct pag - * incore got set to the group block bitmap below - */ - ext4_lock_group(sb, group); -- ext4_mb_generate_buddy(sb, data, incore, group); -+ err = ext4_mb_generate_buddy(sb, data, incore, group); - ext4_unlock_group(sb, group); - incore = NULL; - } else { -@@ -861,7 +863,7 @@ static int ext4_mb_init_cache(struct pag - memcpy(data, bitmap, blocksize); - - /* mark all preallocated blks used in in-core bitmap */ -- ext4_mb_generate_from_pa(sb, data, group); -+ err = ext4_mb_generate_from_pa(sb, data, group); - ext4_mb_generate_from_freelist(sb, data, group); - ext4_unlock_group(sb, group); - -@@ -870,6 +872,7 @@ static int ext4_mb_init_cache(struct pag - incore = data; - } - } -+ if (likely(err == 0)) - SetPageUptodate(page); - - out: -@@ -1964,7 +1967,10 @@ static int ext4_mb_seq_history_show(stru - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s free\n", - hs->pid, hs->ino, buf2); -+ } else { -+ seq_printf(seq, "unknown op %d\n", hs->op); - } -+ - return 0; - } - -@@ -2092,9 +2098,11 @@ static void *ext4_mb_seq_groups_next(str - static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) - { - struct super_block *sb = seq->private; -+ struct ext4_group_desc *gdp; - ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err; -+ int free = 0; - struct ext4_buddy e4b; - struct sg { - struct ext4_group_info info; -@@ -2103,10 +2111,10 @@ static int ext4_mb_seq_groups_show(struc - - group--; - if (group == 0) -- seq_printf(seq, "#%-5s: %-5s %-5s %-5s " -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s" - "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " - "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -- "group", "free", "frags", "first", -+ "group", "free", "frags", "first", "first", "pa", - "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", - "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); - -@@ -2117,13 +2125,20 @@ static int ext4_mb_seq_groups_show(struc - seq_printf(seq, "#%-5lu: I/O error\n", group); - return 0; - } -+ -+ gdp = ext4_get_group_desc(sb, group, NULL); -+ if (gdp != NULL) -+ free = ext4_free_blks_count(sb, gdp); -+ - ext4_lock_group(sb, group); - memcpy(&sg, ext4_get_group_info(sb, group), i); - ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); - -- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, -- sg.info.bb_fragments, sg.info.bb_first_free); -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", -+ (long unsigned int)group, sg.info.bb_free, free, -+ sg.info.bb_fragments, sg.info.bb_first_free, -+ sg.info.bb_prealloc_nr); - for (i = 0; i <= 13; i++) - seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); -@@ -2226,6 +2241,7 @@ ext4_mb_store_history(struct ext4_alloca - h.tail = ac->ac_tail; - h.buddy = ac->ac_buddy; - h.merged = 0; -+ h.cr = ac->ac_criteria; - if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { - if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && - ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -@@ -3539,23 +3555,68 @@ ext4_mb_use_preallocated(struct ext4_all - } - - /* -+ * check free blocks in bitmap match free block in group descriptor -+ * do this before taking preallocated blocks into account to be able -+ * to detect on-disk corruptions. The group lock should be hold by the -+ * caller. -+ */ -+int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, -+ struct ext4_group_desc *gdp, int group) -+{ -+ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); -+ unsigned short i, first, free = 0; -+ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ -+ while (i < max) { -+ first = i; -+ i = mb_find_next_bit(bitmap, max, i); -+ if (i > max) -+ i = max; -+ free += i - first; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ -+ if (free != ext4_free_blks_count(sb, gdp)) { -+ ext4_error(sb, "on-disk bitmap for group %d" -+ "corrupted: %u blocks free in bitmap, %u - in gd\n", -+ group, free, ext4_free_blks_count(sb, gdp)); -+ return -EIO; -+ } -+ return 0; -+} -+ -+/* - * the function goes through all preallocation in this group and marks them - * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock held - */ - static noinline_for_stack --void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, -+int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group) - { - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_prealloc_space *pa; -+ struct ext4_group_desc *gdp; - struct list_head *cur; - ext4_group_t groupnr; - ext4_grpblk_t start; - int preallocated = 0; - int count = 0; -+ int skip = 0; -+ int err; - int len; - -+ gdp = ext4_get_group_desc (sb, group, NULL); -+ if (gdp == NULL) -+ return -EIO; -+ -+ /* before applying preallocations, check bitmap consistency */ -+ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); -+ if (err) -+ return err; -+ - /* all form of preallocation discards first load group, - * so the only competing code is preallocation use. - * we don't need any locking here -@@ -3570,14 +3631,23 @@ static void ext4_mb_generate_from_pa(str - &groupnr, &start); - len = pa->pa_len; - spin_unlock(&pa->pa_lock); -- if (unlikely(len == 0)) -+ if (unlikely(len == 0)) { -+ skip++; - continue; -+ } - BUG_ON(groupnr != group); - mb_set_bits(bitmap, start, len); - preallocated += len; - count++; - } -+ if (count + skip != grp->bb_prealloc_nr) { -+ ext4_error(sb, "lost preallocations: " -+ "count %d, bb_prealloc_nr %lu, skip %d\n", -+ count, grp->bb_prealloc_nr, skip); -+ return -EIO; -+ } - mb_debug(1, "prellocated %u for group %u\n", preallocated, group); -+ return 0; - } - - static void ext4_mb_pa_callback(struct rcu_head *head) -@@ -3629,6 +3699,7 @@ static void ext4_mb_put_pa(struct ext4_a - */ - ext4_lock_group(sb, grp); - list_del(&pa->pa_group_list); -+ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; - ext4_unlock_group(sb, grp); - - spin_lock(pa->pa_obj_lock); -@@ -3717,6 +3788,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat - - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - list_add(&pa->pa_group_list, &grp->bb_prealloc_list); -+ grp->bb_prealloc_nr++; - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - - spin_lock(pa->pa_obj_lock); -@@ -3776,6 +3848,7 @@ ext4_mb_new_group_pa(struct ext4_allocat - - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - list_add(&pa->pa_group_list, &grp->bb_prealloc_list); -+ grp->bb_prealloc_nr++; - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - - /* -@@ -3828,6 +3901,7 @@ ext4_mb_release_inode_pa(struct ext4_bud - ac->ac_sb = sb; - ac->ac_inode = pa->pa_inode; - ac->ac_op = EXT4_MB_HISTORY_DISCARD; -+ ac->ac_o_ex.fe_len = 1; - } - - while (bit < end) { -@@ -3972,6 +4046,8 @@ repeat: - - spin_unlock(&pa->pa_lock); - -+ BUG_ON(grp->bb_prealloc_nr == 0); -+ grp->bb_prealloc_nr--; - list_del(&pa->pa_group_list); - list_add(&pa->u.pa_tmp_list, &list); - } -@@ -4107,7 +4183,7 @@ repeat: - if (err) { - ext4_error(sb, "Error loading buddy information for %u", - group); -- continue; -+ return; - } - - bitmap_bh = ext4_read_block_bitmap(sb, group); -@@ -4119,6 +4195,8 @@ repeat: - } - - ext4_lock_group(sb, group); -+ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); -+ e4b.bd_info->bb_prealloc_nr--; - list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); - ext4_unlock_group(sb, group); -@@ -4394,6 +4472,7 @@ ext4_mb_discard_lg_preallocations(struct - } - ext4_lock_group(sb, group); - list_del(&pa->pa_group_list); -+ ext4_get_group_info(sb, group)->bb_prealloc_nr--; - ext4_mb_release_group_pa(&e4b, pa, ac); - ext4_unlock_group(sb, group); - -diff -rupN linux-2.6.18-128.1.6/fs/ext4/ext4.h ---- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h -+++ linux-2.6.18-128.1.6/fs/ext4/ext4.h -@@ -119,6 +119,7 @@ struct ext4_group_info { - unsigned short bb_free; - unsigned short bb_fragments; - struct list_head bb_prealloc_list; -+ unsigned long bb_prealloc_nr; - #ifdef DOUBLE_CHECK - void *bb_bitmap; - #endif -Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.h -+++ linux-2.6.18-128.1.6/fs/ext4/mballoc.h -@@ -92,7 +92,7 @@ - /* - * for which requests use 2^N search using buddies - */ --#define MB_DEFAULT_ORDER2_REQS 2 -+#define MB_DEFAULT_ORDER2_REQS 8 - - /* - * default group prealloc size 512 blocks -@@ -228,7 +229,7 @@ struct ext4_mb_history { - __u16 tail; /* what tail broke some buddy */ - __u16 buddy; /* buddy the tail ^^^ broke */ - __u16 flags; -- __u8 cr:3; /* which phase the result extent was found at */ -+ __u8 cr:8; /* which phase the result extent was found at */ - __u8 op:4; - __u8 merged:1; - }; diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-group_check-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-group_check-rhel5.patch deleted file mode 100644 index 3b9de5c..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-mballoc-group_check-rhel5.patch +++ /dev/null @@ -1,320 +0,0 @@ -commit 8a57d9d61a6e361c7bb159dda797672c1df1a691 -Author: Curt Wohlgemuth -Date: Sun May 16 15:00:00 2010 -0400 - - ext4: check for a good block group before loading buddy pages - - This adds a new field in ext4_group_info to cache the largest available - block range in a block group; and don't load the buddy pages until *after* - we've done a sanity check on the block group. - - With large allocation requests (e.g., fallocate(), 8MiB) and relatively full - partitions, it's easy to have no block groups with a block extent large - enough to satisfy the input request length. This currently causes the loop - during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages - for EVERY block group. That can be a lot of pages. The patch below allows - us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we - have check again after we lock the block group). - - Addresses-Google-Bug: #2578108 - Addresses-Google-Bug: #2704453 - - Signed-off-by: Curt Wohlgemuth - Signed-off-by: "Theodore Ts'o" - -Index: linux-2.6.32/fs/ext4/ext4.h -=================================================================== ---- linux-2.6.32.orig/fs/ext4/ext4.h 2009-12-02 20:51:21.000000000 -0700 -+++ linux-2.6.32/fs/ext4/ext4.h 2011-02-17 23:54:52.708097710 -0700 -@@ -1625,6 +1625,7 @@ struct ext4_group_info { - ext4_grpblk_t bb_first_free; /* first free block */ - ext4_grpblk_t bb_free; /* total free blocks */ - ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ -+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ - struct list_head bb_prealloc_list; - #ifdef DOUBLE_CHECK - void *bb_bitmap; -Index: linux-2.6.32/fs/ext4/mballoc.c -=================================================================== ---- linux-2.6.32.orig/fs/ext4/mballoc.c 2009-12-02 20:51:21.000000000 -0700 -+++ linux-2.6.32/fs/ext4/mballoc.c 2011-02-18 00:41:06.872097644 -0700 -@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(str - } - } - -+/* -+ * Cache the order of the largest free extent we have available in this block -+ * group. -+ */ -+static void -+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) -+{ -+ int i; -+ int bits; -+ -+ grp->bb_largest_free_order = -1; /* uninit */ -+ -+ bits = sb->s_blocksize_bits + 1; -+ for (i = bits; i >= 0; i--) { -+ if (grp->bb_counters[i] > 0) { -+ grp->bb_largest_free_order = i; -+ break; -+ } -+ } -+} -+ - static noinline_for_stack - void ext4_mb_generate_buddy(struct super_block *sb, - void *buddy, void *bitmap, ext4_group_t group) -@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super - */ - grp->bb_free = free; - } -+ mb_set_largest_free_order(sb, grp); - - clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - -@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super - * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. - * So it can have information regarding groups_per_page which - * is blocks_per_page/2 -+ * -+ * Locking note: This routine takes the block group lock of all groups -+ * for this page; do not hold this lock when calling this routine! - */ - - static int ext4_mb_init_cache(struct page *page, char *incore) -@@ -910,6 +935,11 @@ out: - return err; - } - -+/* -+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the -+ * block group lock of all groups for this page; do not hold the BG lock when -+ * calling this routine! -+ */ - static noinline_for_stack - int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) - { -@@ -1004,6 +1034,11 @@ err: - return ret; - } - -+/* -+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the -+ * block group lock of all groups for this page; do not hold the BG lock when -+ * calling this routine! -+ */ - static noinline_for_stack int - ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) -@@ -1150,7 +1185,7 @@ err: - return ret; - } - --static void ext4_mb_release_desc(struct ext4_buddy *e4b) -+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) - { - if (e4b->bd_bitmap_page) - page_cache_release(e4b->bd_bitmap_page); -@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode - buddy = buddy2; - } while (1); - } -+ mb_set_largest_free_order(sb, e4b->bd_info); - mb_check_buddy(e4b); - } - -@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_budd - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - } -+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - - mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); - mb_check_buddy(e4b); -@@ -1618,7 +1655,7 @@ int ext4_mb_try_best_found(struct ext4_a - } - - ext4_unlock_group(ac->ac_sb, group); -- ext4_mb_release_desc(e4b); -+ ext4_mb_unload_buddy(e4b); - - return 0; - } -@@ -1674,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_all - ext4_mb_use_best_found(ac, e4b); - } - ext4_unlock_group(ac->ac_sb, group); -- ext4_mb_release_desc(e4b); -+ ext4_mb_unload_buddy(e4b); - - return 0; - } -@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_al - } - } - -+/* This is now called BEFORE we load the buddy bitmap. */ - static int ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) - { - unsigned free, fragments; -- unsigned i, bits; - int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); - struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - - BUG_ON(cr < 0 || cr >= 4); -- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); -+ -+ /* We only do this if the grp has never been initialized */ -+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { -+ int ret = ext4_mb_init_group(ac->ac_sb, group); -+ if (ret) -+ return 0; -+ } - - free = grp->bb_free; - fragments = grp->bb_fragments; -@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext - case 0: - BUG_ON(ac->ac_2order == 0); - -+ if (grp->bb_largest_free_order < ac->ac_2order) -+ return 0; -+ - /* Avoid using the first bg of a flexgroup for data files */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA) && - (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && - ((group % flex_size) == 0)) - return 0; - -- bits = ac->ac_sb->s_blocksize_bits + 1; -- for (i = ac->ac_2order; i <= bits; i++) -- if (grp->bb_counters[i] > 0) -- return 1; -- break; -+ return 1; - case 1: - if ((free / fragments) >= ac->ac_g_ex.fe_len) - return 1; -@@ -2026,15 +2068,11 @@ repeat: - group = ac->ac_g_ex.fe_group; - - for (i = 0; i < ngroups; group++, i++) { -- struct ext4_group_info *grp; -- struct ext4_group_desc *desc; -- - if (group == ngroups) - group = 0; - -- /* quick check to skip empty groups */ -- grp = ext4_get_group_info(sb, group); -- if (grp->bb_free == 0) -+ /* This now checks without needing the buddy page */ -+ if (!ext4_mb_good_group(ac, group, cr)) - continue; - - err = ext4_mb_load_buddy(sb, group, &e4b); -@@ -2042,15 +2080,18 @@ repeat: - goto out; - - ext4_lock_group(sb, group); -+ -+ /* -+ * We need to check again after locking the -+ * block group -+ */ - if (!ext4_mb_good_group(ac, group, cr)) { -- /* someone did allocation from this group */ - ext4_unlock_group(sb, group); -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - continue; - } - - ac->ac_groups_scanned++; -- desc = ext4_get_group_desc(sb, group, NULL); - if (cr == 0) - ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && -@@ -2060,7 +2101,7 @@ repeat: - ext4_mb_complex_scan_group(ac, &e4b); - - ext4_unlock_group(sb, group); -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - - if (ac->ac_status != AC_STATUS_CONTINUE) - break; -@@ -2150,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struc - ext4_lock_group(sb, group); - memcpy(&sg, ext4_get_group_info(sb, group), i); - ext4_unlock_group(sb, group); -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); -@@ -2257,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_b - INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); - init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root = RB_ROOT; -+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ - - #ifdef DOUBLE_CHECK - { -@@ -2567,7 +2609,7 @@ static void release_blocks_on_commit(jou - sb_issue_discard(sb, discard_block, entry->count); - - kmem_cache_free(ext4_free_ext_cachep, entry); -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - } - - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); -@@ -3692,7 +3734,7 @@ out: - ext4_unlock_group(sb, group); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - put_bh(bitmap_bh); - return free; - } -@@ -3796,7 +3838,7 @@ repeat: - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", - group); -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - continue; - } - -@@ -3805,7 +3847,7 @@ repeat: - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); - ext4_unlock_group(sb, group); - -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - put_bh(bitmap_bh); - - list_del(&pa->u.pa_tmp_list); -@@ -4069,7 +4111,7 @@ ext4_mb_discard_lg_preallocations(struct - ext4_mb_release_group_pa(&e4b, pa, ac); - ext4_unlock_group(sb, group); - -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); - } -@@ -4570,7 +4612,7 @@ do_more: - atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); - } - -- ext4_mb_release_desc(&e4b); -+ ext4_mb_unload_buddy(&e4b); - - *freed += count; - diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch.patch deleted file mode 100644 index ff6ef5f..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch.patch +++ /dev/null @@ -1,108 +0,0 @@ -Index: linux-stage/fs/ext4/mballoc.c -=================================================================== ---- linux-stage.orig/fs/ext4/mballoc.c 2010-01-26 22:50:37.000000000 +0800 -+++ linux-stage/fs/ext4/mballoc.c 2010-01-26 22:57:24.000000000 +0800 -@@ -3892,6 +3892,7 @@ - INIT_LIST_HEAD(&pa->pa_group_list); - pa->pa_deleted = 0; - pa->pa_type = MB_INODE_PA; -+ pa->pa_error = 0; - - mb_debug("new inode pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); -@@ -3956,6 +3957,7 @@ - INIT_LIST_HEAD(&pa->pa_group_list); - pa->pa_deleted = 0; - pa->pa_type = MB_GROUP_PA; -+ pa->pa_error = 0; - - mb_debug("new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); -@@ -4019,7 +4021,9 @@ - int err = 0; - int free = 0; - -+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); - BUG_ON(pa->pa_deleted == 0); -+ BUG_ON(pa->pa_inode == NULL); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - grp_blk_start = pa->pa_pstart - bit; - BUG_ON(group != e4b->bd_group && pa->pa_len != 0); -@@ -4059,11 +4064,18 @@ - mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); - bit = next + 1; - } -- if (free != pa->pa_free) { -- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", -- pa, (unsigned long) pa->pa_lstart, -- (unsigned long) pa->pa_pstart, -- (unsigned long) pa->pa_len); -+ -+ /* "free < pa->pa_free" means we maybe double alloc the same blocks, -+ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ -+ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { -+ ext4_error(sb,"pa free mismatch: [pa %p] " -+ "[phy %lu] [logic %lu] [len %u] [free %u] " -+ "[error %u] [inode %lu] [freed %u]", pa, -+ (unsigned long)pa->pa_pstart, -+ (unsigned long)pa->pa_lstart, -+ (unsigned)pa->pa_len, (unsigned)pa->pa_free, -+ (unsigned)pa->pa_error, pa->pa_inode->i_ino, -+ free); - ext4_grp_locked_error(sb, group, - __func__, "free %u, pa_free %u", - free, pa->pa_free); -@@ -4072,6 +4084,7 @@ - * from the bitmap and continue. - */ - } -+ BUG_ON(pa->pa_free != free); - atomic_add(free, &sbi->s_mb_discarded); - - return err; -@@ -4832,6 +4863,25 @@ - ac->ac_b_ex.fe_len = 0; - ar->len = 0; - ext4_mb_show_ac(ac); -+ if (ac->ac_pa) { -+ struct ext4_prealloc_space *pa = ac->ac_pa; -+ -+ /* We can not make sure whether the bitmap has -+ * been updated or not when fail case. So can -+ * not revert pa_free back, just mark pa_error*/ -+ pa->pa_error++; -+ ext4_error(sb, -+ "Updating bitmap error: [err %d] " -+ "[pa %p] [phy %lu] [logic %lu] " -+ "[len %u] [free %u] [error %u] " -+ "[inode %lu]", *errp, pa, -+ (unsigned long)pa->pa_pstart, -+ (unsigned long)pa->pa_lstart, -+ (unsigned)pa->pa_len, -+ (unsigned)pa->pa_free, -+ (unsigned)pa->pa_error, -+ pa->pa_inode ? pa->pa_inode->i_ino : 0); -+ } - } else { - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - ar->len = ac->ac_b_ex.fe_len; -Index: linux-stage/fs/ext4/mballoc.h -=================================================================== ---- linux-stage.orig/fs/ext4/mballoc.h 2010-01-26 22:50:36.000000000 +0800 -+++ linux-stage/fs/ext4/mballoc.h 2010-01-26 22:52:58.000000000 +0800 -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - #include "ext4_jbd2.h" - #include "ext4.h" - #include "group.h" -@@ -134,6 +135,7 @@ - ext4_grpblk_t pa_len; /* len of preallocated chunk */ - ext4_grpblk_t pa_free; /* how many blocks are free */ - unsigned short pa_type; /* pa type. inode or group */ -+ unsigned short pa_error; - spinlock_t *pa_obj_lock; - struct inode *pa_inode; /* hack, for history only */ - }; diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch deleted file mode 100644 index e77314e..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch +++ /dev/null @@ -1,330 +0,0 @@ -Index: linux-stage/fs/ext4/ext4_jbd2.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4_jbd2.h -+++ linux-stage/fs/ext4/ext4_jbd2.h -@@ -35,6 +35,8 @@ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) - -+#define ext4_journal_dirty_metadata(handle, bh) \ -+ ext4_handle_dirty_metadata(handle, NULL, bh) - /* Extended attribute operations touch at most two data buffers, - * two bitmap buffers, and two group summaries, in addition to the inode - * and the superblock, which are already accounted for. */ -Index: linux-stage/fs/ext4/extents.c -=================================================================== ---- linux-stage.orig/fs/ext4/extents.c -+++ linux-stage/fs/ext4/extents.c -@@ -59,6 +59,17 @@ ext4_fsblk_t ext_pblock(struct ext4_exte - } - - /* -+ * ext4_ext_store_pblock: -+ * stores a large physical block number into an extent struct, -+ * breaking it into parts -+ */ -+void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) -+{ -+ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); -+ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); -+} -+ -+/* - * idx_pblock: - * combine low and high parts of a leaf physical block number into ext4_fsblk_t - */ -@@ -72,17 +83,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte - } - - /* -- * ext4_ext_store_pblock: -- * stores a large physical block number into an extent struct, -- * breaking it into parts -- */ --void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) --{ -- ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); -- ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); --} -- --/* - * ext4_idx_store_pblock: - * stores a large physical block number into an index struct, - * breaking it into parts -@@ -2097,6 +2097,56 @@ static int ext4_ext_rm_idx(handle_t *han - } - - /* -+ * This routine returns max. credits extent tree can consume. -+ * It should be OK for low-performance paths like ->writepage() -+ * To allow many writing process to fit a single transaction, -+ * caller should calculate credits under truncate_mutex and -+ * pass actual path. -+ */ -+int ext4_ext_calc_credits_for_insert(struct inode *inode, -+ struct ext4_ext_path *path) -+{ -+ int depth, needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ depth = ext_depth(inode); -+ if (le16_to_cpu(path[depth].p_hdr->eh_entries) -+ < le16_to_cpu(path[depth].p_hdr->eh_max)) -+ return 1; -+ } -+ -+ /* -+ * given 32bit logical block (4294967296 blocks), max. tree -+ * can be 4 levels in depth -- 4 * 340^4 == 53453440000. -+ * let's also add one more level for imbalance. -+ */ -+ depth = 5; -+ -+ /* allocation of new data block(s) */ -+ needed = 2; -+ -+ /* -+ * tree can be full, so it'd need to grow in depth: -+ * we need one credit to modify old root, credits for -+ * new root will be added in split accounting -+ */ -+ needed += 1; -+ -+ /* -+ * Index split can happen, we'd need: -+ * allocate intermediate indexes (bitmap + group) -+ * + change two blocks at each level, but root (already included) -+ */ -+ needed += (depth * 2) + (depth * 2); -+ -+ /* any allocation modifies superblock */ -+ needed += 1; -+ -+ return needed; -+} -+ -+/* - * ext4_ext_calc_credits_for_single_extent: - * This routine returns max. credits that needed to insert an extent - * to the extent tree. -@@ -3941,3 +3991,15 @@ int ext4_fiemap(struct inode *inode, str - return error; - } - -+EXPORT_SYMBOL(ext4_ext_store_pblock); -+EXPORT_SYMBOL(ext4_ext_search_right); -+EXPORT_SYMBOL(ext4_ext_search_left); -+EXPORT_SYMBOL(ext_pblock); -+EXPORT_SYMBOL(ext4_ext_insert_extent); -+EXPORT_SYMBOL(ext4_mb_new_blocks); -+EXPORT_SYMBOL(ext4_ext_walk_space); -+EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); -+EXPORT_SYMBOL(ext4_mark_inode_dirty); -+EXPORT_SYMBOL(ext4_ext_find_extent); -+EXPORT_SYMBOL(ext4_ext_drop_refs); -+ -Index: linux-stage/fs/ext4/ext4_extents.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4_extents.h -+++ linux-stage/fs/ext4/ext4_extents.h -@@ -58,6 +58,12 @@ - */ - #define EXT_STATS_ - -+/* -+ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb -+ * are now accounted in ext4_ext_calc_credits_for_insert() -+ */ -+#define EXT4_ALLOC_NEEDED 0 -+#define HAVE_EXT_PREPARE_CB_EXTENT - - /* - * ext4_inode has i_block array (60 bytes total). -@@ -231,6 +237,8 @@ extern ext4_fsblk_t ext_pblock(struct ex - extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); - extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); - extern int ext4_extent_tree_init(handle_t *, struct inode *); -+extern int ext4_ext_calc_credits_for_insert(struct inode *, -+ struct ext4_ext_path *); - extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -Index: linux-stage/fs/ext4/mballoc.c -=================================================================== ---- linux-stage.orig/fs/ext4/mballoc.c -+++ linux-stage/fs/ext4/mballoc.c -@@ -4313,6 +4313,7 @@ repeat: - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); - } -+EXPORT_SYMBOL(ext4_discard_preallocations); - - /* - * finds all preallocated spaces and return blocks being freed to them -@@ -5127,3 +5128,6 @@ error_return: - kmem_cache_free(ext4_ac_cachep, ac); - return; - } -+ -+EXPORT_SYMBOL(ext4_free_blocks); -+ -Index: linux-stage/fs/ext4/ext4_jbd2.c -=================================================================== ---- linux-stage.orig/fs/ext4/ext4_jbd2.c -+++ linux-stage/fs/ext4/ext4_jbd2.c -@@ -31,6 +31,7 @@ int __ext4_journal_get_write_access(cons - } - return err; - } -+EXPORT_SYMBOL(__ext4_journal_get_write_access); - - int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) -@@ -107,3 +108,4 @@ int __ext4_handle_dirty_metadata(const c - } - return err; - } -+EXPORT_SYMBOL(__ext4_handle_dirty_metadata); -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -1528,6 +1528,8 @@ extern int ext4_mb_add_groupinfo(struct - extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); - extern void ext4_mb_put_buddy_cache_lock(struct super_block *, - ext4_group_t, int); -+extern void ext4_mb_discard_inode_preallocations(struct inode *); -+ - /* inode.c */ - int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr); -Index: linux-stage/fs/ext4/inode.c -=================================================================== ---- linux-stage.orig/fs/ext4/inode.c -+++ linux-stage/fs/ext4/inode.c -@@ -5078,6 +5078,7 @@ bad_inode: - iget_failed(inode); - return ERR_PTR(ret); - } -+EXPORT_SYMBOL(ext4_iget); - - static int ext4_inode_blocks_set(handle_t *handle, - struct ext4_inode *raw_inode, -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -90,6 +90,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); - } -+EXPORT_SYMBOL(ext4_inode_bitmap); - - ext4_fsblk_t ext4_inode_table(struct super_block *sb, - struct ext4_group_desc *bg) -@@ -114,6 +115,7 @@ __u32 ext4_free_inodes_count(struct supe - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); - } -+EXPORT_SYMBOL(ext4_itable_unused_count); - - __u32 ext4_used_dirs_count(struct super_block *sb, - struct ext4_group_desc *bg) -@@ -1489,9 +1491,11 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, - Opt_usrquota, Opt_grpquota, Opt_i_version, -+ Opt_mballoc, Opt_extents, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_block_validity, Opt_noblock_validity, -- Opt_inode_readahead_blks, Opt_journal_ioprio -+ Opt_inode_readahead_blks, Opt_journal_ioprio, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - }; - - static match_table_t tokens = { -@@ -1547,6 +1551,11 @@ static match_table_t tokens = { - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_i_version, "i_version"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_extents, "extents"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_stripe, "stripe=%u"}, - {Opt_resize, "resize"}, - {Opt_delalloc, "delalloc"}, -@@ -1993,6 +2002,12 @@ set_qf_format: - else - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); - break; -+ case Opt_mballoc: -+ case Opt_extents: -+ case Opt_iopen: -+ case Opt_noiopen: -+ case Opt_iopen_nopriv: -+ break; - default: - ext4_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " -@@ -2543,7 +2558,7 @@ static ssize_t delayed_allocation_blocks - char *buf) - { - return snprintf(buf, PAGE_SIZE, "%llu\n", -- (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); -+ (unsigned long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); - } - - static ssize_t session_write_kbytes_show(struct ext4_attr *a, -@@ -2564,11 +2579,11 @@ static ssize_t lifetime_write_kbytes_sho - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - return snprintf(buf, PAGE_SIZE, "%llu\n", -- sbi->s_kbytes_written + -+ (unsigned long long)(sbi->s_kbytes_written + - (sb->s_bdev->bd_part ? - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1 -- : 0)); -+ : 0))); - } - - static ssize_t inode_readahead_blks_store(struct ext4_attr *a, -@@ -3042,7 +3057,7 @@ static int ext4_fill_super(struct super_ - if (blocks_count && ext4_blocks_count(es) > blocks_count) { - ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " - "exceeds size of device (%llu blocks)", -- ext4_blocks_count(es), blocks_count); -+ ext4_blocks_count(es), (unsigned long long)blocks_count); - goto failed_mount; - } - -Index: linux-stage/fs/ext4/fsync.c -=================================================================== ---- linux-stage.orig/fs/ext4/fsync.c -+++ linux-stage/fs/ext4/fsync.c -@@ -61,7 +61,7 @@ int ext4_sync_file(struct file *file, st - - trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", - inode->i_sb->s_id, datasync, inode->i_ino, -- dentry->d_parent->d_inode->i_ino); -+ 0L); - - ret = flush_aio_dio_completed_IO(inode); - if (ret < 0) -Index: linux-stage/fs/ext4/move_extent.c -=================================================================== ---- linux-stage.orig/fs/ext4/move_extent.c -+++ linux-stage/fs/ext4/move_extent.c -@@ -1358,7 +1358,8 @@ ext4_move_extents(struct file *o_filp, s - ext4_error(orig_inode->i_sb, - "We replaced blocks too much! " - "sum of replaced: %llu requested: %llu", -- *moved_len, len); -+ (unsigned long long)(*moved_len), -+ (unsigned long long)(len)); - ret1 = -EIO; - break; - } diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch deleted file mode 100644 index d028930..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch +++ /dev/null @@ -1,578 +0,0 @@ -Prevent an ext4 filesystem from being mounted multiple times. -A sequence number is stored on disk and is periodically updated (every 5 -seconds by default) by a mounted filesystem. -At mount time, we now wait for s_mmp_update_interval seconds to make sure -that the MMP sequence does not change. -In case of failure, the nodename, bdevname and the time at which the MMP -block was last updated is displayed. -Move all mmp code to a dedicated file (mmp.c). - -Signed-off-by: Andreas Dilger whamcloud.com> -Signed-off-by: Johann Lombardi whamcloud.com> ---- - fs/ext4/Makefile | 3 +- - fs/ext4/ext4.h | 76 ++++++++++++- - fs/ext4/mmp.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - fs/ext4/super.c | 18 +++- - 4 files changed, 444 insertions(+), 4 deletions(-) - create mode 100644 fs/ext4/mmp.c - -Index: linux-stage/fs/ext4/Makefile -=================================================================== ---- linux-stage.orig/fs/ext4/Makefile -+++ linux-stage/fs/ext4/Makefile -@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o - - ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ -- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o -+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ -+ mmp.o - - ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -878,7 +878,7 @@ struct ext4_super_block { - __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __le16 s_raid_stride; /* RAID stride */ -- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ -+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ - __le64 s_mmp_block; /* Block for multi-mount protection */ - __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u8 s_log_groups_per_flex; /* FLEX_BG group size */ -@@ -1032,6 +1032,9 @@ struct ext4_sb_info { - - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; -+ -+ /* Kernel thread for multiple mount protection */ -+ struct task_struct *s_mmp_tsk; - }; - - static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) -@@ -1169,7 +1172,8 @@ static inline void ext4_clear_inode_stat - EXT4_FEATURE_INCOMPAT_META_BG| \ - EXT4_FEATURE_INCOMPAT_EXTENTS| \ - EXT4_FEATURE_INCOMPAT_64BIT| \ -- EXT4_FEATURE_INCOMPAT_FLEX_BG) -+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ -+ EXT4_FEATURE_INCOMPAT_MMP) - #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ -@@ -1376,6 +1380,67 @@ void ext4_get_group_no_and_offset(struct - extern struct proc_dir_entry *ext4_proc_root; - - /* -+ * This structure will be used for multiple mount protection. It will be -+ * written into the block number saved in the s_mmp_block field in the -+ * superblock. Programs that check MMP should assume that if -+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe -+ * to use the filesystem, regardless of how old the timestamp is. -+ */ -+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ -+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ -+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ -+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ -+ -+struct mmp_struct { -+ __le32 mmp_magic; /* Magic number for MMP */ -+ __le32 mmp_seq; /* Sequence no. updated periodically */ -+ -+ /* -+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information -+ * purposes and do not affect the correctness of the algorithm -+ */ -+ __le64 mmp_time; /* Time last updated */ -+ char mmp_nodename[64]; /* Node which last updated MMP block */ -+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ -+ -+ /* -+ * mmp_check_interval is used to verify if the MMP block has been -+ * updated on the block device. The value is updated based on the -+ * maximum time to write the MMP block during an update cycle. -+ */ -+ __le16 mmp_check_interval; -+ -+ __le16 mmp_pad1; -+ __le32 mmp_pad2[227]; -+}; -+ -+/* arguments passed to the mmp thread */ -+struct mmpd_data { -+ struct buffer_head *bh; /* bh from initial read_mmp_block() */ -+ struct super_block *sb; /* super block of the fs */ -+}; -+ -+/* -+ * Check interval multiplier -+ * The MMP block is written every update interval and initially checked every -+ * update interval x the multiplier (the value is then adapted based on the -+ * write latency). The reason is that writes can be delayed under load and we -+ * don't want readers to incorrectly assume that the filesystem is no longer -+ * in use. -+ */ -+#define EXT4_MMP_CHECK_MULT 2UL -+ -+/* -+ * Minimum interval for MMP checking in seconds. -+ */ -+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL -+ -+/* -+ * Maximum interval for MMP checking in seconds. -+ */ -+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL -+ -+/* - * Function prototypes - */ - -@@ -1547,6 +1612,10 @@ extern void __ext4_warning(struct super_ - #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) - extern void ext4_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, -+ const char *, const char *); -+#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, msg) -+ - extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, - const char *, const char *, ...) - __attribute__ ((format (printf, 4, 5))); -@@ -1784,6 +1853,9 @@ static inline void ext4_unlock_group(str - spin_unlock(ext4_group_lock_ptr(sb, group)); - } - -+/* mmp.c */ -+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); -+ - /* - * Inodes and files operations - */ -Index: linux-stage/fs/ext4/mmp.c -=================================================================== ---- /dev/null -+++ linux-stage/fs/ext4/mmp.c -@@ -0,0 +1,351 @@ -+#include -+#include -+#include -+#include -+#include -+ -+#include "ext4.h" -+ -+/* -+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk -+ * faster. -+ */ -+static int write_mmp_block(struct buffer_head *bh) -+{ -+ mark_buffer_dirty(bh); -+ lock_buffer(bh); -+ bh->b_end_io = end_buffer_write_sync; -+ get_bh(bh); -+ submit_bh(WRITE_SYNC, bh); -+ wait_on_buffer(bh); -+ if (unlikely(!buffer_uptodate(bh))) -+ return 1; -+ -+ return 0; -+} -+ -+/* -+ * Read the MMP block. It _must_ be read from disk and hence we clear the -+ * uptodate flag on the buffer. -+ */ -+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, -+ ext4_fsblk_t mmp_block) -+{ -+ struct mmp_struct *mmp; -+ -+ if (*bh) -+ clear_buffer_uptodate(*bh); -+ -+ /* This would be sb_bread(sb, mmp_block), except we need to be sure -+ * that the MD RAID device cache has been bypassed, and that the read -+ * is not blocked in the elevator. */ -+ if (!*bh) -+ *bh = sb_getblk(sb, mmp_block); -+ if (*bh) { -+ get_bh(*bh); -+ lock_buffer(*bh); -+ (*bh)->b_end_io = end_buffer_read_sync; -+ submit_bh(READ_SYNC, *bh); -+ wait_on_buffer(*bh); -+ if (!buffer_uptodate(*bh)) { -+ brelse(*bh); -+ *bh = NULL; -+ } -+ } -+ if (!*bh) { -+ ext4_warning(sb, "Error while reading MMP block %llu", -+ mmp_block); -+ return -EIO; -+ } -+ -+ mmp = (struct mmp_struct *)((*bh)->b_data); -+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+/* -+ * Dump as much information as possible to help the admin. -+ */ -+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, -+ const char *function, const char *msg) -+{ -+ __ext4_warning(sb, function, "%s", msg); -+ __ext4_warning(sb, function, -+ "MMP failure info: last update time: %llu, last update " -+ "node: %s, last update device: %s\n", -+ (long long unsigned int) le64_to_cpu(mmp->mmp_time), -+ mmp->mmp_nodename, mmp->mmp_bdevname); -+} -+ -+/* -+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds -+ */ -+static int kmmpd(void *data) -+{ -+ struct super_block *sb = ((struct mmpd_data *) data)->sb; -+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh; -+ struct ext4_super_block *es = EXT4_SB(sb)->s_es; -+ struct mmp_struct *mmp; -+ ext4_fsblk_t mmp_block; -+ u32 seq = 0; -+ unsigned long failed_writes = 0; -+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); -+ unsigned mmp_check_interval; -+ unsigned long last_update_time; -+ unsigned long diff; -+ int retval; -+ -+ mmp_block = le64_to_cpu(es->s_mmp_block); -+ mmp = (struct mmp_struct *)(bh->b_data); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ /* -+ * Start with the higher mmp_check_interval and reduce it if -+ * the MMP block is being updated on time. -+ */ -+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, -+ EXT4_MMP_MIN_CHECK_INTERVAL); -+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); -+ bdevname(bh->b_bdev, mmp->mmp_bdevname); -+ -+ memcpy(mmp->mmp_nodename, init_utsname()->sysname, -+ sizeof(mmp->mmp_nodename)); -+ -+ while (!kthread_should_stop()) { -+ if (++seq > EXT4_MMP_SEQ_MAX) -+ seq = 1; -+ -+ mmp->mmp_seq = cpu_to_le32(seq); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ last_update_time = jiffies; -+ -+ retval = write_mmp_block(bh); -+ /* -+ * Don't spew too many error messages. Print one every -+ * (s_mmp_update_interval * 60) seconds. -+ */ -+ if (retval) { -+ if ((failed_writes % 60) == 0) -+ ext4_error(sb, "Error writing to MMP block"); -+ failed_writes++; -+ } -+ -+ if (!(le32_to_cpu(es->s_feature_incompat) & -+ EXT4_FEATURE_INCOMPAT_MMP)) { -+ ext4_warning(sb, "kmmpd being stopped since MMP feature" -+ " has been disabled."); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ if (sb->s_flags & MS_RDONLY) { -+ ext4_warning(sb, "kmmpd being stopped since filesystem " -+ "has been remounted as readonly."); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ diff = jiffies - last_update_time; -+ if (diff < mmp_update_interval * HZ) -+ schedule_timeout_interruptible(mmp_update_interval * -+ HZ - diff); -+ -+ /* -+ * We need to make sure that more than mmp_check_interval -+ * seconds have not passed since writing. If that has happened -+ * we need to check if the MMP block is as we left it. -+ */ -+ diff = jiffies - last_update_time; -+ if (diff > mmp_check_interval * HZ) { -+ struct buffer_head *bh_check = NULL; -+ struct mmp_struct *mmp_check; -+ -+ retval = read_mmp_block(sb, &bh_check, mmp_block); -+ if (retval) { -+ ext4_error(sb, "error reading MMP data: %d", -+ retval); -+ -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ mmp_check = (struct mmp_struct *)(bh_check->b_data); -+ if (mmp->mmp_seq != mmp_check->mmp_seq || -+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, -+ sizeof(mmp->mmp_nodename))) { -+ dump_mmp_msg(sb, mmp_check, -+ "Error while updating MMP info. " -+ "The filesystem seems to have been" -+ " multiply mounted."); -+ ext4_error(sb, "abort"); -+ goto failed; -+ } -+ put_bh(bh_check); -+ } -+ -+ /* -+ * Adjust the mmp_check_interval depending on how much time -+ * it took for the MMP block to be written. -+ */ -+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, -+ EXT4_MMP_MAX_CHECK_INTERVAL), -+ EXT4_MMP_MIN_CHECK_INTERVAL); -+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); -+ } -+ -+ /* -+ * Unmount seems to be clean. -+ */ -+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ -+ retval = write_mmp_block(bh); -+ -+failed: -+ kfree(data); -+ brelse(bh); -+ return retval; -+} -+ -+/* -+ * Get a random new sequence number but make sure it is not greater than -+ * EXT4_MMP_SEQ_MAX. -+ */ -+static unsigned int mmp_new_seq(void) -+{ -+ u32 new_seq; -+ -+ do { -+ get_random_bytes(&new_seq, sizeof(u32)); -+ } while (new_seq > EXT4_MMP_SEQ_MAX); -+ -+ return new_seq; -+} -+ -+/* -+ * Protect the filesystem from being mounted more than once. -+ */ -+int ext4_multi_mount_protect(struct super_block *sb, -+ ext4_fsblk_t mmp_block) -+{ -+ struct ext4_super_block *es = EXT4_SB(sb)->s_es; -+ struct buffer_head *bh = NULL; -+ struct mmp_struct *mmp = NULL; -+ struct mmpd_data *mmpd_data; -+ u32 seq; -+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); -+ unsigned int wait_time = 0; -+ int retval; -+ -+ if (mmp_block < le32_to_cpu(es->s_first_data_block) || -+ mmp_block >= ext4_blocks_count(es)) { -+ ext4_warning(sb, "Invalid MMP block in superblock"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ -+ mmp = (struct mmp_struct *)(bh->b_data); -+ -+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) -+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; -+ -+ /* -+ * If check_interval in MMP block is larger, use that instead of -+ * update_interval from the superblock. -+ */ -+ if (mmp->mmp_check_interval > mmp_check_interval) -+ mmp_check_interval = mmp->mmp_check_interval; -+ -+ seq = le32_to_cpu(mmp->mmp_seq); -+ if (seq == EXT4_MMP_SEQ_CLEAN) -+ goto skip; -+ -+ if (seq == EXT4_MMP_SEQ_FSCK) { -+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); -+ goto failed; -+ } -+ -+ wait_time = min(mmp_check_interval * 2 + 1, -+ mmp_check_interval + 60); -+ -+ /* Print MMP interval if more than 20 secs. */ -+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) -+ ext4_warning(sb, "MMP interval %u higher than expected, please" -+ " wait.\n", wait_time * 2); -+ -+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { -+ ext4_warning(sb, "MMP startup interrupted, failing mount\n"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ mmp = (struct mmp_struct *)(bh->b_data); -+ if (seq != le32_to_cpu(mmp->mmp_seq)) { -+ dump_mmp_msg(sb, mmp, -+ "Device is already active on another node."); -+ goto failed; -+ } -+ -+skip: -+ /* -+ * write a new random sequence number. -+ */ -+ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); -+ -+ retval = write_mmp_block(bh); -+ if (retval) -+ goto failed; -+ -+ /* -+ * wait for MMP interval and check mmp_seq. -+ */ -+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { -+ ext4_warning(sb, "MMP startup interrupted, failing mount\n"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ mmp = (struct mmp_struct *)(bh->b_data); -+ if (seq != le32_to_cpu(mmp->mmp_seq)) { -+ dump_mmp_msg(sb, mmp, -+ "Device is already active on another node."); -+ goto failed; -+ } -+ -+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); -+ if (!mmpd_data) { -+ ext4_warning(sb, "not enough memory for mmpd_data"); -+ goto failed; -+ } -+ mmpd_data->sb = sb; -+ mmpd_data->bh = bh; -+ -+ /* -+ * Start a kernel thread to update the MMP block periodically. -+ */ -+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", -+ bdevname(bh->b_bdev, -+ mmp->mmp_bdevname)); -+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ kfree(mmpd_data); -+ ext4_warning(sb, "Unable to create kmmpd thread for %s.", -+ sb->s_id); -+ goto failed; -+ } -+ -+ return 0; -+ -+failed: -+ brelse(bh); -+ return 1; -+} -+ -+ -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -40,6 +40,8 @@ - #include - #include - #include -+#include -+#include - - #include "ext4.h" - #include "ext4_jbd2.h" -@@ -698,6 +700,8 @@ static void ext4_put_super(struct super_ - invalidate_bdev(sbi->journal_bdev, 0); - ext4_blkdev_remove(sbi); - } -+ if (sbi->s_mmp_tsk) -+ kthread_stop(sbi->s_mmp_tsk); - sb->s_fs_info = NULL; - /* - * Now that we are completely done shutting down the -@@ -2810,6 +2814,11 @@ static int ext4_fill_super(struct super_ - EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_RECOVER)); - -+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && -+ !(sb->s_flags & MS_RDONLY)) -+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) -+ goto failed_mount3; -+ - /* - * The first inode we look at is the journal inode. Don't try - * root first: it may be modified in the journal! -@@ -3048,6 +3057,8 @@ failed_mount3: - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); -+ if (sbi->s_mmp_tsk) -+ kthread_stop(sbi->s_mmp_tsk); - failed_mount2: - for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); -@@ -3557,7 +3568,7 @@ static int ext4_remount(struct super_blo - struct ext4_mount_options old_opts; - ext4_group_t g; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; -- int err; -+ int err = 0; - #ifdef CONFIG_QUOTA - int i; - #endif -@@ -3676,6 +3687,13 @@ static int ext4_remount(struct super_blo - goto restore_opts; - if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; -+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, -+ EXT4_FEATURE_INCOMPAT_MMP)) -+ if (ext4_multi_mount_protect(sb, -+ le64_to_cpu(es->s_mmp_block))) { -+ err = -EROFS; -+ goto restore_opts; -+ } - } - } - ext4_setup_system_zone(sb); diff --git a/ldiskfs/kernel_patches/patches/ext4-osd-iam-exports.patch b/ldiskfs/kernel_patches/patches/ext4-osd-iam-exports.patch deleted file mode 100644 index 6b65eb0..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-osd-iam-exports.patch +++ /dev/null @@ -1,64 +0,0 @@ -diff -rupN 2.6.27.21_2/fs/ext4/ext4.h 2.6.27.21_3/fs/ext4/ext4.h ---- 2.6.27.21_2/fs/ext4/ext4.h 2009-07-17 12:19:59.000000000 +0530 -+++ 2.6.27.21_3/fs/ext4/ext4.h 2009-07-17 12:38:59.000000000 +0530 -@@ -1181,6 +1181,9 @@ extern int ext4_orphan_add(handle_t *, s - extern int ext4_orphan_del(handle_t *, struct inode *); - extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash); -+extern struct buffer_head *ext4_append(handle_t *handle, -+ struct inode *inode, -+ ext4_lblk_t *block, int *err); - - /* resize.c */ - extern int ext4_group_add(struct super_block *sb, -diff -rupN 2.6.27.21_2/fs/ext4/hash.c 2.6.27.21_3/fs/ext4/hash.c ---- 2.6.27.21_2/fs/ext4/hash.c 2009-07-17 12:12:56.000000000 +0530 -+++ 2.6.27.21_3/fs/ext4/hash.c 2009-07-17 12:40:22.000000000 +0530 -@@ -9,6 +9,7 @@ - * License. - */ - -+#include - #include - #include - #include -@@ -206,3 +207,4 @@ int ext4fs_dirhash(const char *name, int - hinfo->minor_hash = minor_hash; - return 0; - } -+EXPORT_SYMBOL(ext4fs_dirhash); -diff -rupN 2.6.27.21_2/fs/ext4/namei.c 2.6.27.21_3/fs/ext4/namei.c ---- 2.6.27.21_2/fs/ext4/namei.c 2009-07-17 12:23:51.000000000 +0530 -+++ 2.6.27.21_3/fs/ext4/namei.c 2009-07-17 12:37:59.000000000 +0530 -@@ -51,9 +51,9 @@ - #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) - --static struct buffer_head *ext4_append(handle_t *handle, -- struct inode *inode, -- ext4_lblk_t *block, int *err) -+struct buffer_head *ext4_append(handle_t *handle, -+ struct inode *inode, -+ ext4_lblk_t *block, int *err) - { - struct buffer_head *bh; - struct ext4_inode_info *ei = EXT4_I(inode); -@@ -72,6 +72,7 @@ static struct buffer_head *ext4_append(h - up(&ei->i_append_sem); - return bh; - } -+EXPORT_SYMBOL(ext4_append); - - #ifndef assert - #define assert(test) J_ASSERT(test) -diff -rupN 2.6.27.21_2/fs/ext4/super.c 2.6.27.21_3/fs/ext4/super.c ---- 2.6.27.21_2/fs/ext4/super.c 2009-07-17 12:12:57.000000000 +0530 -+++ 2.6.27.21_3/fs/ext4/super.c 2009-07-17 12:40:52.000000000 +0530 -@@ -377,6 +377,7 @@ void __ext4_std_error(struct super_block - - ext4_handle_error(sb); - } -+EXPORT_SYMBOL(__ext4_std_error); - - /* - * ext4_abort is a much stronger failure handler than ext4_error. The diff --git a/ldiskfs/kernel_patches/patches/ext4-osd-iop-common.patch b/ldiskfs/kernel_patches/patches/ext4-osd-iop-common.patch deleted file mode 100644 index bc2a345..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-osd-iop-common.patch +++ /dev/null @@ -1,224 +0,0 @@ -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/ext4.h linux-2.6.27.21-0.1_2//fs/ext4/ext4.h ---- linux-2.6.27.21-0.1_1//fs/ext4/ext4.h 2009-08-24 15:32:00.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/ext4.h 2009-08-24 15:32:55.000000000 +0530 -@@ -1171,6 +1171,19 @@ extern int ext4_fiemap(struct inode *, s - /* migrate.c */ - extern int ext4_ext_migrate(struct inode *); - /* namei.c */ -+extern struct inode *ext4_create_inode(handle_t *handle, -+ struct inode * dir, int mode); -+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); -+extern int ext4_delete_entry(handle_t *handle, struct inode * dir, -+ struct ext4_dir_entry_2 * de_del, -+ struct buffer_head * bh); -+extern struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir); -+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) -+extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, -+ struct inode *inode); - extern int ext4_orphan_add(handle_t *, struct inode *); - extern int ext4_orphan_del(handle_t *, struct inode *); - extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/namei.c linux-2.6.27.21-0.1_2//fs/ext4/namei.c ---- linux-2.6.27.21-0.1_1//fs/ext4/namei.c 2009-08-24 15:32:00.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/namei.c 2009-08-24 15:43:56.000000000 +0530 -@@ -24,6 +24,7 @@ - * Theodore Ts'o, 2002 - */ - -+#include - #include - #include - #include -@@ -882,9 +883,9 @@ static inline int search_dirblock(struct - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ --static struct buffer_head * ext4_find_entry (struct inode *dir, -- const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir) -+struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir) - { - struct super_block *sb; - struct buffer_head *bh_use[NAMEI_RA_SIZE]; -@@ -991,6 +992,7 @@ cleanup_and_exit: - brelse(bh_use[ra_ptr]); - return ret; - } -+EXPORT_SYMBOL(ext4_find_entry); - - static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) -@@ -1511,8 +1513,8 @@ static int make_indexed_dir(handle_t *ha - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ --static int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; -@@ -1557,6 +1559,7 @@ static int ext4_add_entry(handle_t *hand - de->rec_len = ext4_rec_len_to_disk(blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); - } -+EXPORT_SYMBOL(ext4_add_entry); - - /* - * Returns 0 for success, or a negative error value -@@ -1699,10 +1702,10 @@ cleanup: - * ext4_delete_entry deletes a directory entry by merging it with the - * previous entry - */ --static int ext4_delete_entry(handle_t *handle, -- struct inode *dir, -- struct ext4_dir_entry_2 *de_del, -- struct buffer_head *bh) -+int ext4_delete_entry(handle_t *handle, -+ struct inode *dir, -+ struct ext4_dir_entry_2 *de_del, -+ struct buffer_head *bh) - { - struct ext4_dir_entry_2 *de, *pde; - int i; -@@ -1733,7 +1736,7 @@ static int ext4_delete_entry(handle_t *h - } - return -ENOENT; - } -- -+EXPORT_SYMBOL(ext4_delete_entry); - /* - * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, - * since this indicates that nlinks count was previously 1. -@@ -1796,6 +1799,26 @@ static unsigned ext4_dentry_goal(struct - return inum; - } - -+struct inode * ext4_create_inode(handle_t *handle, struct inode * dir, int mode) -+{ -+ struct inode *inode; -+ -+ inode = ext4_new_inode(handle, dir, mode, NULL, 0); -+ if (!IS_ERR(inode)) { -+ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { -+#ifdef CONFIG_LDISKFS_FS_XATTR -+ inode->i_op = &ext4_special_inode_operations; -+#endif -+ } else { -+ inode->i_op = &ext4_file_inode_operations; -+ inode->i_fop = &ext4_file_operations; -+ ext4_set_aops(inode); -+ } -+ } -+ return inode; -+} -+EXPORT_SYMBOL(ext4_create_inode); -+ - /* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it -@@ -1872,40 +1895,32 @@ retry: - return err; - } - --static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) -+/* Initialize @inode as a subdirectory of @dir, and add the -+ * "." and ".." entries into the first directory block. */ -+int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir, -+ struct inode *inode) - { -- handle_t *handle; -- struct inode *inode; -- struct buffer_head *dir_block; -- struct ext4_dir_entry_2 *de; - unsigned int blocksize = dir->i_sb->s_blocksize; -- int err, retries = 0; -- -- if (EXT4_DIR_LINK_MAX(dir)) -- return -EMLINK; -+ struct buffer_head * dir_block; -+ struct ext4_dir_entry_2 * de; -+ int err = 0; - --retry: -- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + -- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - -- inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name, -- ext4_dentry_goal(dir->i_sb, dentry)); -- err = PTR_ERR(inode); -- if (IS_ERR(inode)) -- goto out_stop; -- - inode->i_op = &ext4_dir_inode_operations; - inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); -- if (!dir_block) -- goto out_clear_inode; -+ if (!dir_block) { -+ clear_nlink(inode); -+ ext4_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto get_out; -+ } - BUFFER_TRACE(dir_block, "get_write_access"); - ext4_journal_get_write_access(handle, dir_block); - de = (struct ext4_dir_entry_2 *) dir_block->b_data; -@@ -1925,9 +1940,43 @@ retry: - ext4_journal_dirty_metadata(handle, dir_block); - brelse(dir_block); - ext4_mark_inode_dirty(handle, inode); -+get_out: -+ return err; -+} -+EXPORT_SYMBOL(ext4_add_dot_dotdot); -+ -+ -+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) -+{ -+ handle_t *handle; -+ struct inode *inode; -+ int err, retries = 0; -+ -+ if (EXT4_DIR_LINK_MAX(dir)) -+ return -EMLINK; -+ -+retry: -+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + -+ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name, -+ ext4_dentry_goal(dir->i_sb, dentry)); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ err = ext4_add_dot_dotdot(handle, dir, inode); -+ if (err) -+ goto out_stop; -+ - err = ext4_add_entry(handle, dentry, inode); - if (err) { --out_clear_inode: - clear_nlink(inode); - ext4_mark_inode_dirty(handle, inode); - iput(inode); diff --git a/ldiskfs/kernel_patches/patches/ext4-pdir-fix.patch b/ldiskfs/kernel_patches/patches/ext4-pdir-fix.patch deleted file mode 100644 index 32218cf..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-pdir-fix.patch +++ /dev/null @@ -1,59 +0,0 @@ -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/ext4_i.h linux-2.6.27.21-0.1_2//fs/ext4/ext4_i.h ---- linux-2.6.27.21-0.1_1//fs/ext4/ext4.h 2009-08-24 13:00:59.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/ext4.h 2009-08-24 13:01:25.000000000 +0530 -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -56,7 +57,9 @@ struct ext4_inode_info { - __u32 i_flags; - ext4_fsblk_t i_file_acl; - __u32 i_dtime; -- -+ /* following fields for parallel directory operations -bzzz */ -+ struct dynlock i_htree_lock; -+ struct semaphore i_append_sem; - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/namei.c linux-2.6.27.21-0.1_2//fs/ext4/namei.c ---- linux-2.6.27.21-0.1_1//fs/ext4/namei.c 2009-08-24 13:00:59.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/namei.c 2009-08-24 13:03:45.000000000 +0530 -@@ -55,6 +55,11 @@ static struct buffer_head *ext4_append(h - ext4_lblk_t *block, int *err) - { - struct buffer_head *bh; -+ struct ext4_inode_info *ei = EXT4_I(inode); -+ -+ /* with parallel dir operations all appends -+ * have to be serialized -bzzz */ -+ down(&ei->i_append_sem); - - *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - -@@ -67,7 +72,9 @@ static struct buffer_head *ext4_append(h - brelse(bh); - bh = NULL; - } -+ ei->i_disksize = inode->i_size; - } -+ up(&ei->i_append_sem); - return bh; - } - -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/super.c linux-2.6.27.21-0.1_2//fs/ext4/super.c ---- linux-2.6.27.21-0.1_1//fs/ext4/super.c 2009-08-24 13:00:59.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/super.c 2009-08-24 13:01:25.000000000 +0530 -@@ -635,6 +635,8 @@ static struct inode *ext4_alloc_inode(st - #endif - ei->vfs_inode.i_version = 1; - ei->vfs_inode.i_data.writeback_index = 0; -+ dynlock_init(&ei->i_htree_lock); -+ sema_init(&ei->i_append_sem, 1); - memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); - INIT_LIST_HEAD(&ei->i_prealloc_list); - spin_lock_init(&ei->i_prealloc_lock); diff --git a/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch deleted file mode 100644 index d7485f5..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch +++ /dev/null @@ -1,378 +0,0 @@ -Index: linux-2.6.18-128.1.6/fs/ext4/super.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/super.c -+++ linux-2.6.18-128.1.6/fs/ext4/super.c -@@ -108,7 +108,8 @@ - EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); - EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); - EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); --EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -+EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); -+EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); - EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); - EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size); - -@@ -108,7 +108,8 @@ - ATTR_LIST(mb_max_to_scan), - ATTR_LIST(mb_min_to_scan), - ATTR_LIST(mb_order2_req), -- ATTR_LIST(mb_stream_req), -+ ATTR_LIST(mb_small_req), -+ ATTR_LIST(mb_large_req), - ATTR_LIST(mb_group_prealloc), - ATTR_LIST(max_dir_size), - NULL, -Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h 2009-05-28 17:16:51.000000000 +0530 -+++ linux-2.6.18-128.1.6/fs/ext4/ext4.h 2009-05-28 17:16:52.000000000 +0530 -@@ -108,11 +108,14 @@ - - /* tunables */ - unsigned long s_stripe; -- unsigned int s_mb_stream_request; -+ unsigned long s_mb_small_req; -+ unsigned long s_mb_large_req; - unsigned int s_mb_max_to_scan; - unsigned int s_mb_min_to_scan; - unsigned int s_mb_stats; - unsigned int s_mb_order2_reqs; -+ unsigned long *s_mb_prealloc_table; -+ unsigned long s_mb_prealloc_table_size; - unsigned int s_mb_group_prealloc; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; -Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c 2009-05-28 17:16:51.000000000 +0530 -+++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c 2009-05-28 17:19:57.000000000 +0530 -@@ -2284,6 +2284,26 @@ - } - } - -+static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) -+{ -+ int i; -+ -+ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) -+ return; -+ -+ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { -+ if (sbi->s_mb_prealloc_table[i] == 0) { -+ sbi->s_mb_prealloc_table[i] = value; -+ return; -+ } -+ -+ /* they should add values in order */ -+ if (value <= sbi->s_mb_prealloc_table[i]) -+ return; -+ } -+} -+ -+ - static int ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) - { -@@ -2325,6 +2389,80 @@ - .llseek = seq_lseek, - .release = seq_release, - }; -+ -+#define EXT4_MB_PREALLOC_TABLE "prealloc_table" -+ -+static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext4_sb_info *sbi = data; -+ int len = 0; -+ int i; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) -+ len += sprintf(page + len, "%ld ", -+ sbi->s_mb_prealloc_table[i]); -+ len += sprintf(page + len, "\n"); -+ -+ *start = page; -+ return len; -+} -+ -+static int ext4_mb_prealloc_table_proc_write(struct file *file, -+ const char __user *buf, -+ unsigned long cnt, void *data) -+{ -+ struct ext4_sb_info *sbi = data; -+ unsigned long value; -+ unsigned long prev = 0; -+ char str[128]; -+ char *cur; -+ char *end; -+ unsigned long *new_table; -+ int num = 0; -+ int i = 0; -+ -+ if (cnt >= sizeof(str)) -+ return -EINVAL; -+ if (copy_from_user(str, buf, cnt)) -+ return -EFAULT; -+ -+ num = 0; -+ cur = str; -+ end = str + cnt; -+ while (cur < end) { -+ while ((cur < end) && (*cur == ' ')) cur++; -+ value = simple_strtol(cur, &cur, 0); -+ if (value == 0) -+ break; -+ if (value <= prev) -+ return -EINVAL; -+ prev = value; -+ num++; -+ } -+ -+ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); -+ if (new_table == NULL) -+ return -ENOMEM; -+ kfree(sbi->s_mb_prealloc_table); -+ memset(new_table, 0, num * sizeof(*new_table)); -+ sbi->s_mb_prealloc_table = new_table; -+ sbi->s_mb_prealloc_table_size = num; -+ cur = str; -+ end = str + cnt; -+ while (cur < end && i < num) { -+ while ((cur < end) && (*cur == ' ')) cur++; -+ value = simple_strtol(cur, &cur, 0); -+ ext4_mb_prealloc_table_add(sbi, value); -+ i++; -+ } -+ -+ return cnt; -+} - - static void ext4_mb_history_release(struct super_block *sb) - { -@@ -2400,6 +2400,7 @@ - remove_proc_entry("mb_groups", sbi->s_proc); - if (sbi->s_mb_history_max) - remove_proc_entry("mb_history", sbi->s_proc); -+ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); - } - kfree(sbi->s_mb_history); - } -@@ -2408,6 +2446,13 @@ - p->proc_fops = &ext4_mb_seq_groups_fops; - p->data = sb; - } -+ p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG | -+ S_IRUGO | S_IWUSR, sbi->s_proc); -+ if (p) { -+ p->data = sbi; -+ p->read_proc = ext4_mb_prealloc_table_proc_read; -+ p->write_proc = ext4_mb_prealloc_table_proc_write; -+ } - } - - sbi->s_mb_history_cur = 0; -@@ -2542,13 +2562,57 @@ - sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; - sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; - sbi->s_mb_stats = MB_DEFAULT_STATS; -- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; - sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; -- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; -+ -+ if (sbi->s_stripe == 0) { -+ sbi->s_mb_prealloc_table_size = 10; -+ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); -+ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); -+ if (sbi->s_mb_prealloc_table == NULL) { -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ memset(sbi->s_mb_prealloc_table, 0, i); -+ -+ ext4_mb_prealloc_table_add(sbi, 4); -+ ext4_mb_prealloc_table_add(sbi, 8); -+ ext4_mb_prealloc_table_add(sbi, 16); -+ ext4_mb_prealloc_table_add(sbi, 32); -+ ext4_mb_prealloc_table_add(sbi, 64); -+ ext4_mb_prealloc_table_add(sbi, 128); -+ ext4_mb_prealloc_table_add(sbi, 256); -+ ext4_mb_prealloc_table_add(sbi, 512); -+ ext4_mb_prealloc_table_add(sbi, 1024); -+ ext4_mb_prealloc_table_add(sbi, 2048); -+ -+ sbi->s_mb_small_req = 256; -+ sbi->s_mb_large_req = 1024; -+ sbi->s_mb_group_prealloc = 512; -+ } else { -+ sbi->s_mb_prealloc_table_size = 3; -+ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); -+ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); -+ if (sbi->s_mb_prealloc_table == NULL) { -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ memset(sbi->s_mb_prealloc_table, 0, i); -+ -+ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe); -+ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); -+ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); -+ -+ sbi->s_mb_small_req = sbi->s_stripe; -+ sbi->s_mb_large_req = sbi->s_stripe * 8; -+ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; -+ } - - sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); - if (sbi->s_locality_groups == NULL) { -+ kfree(sbi->s_mb_prealloc_table); - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return -ENOMEM; -@@ -3032,11 +3186,12 @@ - ext4_mb_normalize_request(struct ext4_allocation_context *ac, - struct ext4_allocation_request *ar) - { -- int bsbits, max; -+ int bsbits, i, wind; - ext4_lblk_t end; -- loff_t size, orig_size, start_off; -+ loff_t size, orig_size; - ext4_lblk_t start, orig_start; - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); -+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_prealloc_space *pa; - - /* do normalize only data requests, metadata requests -@@ -3066,49 +3221,35 @@ - size = size << bsbits; - if (size < i_size_read(ac->ac_inode)) - size = i_size_read(ac->ac_inode); -+ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; - -- /* max size of free chunks */ -- max = 2 << bsbits; -+ start = wind = 0; - --#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ -- (req <= (size) || max <= (chunk_size)) -+ /* let's choose preallocation window depending on file size */ -+ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { -+ if (size <= sbi->s_mb_prealloc_table[i]) { -+ wind = sbi->s_mb_prealloc_table[i]; -+ break; -+ } -+ } -+ size = wind; - -- /* first, try to predict filesize */ -- /* XXX: should this table be tunable? */ -- start_off = 0; -- if (size <= 16 * 1024) { -- size = 16 * 1024; -- } else if (size <= 32 * 1024) { -- size = 32 * 1024; -- } else if (size <= 64 * 1024) { -- size = 64 * 1024; -- } else if (size <= 128 * 1024) { -- size = 128 * 1024; -- } else if (size <= 256 * 1024) { -- size = 256 * 1024; -- } else if (size <= 512 * 1024) { -- size = 512 * 1024; -- } else if (size <= 1024 * 1024) { -- size = 1024 * 1024; -- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { -- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> -- (21 - bsbits)) << 21; -- size = 2 * 1024 * 1024; -- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { -- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> -- (22 - bsbits)) << 22; -- size = 4 * 1024 * 1024; -- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, -- (8<<20)>>bsbits, max, 8 * 1024)) { -- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> -- (23 - bsbits)) << 23; -- size = 8 * 1024 * 1024; -- } else { -- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; -- size = ac->ac_o_ex.fe_len << bsbits; -+ if (wind == 0) { -+ __u64 tstart, tend; -+ /* file is quite large, we now preallocate with -+ * the biggest configured window with regart to -+ * logical offset */ -+ wind = sbi->s_mb_prealloc_table[i - 1]; -+ tstart = ac->ac_o_ex.fe_logical; -+ do_div(tstart, wind); -+ start = tstart * wind; -+ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; -+ do_div(tend, wind); -+ tend = tend * wind + wind; -+ size = tend - start; - } -- orig_size = size = size >> bsbits; -- orig_start = start = start_off >> bsbits; -+ orig_size = size; -+ orig_start = start; - - /* don't cover already allocated blocks in selected range */ - if (ar->pleft && start <= ar->lleft) { -@@ -3185,7 +3326,6 @@ - } - BUG_ON(start + size <= ac->ac_o_ex.fe_logical && - start > ac->ac_o_ex.fe_logical); -- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); - - /* now prepare goal request */ - -@@ -4077,11 +4217,17 @@ - - /* don't use group allocation for large files */ - size = max(size, isize); -- if (size > sbi->s_mb_stream_request) { -+ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || -+ (size >= sbi->s_mb_large_req)) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; - return; - } - -+ /* request is so large that we don't care about -+ * streaming - it overweights any possible seek */ -+ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) -+ return; -+ - BUG_ON(ac->ac_lg != NULL); - /* - * locality group prealloc space are per cpu. The reason for having -Index: linux-2.6.27.21-0.1/fs/ext4/inode.c -=================================================================== ---- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c 2009-05-28 11:12:42.000000000 +0530 -+++ linux-2.6.27.21-0.1/fs/ext4/inode.c 2009-05-28 11:16:48.000000000 +0530 -@@ -2442,14 +2442,14 @@ - return -EROFS; - - /* -- * Make sure nr_to_write is >= sbi->s_mb_stream_request -+ * Make sure nr_to_write is >= sbi->s_mb_small_req - * This make sure small files blocks are allocated in - * single attempt. This ensure that small files - * get less fragmented. - */ -- if (wbc->nr_to_write < sbi->s_mb_stream_request) { -- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; -- wbc->nr_to_write = sbi->s_mb_stream_request; -+ if (wbc->nr_to_write < sbi->s_mb_small_req) { -+ nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write; -+ wbc->nr_to_write = sbi->s_mb_small_req; - } - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; diff --git a/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch deleted file mode 100644 index 66ffd44..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch +++ /dev/null @@ -1,16 +0,0 @@ -Index: linux-2.6.18.i386/fs/ext4/namei.c -=================================================================== ---- linux-2.6.18.i386.orig/fs/ext4/namei.c -+++ linux-2.6.18.i386/fs/ext4/namei.c -@@ -374,8 +374,9 @@ dx_probe(struct dentry *dentry, struct i - if (root->info.hash_version != DX_HASH_TEA && - root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { -- ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", -- root->info.hash_version); -+ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d" -+ "for directory #%lu", -+ root->info.hash_version, dir->i_ino); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; diff --git a/ldiskfs/kernel_patches/patches/ext4-quota-minimal-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-quota-minimal-rhel5.patch deleted file mode 100644 index 1e98c8f..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-quota-minimal-rhel5.patch +++ /dev/null @@ -1,20 +0,0 @@ -Index: linux-2.6.18-238.12.1/fs/ext4/ext4.h -=================================================================== ---- linux-2.6.18-238.12.1.orig/fs/ext4/ext4.h 2011-09-21 17:55:44.627741549 +0200 -+++ linux-2.6.18-238.12.1/fs/ext4/ext4.h 2011-09-21 18:05:20.974106450 +0200 -@@ -971,6 +971,7 @@ - #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -+ unsigned long s_qf_inums[MAXQUOTAS]; /* Quota file inodes */ - #endif - unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; -@@ -1171,6 +1172,7 @@ - #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 - #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 - #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 -+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 - - #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 diff --git a/ldiskfs/kernel_patches/patches/ext4-version-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-version-2.6-rhel5.patch deleted file mode 100644 index fe9cfeb..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-version-2.6-rhel5.patch +++ /dev/null @@ -1,18 +0,0 @@ -Index: linux-2.6.18-128.1.6/fs/ext4/super.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/ext4/super.c 2009-07-24 01:33:54.000000000 -0400 -+++ linux-2.6.18-128.1.6/fs/ext4/super.c 2009-07-24 01:35:28.000000000 -0400 -@@ -3461,6 +3461,8 @@ static int __init init_ext4_fs(void) - goto out; - } - #endif -+ -+ printk(KERN_INFO "ldiskfs created from ""ext""4-2.6-rhel5\n"); - return 0; - out: - destroy_inodecache(); ---- /dev/null 2009-09-21 17:11:24.467285554 +0800 -+++ linux-2.6.27.21-0.1/fs/ext4/fiemap.h -@@ -0,0 +1,2 @@ -+ -+#include_next diff --git a/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch deleted file mode 100644 index e1fa436..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch +++ /dev/null @@ -1,198 +0,0 @@ -Index: linux-stage/fs/ext4/super.c -=================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -662,7 +662,12 @@ static void ext4_put_super(struct super_ - - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(sbi->s_group_desc[i]); -- kfree(sbi->s_group_desc); -+ -+ if (is_vmalloc_addr(sbi->s_group_desc)) -+ vfree(sbi->s_group_desc); -+ else -+ kfree(sbi->s_group_desc); -+ - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else -@@ -2402,12 +2407,13 @@ static int ext4_fill_super(struct super_ - unsigned long offset = 0; - unsigned long journal_devnum = 0; - unsigned long def_mount_opts; -- struct inode *root; -+ struct inode *root = NULL; - char *cp; - const char *descr; - int ret = -EINVAL; - int blocksize; - unsigned int db_count; -+ size_t size; - unsigned int i; - int needs_recovery, has_huge_files; - __u64 blocks_count; -@@ -2718,10 +2724,16 @@ static int ext4_fill_super(struct super_ - (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); - db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); -- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), -- GFP_KERNEL); -+ size = (size_t) db_count * sizeof(struct buffer_head *); -+ sbi->s_group_desc = kzalloc(size, GFP_KERNEL); -+ if (sbi->s_group_desc == NULL) { -+ sbi->s_group_desc = vmalloc(size); -+ if (sbi->s_group_desc != NULL) -+ memset(sbi->s_group_desc, 0, size); -+ } - if (sbi->s_group_desc == NULL) { -- ext4_msg(sb, KERN_ERR, "not enough memory"); -+ ext4_msg(sb, KERN_ERR, "not enough memory for %u groups (%u)\n", -+ sbi->s_groups_count, (unsigned int) size); - goto failed_mount; - } - -@@ -2907,17 +2919,16 @@ no_journal: - if (IS_ERR(root)) { - ext4_msg(sb, KERN_ERR, "get root inode failed"); - ret = PTR_ERR(root); -+ root = NULL; - goto failed_mount4; - } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { -- iput(root); - ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); - goto failed_mount4; - } - sb->s_root = d_alloc_root(root); - if (!sb->s_root) { - ext4_msg(sb, KERN_ERR, "get root dentry failed"); -- iput(root); - ret = -ENOMEM; - goto failed_mount4; - } -@@ -2968,6 +2979,7 @@ no_journal: - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", - err); -+ ret = err; - goto failed_mount4; - } - -@@ -3011,6 +3023,8 @@ cantfind_ext4: - goto failed_mount; - - failed_mount4: -+ iput(root); -+ sb->s_root = NULL; - ext4_msg(sb, KERN_ERR, "mount failed"); - destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); - failed_mount_wq: -@@ -3033,7 +3047,11 @@ failed_mount3: - failed_mount2: - for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); -- kfree(sbi->s_group_desc); -+ -+ if (is_vmalloc_addr(sbi->s_group_desc)) -+ vfree(sbi->s_group_desc); -+ else -+ kfree(sbi->s_group_desc); - failed_mount: - if (sbi->s_proc) { - remove_proc_entry(sb->s_id, ext4_proc_root); -Index: linux-stage/fs/ext4/mballoc.c -=================================================================== ---- linux-stage.orig/fs/ext4/mballoc.c -+++ linux-stage/fs/ext4/mballoc.c -@@ -2607,10 +2607,21 @@ static int ext4_mb_init_backend(struct s - while (array_size < sizeof(*sbi->s_group_info) * - num_meta_group_infos_max) - array_size = array_size << 1; -- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -- * So a two level scheme suffices for now. */ -- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); -+ -+ /* -+ * A 16TB filesystem with 64-bit pointers requires an 8192 byte -+ * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally) -+ * have group descriptors at least twice as large (64 bytes or -+ * more vs. 32 bytes for traditional ext3 filesystems, so a 128TB -+ * filesystem needs a 128kB allocation, which may need vmalloc(). -+ */ -+ sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ sbi->s_group_info = vmalloc(array_size); -+ if (sbi->s_group_info != NULL) -+ memset(sbi->s_group_info, 0, array_size); -+ } -+ - if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); - return -ENOMEM; -@@ -2620,6 +2631,11 @@ static int ext4_mb_init_backend(struct s - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); - goto err_freesgi; - } -+ /* -+ * To avoid colliding with an valid on-disk inode number, -+ * EXT4_BAD_INO is used here as the number of the buddy cache inode. -+ */ -+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; - EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; - for (i = 0; i < ngroups; i++) { - desc = ext4_get_group_desc(sb, i, NULL); -@@ -2642,7 +2658,10 @@ err_freebuddy: - kfree(sbi->s_group_info[i]); - iput(sbi->s_buddy_cache); - err_freesgi: -- kfree(sbi->s_group_info); -+ if (is_vmalloc_addr(sbi->s_group_info)) -+ vfree(sbi->s_group_info); -+ else -+ kfree(sbi->s_group_info); - return -ENOMEM; - } - -@@ -2683,14 +2702,6 @@ int ext4_mb_init(struct super_block *sb, - i++; - } while (i <= sb->s_blocksize_bits + 1); - -- /* init file for buddy data */ -- ret = ext4_mb_init_backend(sb); -- if (ret != 0) { -- kfree(sbi->s_mb_offsets); -- kfree(sbi->s_mb_maxs); -- return ret; -- } -- - spin_lock_init(&sbi->s_md_lock); - spin_lock_init(&sbi->s_bal_lock); - -@@ -2717,6 +2728,14 @@ int ext4_mb_init(struct super_block *sb, - spin_lock_init(&lg->lg_prealloc_lock); - } - -+ /* init file for buddy data */ -+ ret = ext4_mb_init_backend(sb); -+ if (ret != 0) { -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return ret; -+ } -+ - ext4_mb_history_init(sb); - - if (sbi->s_journal) -@@ -2766,7 +2785,10 @@ int ext4_mb_release(struct super_block * - EXT4_DESC_PER_BLOCK_BITS(sb); - for (i = 0; i < num_meta_group_infos; i++) - kfree(sbi->s_group_info[i]); -- kfree(sbi->s_group_info); -+ if (is_vmalloc_addr(sbi->s_group_info)) -+ vfree(sbi->s_group_info); -+ else -+ kfree(sbi->s_group_info); - } - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); diff --git a/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch deleted file mode 100644 index 20c2c38..0000000 --- a/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch +++ /dev/null @@ -1,84 +0,0 @@ -Index: linux-2.6.18-194.3.1/fs/ext4/namei.c -=================================================================== ---- linux-2.6.18-194.3.1.orig/fs/ext4/namei.c -+++ linux-2.6.18-194.3.1/fs/ext4/namei.c -@@ -148,6 +148,17 @@ struct dx_map_entry - u16 size; - }; - -+/* -+ * dentry_param used by ext4_new_inode_wantedi() -+ */ -+#define LVFS_DENTRY_PARAM_MAGIC 20070216UL -+struct lvfs_dentry_params -+{ -+ unsigned long ldp_inum; -+ unsigned long ldp_flags; -+ u32 ldp_magic; -+}; -+ - static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); - static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); - static inline unsigned dx_get_hash(struct dx_entry *entry); -@@ -1761,6 +1772,19 @@ static int ext4_add_nondir(handle_t *han - return err; - } - -+static unsigned ext4_dentry_goal(struct super_block *sb, struct dentry *dentry) -+{ -+ unsigned inum = EXT4_SB(sb)->s_inode_goal; -+ -+ if (dentry->d_fsdata != NULL) { -+ struct lvfs_dentry_params *param = dentry->d_fsdata; -+ -+ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC) -+ inum = param->ldp_inum; -+ } -+ return inum; -+} -+ - /* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it -@@ -1786,7 +1810,8 @@ retry: - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - -- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); -+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, -+ ext4_dentry_goal(dir->i_sb, dentry)); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext4_file_inode_operations; -@@ -1820,7 +1845,8 @@ retry: - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - -- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); -+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, -+ ext4_dentry_goal(dir->i_sb, dentry)); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -@@ -1857,8 +1883,8 @@ retry: - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - -- inode = ext4_new_inode(handle, dir, S_IFDIR | mode, -- &dentry->d_name, 0); -+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name, -+ ext4_dentry_goal(dir->i_sb, dentry)); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -2270,8 +2296,8 @@ retry: - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - -- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, -- &dentry->d_name, 0); -+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, -+ ext4_dentry_goal(dir->i_sb, dentry)); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; diff --git a/ldiskfs/kernel_patches/patches/ext4_data_in_dirent.patch b/ldiskfs/kernel_patches/patches/ext4_data_in_dirent.patch deleted file mode 100644 index 42568cd..0000000 --- a/ldiskfs/kernel_patches/patches/ext4_data_in_dirent.patch +++ /dev/null @@ -1,546 +0,0 @@ -this patch implements feature which allows ext4 fs users (e.g. Lustre) -to store data in ext4 dirent. -data is stored in ext4 dirent after file-name, this space is accounted -in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data -is present. - -make use of dentry->d_fsdata to pass fid to ext4. so no -changes in ext4_add_entry() interface required. - -Index: linux-stage/fs/ext4/dir.c -=================================================================== ---- linux-stage.orig/fs/ext4/dir.c -+++ linux-stage/fs/ext4/dir.c -@@ -53,11 +53,18 @@ const struct file_operations ext4_dir_op - - static unsigned char get_dtype(struct super_block *sb, int filetype) - { -+ int fl_index = filetype & EXT4_FT_MASK; -+ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || -- (filetype >= EXT4_FT_MAX)) -+ (fl_index >= EXT4_FT_MAX)) - return DT_UNKNOWN; - -- return (ext4_filetype_table[filetype]); -+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA)) -+ return (ext4_filetype_table[fl_index]); -+ -+ return (ext4_filetype_table[fl_index]) | -+ (filetype & EXT4_DIRENT_LUFID); -+ - } - - -@@ -70,11 +77,11 @@ int ext4_check_dir_entry(const char *fun - const int rlen = ext4_rec_len_from_disk(de->rec_len, - dir->i_sb->s_blocksize); - -- if (rlen < EXT4_DIR_REC_LEN(1)) -+ if (rlen < __EXT4_DIR_REC_LEN(1)) - error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) - error_msg = "rec_len % 4 != 0"; -- else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) -+ else if (rlen < EXT4_DIR_REC_LEN(de)) - error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) - error_msg = "directory entry across blocks"; -@@ -179,7 +186,7 @@ revalidate: - * failure will be detected in the - * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len, -- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) -+ sb->s_blocksize) < __EXT4_DIR_REC_LEN(1)) - break; - i += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); -@@ -342,12 +349,17 @@ int ext4_htree_store_dirent(struct file - struct fname *fname, *new_fn; - struct dir_private_info *info; - int len; -+ int extra_data = 1; - - info = (struct dir_private_info *) dir_file->private_data; - p = &info->root.rb_node; - - /* Create and allocate the fname structure */ -- len = sizeof(struct fname) + dirent->name_len + 1; -+ if (dirent->file_type & EXT4_DIRENT_LUFID) -+ extra_data = ext4_get_dirent_data_len(dirent); -+ -+ len = sizeof(struct fname) + dirent->name_len + extra_data; -+ - new_fn = kzalloc(len, GFP_KERNEL); - if (!new_fn) - return -ENOMEM; -@@ -356,7 +368,7 @@ int ext4_htree_store_dirent(struct file - new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; - new_fn->file_type = dirent->file_type; -- memcpy(new_fn->name, dirent->name, dirent->name_len); -+ memcpy(new_fn->name, dirent->name, dirent->name_len + extra_data); - new_fn->name[dirent->name_len] = 0; - - while (*p) { -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -1172,6 +1172,7 @@ static inline void ext4_clear_inode_stat - #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 - #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 - #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 -+#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 - - #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ -@@ -1180,7 +1181,9 @@ static inline void ext4_clear_inode_stat - EXT4_FEATURE_INCOMPAT_EXTENTS| \ - EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG| \ -- EXT4_FEATURE_INCOMPAT_MMP) -+ EXT4_FEATURE_INCOMPAT_MMP| \ -+ EXT4_FEATURE_INCOMPAT_DIRDATA) -+ - #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ -@@ -1262,6 +1265,43 @@ struct ext4_dir_entry_2 { - #define EXT4_FT_SYMLINK 7 - - #define EXT4_FT_MAX 8 -+#define EXT4_FT_MASK 0xf -+ -+#if EXT4_FT_MAX > EXT4_FT_MASK -+#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" -+#endif -+ -+/* -+ * d_type has 4 unused bits, so it can hold four types data. these different -+ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be -+ * stored, in flag order, after file-name in ext4 dirent. -+*/ -+/* -+ * this flag is added to d_type if ext4 dirent has extra data after -+ * filename. this data length is variable and length is stored in first byte -+ * of data. data start after filename NUL byte. -+ * This is used by Lustre FS. -+ */ -+#define EXT4_DIRENT_LUFID 0x10 -+ -+#define EXT4_LUFID_MAGIC 0xAD200907UL -+struct ext4_dentry_param { -+ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ -+ char edp_len; /* size of edp_data in bytes */ -+ char edp_data[0]; /* packed array of data */ -+} __attribute__((packed)); -+ -+static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, -+ struct ext4_dentry_param* p) -+ -+{ -+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA)) -+ return NULL; -+ if (p && p->edp_magic == EXT4_LUFID_MAGIC) -+ return &p->edp_len; -+ else -+ return NULL; -+} - - /* - * EXT4_DIR_PAD defines the directory entries boundaries -@@ -1270,8 +1310,11 @@ struct ext4_dir_entry_2 { - */ - #define EXT4_DIR_PAD 4 - #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) --#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ -+#define __EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ - ~EXT4_DIR_ROUND) -+#define EXT4_DIR_REC_LEN(de) (__EXT4_DIR_REC_LEN(de->name_len +\ -+ ext4_get_dirent_data_len(de))) -+ - #define EXT4_MAX_REC_LEN ((1<<16)-1) - - static inline unsigned int -@@ -1611,7 +1654,7 @@ extern struct buffer_head * ext4_find_en - struct ext4_dir_entry_2 ** res_dir); - #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) - extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, -- struct inode *inode); -+ struct inode *inode, const void *, const void *); - extern int ext4_orphan_add(handle_t *, struct inode *); - extern int ext4_orphan_del(handle_t *, struct inode *); - extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, -@@ -1809,6 +1852,28 @@ static inline void ext4_update_i_disksiz - up_write(&EXT4_I(inode)->i_data_sem); - return ; - } -+/* -+ * Compute the total directory entry data length. -+ * This includes the filename and an implicit NUL terminator (always present), -+ * and optional extensions. Each extension has a bit set in the high 4 bits of -+ * de->file_type, and the extension length is the first byte in each entry. -+ */ -+ -+static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) -+{ -+ char *len = de->name + de->name_len + 1 /* NUL terminator */; -+ int dlen = 0; -+ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; -+ -+ while (extra_data_flags) { -+ if (extra_data_flags & 1) { -+ dlen += *len + (dlen == 0); -+ len += *len; -+ } -+ extra_data_flags >>= 1; -+ } -+ return dlen; -+} - - struct ext4_group_info { - unsigned long bb_state; -Index: linux-stage/fs/ext4/namei.c -=================================================================== ---- linux-stage.orig/fs/ext4/namei.c -+++ linux-stage/fs/ext4/namei.c -@@ -173,7 +173,8 @@ static unsigned dx_get_count(struct dx_e - static unsigned dx_get_limit(struct dx_entry *entries); - static void dx_set_count(struct dx_entry *entries, unsigned value); - static void dx_set_limit(struct dx_entry *entries, unsigned value); --static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -+static inline unsigned dx_root_limit(__u32 blocksize, -+ struct ext4_dir_entry_2 *dot_de, unsigned infosize); - static unsigned dx_node_limit(struct inode *dir); - static struct dx_frame *dx_probe(const struct qstr *d_name, - struct inode *dir, -@@ -216,11 +217,12 @@ ext4_next_entry(struct ext4_dir_entry_2 - */ - struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de) - { -- /* get dotdot first */ -- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); -+ BUG_ON(de->name_len != 1); -+ /* get dotdot first */ -+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); - -- /* dx root info is after dotdot entry */ -- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); -+ /* dx root info is after dotdot entry */ -+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); - - return (struct dx_root_info *) de; - } -@@ -265,16 +267,23 @@ static inline void dx_set_limit(struct d - ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); - } - --static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) -+static inline unsigned dx_root_limit(__u32 blocksize, -+ struct ext4_dir_entry_2 *dot_de, unsigned infosize) - { -- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - -- EXT4_DIR_REC_LEN(2) - infosize; -+ struct ext4_dir_entry_2 *dotdot_de; -+ unsigned entry_space; -+ -+ BUG_ON(dot_de->name_len != 1); -+ dotdot_de = ext4_next_entry(dot_de, blocksize); -+ entry_space = blocksize - EXT4_DIR_REC_LEN(dot_de) - -+ EXT4_DIR_REC_LEN(dotdot_de) - infosize; -+ - return entry_space / sizeof(struct dx_entry); - } - - static inline unsigned dx_node_limit(struct inode *dir) - { -- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); -+ unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0); - return entry_space / sizeof(struct dx_entry); - } - -@@ -321,7 +330,7 @@ static struct stats dx_show_leaf(struct - printk(":%x.%u ", h.hash, - ((char *) de - base)); - } -- space += EXT4_DIR_REC_LEN(de->name_len); -+ space += EXT4_DIR_REC_LEN(de); - names++; - } - de = ext4_next_entry(de, size); -@@ -424,7 +433,8 @@ dx_probe(const struct qstr *d_name, stru - - entries = (struct dx_entry *) (((char *)info) + info->info_length); - -- if (dx_get_limit(entries) != dx_root_limit(dir, -+ if (dx_get_limit(entries) != dx_root_limit(dir->i_sb->s_blocksize, -+ (struct ext4_dir_entry_2*)bh->b_data, - info->info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); - brelse(bh); -@@ -480,14 +490,17 @@ dx_probe(const struct qstr *d_name, stru - if (!indirect--) return frame; - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) - goto fail2; -- at = entries = ((struct dx_node *) bh->b_data)->entries; -+ entries = ((struct dx_node *) bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, -- "dx entry: limit != node limit"); -+ "block %u(%lu): limit %u != node limit %u", -+ dx_get_block(at), (long)bh->b_blocknr, -+ dx_get_limit(entries), dx_node_limit(dir)); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } -+ at = entries; - frame++; - frame->bh = NULL; - } -@@ -613,7 +626,7 @@ static int htree_dirblock_to_tree(struct - de = (struct ext4_dir_entry_2 *) bh->b_data; - top = (struct ext4_dir_entry_2 *) ((char *) de + - dir->i_sb->s_blocksize - -- EXT4_DIR_REC_LEN(0)); -+ __EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, - (block<i_sb)) -@@ -1025,7 +1038,7 @@ static struct buffer_head * ext4_dx_find - goto errout; - de = (struct ext4_dir_entry_2 *) bh->b_data; - top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - -- EXT4_DIR_REC_LEN(0)); -+ __EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { - int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) - + ((char *) de - bh->b_data); -@@ -1186,7 +1199,7 @@ dx_move_dirents(char *from, char *to, st - while (count--) { - struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) - (from + (map->offs<<2)); -- rec_len = EXT4_DIR_REC_LEN(de->name_len); -+ rec_len = EXT4_DIR_REC_LEN(de); - memcpy (to, de, rec_len); - ((struct ext4_dir_entry_2 *) to)->rec_len = - ext4_rec_len_to_disk(rec_len, blocksize); -@@ -1210,7 +1223,7 @@ static struct ext4_dir_entry_2* dx_pack_ - while ((char*)de < base + blocksize) { - next = ext4_next_entry(de, blocksize); - if (de->inode && de->name_len) { -- rec_len = EXT4_DIR_REC_LEN(de->name_len); -+ rec_len = EXT4_DIR_REC_LEN(de); - if (de > to) - memmove(to, de, rec_len); - to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); -@@ -1340,10 +1353,16 @@ static int add_dirent_to_buf(handle_t *h - unsigned int offset = 0; - unsigned int blocksize = dir->i_sb->s_blocksize; - unsigned short reclen; -- int nlen, rlen, err; -+ int nlen, rlen, err, dlen = 0; -+ unsigned char *data; - char *top; - -- reclen = EXT4_DIR_REC_LEN(namelen); -+ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) -+ dentry->d_fsdata); -+ if (data) -+ dlen = (*data) + 1; -+ -+ reclen = __EXT4_DIR_REC_LEN(namelen + dlen); - if (!de) { - de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + blocksize - reclen; -@@ -1353,7 +1372,7 @@ static int add_dirent_to_buf(handle_t *h - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; -- nlen = EXT4_DIR_REC_LEN(de->name_len); -+ nlen = EXT4_DIR_REC_LEN(de); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; -@@ -1371,7 +1390,7 @@ static int add_dirent_to_buf(handle_t *h - } - - /* By now the buffer is marked for journaling */ -- nlen = EXT4_DIR_REC_LEN(de->name_len); -+ nlen = EXT4_DIR_REC_LEN(de); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); -@@ -1387,6 +1406,12 @@ static int add_dirent_to_buf(handle_t *h - de->inode = 0; - de->name_len = namelen; - memcpy(de->name, name, namelen); -+ if (data) { -+ de->name[namelen] = 0; -+ memcpy(&de->name[namelen + 1], data, *(char *) data); -+ de->file_type |= EXT4_DIRENT_LUFID; -+ } -+ - /* - * XXX shouldn't update any times until successful - * completion of syscall, but too many callers depend -@@ -1485,7 +1510,8 @@ static int make_indexed_dir(handle_t *ha - - dx_set_block(entries, 1); - dx_set_count(entries, 1); -- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); -+ dx_set_limit(entries, dx_root_limit(dir->i_sb->s_blocksize, -+ dot_de, sizeof(*dx_info))); - - /* Initialize as for dx_probe */ - hinfo.hash_version = dx_info->hash_version; -@@ -1516,6 +1542,8 @@ static int ext4_update_dotdot(handle_t * - struct buffer_head * dir_block; - struct ext4_dir_entry_2 * de; - int len, journal = 0, err = 0; -+ int dlen = 0; -+ char *data; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -@@ -1531,19 +1559,24 @@ static int ext4_update_dotdot(handle_t * - /* the first item must be "." */ - assert(de->name_len == 1 && de->name[0] == '.'); - len = le16_to_cpu(de->rec_len); -- assert(len >= EXT4_DIR_REC_LEN(1)); -- if (len > EXT4_DIR_REC_LEN(1)) { -+ assert(len >= __EXT4_DIR_REC_LEN(1)); -+ if (len > __EXT4_DIR_REC_LEN(1)) { - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out_journal; - - journal = 1; -- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); -+ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); - } - -- len -= EXT4_DIR_REC_LEN(1); -- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); -+ len -= EXT4_DIR_REC_LEN(de); -+ data = ext4_dentry_get_data(dir->i_sb, -+ (struct ext4_dentry_param *) dentry->d_fsdata); -+ if (data) -+ dlen = *data + 1; -+ assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen)); -+ - de = (struct ext4_dir_entry_2 *) - ((char *) de + le16_to_cpu(de->rec_len)); - if (!journal) { -@@ -1557,10 +1590,15 @@ static int ext4_update_dotdot(handle_t * - if (len > 0) - de->rec_len = cpu_to_le16(len); - else -- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); -+ assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2)); - de->name_len = 2; - strcpy (de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); -+ if (data) { -+ de->name[2] = 0; -+ memcpy(&de->name[2 + 1], data, dlen); -+ de->file_type |= EXT4_DIRENT_LUFID; -+ } - - out_journal: - if (journal) { -@@ -1982,12 +2020,13 @@ retry: - /* Initialize @inode as a subdirectory of @dir, and add the - * "." and ".." entries into the first directory block. */ - int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir, -- struct inode *inode) -+ struct inode *inode, -+ const void *data1, const void *data2) - { - unsigned int blocksize = dir->i_sb->s_blocksize; - struct buffer_head * dir_block; - struct ext4_dir_entry_2 * de; -- int err = 0; -+ int err = 0, dot_reclen; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -@@ -1999,28 +2038,42 @@ int ext4_add_dot_dotdot(handle_t *handle - inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); -- if (!dir_block) { -- clear_nlink(inode); -- ext4_mark_inode_dirty(handle, inode); -- iput (inode); -+ if (!dir_block) - goto get_out; -- } -+ - BUFFER_TRACE(dir_block, "get_write_access"); - ext4_journal_get_write_access(handle, dir_block); - de = (struct ext4_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; -- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), -- blocksize); - strcpy(de->name, "."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); -+ /* get packed fid data*/ -+ data1 = ext4_dentry_get_data(dir->i_sb, -+ (struct ext4_dentry_param *) data1); -+ if (data1) { -+ de->name[1] = 0; -+ memcpy(&de->name[2], data1, *(char *) data1); -+ de->file_type |= EXT4_DIRENT_LUFID; -+ } -+ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); -+ dot_reclen = cpu_to_le16(de->rec_len); -+ - de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(dir->i_ino); -- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), -+ de->rec_len = ext4_rec_len_to_disk(blocksize - dot_reclen, - blocksize); - de->name_len = 2; - strcpy(de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); -+ data2 = ext4_dentry_get_data(dir->i_sb, -+ (struct ext4_dentry_param *) data2); -+ if (data2) { -+ de->name[2] = 0; -+ memcpy(&de->name[3], data2, *(char *) data2); -+ de->file_type |= EXT4_DIRENT_LUFID; -+ } -+ - inode->i_nlink = 2; - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, dir_block); -@@ -2057,9 +2110,14 @@ retry: - if (IS_ERR(inode)) - goto out_stop; - -- err = ext4_add_dot_dotdot(handle, dir, inode); -- if (err) -+ err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL); -+ if (err) { -+ clear_nlink(inode); -+ unlock_new_inode(inode); -+ ext4_mark_inode_dirty(handle, inode); -+ iput (inode); - goto out_stop; -+ } - - err = ext4_add_entry(handle, dentry, inode); - if (err) { -@@ -2093,7 +2151,7 @@ static int empty_dir(struct inode *inode - int err = 0; - - sb = inode->i_sb; -- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || -+ if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { - if (err) - ext4_error(inode->i_sb, diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series deleted file mode 100644 index abf7009..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series +++ /dev/null @@ -1,41 +0,0 @@ -ext4-version-2.6-rhel5.patch -ext4-wantedi-2.6-rhel5.patch -ext4-map_inode_page-2.6.18-rhel5.patch -export-ext4-2.6-rhel5.patch -ext4-remove-cond_resched-calls-rhel5.patch -ext4-nlink-2.6-rhel5.patch -ext4-inode-version-rhel5.patch -ext4-mmp-rhel5.patch -ext4-lookup-dotdot-rhel5.patch -ext4-max-dir-size-rhel5.patch -ext4-print-inum-in-htree-warning-rhel5.patch -ext4-xattr-no-update-ctime-rhel5.patch -ext4-prealloc-rhel5.patch -ext4-mballoc-extra-checks-rhel5.patch -ext4-misc-rhel5.patch -ext4-big-endian-check-2.6-rhel5.patch -ext4-alloc-policy-2.6-rhel5.patch -ext4-force_over_128tb-rhel5.patch -ext4-pdir-fix.patch -ext4-osd-iop-common.patch -ext4-osd-iam-exports.patch -ext4-dynlocks-common.patch -ext4-dynlocks-2.6-rhel5.patch -ext4-hash-indexed-dir-dotdot-update-rhel5.patch -ext4-ext_generation-sles11.patch -ext4-kill-dx_root.patch -ext4-fiemap-2.6-rhel5.patch -ext4-mballoc-pa_free-mismatch.patch -ext4_data_in_dirent.patch -ext4-large-eas.patch -ext4-disable-mb-cache-rhel5.patch -ext4-disable-delalloc-rhel5.patch -ext4-back-dquot-to-rhel54.patch -ext4-nocmtime-2.6-rhel5.patch -ext4-failed-mount-b23368.patch -ext4-export-64bit-name-hash.patch -ext4-vmalloc-rhel5.patch -ext4-mballoc-group_check-rhel5.patch -ext4-journal-callback-rhel5.patch -ext4-store-tree-generation-at-find.patch -ext4-quota-minimal-rhel5.patch diff --git a/lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel5.patch b/lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel5.patch deleted file mode 100644 index 3874794..0000000 --- a/lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel5.patch +++ /dev/null @@ -1,44 +0,0 @@ -Index: linux-2.6.18-164.11.1/include/linux/blkdev.h -=================================================================== ---- linux-2.6.18-164.11.1.orig/include/linux/blkdev.h -+++ linux-2.6.18-164.11.1/include/linux/blkdev.h -@@ -788,10 +788,10 @@ extern void blk_free_tags(struct blk_que - extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *); - extern int blkdev_issue_flush(struct block_device *, sector_t *); - --#define MAX_PHYS_SEGMENTS 128 --#define MAX_HW_SEGMENTS 128 -+#define MAX_PHYS_SEGMENTS 256 -+#define MAX_HW_SEGMENTS 256 - #define SAFE_MAX_SECTORS 255 --#define BLK_DEF_MAX_SECTORS 1024 -+#define BLK_DEF_MAX_SECTORS 2048 - - #define MAX_SEGMENT_SIZE 65536 - -Index: linux-2.6.18-164.11.1/include/scsi/scsi_host.h -=================================================================== ---- linux-2.6.18-164.11.1.orig/include/scsi/scsi_host.h -+++ linux-2.6.18-164.11.1/include/scsi/scsi_host.h -@@ -30,7 +30,7 @@ struct blk_queue_tags; - * used in one scatter-gather request. - */ - #define SG_NONE 0 --#define SG_ALL 0xff -+#define SG_ALL 256 - - - #define DISABLE_CLUSTERING 0 -Index: linux-2.6.18-164.11.1/drivers/scsi/lpfc/lpfc.h -=================================================================== ---- linux-2.6.18-164.11.1.orig/drivers/scsi/lpfc/lpfc.h -+++ linux-2.6.18-164.11.1/drivers/scsi/lpfc/lpfc.h -@@ -38,7 +38,7 @@ - #define LPFC_MAX_NS_RETRY 3 /* Number of retry attempts to contact - the NameServer before giving up. */ - #define LPFC_CMD_PER_LUN 3 /* max outstanding cmds per lun */ --#define LPFC_DEFAULT_SG_SEG_CNT 64 /* sg element count per scsi cmnd */ -+#define LPFC_DEFAULT_SG_SEG_CNT 256 /* sg element count per scsi cmnd */ - #define LPFC_MAX_SG_SEG_CNT 256 /* sg element count per scsi cmnd */ - #define LPFC_IOCB_LIST_CNT 2250 /* list of IOCBs for fast-path usage. */ - #define LPFC_Q_RAMP_UP_INTERVAL 120 /* lun q_depth ramp up interval */ diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch deleted file mode 100644 index a12fb3f..0000000 --- a/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch +++ /dev/null @@ -1,166 +0,0 @@ -This patch is no longer needed for Lustre. It is only included -for testing and ease of using the same kernel with older Lustre -versions. This testing functionality was replaced in Linux 3.0 -by the dm-flakey driver. - -This functionality is mainly used during testing, in order to -simulate a server crash for ldiskfs by discarding all of the -writes to the filesystem. For recovery testing we could simulate -this by using a special loopback or DM device that also discards -writes to the device. - -This functionality is also used by target "failback" in order -to speed up service shutdown and takeover by the other node -during controlled operation. However, it would also be possible -to do this by simply allowing all of the in-flight requests to -complete and then waiting for the service to stop. This will -also be needed by the DMU-OSD, because discarding of writes on -a DMU-based target is not safe as it could trigger a storage -failure if the data is ever read from disk again and the -checksum does not match that expected by the block pointer. - -Index: linux-2.6.18.1/block/ll_rw_blk.c -=================================================================== ---- linux-2.6.18.1.orig/block/ll_rw_blk.c -+++ linux-2.6.18.1/block/ll_rw_blk.c -@@ -3067,6 +3067,8 @@ static void handle_bad_sector(struct bio - set_bit(BIO_EOF, &bio->bi_flags); - } - -+int dev_check_rdonly(struct block_device *bdev); -+ - /** - * generic_make_request: hand a buffer to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. -@@ -3151,6 +3153,12 @@ end_io: - - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - -+ /* this is cfs's dev_rdonly check */ -+ if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) { -+ bio_endio(bio, bio->bi_size, 0); -+ break; -+ } -+ - /* - * If this device has partitions, remap block n -@@ -3765,6 +3773,91 @@ void swap_io_context(struct io_context * - *ioc2 = temp; - } - EXPORT_SYMBOL(swap_io_context); -+ /* -+ * Debug code for turning block devices "read-only" (will discard writes -+ * silently). This is for filesystem crash/recovery testing. -+ */ -+struct deventry { -+ dev_t dev; -+ struct deventry *next; -+}; -+ -+static struct deventry *devlist = NULL; -+static spinlock_t devlock = SPIN_LOCK_UNLOCKED; -+ -+int dev_check_rdonly(struct block_device *bdev) -+{ -+ struct deventry *cur; -+ if (!bdev) return 0; -+ spin_lock(&devlock); -+ cur = devlist; -+ while(cur) { -+ if (bdev->bd_dev == cur->dev) { -+ spin_unlock(&devlock); -+ return 1; -+ } -+ cur = cur->next; -+ } -+ spin_unlock(&devlock); -+ return 0; -+} -+ -+void dev_set_rdonly(struct block_device *bdev) -+{ -+ struct deventry *newdev, *cur; -+ -+ if (!bdev) -+ return; -+ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL); -+ if (!newdev) -+ return; -+ -+ spin_lock(&devlock); -+ cur = devlist; -+ while(cur) { -+ if (bdev->bd_dev == cur->dev) { -+ spin_unlock(&devlock); -+ kfree(newdev); -+ return; -+ } -+ cur = cur->next; -+ } -+ newdev->dev = bdev->bd_dev; -+ newdev->next = devlist; -+ devlist = newdev; -+ spin_unlock(&devlock); -+ printk(KERN_WARNING "Turning device %s (%#x) read-only\n", -+ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev); -+} -+ -+void dev_clear_rdonly(struct block_device *bdev) -+{ -+ struct deventry *cur, *last = NULL; -+ if (!bdev) return; -+ spin_lock(&devlock); -+ cur = devlist; -+ while(cur) { -+ if (bdev->bd_dev == cur->dev) { -+ if (last) -+ last->next = cur->next; -+ else -+ devlist = cur->next; -+ spin_unlock(&devlock); -+ kfree(cur); -+ printk(KERN_WARNING "Removing read-only on %s (%#x)\n", -+ bdev->bd_disk ? bdev->bd_disk->disk_name : -+ "unknown block", bdev->bd_dev); -+ return; -+ } -+ last = cur; -+ cur = cur->next; -+ } -+ spin_unlock(&devlock); -+} -+ -+EXPORT_SYMBOL(dev_set_rdonly); -+EXPORT_SYMBOL(dev_clear_rdonly); -+EXPORT_SYMBOL(dev_check_rdonly); - - /* - * sysfs parts below -Index: linux-2.6.18.1/fs/block_dev.c -=================================================================== ---- linux-2.6.18.1.orig/fs/block_dev.c -+++ linux-2.6.18.1/fs/block_dev.c -@@ -1059,6 +1059,7 @@ static int __blkdev_put(struct block_dev - if (bdev != bdev->bd_contains) - victim = bdev->bd_contains; - bdev->bd_contains = NULL; -+ dev_clear_rdonly(bdev); - } - unlock_kernel(); - mutex_unlock(&bdev->bd_mutex); -Index: linux-2.6.18.1/include/linux/fs.h -=================================================================== ---- linux-2.6.18.1.orig/include/linux/fs.h -+++ linux-2.6.18.1/include/linux/fs.h -@@ -1685,6 +1685,10 @@ extern void file_kill(struct file *f); - struct bio; - extern void submit_bio(int, struct bio *); - extern int bdev_read_only(struct block_device *); -+#define HAVE_CLEAR_RDONLY_ON_PUT -+void dev_set_rdonly(struct block_device *bdev); -+int dev_check_rdonly(struct block_device *bdev); -+void dev_clear_rdonly(struct block_device *bdev); - extern int set_blocksize(struct block_device *, int); - extern int sb_set_blocksize(struct super_block *, int); - extern int sb_min_blocksize(struct super_block *, int); diff --git a/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch deleted file mode 100644 index 9727ea4..0000000 --- a/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch +++ /dev/null @@ -1,17 +0,0 @@ -Allow starting the commit of a journal transaction, without waiting for -it to complete. This is a performance enhancement for OST IO so that -the journal commit can run concurrently with the file IO. It isn't -necessary if the client can handle bulk IO recovery (bug 16919). - -Index: linux-2.6/fs/jbd/journal.c -=================================================================== ---- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:13:50.000000000 +0800 -+++ linux-2.6/fs/jbd/journal.c 2006-07-15 16:22:04.000000000 +0800 -@@ -74,6 +74,7 @@ EXPORT_SYMBOL(journal_abort); - EXPORT_SYMBOL(journal_errno); - EXPORT_SYMBOL(journal_ack_err); - EXPORT_SYMBOL(journal_clear_err); -+EXPORT_SYMBOL(log_start_commit); - EXPORT_SYMBOL(log_wait_commit); - EXPORT_SYMBOL(journal_start_commit); - EXPORT_SYMBOL(journal_force_commit_nested); diff --git a/lustre/kernel_patches/patches/export_symbol_numa-2.6-fc5.patch b/lustre/kernel_patches/patches/export_symbol_numa-2.6-fc5.patch deleted file mode 100644 index 095c1de..0000000 --- a/lustre/kernel_patches/patches/export_symbol_numa-2.6-fc5.patch +++ /dev/null @@ -1,12 +0,0 @@ -Index: linux-2.6.16.i686/arch/i386/kernel/smpboot.c -=================================================================== ---- linux-2.6.16.i686.orig/arch/i386/kernel/smpboot.c 2006-05-30 15:47:03.000000000 +0800 -+++ linux-2.6.16.i686/arch/i386/kernel/smpboot.c 2006-05-30 21:22:02.000000000 +0800 -@@ -579,6 +579,7 @@ - /* which logical CPUs are on which nodes */ - cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = - { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; -+EXPORT_SYMBOL(node_2_cpu_mask); - /* which node each logical CPU is on */ - int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; - EXPORT_SYMBOL(cpu_2_node); diff --git a/lustre/kernel_patches/patches/export_symbols-2.6.12.patch b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch deleted file mode 100644 index 26f02c2..0000000 --- a/lustre/kernel_patches/patches/export_symbols-2.6.12.patch +++ /dev/null @@ -1,15 +0,0 @@ -This patch is not needed for 2.x, but is kept to allow the same kernel -to be used between 1.8.x and 2.0.x for ease of upgrade. - -Index: linux-2.6.12-rc6/fs/dcache.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200 -+++ linux-2.6.12-rc6/fs/dcache.c 2005-06-14 15:53:58.385436913 +0200 -@@ -1581,6 +1581,7 @@ - - return result; - } -+EXPORT_SYMBOL(is_subdir); - - void d_genocide(struct dentry *root) - { diff --git a/lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch b/lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch deleted file mode 100644 index 57d12ee..0000000 --- a/lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch +++ /dev/null @@ -1,41 +0,0 @@ -commit 229309caebe4508d650bb6d8f7d51f2b116f5bbd -Author: Jan Kara -Date: Sun May 8 19:09:53 2011 -0400 - -jbd2: Fix forever sleeping process in do_get_write_access() - -In do_get_write_access() we wait on BH_Unshadow bit for buffer to get -from shadow state. The waking code in journal_commit_transaction() has -a bug because it does not issue a memory barrier after the buffer is -moved from the shadow state and before wake_up_bit() is called. Thus a -waitqueue check can happen before the buffer is actually moved from -the shadow state and waiting process may never be woken. Fix the -problem by issuing proper barrier. - -Reported-by: Tao Ma -Signed-off-by: Jan Kara -Signed-off-by: "Theodore Ts'o" ---- - fs/jbd2/commit.c | 9 +++++++-- - 1 files changed, 7 insertions(+), 2 deletions(-) - -Index: linux-2.6.18.4/fs/jbd2/commit.c -=================================================================== ---- linux-2.6.18.4.orig/fs/jbd2/commit.c -+++ linux-2.6.18.4/fs/jbd2/commit.c -@@ -788,8 +788,13 @@ wait_for_iobuf: - required. */ - JBUFFER_TRACE(jh, "file as BJ_Forget"); - jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); -- /* Wake up any transactions which were waiting for this -- IO to complete */ -+ /* -+ * Wake up any transactions which were waiting for this IO to -+ * complete. The barrier must be here so that changes by -+ * jbd2_journal_file_buffer() take effect before wake_up_bit() -+ * does the waitqueue check. -+ */ -+ smp_mb(); - wake_up_bit(&bh->b_state, BH_Unshadow); - JBUFFER_TRACE(jh, "brelse shadowed buffer"); - __brelse(bh); diff --git a/lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch deleted file mode 100644 index 0276d59..0000000 --- a/lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch +++ /dev/null @@ -1,238 +0,0 @@ -Implement a JBD per-transaction commit callback. Users can attach arbitrary -callbacks to a journal handle, which are propagated to the transaction at -journal handle stop time. The commit callbacks are run when the transaction -is finished commit, and will be passed a non-zero error code if there was -a commit error. - -Signed-off-by: Andreas Dilger - - -Index: linux-2.6/include/linux/jbd.h -=================================================================== ---- linux-2.6.orig/include/linux/jbd.h 2006-07-15 16:08:35.000000000 +0800 -+++ linux-2.6/include/linux/jbd.h 2006-07-15 16:13:01.000000000 +0800 -@@ -356,6 +356,27 @@ static inline void jbd_unlock_bh_journal - bit_spin_unlock(BH_JournalHead, &bh->b_state); - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+/** -+ * struct journal_callback - Base structure for callback information -+ * @jcb_list: list information for other callbacks attached to the same handle -+ * @jcb_func: Function to call with this callback structure -+ * -+ * This struct is a 'seed' structure for a using with your own callback -+ * structs. If you are using callbacks you must allocate one of these -+ * or another struct of your own definition which has this struct -+ * as it's first element and pass it to journal_callback_set(). -+ * -+ * This is used internally by jbd to maintain callback information. -+ * -+ * See journal_callback_set for more information. -+ **/ -+struct journal_callback { -+ struct list_head jcb_list; /* t_jcb_lock */ -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* caller data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /** -@@ -364,6 +385,7 @@ struct jbd_revoke_table_s; - * @h_transaction: Which compound transaction is this update a part of? - * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. - * @h_ref: Reference count on this handle -+ * @h_jcb: List of application registered callbacks for this handle. - * @h_err: Field for caller's use to track errors through large fs operations - * @h_sync: flag for sync-on-close - * @h_jdata: flag to force data journaling -@@ -389,6 +411,13 @@ struct handle_s - /* operations */ - int h_err; - -+ /* -+ * List of application registered callbacks for this handle. The -+ * function(s) will be called after the transaction that this handle is -+ * part of has been committed to disk. [t_jcb_lock] -+ */ -+ struct list_head h_jcb; -+ - /* Flags [no locking] */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -430,6 +459,8 @@ struct handle_s - * j_state_lock - * ->j_list_lock (journal_unmap_buffer) - * -+ * t_handle_lock -+ * ->t_jcb_lock - */ - - struct transaction_s -@@ -559,6 +590,15 @@ struct transaction_s - */ - int t_handle_count; - -+ /* -+ * Protects the callback list -+ */ -+ spinlock_t t_jcb_lock; -+ /* -+ * List of registered callback functions for this transaction. -+ * Called when the transaction is committed. [t_jcb_lock] -+ */ -+ struct list_head t_jcb; - }; - - /** -@@ -906,6 +946,10 @@ extern void journal_invalidatepage(jour - extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); -+ - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); - -Index: linux-2.6/fs/jbd/checkpoint.c -=================================================================== ---- linux-2.6.orig/fs/jbd/checkpoint.c 2006-07-15 16:08:36.000000000 +0800 -+++ linux-2.6/fs/jbd/checkpoint.c 2006-07-15 16:13:01.000000000 +0800 -@@ -688,6 +688,7 @@ void __journal_drop_transaction(journal_ - J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); -+ J_ASSERT(list_empty(&transaction->t_jcb)); - J_ASSERT(journal->j_committing_transaction != transaction); - J_ASSERT(journal->j_running_transaction != transaction); - -Index: linux-2.6/fs/jbd/commit.c -=================================================================== ---- linux-2.6.orig/fs/jbd/commit.c 2006-07-15 16:08:36.000000000 +0800 -+++ linux-2.6/fs/jbd/commit.c 2006-07-15 16:13:01.000000000 +0800 -@@ -708,6 +708,32 @@ wait_for_iobuf: - transaction can be removed from any checkpoint list it was on - before. */ - -+ /* -+ * Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ * -+ * Locking not strictly required, since this is the only process -+ * touching this transaction anymore, but is done to keep code -+ * checkers happy and has no contention in any case. -+ */ -+ spin_lock(&commit_transaction->t_jcb_lock); -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del_init(p); -+ spin_unlock(&commit_transaction->t_jcb_lock); -+ jcb->jcb_func(jcb, error); -+ spin_lock(&commit_transaction->t_jcb_lock); -+ } -+ } -+ spin_unlock(&commit_transaction->t_jcb_lock); -+ - jbd_debug(3, "JBD: commit phase 7\n"); - - J_ASSERT(commit_transaction->t_sync_datalist == NULL); -Index: linux-2.6/fs/jbd/journal.c -=================================================================== ---- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:08:36.000000000 +0800 -+++ linux-2.6/fs/jbd/journal.c 2006-07-15 16:13:01.000000000 +0800 -@@ -58,6 +58,7 @@ EXPORT_SYMBOL(journal_sync_buffer); - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); -@@ -80,6 +81,7 @@ EXPORT_SYMBOL(journal_wipe); - EXPORT_SYMBOL(journal_blocks_per_page); - EXPORT_SYMBOL(journal_invalidatepage); - EXPORT_SYMBOL(journal_try_to_free_buffers); -+EXPORT_SYMBOL(journal_bmap); - EXPORT_SYMBOL(journal_force_commit); - - static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); -Index: linux-2.6/fs/jbd/transaction.c -=================================================================== ---- linux-2.6.orig/fs/jbd/transaction.c 2006-07-15 16:08:35.000000000 +0800 -+++ linux-2.6/fs/jbd/transaction.c 2006-07-15 16:13:01.000000000 +0800 -@@ -50,7 +50,9 @@ get_transaction(journal_t *journal, tran - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; -+ INIT_LIST_HEAD(&transaction->t_jcb); - spin_lock_init(&transaction->t_handle_lock); -+ spin_lock_init(&transaction->t_jcb_lock); - - /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = transaction->t_expires; -@@ -241,6 +243,7 @@ static handle_t *new_handle(int nblocks) - memset(handle, 0, sizeof(*handle)); - handle->h_buffer_credits = nblocks; - handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); - - return handle; - } -@@ -1291,6 +1294,35 @@ drop: - } - - /** -+ * void journal_callback_set() - Register a callback function for this handle. -+ * @handle: handle to attach the callback to. -+ * @func: function to callback. -+ * @jcb: structure with additional information required by func() , and -+ * some space for jbd internal information. -+ * -+ * The function will be called when the transaction that this handle is -+ * part of has been committed to disk with the original callback data -+ * struct and the error status of the journal as parameters. There is no -+ * guarantee of ordering between handles within a single transaction, nor -+ * between callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ jcb->jcb_func = func; -+ spin_lock(&handle->h_transaction->t_jcb_lock); -+ list_add_tail(&jcb->jcb_list, &handle->h_jcb); -+ spin_unlock(&handle->h_transaction->t_jcb_lock); -+} -+ -+/** - * int journal_stop() - complete a transaction - * @handle: tranaction to complete. - * -@@ -1363,6 +1396,11 @@ int journal_stop(handle_t *handle) - wake_up(&journal->j_wait_transaction_locked); - } - -+ /* Move callbacks from the handle to the transaction. */ -+ spin_lock(&transaction->t_jcb_lock); -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ spin_unlock(&transaction->t_jcb_lock); -+ - /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current diff --git a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch deleted file mode 100644 index 5f26c0b..0000000 --- a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch +++ /dev/null @@ -1,637 +0,0 @@ -Index: linux-2.6.18-128.1.6/fs/jbd/commit.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c 2009-06-02 23:24:00.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/commit.c 2009-06-02 23:26:07.000000000 -0600 -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - - - /* -@@ -95,19 +96,23 @@ - return 1; - } - --/* Done it all: now write the commit record. We should have -+/* -+ * Done it all: now submit the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort - * mode we can now just skip the rest of the journal write - * entirely. - * - * Returns 1 if the journal needs to be aborted or 0 on success - */ --static int journal_write_commit_record(journal_t *journal, -- transaction_t *commit_transaction) -+static int journal_submit_commit_record(journal_t *journal, -+ transaction_t *commit_transaction, -+ struct buffer_head **cbh, -+ __u32 crc32_sum) - { - struct journal_head *descriptor; -+ struct commit_header *tmp; - struct buffer_head *bh; -- int i, ret; -+ int ret; - int barrier_done = 0; - - if (is_journal_aborted(journal)) -@@ -119,21 +124,34 @@ - - bh = jh2bh(descriptor); - -- /* AKPM: buglet - add `i' to tmp! */ -- for (i = 0; i < bh->b_size; i += 512) { -- journal_header_t *tmp = (journal_header_t*)bh->b_data; -- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); -- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); -- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); -+ tmp = (struct commit_header *)bh->b_data; -+ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); -+ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); -+ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); -+ -+ if (JFS_HAS_COMPAT_FEATURE(journal, -+ JFS_FEATURE_COMPAT_CHECKSUM)) { -+ tmp->h_chksum_type = JFS_CRC32_CHKSUM; -+ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE; -+ tmp->h_chksum[0] = cpu_to_be32(crc32_sum); - } - -- JBUFFER_TRACE(descriptor, "write commit block"); -+ JBUFFER_TRACE(descriptor, "submit commit block"); -+ lock_buffer(bh); -+ - set_buffer_dirty(bh); -- if (journal->j_flags & JFS_BARRIER) { -+ set_buffer_uptodate(bh); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ -+ if (journal->j_flags & JFS_BARRIER && -+ !JFS_HAS_INCOMPAT_FEATURE(journal, -+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { -+ - set_buffer_ordered(bh); - barrier_done = 1; - } -- ret = sync_dirty_buffer(bh); -+ ret = submit_bh(WRITE, bh); -+ - /* is it possible for another commit to fail at roughly - * the same time as this one? If so, we don't want to - * trust the barrier flag in the super, but instead want -@@ -154,12 +172,70 @@ - clear_buffer_ordered(bh); - set_buffer_uptodate(bh); - set_buffer_dirty(bh); -- ret = sync_dirty_buffer(bh); -+ ret = submit_bh(WRITE, bh); - } -- put_bh(bh); /* One for getblk() */ -- journal_put_journal_head(descriptor); -+ *cbh = bh; -+ return ret; -+} -+ -+/* -+ * This function along with journal_submit_commit_record -+ * allows to write the commit record asynchronously. -+ */ -+static int journal_wait_on_commit_record(struct buffer_head *bh) -+{ -+ int ret = 0; -+ -+ clear_buffer_dirty(bh); -+ wait_on_buffer(bh); -+ -+ if (unlikely(!buffer_uptodate(bh))) -+ ret = -EIO; -+ put_bh(bh); /* One for getblk() */ -+ journal_put_journal_head(bh2jh(bh)); -+ -+ return ret; -+} -+ -+/* -+ * Wait for all submitted IO to complete. -+ */ -+static int journal_wait_on_locked_list(journal_t *journal, -+ transaction_t *commit_transaction) -+{ -+ int ret = 0; -+ struct journal_head *jh; - -- return (ret == -EIO); -+ while (commit_transaction->t_locked_list) { -+ struct buffer_head *bh; -+ -+ jh = commit_transaction->t_locked_list->b_tprev; -+ bh = jh2bh(jh); -+ get_bh(bh); -+ if (buffer_locked(bh)) { -+ spin_unlock(&journal->j_list_lock); -+ wait_on_buffer(bh); -+ if (unlikely(!buffer_uptodate(bh))) -+ ret = -EIO; -+ spin_lock(&journal->j_list_lock); -+ } -+ if (!inverted_lock(journal, bh)) { -+ put_bh(bh); -+ spin_lock(&journal->j_list_lock); -+ continue; -+ } -+ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { -+ __journal_unfile_buffer(jh); -+ jbd_unlock_bh_state(bh); -+ journal_remove_journal_head(bh); -+ put_bh(bh); -+ } else { -+ jbd_unlock_bh_state(bh); -+ } -+ put_bh(bh); -+ cond_resched_lock(&journal->j_list_lock); -+ } -+ return ret; - } - - void journal_do_submit_data(struct buffer_head **wbuf, int bufs) -@@ -282,6 +358,20 @@ - return err; - } - -+static inline __u32 jbd_checksum_data(__u32 crc32_sum, struct buffer_head *bh) -+{ -+ struct page *page = bh->b_page; -+ char *addr; -+ __u32 checksum; -+ -+ addr = kmap_atomic(page, KM_USER0); -+ checksum = crc32_be(crc32_sum, -+ (void *)(addr + offset_in_page(bh->b_data)), -+ bh->b_size); -+ kunmap_atomic(addr, KM_USER0); -+ return checksum; -+} -+ - /* - * journal_commit_transaction - * -@@ -305,6 +395,8 @@ - int first_tag = 0; - int tag_flag; - int i; -+ struct buffer_head *cbh = NULL; /* For transactional checksums */ -+ __u32 crc32_sum = ~0; - - /* - * First job: lock down the current transaction and wait for -@@ -431,39 +523,14 @@ - err = journal_submit_data_buffers(journal, commit_transaction); - - /* -- * Wait for all previously submitted IO to complete. -+ * Wait for all previously submitted IO to complete if commit -+ * record is to be written synchronously. - */ - spin_lock(&journal->j_list_lock); -- while (commit_transaction->t_locked_list) { -- struct buffer_head *bh; -- -- jh = commit_transaction->t_locked_list->b_tprev; -- bh = jh2bh(jh); -- get_bh(bh); -- if (buffer_locked(bh)) { -- spin_unlock(&journal->j_list_lock); -- wait_on_buffer(bh); -- spin_lock(&journal->j_list_lock); -- } -- if (unlikely(!buffer_uptodate(bh))) -- err = -EIO; -- -- if (!inverted_lock(journal, bh)) { -- put_bh(bh); -- spin_lock(&journal->j_list_lock); -- continue; -- } -- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { -- __journal_unfile_buffer(jh); -- jbd_unlock_bh_state(bh); -- journal_remove_journal_head(bh); -- put_bh(bh); -- } else { -- jbd_unlock_bh_state(bh); -- } -- release_data_buffer(bh); -- cond_resched_lock(&journal->j_list_lock); -- } -+ if (!JFS_HAS_INCOMPAT_FEATURE(journal, -+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) -+ err = journal_wait_on_locked_list(journal, -+ commit_transaction); - spin_unlock(&journal->j_list_lock); - - if (err) -@@ -642,6 +709,16 @@ - start_journal_io: - for (i = 0; i < bufs; i++) { - struct buffer_head *bh = wbuf[i]; -+ /* -+ * Compute checksum. -+ */ -+ if (JFS_HAS_COMPAT_FEATURE(journal, -+ JFS_FEATURE_COMPAT_CHECKSUM)) { -+ crc32_sum = -+ jbd_checksum_data(crc32_sum, -+ bh); -+ } -+ - lock_buffer(bh); - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); -@@ -658,6 +735,23 @@ - } - } - -+ /* Done it all: now write the commit record asynchronously. */ -+ -+ if (JFS_HAS_INCOMPAT_FEATURE(journal, -+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { -+ err = journal_submit_commit_record(journal, commit_transaction, -+ &cbh, crc32_sum); -+ if (err) -+ __journal_abort_hard(journal); -+ -+ spin_lock(&journal->j_list_lock); -+ err = journal_wait_on_locked_list(journal, -+ commit_transaction); -+ spin_unlock(&journal->j_list_lock); -+ if (err) -+ __journal_abort_hard(journal); -+ } -+ - /* Lo and behold: we have just managed to send a transaction to - the log. Before we can commit it, wait for the IO so far to - complete. Control buffers being written are on the -@@ -759,9 +853,15 @@ - journal_abort(journal, err); - - jbd_debug(3, "JBD: commit phase 6\n"); -- -- if (journal_write_commit_record(journal, commit_transaction)) -- err = -EIO; -+ -+ if (!JFS_HAS_INCOMPAT_FEATURE(journal, -+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { -+ err = journal_submit_commit_record(journal, commit_transaction, -+ &cbh, crc32_sum); -+ if (err) -+ __journal_abort_hard(journal); -+ } -+ err = journal_wait_on_commit_record(cbh); - - if (err) - journal_abort(journal, err); -Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c 2009-04-14 21:05:39.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/recovery.c 2009-06-02 23:26:07.000000000 -0600 -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - #endif - - /* -@@ -310,6 +311,38 @@ - return err; - } - -+/* -+ * calc_chksums calculates the checksums for the blocks described in the -+ * descriptor block. -+ */ -+static int calc_chksums(journal_t *journal, struct buffer_head *bh, -+ unsigned long *next_log_block, __u32 *crc32_sum) -+{ -+ int i, num_blks, err; -+ unsigned long io_block; -+ struct buffer_head *obh; -+ -+ num_blks = count_tags(bh, journal->j_blocksize); -+ /* Calculate checksum of the descriptor block. */ -+ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); -+ -+ for (i = 0; i < num_blks; i++) { -+ io_block = (*next_log_block)++; -+ wrap(journal, *next_log_block); -+ err = jread(&obh, journal, io_block); -+ if (err) { -+ printk(KERN_ERR "JBD: IO error %d recovering block " -+ "%lu in log\n", err, io_block); -+ return 1; -+ } else { -+ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, -+ obh->b_size); -+ } -+ put_bh(obh); -+ } -+ return 0; -+} -+ - static int do_one_pass(journal_t *journal, - struct recovery_info *info, enum passtype pass) - { -@@ -321,6 +354,7 @@ - struct buffer_head * bh; - unsigned int sequence; - int blocktype; -+ __u32 crc32_sum = ~0; /* Transactional Checksums */ - - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; -@@ -412,9 +446,24 @@ - switch(blocktype) { - case JFS_DESCRIPTOR_BLOCK: - /* If it is a valid descriptor block, replay it -- * in pass REPLAY; otherwise, just skip over the -- * blocks it describes. */ -+ * in pass REPLAY; if journal_checksums enabled, then -+ * calculate checksums in PASS_SCAN, otherwise, -+ * just skip over the blocks it describes. */ - if (pass != PASS_REPLAY) { -+ if (pass == PASS_SCAN && -+ JFS_HAS_COMPAT_FEATURE(journal, -+ JFS_FEATURE_COMPAT_CHECKSUM) && -+ !info->end_transaction) { -+ if (calc_chksums(journal, bh, -+ &next_log_block, -+ &crc32_sum)) { -+ put_bh(bh); -+ break; -+ } -+ put_bh(bh); -+ continue; -+ } -+ - next_log_block += - count_tags(bh, journal->j_blocksize); - wrap(journal, next_log_block); -@@ -509,9 +558,97 @@ - continue; - - case JFS_COMMIT_BLOCK: -- /* Found an expected commit block: not much to -- * do other than move on to the next sequence -+ /* How to differentiate between interrupted commit -+ * and journal corruption ? -+ * -+ * {nth transaction} -+ * Checksum Verification Failed -+ * | -+ * ____________________ -+ * | | -+ * async_commit sync_commit -+ * | | -+ * | GO TO NEXT "Journal Corruption" -+ * | TRANSACTION -+ * | -+ * {(n+1)th transanction} -+ * | -+ * _______|______________ -+ * | | -+ * Commit block found Commit block not found -+ * | | -+ * "Journal Corruption" | -+ * _____________|__________ -+ * | | -+ * nth trans corrupt OR nth trans -+ * and (n+1)th interrupted interrupted -+ * before commit block -+ * could reach the disk. -+ * (Cannot find the difference in above -+ * mentioned conditions. Hence assume -+ * "Interrupted Commit".) -+ */ -+ -+ /* Found an expected commit block: if checksums -+ * are present verify them in PASS_SCAN; else not -+ * much to do other than move on to the next sequence - * number. */ -+ if (pass == PASS_SCAN && -+ JFS_HAS_COMPAT_FEATURE(journal, -+ JFS_FEATURE_COMPAT_CHECKSUM)) { -+ int chksum_err, chksum_seen; -+ struct commit_header *cbh = -+ (struct commit_header *)bh->b_data; -+ unsigned found_chksum = -+ be32_to_cpu(cbh->h_chksum[0]); -+ -+ chksum_err = chksum_seen = 0; -+ -+ if (info->end_transaction) { -+ printk(KERN_ERR "JBD: Transaction %u " -+ "found to be corrupt.\n", -+ next_commit_ID - 1); -+ brelse(bh); -+ break; -+ } -+ -+ if (crc32_sum == found_chksum && -+ cbh->h_chksum_type == JFS_CRC32_CHKSUM && -+ cbh->h_chksum_size == -+ JFS_CRC32_CHKSUM_SIZE) { -+ chksum_seen = 1; -+ } else if (!(cbh->h_chksum_type == 0 && -+ cbh->h_chksum_size == 0 && -+ found_chksum == 0 && -+ !chksum_seen)) { -+ /* -+ * If fs is mounted using an old kernel and then -+ * kernel with journal_chksum is used then we -+ * get a situation where the journal flag has -+ * checksum flag set but checksums are not -+ * present i.e chksum = 0, in the individual -+ * commit blocks. -+ * Hence to avoid checksum failures, in this -+ * situation, this extra check is added. -+ */ -+ chksum_err = 1; -+ } -+ -+ if (chksum_err) { -+ info->end_transaction = next_commit_ID; -+ -+ if (!JFS_HAS_INCOMPAT_FEATURE(journal, -+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){ -+ printk(KERN_ERR -+ "JBD: Transaction %u " -+ "found to be corrupt.\n", -+ next_commit_ID); -+ brelse(bh); -+ break; -+ } -+ } -+ crc32_sum = ~0; -+ } - brelse(bh); - next_commit_ID++; - continue; -@@ -547,9 +684,10 @@ - * transaction marks the end of the valid log. - */ - -- if (pass == PASS_SCAN) -- info->end_transaction = next_commit_ID; -- else { -+ if (pass == PASS_SCAN) { -+ if (!info->end_transaction) -+ info->end_transaction = next_commit_ID; -+ } else { - /* It's really bad news if different passes end up at - * different places (but possible due to IO errors). */ - if (info->end_transaction != next_commit_ID) { -Index: linux-2.6.18-128.1.6/fs/jbd/journal.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:24:00.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/journal.c 2009-06-02 23:26:07.000000000 -0600 -@@ -67,6 +67,7 @@ - EXPORT_SYMBOL(journal_check_used_features); - EXPORT_SYMBOL(journal_check_available_features); - EXPORT_SYMBOL(journal_set_features); -+EXPORT_SYMBOL(journal_clear_features); - EXPORT_SYMBOL(journal_create); - EXPORT_SYMBOL(journal_load); - EXPORT_SYMBOL(journal_destroy); -@@ -1583,6 +1584,33 @@ - return 1; - } - -+/** -+ * int journal_clear_features () - Clear a given journal feature in the superblock -+ * @journal: Journal to act on. -+ * @compat: bitmask of compatible features -+ * @ro: bitmask of features that force read-only mount -+ * @incompat: bitmask of incompatible features -+ * -+ * Clear a given journal feature as present on the -+ * superblock. Returns true if the requested features could be reset. -+ * -+ */ -+int journal_clear_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", -+ compat, ro, incompat); -+ -+ sb = journal->j_superblock; -+ -+ sb->s_feature_compat &= ~cpu_to_be32(compat); -+ sb->s_feature_ro_compat &= ~cpu_to_be32(ro); -+ sb->s_feature_incompat &= ~cpu_to_be32(incompat); -+ -+ return 1; -+} - - /** - * int journal_update_format () - Update on-disk journal structure. -Index: linux-2.6.18-128.1.6/fs/Kconfig -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/Kconfig 2009-04-14 21:05:39.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/Kconfig 2009-06-02 23:26:07.000000000 -0600 -@@ -206,6 +206,7 @@ - - config JBD - tristate -+ select CRC32 - help - This is a generic journaling layer for block devices. It is - currently used by the ext3 and OCFS2 file systems, but it could -Index: linux-2.6.18-128.1.6/include/linux/jbd.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/jbd.h 2009-06-02 23:24:00.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/jbd.h 2009-06-02 23:26:07.000000000 -0600 -@@ -148,6 +148,29 @@ - __be32 h_sequence; - } journal_header_t; - -+/* -+ * Checksum types. -+ */ -+#define JFS_CRC32_CHKSUM 1 -+#define JFS_MD5_CHKSUM 2 -+#define JFS_SHA1_CHKSUM 3 -+ -+#define JFS_CRC32_CHKSUM_SIZE 4 -+ -+#define JFS_CHECKSUM_BYTES (32 / sizeof(u32)) -+/* -+ * Commit block header for storing transactional checksums: -+ */ -+struct commit_header -+{ -+ __be32 h_magic; -+ __be32 h_blocktype; -+ __be32 h_sequence; -+ unsigned char h_chksum_type; -+ unsigned char h_chksum_size; -+ unsigned char h_padding[2]; -+ __be32 h_chksum[JFS_CHECKSUM_BYTES]; -+}; - - /* - * The block tag: used to describe a single buffer in the journal -@@ -234,12 +257,16 @@ - ((j)->j_format_version >= 2 && \ - ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) - --#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 -+#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001 -+ -+#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 -+#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 - - /* Features known to this kernel version: */ --#define JFS_KNOWN_COMPAT_FEATURES 0 -+#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM - #define JFS_KNOWN_ROCOMPAT_FEATURES 0 --#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE -+#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \ -+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT) - - #ifdef __KERNEL__ - -@@ -1053,6 +1080,8 @@ - (journal_t *, unsigned long, unsigned long, unsigned long); - extern int journal_set_features - (journal_t *, unsigned long, unsigned long, unsigned long); -+extern int journal_clear_features -+ (journal_t *, unsigned long, unsigned long, unsigned long); - extern int journal_create (journal_t *); - extern int journal_load (journal_t *journal); - #ifndef __GENKSYMS__ -Index: linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt -=================================================================== ---- linux-2.6.18-128.1.6.orig/Documentation/filesystems/ext3.txt 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt 2009-06-02 23:26:07.000000000 -0600 -@@ -14,6 +14,16 @@ - When mounting an ext3 filesystem, the following option are accepted: - (*) == default - -+journal_checksum Enable checksumming of the journal transactions. -+ This will allow the recovery code in e2fsck and the -+ kernel to detect corruption in the kernel. It is a -+ compatible change and will be ignored by older kernels. -+ -+journal_async_commit Commit block can be written to disk without waiting -+ for descriptor blocks. If enabled older kernels cannot -+ mount the device. This will enable 'journal_checksum' -+ internally. -+ - journal=update Update the ext3 file system's journal to the current - format. - diff --git a/lustre/kernel_patches/patches/jbd-stats-2.6-rhel5.patch b/lustre/kernel_patches/patches/jbd-stats-2.6-rhel5.patch deleted file mode 100644 index 67832a6..0000000 --- a/lustre/kernel_patches/patches/jbd-stats-2.6-rhel5.patch +++ /dev/null @@ -1,743 +0,0 @@ -Index: linux-2.6.18-128.1.6/include/linux/jbd.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/jbd.h 2009-06-02 23:22:50.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/jbd.h 2009-06-02 23:24:00.000000000 -0600 -@@ -428,6 +428,16 @@ - }; - - -+/* -+ * Some stats for checkpoint phase -+ */ -+struct transaction_chp_stats_s { -+ unsigned long cs_chp_time; -+ unsigned long cs_forced_to_close; -+ unsigned long cs_written; -+ unsigned long cs_dropped; -+}; -+ - /* The transaction_t type is the guts of the journaling mechanism. It - * tracks a compound transaction through its various states: - * -@@ -565,6 +575,21 @@ - spinlock_t t_handle_lock; - - /* -+ * Longest time some handle had to wait for running transaction -+ */ -+ unsigned long t_max_wait; -+ -+ /* -+ * When transaction started -+ */ -+ unsigned long t_start; -+ -+ /* -+ * Checkpointing stats [j_checkpoint_sem] -+ */ -+ struct transaction_chp_stats_s t_chp_stats; -+ -+ /* - * Number of outstanding updates running on this transaction - * [t_handle_lock] - */ -@@ -604,6 +629,57 @@ - struct list_head t_jcb; - }; - -+struct transaction_run_stats_s { -+ unsigned long rs_wait; -+ unsigned long rs_running; -+ unsigned long rs_locked; -+ unsigned long rs_flushing; -+ unsigned long rs_logging; -+ -+ unsigned long rs_handle_count; -+ unsigned long rs_blocks; -+ unsigned long rs_blocks_logged; -+}; -+ -+struct transaction_stats_s -+{ -+ int ts_type; -+ unsigned long ts_tid; -+ union { -+ struct transaction_run_stats_s run; -+ struct transaction_chp_stats_s chp; -+ } u; -+}; -+ -+#define JBD_STATS_RUN 1 -+#define JBD_STATS_CHECKPOINT 2 -+ -+#define ts_wait u.run.rs_wait -+#define ts_running u.run.rs_running -+#define ts_locked u.run.rs_locked -+#define ts_flushing u.run.rs_flushing -+#define ts_logging u.run.rs_logging -+#define ts_handle_count u.run.rs_handle_count -+#define ts_blocks u.run.rs_blocks -+#define ts_blocks_logged u.run.rs_blocks_logged -+ -+#define ts_chp_time u.chp.cs_chp_time -+#define ts_forced_to_close u.chp.cs_forced_to_close -+#define ts_written u.chp.cs_written -+#define ts_dropped u.chp.cs_dropped -+ -+#define CURRENT_MSECS (jiffies_to_msecs(jiffies)) -+ -+static inline unsigned int -+jbd_time_diff(unsigned int start, unsigned int end) -+{ -+ if (unlikely(start > end)) -+ end = end + (~0UL - start); -+ else -+ end -= start; -+ return end; -+} -+ - /** - * struct journal_s - The journal_s type is the concrete type associated with - * journal_t. -@@ -857,6 +933,16 @@ - pid_t j_last_sync_writer; - - /* -+ * -+ */ -+ struct transaction_stats_s *j_history; -+ int j_history_max; -+ int j_history_cur; -+ spinlock_t j_history_lock; -+ struct proc_dir_entry *j_proc_entry; -+ struct transaction_stats_s j_stats; -+ -+ /* - * An opaque pointer to fs-private information. ext3 puts its - * superblock pointer here - */ -Index: linux-2.6.18-128.1.6/fs/jbd/transaction.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/transaction.c 2009-06-02 23:22:50.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/transaction.c 2009-06-02 23:24:00.000000000 -0600 -@@ -60,6 +60,8 @@ - - J_ASSERT(journal->j_running_transaction == NULL); - journal->j_running_transaction = transaction; -+ transaction->t_max_wait = 0; -+ transaction->t_start = CURRENT_MSECS; - - return transaction; - } -@@ -86,6 +88,7 @@ - int nblocks = handle->h_buffer_credits; - transaction_t *new_transaction = NULL; - int ret = 0; -+ unsigned long ts = CURRENT_MSECS; - - if (nblocks > journal->j_max_transaction_buffers) { - printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", -@@ -219,6 +222,12 @@ - /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - -+ if (time_after(transaction->t_start, ts)) { -+ ts = jbd_time_diff(ts, transaction->t_start); -+ if (ts > transaction->t_max_wait) -+ transaction->t_max_wait= ts; -+ } -+ - handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; -Index: linux-2.6.18-128.1.6/fs/jbd/journal.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:23:03.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/journal.c 2009-06-02 23:24:00.000000000 -0600 -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -638,6 +639,300 @@ - return journal_add_journal_head(bh); - } - -+struct jbd_stats_proc_session { -+ journal_t *journal; -+ struct transaction_stats_s *stats; -+ int start; -+ int max; -+}; -+ -+static void *jbd_history_skip_empty(struct jbd_stats_proc_session *s, -+ struct transaction_stats_s *ts, -+ int first) -+{ -+ if (ts == s->stats + s->max) -+ ts = s->stats; -+ if (!first && ts == s->stats + s->start) -+ return NULL; -+ while (ts->ts_type == 0) { -+ ts++; -+ if (ts == s->stats + s->max) -+ ts = s->stats; -+ if (ts == s->stats + s->start) -+ return NULL; -+ } -+ return ts; -+ -+} -+ -+static void *jbd_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct jbd_stats_proc_session *s = seq->private; -+ struct transaction_stats_s *ts; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ ts = jbd_history_skip_empty(s, s->stats + s->start, 1); -+ if (!ts) -+ return NULL; -+ while (--l && (ts = jbd_history_skip_empty(s, ++ts, 0)) != NULL); -+ return ts; -+} -+ -+static void *jbd_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct jbd_stats_proc_session *s = seq->private; -+ struct transaction_stats_s *ts = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return jbd_history_skip_empty(s, s->stats + s->start, 1); -+ else -+ return jbd_history_skip_empty(s, ++ts, 0); -+} -+ -+static int jbd_seq_history_show(struct seq_file *seq, void *v) -+{ -+ struct transaction_stats_s *ts = v; -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " -+ "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", -+ "wait", "run", "lock", "flush", "log", "hndls", -+ "block", "inlog", "ctime", "write", "drop", -+ "close"); -+ return 0; -+ } -+ if (ts->ts_type == JBD_STATS_RUN) -+ seq_printf(seq, "%-4s %-5lu %-5lu %-5lu %-5lu %-5lu %-5lu " -+ "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, -+ ts->ts_wait, ts->ts_running, ts->ts_locked, -+ ts->ts_flushing, ts->ts_logging, -+ ts->ts_handle_count, ts->ts_blocks, -+ ts->ts_blocks_logged); -+ else if (ts->ts_type == JBD_STATS_CHECKPOINT) -+ seq_printf(seq, "%-4s %-5lu %48s %-5lu %-5lu %-5lu %-5lu\n", -+ "C", ts->ts_tid, " ", ts->ts_chp_time, -+ ts->ts_written, ts->ts_dropped, -+ ts->ts_forced_to_close); -+ else -+ J_ASSERT(0); -+ return 0; -+} -+ -+static void jbd_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations jbd_seq_history_ops = { -+ .start = jbd_seq_history_start, -+ .next = jbd_seq_history_next, -+ .stop = jbd_seq_history_stop, -+ .show = jbd_seq_history_show, -+}; -+ -+static int jbd_seq_history_open(struct inode *inode, struct file *file) -+{ -+ journal_t *journal = PDE(inode)->data; -+ struct jbd_stats_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct transaction_stats_s) * journal->j_history_max; -+ s->stats = kmalloc(size, GFP_KERNEL); -+ if (s->stats == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ spin_lock(&journal->j_history_lock); -+ memcpy(s->stats, journal->j_history, size); -+ s->max = journal->j_history_max; -+ s->start = journal->j_history_cur % s->max; -+ spin_unlock(&journal->j_history_lock); -+ -+ rc = seq_open(file, &jbd_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->stats); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int jbd_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct jbd_stats_proc_session *s = seq->private; -+ kfree(s->stats); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations jbd_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = jbd_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = jbd_seq_history_release, -+}; -+ -+static void *jbd_seq_info_start(struct seq_file *seq, loff_t *pos) -+{ -+ return *pos ? NULL : SEQ_START_TOKEN; -+} -+ -+static void *jbd_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ return NULL; -+} -+ -+static int jbd_seq_info_show(struct seq_file *seq, void *v) -+{ -+ struct jbd_stats_proc_session *s = seq->private; -+ if (v != SEQ_START_TOKEN) -+ return 0; -+ seq_printf(seq, "%lu transaction, each upto %u blocks\n", -+ s->stats->ts_tid, -+ s->journal->j_max_transaction_buffers); -+ if (s->stats->ts_tid == 0) -+ return 0; -+ seq_printf(seq, "average: \n %lums waiting for transaction\n", -+ s->stats->ts_wait / s->stats->ts_tid); -+ seq_printf(seq, " %lums running transaction\n", -+ s->stats->ts_running / s->stats->ts_tid); -+ seq_printf(seq, " %lums transaction was being locked\n", -+ s->stats->ts_locked / s->stats->ts_tid); -+ seq_printf(seq, " %lums flushing data (in ordered mode)\n", -+ s->stats->ts_flushing / s->stats->ts_tid); -+ seq_printf(seq, " %lums logging transaction\n", -+ s->stats->ts_logging / s->stats->ts_tid); -+ seq_printf(seq, " %lu handles per transaction\n", -+ s->stats->ts_handle_count / s->stats->ts_tid); -+ seq_printf(seq, " %lu blocks per transaction\n", -+ s->stats->ts_blocks / s->stats->ts_tid); -+ seq_printf(seq, " %lu logged blocks per transaction\n", -+ s->stats->ts_blocks_logged / s->stats->ts_tid); -+ return 0; -+} -+ -+static void jbd_seq_info_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations jbd_seq_info_ops = { -+ .start = jbd_seq_info_start, -+ .next = jbd_seq_info_next, -+ .stop = jbd_seq_info_stop, -+ .show = jbd_seq_info_show, -+}; -+ -+static int jbd_seq_info_open(struct inode *inode, struct file *file) -+{ -+ journal_t *journal = PDE(inode)->data; -+ struct jbd_stats_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct transaction_stats_s); -+ s->stats = kmalloc(size, GFP_KERNEL); -+ if (s->stats == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ spin_lock(&journal->j_history_lock); -+ memcpy(s->stats, &journal->j_stats, size); -+ s->journal = journal; -+ spin_unlock(&journal->j_history_lock); -+ -+ rc = seq_open(file, &jbd_seq_info_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->stats); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int jbd_seq_info_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct jbd_stats_proc_session *s = seq->private; -+ kfree(s->stats); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations jbd_seq_info_fops = { -+ .owner = THIS_MODULE, -+ .open = jbd_seq_info_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = jbd_seq_info_release, -+}; -+ -+static struct proc_dir_entry *proc_jbd_stats = NULL; -+ -+static void jbd_stats_proc_init(journal_t *journal) -+{ -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); -+ journal->j_proc_entry = proc_mkdir(name, proc_jbd_stats); -+ if (journal->j_proc_entry) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("history", S_IRUGO, -+ journal->j_proc_entry); -+ if (p) { -+ p->proc_fops = &jbd_seq_history_fops; -+ p->data = journal; -+ p = create_proc_entry("info", S_IRUGO, -+ journal->j_proc_entry); -+ if (p) { -+ p->proc_fops = &jbd_seq_info_fops; -+ p->data = journal; -+ } -+ } -+ } -+} -+ -+static void jbd_stats_proc_exit(journal_t *journal) -+{ -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); -+ remove_proc_entry("info", journal->j_proc_entry); -+ remove_proc_entry("history", journal->j_proc_entry); -+ remove_proc_entry(name, proc_jbd_stats); -+} -+ -+static void journal_init_stats(journal_t *journal) -+{ -+ int size; -+ -+ if (proc_jbd_stats == NULL) -+ return; -+ -+ journal->j_history_max = 100; -+ size = sizeof(struct transaction_stats_s) * journal->j_history_max; -+ journal->j_history = kmalloc(size, GFP_KERNEL); -+ if (journal->j_history == NULL) { -+ journal->j_history_max = 0; -+ return; -+ } -+ memset(journal->j_history, 0, size); -+ spin_lock_init(&journal->j_history_lock); -+} -+ - /* - * Management for journal control blocks: functions to create and - * destroy journal_t structures, and to initialise and read existing -@@ -680,6 +975,9 @@ - kfree(journal); - goto fail; - } -+ -+ journal_init_stats(journal); -+ - return journal; - fail: - return NULL; -@@ -723,6 +1021,7 @@ - journal->j_blk_offset = start; - journal->j_maxlen = len; - journal->j_blocksize = blocksize; -+ jbd_stats_proc_init(journal); - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); -@@ -772,6 +1071,7 @@ - - journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; - journal->j_blocksize = inode->i_sb->s_blocksize; -+ jbd_stats_proc_init(journal); - - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); -@@ -1168,6 +1468,8 @@ - brelse(journal->j_sb_buffer); - } - -+ if (journal->j_proc_entry) -+ jbd_stats_proc_exit(journal); - if (journal->j_inode) - iput(journal->j_inode); - if (journal->j_revoke) -@@ -2015,6 +2317,28 @@ - - #endif - -+#if defined(CONFIG_PROC_FS) -+ -+#define JBD_STATS_PROC_NAME "fs/jbd" -+ -+static void __init create_jbd_stats_proc_entry(void) -+{ -+ proc_jbd_stats = proc_mkdir(JBD_STATS_PROC_NAME, NULL); -+} -+ -+static void __exit remove_jbd_stats_proc_entry(void) -+{ -+ if (proc_jbd_stats) -+ remove_proc_entry(JBD_STATS_PROC_NAME, NULL); -+} -+ -+#else -+ -+#define create_jbd_stats_proc_entry() do {} while (0) -+#define remove_jbd_stats_proc_entry() do {} while (0) -+ -+#endif -+ - kmem_cache_t *jbd_handle_cache; - - static int __init journal_init_handle_cache(void) -@@ -2078,6 +2402,7 @@ - if (ret != 0) - journal_destroy_caches(); - create_jbd_proc_entry(); -+ create_jbd_stats_proc_entry(); - return ret; - } - -@@ -2089,6 +2414,7 @@ - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); - #endif - remove_jbd_proc_entry(); -+ remove_jbd_stats_proc_entry(); - journal_destroy_caches(); - } - -Index: linux-2.6.18-128.1.6/fs/jbd/checkpoint.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/checkpoint.c 2009-06-02 23:22:50.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/checkpoint.c 2009-06-02 23:24:00.000000000 -0600 -@@ -242,7 +242,7 @@ - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it - */ - static int __process_buffer(journal_t *journal, struct journal_head *jh, -- struct buffer_head **bhs, int *batch_count) -+ struct buffer_head **bhs, int *batch_count, transaction_t *transaction) - { - struct buffer_head *bh = jh2bh(jh); - int ret = 0; -@@ -260,6 +260,7 @@ - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - -+ transaction->t_chp_stats.cs_forced_to_close++; - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); -@@ -291,6 +292,7 @@ - bhs[*batch_count] = bh; - __buffer_relink_io(jh); - jbd_unlock_bh_state(bh); -+ transaction->t_chp_stats.cs_written++; - (*batch_count)++; - if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); -@@ -336,6 +338,8 @@ - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; -+ if (transaction->t_chp_stats.cs_chp_time == 0) -+ transaction->t_chp_stats.cs_chp_time = CURRENT_MSECS; - this_tid = transaction->t_tid; - restart: - /* -@@ -360,7 +364,8 @@ - retry = 1; - break; - } -- retry = __process_buffer(journal, jh, bhs,&batch_count); -+ retry = __process_buffer(journal, jh, bhs,&batch_count, -+ transaction); - if (retry < 0 && !result) - result = retry; - if (!retry && lock_need_resched(&journal->j_list_lock)){ -@@ -692,6 +697,8 @@ - - void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) - { -+ struct transaction_stats_s stats; -+ - assert_spin_locked(&journal->j_list_lock); - if (transaction->t_cpnext) { - transaction->t_cpnext->t_cpprev = transaction->t_cpprev; -@@ -718,5 +725,25 @@ - J_ASSERT(journal->j_running_transaction != transaction); - - jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); -+ -+ /* -+ * File the transaction for history -+ */ -+ if (transaction->t_chp_stats.cs_written != 0 || -+ transaction->t_chp_stats.cs_chp_time != 0) { -+ stats.ts_type = JBD_STATS_CHECKPOINT; -+ stats.ts_tid = transaction->t_tid; -+ stats.u.chp = transaction->t_chp_stats; -+ if (stats.ts_chp_time) -+ stats.ts_chp_time = -+ jbd_time_diff(stats.ts_chp_time, CURRENT_MSECS); -+ spin_lock(&journal->j_history_lock); -+ memcpy(journal->j_history + journal->j_history_cur, &stats, -+ sizeof(stats)); -+ if (++journal->j_history_cur == journal->j_history_max) -+ journal->j_history_cur = 0; -+ spin_unlock(&journal->j_history_lock); -+ } -+ - kfree(transaction); - } -Index: linux-2.6.18-128.1.6/fs/jbd/commit.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c 2009-06-02 23:22:50.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/jbd/commit.c 2009-06-02 23:24:00.000000000 -0600 -@@ -13,6 +13,7 @@ - * part of the ext2fs journaling system. - */ - -+#include - #include - #include - #include -@@ -22,6 +23,7 @@ - #include - #include - -+ - /* - * Default IO end handler for temporary BJ_IO buffer_heads. - */ -@@ -288,6 +290,7 @@ - */ - void journal_commit_transaction(journal_t *journal) - { -+ struct transaction_stats_s stats; - transaction_t *commit_transaction; - struct journal_head *jh, *new_jh, *descriptor; - struct buffer_head **wbuf = journal->j_wbuf; -@@ -334,6 +337,11 @@ - spin_lock(&journal->j_state_lock); - commit_transaction->t_state = T_LOCKED; - -+ stats.ts_wait = commit_transaction->t_max_wait; -+ stats.ts_locked = CURRENT_MSECS; -+ stats.ts_running = jbd_time_diff(commit_transaction->t_start, -+ stats.ts_locked); -+ - spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { - DEFINE_WAIT(wait); -@@ -404,6 +412,9 @@ - */ - journal_switch_revoke_table(journal); - -+ stats.ts_flushing = CURRENT_MSECS; -+ stats.ts_locked = jbd_time_diff(stats.ts_locked, stats.ts_flushing); -+ - commit_transaction->t_state = T_FLUSH; - journal->j_committing_transaction = commit_transaction; - journal->j_running_transaction = NULL; -@@ -484,6 +495,11 @@ - J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); - -+ stats.ts_logging = CURRENT_MSECS; -+ stats.ts_flushing = jbd_time_diff(stats.ts_flushing, stats.ts_logging); -+ stats.ts_blocks = commit_transaction->t_outstanding_credits; -+ stats.ts_blocks_logged = 0; -+ - descriptor = NULL; - bufs = 0; - while (commit_transaction->t_buffers) { -@@ -633,6 +649,7 @@ - submit_bh(WRITE, bh); - } - cond_resched(); -+ stats.ts_blocks_logged += bufs; - - /* Force a new descriptor to be generated next - time round the loop. */ -@@ -832,6 +849,7 @@ - cp_transaction = jh->b_cp_transaction; - if (cp_transaction) { - JBUFFER_TRACE(jh, "remove from old cp transaction"); -+ cp_transaction->t_chp_stats.cs_dropped++; - __journal_remove_checkpoint(jh); - } - -@@ -908,6 +926,36 @@ - - J_ASSERT(commit_transaction->t_state == T_COMMIT); - -+ commit_transaction->t_start = CURRENT_MSECS; -+ stats.ts_logging = jbd_time_diff(stats.ts_logging, -+ commit_transaction->t_start); -+ -+ /* -+ * File the transaction for history -+ */ -+ stats.ts_type = JBD_STATS_RUN; -+ stats.ts_tid = commit_transaction->t_tid; -+ stats.ts_handle_count = commit_transaction->t_handle_count; -+ spin_lock(&journal->j_history_lock); -+ memcpy(journal->j_history + journal->j_history_cur, &stats, -+ sizeof(stats)); -+ if (++journal->j_history_cur == journal->j_history_max) -+ journal->j_history_cur = 0; -+ -+ /* -+ * Calculate overall stats -+ */ -+ journal->j_stats.ts_tid++; -+ journal->j_stats.ts_wait += stats.ts_wait; -+ journal->j_stats.ts_running += stats.ts_running; -+ journal->j_stats.ts_locked += stats.ts_locked; -+ journal->j_stats.ts_flushing += stats.ts_flushing; -+ journal->j_stats.ts_logging += stats.ts_logging; -+ journal->j_stats.ts_handle_count += stats.ts_handle_count; -+ journal->j_stats.ts_blocks += stats.ts_blocks; -+ journal->j_stats.ts_blocks_logged += stats.ts_blocks_logged; -+ spin_unlock(&journal->j_history_lock); -+ - commit_transaction->t_state = T_FINISHED; - J_ASSERT(commit_transaction == journal->j_committing_transaction); - journal->j_commit_sequence = commit_transaction->t_tid; diff --git a/lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel5.patch b/lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel5.patch deleted file mode 100644 index 4a57ec3..0000000 --- a/lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel5.patch +++ /dev/null @@ -1,224 +0,0 @@ -This patch is no longer needed for Lustre, since Lustre 2.2. It is kept -in the kernel patch series for compatibility with older Lustre releases -to simplify the upgrade process so that both the kernel and Lustre do -not need to be upgraded at the same time. See Jira issue LU-433. - -Index: linux-2.6.18-128.1.6/include/linux/jbd2.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/jbd2.h 2009-04-15 08:35:28.000000000 +0530 -+++ linux-2.6.18-128.1.6/include/linux/jbd2.h 2009-05-28 15:10:18.000000000 +0530 -@@ -381,6 +381,27 @@ - bit_spin_unlock(BH_JournalHead, &bh->b_state); - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+/** -+ * struct journal_callback - Base structure for callback information. -+ * @jcb_list: list information for other callbacks attached to the same handle. -+ * @jcb_func: Function to call with this callback structure. -+ * -+ * This struct is a 'seed' structure for a using with your own callback -+ * structs. If you are using callbacks you must allocate one of these -+ * or another struct of your own definition which has this struct -+ * as it's first element and pass it to journal_callback_set(). -+ * -+ * This is used internally by jbd2 to maintain callback information. -+ * -+ * See journal_callback_set for more information. -+ **/ -+struct journal_callback { -+ struct list_head jcb_list; /* t_jcb_lock */ -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd2_revoke_table_s; - - /** -@@ -389,6 +410,7 @@ - * @h_transaction: Which compound transaction is this update a part of? - * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. - * @h_ref: Reference count on this handle -+ * @h_jcb: List of application registered callbacks for this handle. - * @h_err: Field for caller's use to track errors through large fs operations - * @h_sync: flag for sync-on-close - * @h_jdata: flag to force data journaling -@@ -414,6 +436,13 @@ - /* operations */ - int h_err; - -+ /* -+ * List of application registered callbacks for this handle. The -+ * function(s) will be called after the transaction that this handle is -+ * part of has been committed to disk. [t_jcb_lock] -+ */ -+ struct list_head h_jcb; -+ - /* Flags [no locking] */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -469,6 +498,8 @@ - * j_state_lock - * ->j_list_lock (journal_unmap_buffer) - * -+ * t_handle_lock -+ * ->t_jcb_lock - */ - - struct transaction_s -@@ -615,6 +646,15 @@ - */ - int t_handle_count; - -+ /* -+ * Protects the callback list -+ */ -+ spinlock_t t_jcb_lock; -+ /* -+ * List of registered callback functions for this transaction. -+ * Called when the transaction is committed. [t_jcb_lock] -+ */ -+ struct list_head t_jcb; - /* - * For use by the filesystem to store fs-specific data - * structures associated with the transaction -@@ -1018,6 +1058,9 @@ - extern int jbd2_journal_flush (journal_t *); - extern void jbd2_journal_lock_updates (journal_t *); - extern void jbd2_journal_unlock_updates (journal_t *); -+extern void jbd2_journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, - struct block_device *fs_dev, -Index: linux-2.6.18-128.1.6/fs/jbd2/checkpoint.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd2/checkpoint.c 2009-04-15 08:35:28.000000000 +0530 -+++ linux-2.6.18-128.1.6/fs/jbd2/checkpoint.c 2009-05-28 15:10:18.000000000 +0530 -@@ -695,6 +695,7 @@ - J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); -+ J_ASSERT(list_empty(&transaction->t_jcb)); - J_ASSERT(journal->j_committing_transaction != transaction); - J_ASSERT(journal->j_running_transaction != transaction); - -Index: linux-2.6.18-128.1.6/fs/jbd2/commit.c -=================================================================== ---- linux-2.6.18-164.6.1/fs/jbd2/commit.c 2010-01-21 11:24:52.000000000 +0530 -+++ linux-2.6.18-164.6.1_new/fs/jbd2/commit.c 2010-01-21 11:26:36.000000000 +0530 -@@ -832,6 +832,29 @@ wait_for_iobuf: - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ -+ /* -+ * Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ * -+ * The spinlocking (t_jcb_lock) here is surely unnecessary... -+ */ -+ spin_lock(&commit_transaction->t_jcb_lock); -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ spin_unlock(&commit_transaction->t_jcb_lock); -+ jcb->jcb_func(jcb, error); -+ spin_lock(&commit_transaction->t_jcb_lock); -+ } -+ } -+ spin_unlock(&commit_transaction->t_jcb_lock); - - jbd_debug(3, "JBD: commit phase 6\n"); - -Index: linux-2.6.18-128.1.6/fs/jbd2/journal.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd2/journal.c 2009-04-15 08:35:28.000000000 +0530 -+++ linux-2.6.18-128.1.6/fs/jbd2/journal.c 2009-05-28 17:13:35.000000000 +0530 -@@ -80,6 +80,8 @@ - EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); - EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); - EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); -+EXPORT_SYMBOL(jbd2_journal_callback_set); -+EXPORT_SYMBOL(jbd2_journal_bmap); - - static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); - static void __journal_abort_soft (journal_t *journal, int errno); -Index: linux-2.6.18-128.1.6/fs/jbd2/transaction.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/jbd2/transaction.c 2009-04-15 08:35:28.000000000 +0530 -+++ linux-2.6.18-128.1.6/fs/jbd2/transaction.c 2009-05-28 15:11:28.000000000 +0530 -@@ -51,6 +51,9 @@ - spin_lock_init(&transaction->t_handle_lock); - INIT_LIST_HEAD(&transaction->t_inode_list); - INIT_LIST_HEAD(&transaction->t_private_list); -+ INIT_LIST_HEAD(&transaction->t_jcb); -+ spin_lock_init(&transaction->t_jcb_lock); -+ - - /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); -@@ -251,6 +254,7 @@ - memset(handle, 0, sizeof(*handle)); - handle->h_buffer_credits = nblocks; - handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); - - lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", - &jbd2_handle_key, 0); -@@ -1349,6 +1353,36 @@ - } - - /** -+ * void jbd2_journal_callback_set() - Register a callback function for this handle. -+ * @handle: handle to attach the callback to. -+ * @func: function to callback. -+ * @jcb: structure with additional information required by func() , and -+ * some space for jbd2 internal information. -+ * -+ * The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void jbd2_journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ spin_lock(&handle->h_transaction->t_jcb_lock); -+ list_add_tail(&jcb->jcb_list, &handle->h_jcb); -+ spin_unlock(&handle->h_transaction->t_jcb_lock); -+ jcb->jcb_func = func; -+} -+ -+/** - * int jbd2_journal_stop() - complete a transaction - * @handle: tranaction to complete. - * -@@ -1422,6 +1456,11 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -+ /* Move callbacks from the handle to the transaction. */ -+ spin_lock(&transaction->t_jcb_lock); -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ spin_unlock(&transaction->t_jcb_lock); -+ - /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current diff --git a/lustre/kernel_patches/patches/jbd2_stats_proc_init-wrong-place.patch b/lustre/kernel_patches/patches/jbd2_stats_proc_init-wrong-place.patch deleted file mode 100644 index a37c894..0000000 --- a/lustre/kernel_patches/patches/jbd2_stats_proc_init-wrong-place.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 42e140bf105aea1c9679b1cd128aebc35196e6fc Mon Sep 17 00:00:00 2001 -From: yangsheng -Date: Mon, 15 Nov 2010 21:26:35 +0800 -Subject: [PATCH] jbd2_stats_proc_init wrong place. - - The jbd2_stats_proc_init() was placed on wrong location in - jbd2_journal_init_dev(). This may cause /proc/fs/jdb2//* - cannot be created when using external journal device. - - Reviewed-by: Andreas Dilger - ---- ---- - fs/jbd2/journal.c | 16 ++++++++-------- - 1 files changed, 8 insertions(+), 8 deletions(-) - -diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c -index c590d15..f837ba9 100644 ---- a/fs/jbd2/journal.c -+++ b/fs/jbd2/journal.c -@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; -+ journal->j_dev = bdev; -+ journal->j_fs_dev = fs_dev; -+ journal->j_blk_offset = start; -+ journal->j_maxlen = len; -+ bdevname(journal->j_dev, journal->j_devname); -+ p = journal->j_devname; -+ while ((p = strchr(p, '/'))) -+ *p = '!'; - jbd2_stats_proc_init(journal); - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; -@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, - __func__); - goto out_err; - } -- journal->j_dev = bdev; -- journal->j_fs_dev = fs_dev; -- journal->j_blk_offset = start; -- journal->j_maxlen = len; -- bdevname(journal->j_dev, journal->j_devname); -- p = journal->j_devname; -- while ((p = strchr(p, '/'))) -- *p = '!'; - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - if (!bh) { --- -1.7.2.3 - diff --git a/lustre/kernel_patches/patches/lustre_iser_max_sectors_tuning_lustre2.0.patch b/lustre/kernel_patches/patches/lustre_iser_max_sectors_tuning_lustre2.0.patch deleted file mode 100644 index 9f3c500..0000000 --- a/lustre/kernel_patches/patches/lustre_iser_max_sectors_tuning_lustre2.0.patch +++ /dev/null @@ -1,78 +0,0 @@ -diff -Naur base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c ---- base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c 2010-09-09 16:57:15.000000000 -0400 -+++ linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c 2010-09-09 17:02:17.000000000 -0400 -@@ -586,18 +586,25 @@ - iser_conn_terminate(ib_conn); - } - -+static int iscsi_iser_slave_configure(struct scsi_device *sdev) -+{ -+ blk_queue_dma_alignment(sdev->request_queue, 0); -+ return 0; -+} -+ - static struct scsi_host_template iscsi_iser_sht = { - .module = THIS_MODULE, - .name = "iSCSI Initiator over iSER, v." DRV_VER, - .queuecommand = iscsi2_queuecommand, - .change_queue_depth = iscsi2_change_queue_depth, - .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, -- .max_sectors = 1024, -+ .max_sectors = 0xffff, - .cmd_per_lun = ISER_DEF_CMD_PER_LUN, - .eh_abort_handler = iscsi2_eh_abort, - .eh_device_reset_handler= iscsi2_eh_device_reset, - .eh_host_reset_handler= iscsi2_eh_target_reset, - .use_clustering = DISABLE_CLUSTERING, -+ .slave_configure = iscsi_iser_slave_configure, - .proc_name = "iscsi_iser", - .this_id = -1, - }; -diff -Naur base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h ---- base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h 2010-09-09 16:57:15.000000000 -0400 -+++ linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h 2010-09-09 17:03:17.000000000 -0400 -@@ -92,7 +92,8 @@ - #define MASK_4K (~(SIZE_4K-1)) - - /* support upto 512KB in one RDMA */ --#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) -+/* FMR space for 1 MB of 4k-page transfers, plus 1 if not page aligned */ -+#define ISCSI_ISER_SG_TABLESIZE (((1<<20) >> SHIFT_4K) + 1) - #define ISER_DEF_CMD_PER_LUN 128 - - /* QP settings */ -diff -Naur base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c ---- base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c 2010-09-09 16:57:15.000000000 -0400 -+++ linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c 2010-09-09 17:04:44.000000000 -0400 -@@ -137,7 +137,7 @@ - device = ib_conn->device; - - ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) + -- (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)), -+ sizeof(u64) * ISCSI_ISER_SG_TABLESIZE, - GFP_KERNEL); - if (!ib_conn->page_vec) { - ret = -ENOMEM; -@@ -146,9 +146,7 @@ - ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1); - - params.page_shift = SHIFT_4K; -- /* when the first/last SG element are not start/end * -- * page aligned, the map whould be of N+1 pages */ -- params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; -+ params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE; - /* make the pool size twice the max number of SCSI commands * - * the ML is expected to queue, watermark for unmap at 50% */ - params.pool_size = ISCSI_DEF_XMIT_CMDS_MAX * 2; -diff -Naur base.linux-2.6.18.x86_64/include/scsi/libiscsi2.h linux-2.6.18.x86_64/include/scsi/libiscsi2.h ---- base.linux-2.6.18.x86_64/include/scsi/libiscsi2.h 2010-09-09 16:57:35.000000000 -0400 -+++ linux-2.6.18.x86_64/include/scsi/libiscsi2.h 2010-09-09 17:05:34.000000000 -0400 -@@ -43,7 +43,7 @@ - struct iscsi_nopin; - struct device; - --#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* must be power of 2 */ -+#define ISCSI_DEF_XMIT_CMDS_MAX 256 /* must be power of 2 */ - #define ISCSI_MGMT_CMDS_MAX 15 - - #define ISCSI_DEF_CMD_PER_LUN 32 diff --git a/lustre/kernel_patches/patches/md-avoid-bug_on-when-bmc-overflow.patch b/lustre/kernel_patches/patches/md-avoid-bug_on-when-bmc-overflow.patch deleted file mode 100644 index f64557e..0000000 --- a/lustre/kernel_patches/patches/md-avoid-bug_on-when-bmc-overflow.patch +++ /dev/null @@ -1,64 +0,0 @@ -diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c ---- .prev/drivers/md/bitmap.c 2007-02-07 13:03:56.000000000 +1100 -+++ ./drivers/md/bitmap.c 2007-02-07 21:34:47.000000000 +1100 -@@ -1160,6 +1160,22 @@ int bitmap_startwrite(struct bitmap *bit - return 0; - } - -+ if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) { -+ DEFINE_WAIT(__wait); -+ /* note that it is safe to do the prepare_to_wait -+ * after the test as long as we do it before dropping -+ * the spinlock. -+ */ -+ prepare_to_wait(&bitmap->overflow_wait, &__wait, -+ TASK_UNINTERRUPTIBLE); -+ spin_unlock_irq(&bitmap->lock); -+ bitmap->mddev->queue -+ ->unplug_fn(bitmap->mddev->queue); -+ schedule(); -+ finish_wait(&bitmap->overflow_wait, &__wait); -+ continue; -+ } -+ - switch(*bmc) { - case 0: - bitmap_file_set_bit(bitmap, offset); -@@ -1169,7 +1185,7 @@ int bitmap_startwrite(struct bitmap *bit - case 1: - *bmc = 2; - } -- BUG_ON((*bmc & COUNTER_MAX) == COUNTER_MAX); -+ - (*bmc)++; - - spin_unlock_irq(&bitmap->lock); -@@ -1207,6 +1223,9 @@ void bitmap_endwrite(struct bitmap *bitm - if (!success && ! (*bmc & NEEDED_MASK)) - *bmc |= NEEDED_MASK; - -+ if ((*bmc & COUNTER_MAX) == COUNTER_MAX) -+ wake_up(&bitmap->overflow_wait); -+ - (*bmc)--; - if (*bmc <= 2) { - set_page_attr(bitmap, -@@ -1431,6 +1450,7 @@ int bitmap_create(mddev_t *mddev) - spin_lock_init(&bitmap->lock); - atomic_set(&bitmap->pending_writes, 0); - init_waitqueue_head(&bitmap->write_wait); -+ init_waitqueue_head(&bitmap->overflow_wait); - - bitmap->mddev = mddev; - -diff .prev/include/linux/raid/bitmap.h ./include/linux/raid/bitmap.h ---- .prev/include/linux/raid/bitmap.h 2007-02-07 13:03:56.000000000 +1100 -+++ ./include/linux/raid/bitmap.h 2007-02-07 20:57:57.000000000 +1100 -@@ -247,6 +247,7 @@ struct bitmap { - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; -+ wait_queue_head_t overflow_wait; - - }; - diff --git a/lustre/kernel_patches/patches/md-rebuild-policy.patch b/lustre/kernel_patches/patches/md-rebuild-policy.patch deleted file mode 100644 index d42dae4..0000000 --- a/lustre/kernel_patches/patches/md-rebuild-policy.patch +++ /dev/null @@ -1,140 +0,0 @@ -Index: linux-2.6.18-128.1.6/drivers/md/md.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/drivers/md/md.c 2009-04-14 21:05:26.000000000 -0600 -+++ linux-2.6.18-128.1.6/drivers/md/md.c 2009-06-02 23:25:31.000000000 -0600 -@@ -90,6 +90,8 @@ - - static int sysctl_speed_limit_min = 1000; - static int sysctl_speed_limit_max = 200000; -+static int sysctl_rebuild_window_size = 256; -+static int sysctl_disk_idle_size = 4096; - static inline int speed_min(mddev_t *mddev) - { - return mddev->sync_speed_min ? -@@ -121,6 +123,22 @@ - .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, - }, -+ { -+ .ctl_name = DEV_RAID_REBUILD_WINDOW, -+ .procname = "rebuild_window_size", -+ .data = &sysctl_rebuild_window_size, -+ .maxlen = sizeof(int), -+ .mode = S_IRUGO|S_IWUSR, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = DEV_RAID_DISK_IDLE_SIZE, -+ .procname = "disk_idle_size", -+ .data = &sysctl_disk_idle_size, -+ .maxlen = sizeof(int), -+ .mode = S_IRUGO|S_IWUSR, -+ .proc_handler = &proc_dointvec, -+ }, - { .ctl_name = 0 } - }; - -@@ -5009,15 +5027,16 @@ - { - mdk_rdev_t * rdev; - int idle; -- unsigned long curr_events; -+ unsigned long rw, sync; - - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; -- curr_events = disk_stat_read(disk, sectors[0]) + -- disk_stat_read(disk, sectors[1]) - -- atomic_read(&disk->sync_io); -+ -+ rw = disk_stat_read(disk, sectors[READ])+disk_stat_read(disk, sectors[WRITE]); -+ sync = atomic_read(&disk->sync_io); -+ - /* The difference between curr_events and last_events - * will be affected by any new non-sync IO (making - * curr_events bigger) and any difference in the amount of -@@ -5031,9 +5050,9 @@ - * - * Note: the following is an unsigned comparison. - */ -- if ((curr_events - rdev->last_events + 4096) > 8192) { -- rdev->last_events = curr_events; -+ if (rw - rdev->last_events > sync + sysctl_disk_idle_size) { - idle = 0; -+ rdev->last_events = rw - sync; - } - } - rcu_read_unlock(); -@@ -5100,8 +5119,7 @@ - void md_do_sync(mddev_t *mddev) - { - mddev_t *mddev2; -- unsigned int currspeed = 0, -- window; -+ unsigned int currspeed = 0; - sector_t max_sectors,j, io_sectors; - unsigned long mark[SYNC_MARKS]; - sector_t mark_cnt[SYNC_MARKS]; -@@ -5221,9 +5239,8 @@ - /* - * Tune reconstruction: - */ -- window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", -- window/2,(unsigned long long) max_sectors/2); -+ sysctl_rebuild_window_size/2,(unsigned long long) max_sectors/2); - - atomic_set(&mddev->recovery_active, 0); - init_waitqueue_head(&mddev->recovery_wait); -@@ -5261,7 +5278,7 @@ - */ - md_new_event(mddev); - -- if (last_check + window > io_sectors || j == max_sectors) -+ if (last_check + sysctl_rebuild_window_size > io_sectors || j == max_sectors) - continue; - - last_check = io_sectors; -@@ -5282,7 +5299,6 @@ - last_mark = next; - } - -- - if (kthread_should_stop()) { - /* - * got a signal, exit. -@@ -5306,10 +5322,16 @@ - - currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 - /((jiffies-mddev->resync_mark)/HZ +1) +1; -- - if (currspeed > speed_min(mddev)) { - if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { -+ static unsigned long next_report; -+ if (time_after(jiffies, next_report)) { -+ printk(KERN_INFO "md: rebuild %s throttled due to IO\n", -+ mdname(mddev)); -+ /* once per 10 minutes */ -+ next_report = jiffies + 600 * HZ; -+ } - msleep(500); - goto repeat; - } -Index: linux-2.6.18-128.1.6/include/linux/sysctl.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/sysctl.h 2009-04-14 21:05:41.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/sysctl.h 2009-06-02 23:25:31.000000000 -0600 -@@ -928,7 +928,9 @@ - /* /proc/sys/dev/raid */ - enum { - DEV_RAID_SPEED_LIMIT_MIN=1, -- DEV_RAID_SPEED_LIMIT_MAX=2 -+ DEV_RAID_SPEED_LIMIT_MAX=2, -+ DEV_RAID_REBUILD_WINDOW=3, -+ DEV_RAID_DISK_IDLE_SIZE=4 - }; - - /* /proc/sys/dev/parport/default */ diff --git a/lustre/kernel_patches/patches/mpt-fusion-max-sge.patch b/lustre/kernel_patches/patches/mpt-fusion-max-sge.patch deleted file mode 100644 index 3fa6c48..0000000 --- a/lustre/kernel_patches/patches/mpt-fusion-max-sge.patch +++ /dev/null @@ -1,31 +0,0 @@ -diff -Nrup linux-2.6.18-92.1.10.orig/drivers/message/fusion/Kconfig linux-2.6.18-92.1.10/drivers/message/fusion/Kconfig ---- linux-2.6.18-92.1.10.orig/drivers/message/fusion/Kconfig 2008-12-11 10:27:02.000000000 +1100 -+++ linux-2.6.18-92.1.10/drivers/message/fusion/Kconfig 2008-12-11 10:28:42.000000000 +1100 -@@ -59,10 +59,10 @@ config FUSION_SAS - LSISAS1078 - - config FUSION_MAX_SGE -- int "Maximum number of scatter gather entries (16 - 128)" -+ int "Maximum number of scatter gather entries (16 - 256)" - depends on FUSION -- default "128" -- range 16 128 -+ default "256" -+ range 16 256 - help - This option allows you to specify the maximum number of scatter- - gather entries per I/O. The driver default is 128, which matches -diff -Nrup linux-2.6.18-92.1.10.orig/drivers/message/fusion/mptbase.h linux-2.6.18-92.1.10/drivers/message/fusion/mptbase.h ---- linux-2.6.18-92.1.10.orig/drivers/message/fusion/mptbase.h 2008-12-11 10:27:03.000000000 +1100 -+++ linux-2.6.18-92.1.10/drivers/message/fusion/mptbase.h 2008-12-11 10:30:55.000000000 +1100 -@@ -166,8 +166,8 @@ - #ifdef CONFIG_FUSION_MAX_SGE - #if CONFIG_FUSION_MAX_SGE < 16 - #define MPT_SCSI_SG_DEPTH 16 --#elif CONFIG_FUSION_MAX_SGE > 128 --#define MPT_SCSI_SG_DEPTH 128 -+#elif CONFIG_FUSION_MAX_SGE > 256 -+#define MPT_SCSI_SG_DEPTH 256 - #else - #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE - #endif diff --git a/lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch b/lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch deleted file mode 100644 index beadec2..0000000 --- a/lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch +++ /dev/null @@ -1,13 +0,0 @@ ---- linux/fs/inode.c.orig 2009-01-24 03:28:57.000000000 +0800 -+++ linux/fs/inode.c 2009-01-24 03:30:18.000000000 +0800 -@@ -418,7 +418,9 @@ static void prune_icache(int nr_to_scan) - int nr_scanned; - unsigned long reap = 0; - -- mutex_lock(&iprune_mutex); -+ if (!mutex_trylock(&iprune_mutex)) -+ return; -+ - spin_lock(&inode_lock); - for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { - struct inode *inode; diff --git a/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch b/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch deleted file mode 100644 index e53d871..0000000 --- a/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch +++ /dev/null @@ -1,622 +0,0 @@ -Index: linux-2.6.18-128.1.6/fs/dquot.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/dquot.c 2009-04-14 21:04:50.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/dquot.c 2009-06-02 23:26:36.000000000 -0600 -@@ -1592,10 +1592,19 @@ - } - - /* Generic routine for setting common part of quota structure */ --static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) -+static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) - { - struct mem_dqblk *dm = &dquot->dq_dqb; - int check_blim = 0, check_ilim = 0; -+ struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; -+ -+ if ((di->dqb_valid & QIF_BLIMITS && -+ (di->dqb_bhardlimit > dqi->dqi_maxblimit || -+ di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || -+ (di->dqb_valid & QIF_ILIMITS && -+ (di->dqb_ihardlimit > dqi->dqi_maxilimit || -+ di->dqb_isoftlimit > dqi->dqi_maxilimit))) -+ return -ERANGE; - - spin_lock(&dq_data_lock); - if (di->dqb_valid & QIF_SPACE) { -@@ -1627,7 +1636,7 @@ - clear_bit(DQ_BLKS_B, &dquot->dq_flags); - } - else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */ -- dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; -+ dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; - } - if (check_ilim) { - if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) { -@@ -1635,7 +1644,7 @@ - clear_bit(DQ_INODES_B, &dquot->dq_flags); - } - else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */ -- dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; -+ dm->dqb_itime = get_seconds() + dqi->dqi_igrace; - } - if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) - clear_bit(DQ_FAKE_B, &dquot->dq_flags); -@@ -1643,21 +1652,24 @@ - set_bit(DQ_FAKE_B, &dquot->dq_flags); - spin_unlock(&dq_data_lock); - mark_dquot_dirty(dquot); -+ -+ return 0; - } - - int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) - { - struct dquot *dquot; -+ int rc; - - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!(dquot = dqget(sb, id, type))) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return -ESRCH; - } -- do_set_dqblk(dquot, di); -+ rc = do_set_dqblk(dquot, di); - dqput(dquot); - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); -- return 0; -+ return rc; - } - - /* Generic routine for getting common part of quota file information */ -Index: linux-2.6.18-128.1.6/fs/quota_v1.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/quota_v1.c 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/quota_v1.c 2009-06-02 23:26:36.000000000 -0600 -@@ -139,6 +139,9 @@ - goto out; - } - ret = 0; -+ /* limits are stored as unsigned 32-bit data */ -+ dqopt->info[type].dqi_maxblimit = 0xffffffff; -+ dqopt->info[type].dqi_maxilimit = 0xffffffff; - dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; - dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; - out: -Index: linux-2.6.18-128.1.6/fs/quota_v2.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/fs/quota_v2.c 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/fs/quota_v2.c 2009-06-02 23:26:36.000000000 -0600 -@@ -23,26 +23,64 @@ - typedef char *dqbuf_t; - - #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) --#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) -+#define GETENTRIES(buf) ((union v2_disk_dqblk *)(((char *)buf) + \ -+ sizeof(struct v2_disk_dqdbheader))) -+#define REV_ASSERT(r) BUG_ON((rev) != 0 && (rev) != 1) -+ -+static const union v2_disk_dqblk emptydquot; -+static const union v2_disk_dqblk fakedquot[2] = { -+ {.r0 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }, -+ {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} } -+}; - --/* Check whether given file is really vfsv0 quotafile */ --static int v2_check_quota_file(struct super_block *sb, int type) -+static inline uint v2_dqblksz(uint rev) -+{ -+ uint sz; -+ -+ REV_ASSERT(rev); -+ -+ if (rev == 0) -+ sz = sizeof(struct v2_disk_dqblk_r0); -+ else -+ sz = sizeof(struct v2_disk_dqblk_r1); -+ -+ return sz; -+} -+ -+/* Number of quota entries in a block */ -+static inline int v2_dqstrinblk(uint rev) -+{ -+ return (V2_DQBLKSIZE-sizeof(struct v2_disk_dqdbheader))/v2_dqblksz(rev); -+} -+ -+/* Get revision of a quota file, -1 if it does not look a quota file */ -+static int v2_quota_file_revision(struct super_block *sb, int type) - { - struct v2_disk_dqheader dqhead; - ssize_t size; - static const uint quota_magics[] = V2_INITQMAGICS; -- static const uint quota_versions[] = V2_INITQVERSIONS; -+ static const uint quota_versions_r0[] = V2_INITQVERSIONS_R0; -+ static const uint quota_versions_r1[] = V2_INITQVERSIONS_R1; - - size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); - if (size != sizeof(struct v2_disk_dqheader)) { - printk("quota_v2: failed read expected=%zd got=%zd\n", - sizeof(struct v2_disk_dqheader), size); -- return 0; -+ return -1; - } -- if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || -- le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) -- return 0; -- return 1; -+ if (le32_to_cpu(dqhead.dqh_magic) == quota_magics[type]) { -+ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r0[type]) -+ return 0; -+ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r1[type]) -+ return 1; -+ } -+ return -1; -+} -+ -+/* Check whether given file is really vfsv0 quotafile */ -+static inline int v2_check_quota_file(struct super_block *sb, int type) -+{ -+ return v2_quota_file_revision(sb, type) != -1; - } - - /* Read information header from quota file */ -@@ -51,6 +89,13 @@ - struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqopt(sb)->info+type; - ssize_t size; -+ int rev; -+ -+ rev = v2_quota_file_revision(sb, type); -+ if (rev < 0) { -+ printk(KERN_WARNING "Second quota file check failed.\n"); -+ return -1; -+ } - - size = sb->s_op->quota_read(sb, type, (char *)&dinfo, - sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); -@@ -65,6 +110,16 @@ - info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); - info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); - info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); -+ -+ info->u.v2_i.dqi_revision = rev; -+ if (rev == 0) { -+ info->dqi_maxblimit = 0xffffffffULL; -+ info->dqi_maxilimit = 0xffffffffULL; -+ } else { -+ info->dqi_maxblimit = 0xffffffffffffffffULL; -+ info->dqi_maxilimit = 0xffffffffffffffffULL; -+ } -+ - return 0; - } - -@@ -94,29 +149,61 @@ - return 0; - } - --static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) -+static void disk2memdqb(struct mem_dqblk *m, union v2_disk_dqblk *d, uint rev) - { -- m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); -- m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); -- m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); -- m->dqb_itime = le64_to_cpu(d->dqb_itime); -- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); -- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); -- m->dqb_curspace = le64_to_cpu(d->dqb_curspace); -- m->dqb_btime = le64_to_cpu(d->dqb_btime); --} -- --static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) --{ -- d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); -- d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); -- d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); -- d->dqb_itime = cpu_to_le64(m->dqb_itime); -- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); -- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); -- d->dqb_curspace = cpu_to_le64(m->dqb_curspace); -- d->dqb_btime = cpu_to_le64(m->dqb_btime); -- d->dqb_id = cpu_to_le32(id); -+ REV_ASSERT(rev); -+ -+ if (rev == 0) { -+ struct v2_disk_dqblk_r0 *ddqblk = &d->r0; -+ m->dqb_ihardlimit = le32_to_cpu(ddqblk->dqb_ihardlimit); -+ m->dqb_isoftlimit = le32_to_cpu(ddqblk->dqb_isoftlimit); -+ m->dqb_curinodes = le32_to_cpu(ddqblk->dqb_curinodes); -+ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime); -+ m->dqb_bhardlimit = le32_to_cpu(ddqblk->dqb_bhardlimit); -+ m->dqb_bsoftlimit = le32_to_cpu(ddqblk->dqb_bsoftlimit); -+ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace); -+ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime); -+ } else { -+ struct v2_disk_dqblk_r1 *ddqblk = &d->r1; -+ m->dqb_ihardlimit = le64_to_cpu(ddqblk->dqb_ihardlimit); -+ m->dqb_isoftlimit = le64_to_cpu(ddqblk->dqb_isoftlimit); -+ m->dqb_curinodes = le64_to_cpu(ddqblk->dqb_curinodes); -+ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime); -+ m->dqb_bhardlimit = le64_to_cpu(ddqblk->dqb_bhardlimit); -+ m->dqb_bsoftlimit = le64_to_cpu(ddqblk->dqb_bsoftlimit); -+ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace); -+ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime); -+ } -+} -+ -+static void mem2diskdqb(union v2_disk_dqblk *d, struct mem_dqblk *m, -+ qid_t id, uint rev) -+{ -+ REV_ASSERT(rev); -+ -+ if (rev == 0) { -+ struct v2_disk_dqblk_r0 *ddqblk = &d->r0; -+ ddqblk->dqb_id = cpu_to_le32(id); -+ ddqblk->dqb_ihardlimit = cpu_to_le32((__u32)m->dqb_ihardlimit); -+ ddqblk->dqb_isoftlimit = cpu_to_le32((__u32)m->dqb_isoftlimit); -+ ddqblk->dqb_curinodes = cpu_to_le32((__u32)m->dqb_curinodes); -+ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime); -+ ddqblk->dqb_bhardlimit = cpu_to_le32((__u32)m->dqb_bhardlimit); -+ ddqblk->dqb_bsoftlimit = cpu_to_le32((__u32)m->dqb_bsoftlimit); -+ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); -+ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime); -+ } else { -+ struct v2_disk_dqblk_r1 *ddqblk = &d->r1; -+ ddqblk->dqb_id = cpu_to_le32(id); -+ ddqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); -+ ddqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); -+ ddqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); -+ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime); -+ ddqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); -+ ddqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); -+ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); -+ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime); -+ } - } - - static dqbuf_t getdqbuf(void) -@@ -268,10 +355,10 @@ - { - struct super_block *sb = dquot->dq_sb; - struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type; -- uint blk, i; -+ uint blk, i, rev = info->u.v2_i.dqi_revision; -+ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev); - struct v2_disk_dqdbheader *dh; -- struct v2_disk_dqblk *ddquot; -- struct v2_disk_dqblk fakedquot; -+ union v2_disk_dqblk *ddquot; - dqbuf_t buf; - - *err = 0; -@@ -298,17 +385,18 @@ - info->u.v2_i.dqi_free_entry = blk; - mark_info_dirty(sb, dquot->dq_type); - } -- if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ -+ /* Block will be full? */ -+ if (le16_to_cpu(dh->dqdh_entries)+1 >= dqstrinblk) - if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); - goto out_buf; - } - dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1); -- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); - /* Find free structure in block */ -- for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); -+ for (i = 0; i < dqstrinblk && memcmp(&emptydquot, ddquot, dqblksz); -+ i++, ddquot = (char *)ddquot + dqblksz); - #ifdef __QUOTA_V2_PARANOIA -- if (i == V2_DQSTRINBLK) { -+ if (i == dqstrinblk) { - printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); - *err = -EIO; - goto out_buf; -@@ -318,7 +406,8 @@ - printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); - goto out_buf; - } -- dquot->dq_off = (blk<dq_off = (blk<dq_type; - ssize_t ret; -- struct v2_disk_dqblk ddquot, empty; -+ union v2_disk_dqblk ddquot; -+ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision; -+ uint dqblksz = v2_dqblksz(rev); - - /* dq_off is guarded by dqio_mutex */ - if (!dquot->dq_off) -@@ -401,18 +492,22 @@ - return ret; - } - spin_lock(&dq_data_lock); -- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); -+ mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, rev); - /* Argh... We may need to write structure full of zeroes but that would be - * treated as an empty place by the rest of the code. Format change would - * be definitely cleaner but the problems probably are not worth it */ -- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); -- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) -- ddquot.dqb_itime = cpu_to_le64(1); -+ if (!memcmp(&emptydquot, &ddquot, dqblksz)) { -+ if (rev == 0) -+ ddquot.r0.dqb_itime = cpu_to_le64(1); -+ else -+ ddquot.r1.dqb_itime = cpu_to_le64(1); -+ } - spin_unlock(&dq_data_lock); - ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, -- (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off); -- if (ret != sizeof(struct v2_disk_dqblk)) { -- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id); -+ (char *)&ddquot, dqblksz, dquot->dq_off); -+ if (ret != dqblksz) { -+ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", -+ dquot->dq_sb->s_id); - if (ret >= 0) - ret = -ENOSPC; - } -@@ -431,6 +526,7 @@ - struct v2_disk_dqdbheader *dh; - dqbuf_t buf = getdqbuf(); - int ret = 0; -+ uint rev = sb_dqopt(sb)->info[type].u.v2_i.dqi_revision; - - if (!buf) - return -ENOMEM; -@@ -456,8 +552,8 @@ - } - else { - memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, -- sizeof(struct v2_disk_dqblk)); -- if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { -+ v2_dqblksz(rev)); -+ if (le16_to_cpu(dh->dqdh_entries) == v2_dqstrinblk(rev)-1) { - /* Insert will write block itself */ - if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); -@@ -529,41 +625,56 @@ - return remove_tree(dquot, &tmp, 0); - } - -+static inline __u32 dqid(union v2_disk_dqblk *ddquot, uint rev) -+{ -+ __u32 dq_id; -+ -+ REV_ASSERT(rev); -+ -+ if (rev == 0) -+ dq_id = le32_to_cpu(ddquot->r0.dqb_id); -+ else -+ dq_id = le32_to_cpu(ddquot->r1.dqb_id); -+ -+ return dq_id; -+} -+ - /* Find entry in block */ - static loff_t find_block_dqentry(struct dquot *dquot, uint blk) - { - dqbuf_t buf = getdqbuf(); - loff_t ret = 0; - int i; -- struct v2_disk_dqblk *ddquot = GETENTRIES(buf); -+ union v2_disk_dqblk *ddquot = GETENTRIES(buf); -+ int type = dquot->dq_type; -+ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision; -+ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev); - - if (!buf) - return -ENOMEM; -- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { -+ -+ ret = read_blk(dquot->dq_sb, type, blk, buf); -+ if (ret < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); - goto out_buf; - } - if (dquot->dq_id) -- for (i = 0; i < V2_DQSTRINBLK && -- le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); -+ for (i = 0; i < dqstrinblk && dqid(ddquot, rev) != dquot->dq_id; -+ i++, ddquot = (char *)ddquot + dqblksz); - else { /* ID 0 as a bit more complicated searching... */ -- struct v2_disk_dqblk fakedquot; -- -- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); -- for (i = 0; i < V2_DQSTRINBLK; i++) -- if (!le32_to_cpu(ddquot[i].dqb_id) && -- memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) -+ for (i = 0; i < dqstrinblk; i++, ddquot = (char *)ddquot+dqblksz) -+ if (!dqid(ddquot, rev) && -+ memcmp(&emptydquot, ddquot, dqblksz)) - break; - } -- if (i == V2_DQSTRINBLK) { -+ if (i == dqstrinblk) { - printk(KERN_ERR "VFS: Quota for id %u referenced " - "but not present.\n", dquot->dq_id); - ret = -EIO; - goto out_buf; - } - else -- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct -- v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); -+ ret = (blk << V2_DQBLKSIZE_BITS)+((char *)ddquot-(char *)buf); - out_buf: - freedqbuf(buf); - return ret; -@@ -605,7 +716,7 @@ - { - int type = dquot->dq_type; - loff_t offset; -- struct v2_disk_dqblk ddquot, empty; -+ union v2_disk_dqblk ddquot; - int ret = 0; - - #ifdef __QUOTA_V2_PARANOIA -@@ -626,25 +737,30 @@ - ret = offset; - } - else { -+ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i. -+ dqi_revision; -+ uint dqblksz = v2_dqblksz(rev); - dquot->dq_off = offset; -- if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, -- (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset)) -- != sizeof(struct v2_disk_dqblk)) { -+ ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, -+ (char *)&ddquot, dqblksz, offset); -+ if (ret != dqblksz) { - if (ret >= 0) - ret = -EIO; - printk(KERN_ERR "VFS: Error while reading quota " - "structure for id %u.\n", dquot->dq_id); -- memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); -+ memset(&ddquot, 0, dqblksz); - } - else { - ret = 0; - /* We need to escape back all-zero structure */ -- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); -- empty.dqb_itime = cpu_to_le64(1); -- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) -- ddquot.dqb_itime = 0; -+ if (!memcmp(&fakedquot[rev], &ddquot, dqblksz)) { -+ if (rev == 0) -+ ddquot.r0.dqb_itime = cpu_to_le64(0); -+ else -+ ddquot.r1.dqb_itime = cpu_to_le64(0); -+ } - } -- disk2memdqb(&dquot->dq_dqb, &ddquot); -+ disk2memdqb(&dquot->dq_dqb, &ddquot, rev); - if (!dquot->dq_dqb.dqb_bhardlimit && - !dquot->dq_dqb.dqb_bsoftlimit && - !dquot->dq_dqb.dqb_ihardlimit && -Index: linux-2.6.18-128.1.6/include/linux/dqblk_v2.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/dqblk_v2.h 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/dqblk_v2.h 2009-06-02 23:26:36.000000000 -0600 -@@ -21,6 +21,7 @@ - unsigned int dqi_blocks; - unsigned int dqi_free_blk; - unsigned int dqi_free_entry; -+ unsigned int dqi_revision; - }; - - #endif /* _LINUX_DQBLK_V2_H */ -Index: linux-2.6.18-128.1.6/include/linux/quota.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/quota.h 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/quota.h 2009-06-02 23:26:36.000000000 -0600 -@@ -149,12 +149,12 @@ - * Data for one user/group kept in memory - */ - struct mem_dqblk { -- __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ -- __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ -+ qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ -+ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */ - qsize_t dqb_curspace; /* current used space */ -- __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ -- __u32 dqb_isoftlimit; /* preferred inode limit */ -- __u32 dqb_curinodes; /* current # allocated inodes */ -+ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */ -+ qsize_t dqb_isoftlimit; /* preferred inode limit */ -+ qsize_t dqb_curinodes; /* current # allocated inodes */ - time_t dqb_btime; /* time limit for excessive disk use */ - time_t dqb_itime; /* time limit for excessive inode use */ - }; -@@ -170,6 +170,8 @@ - unsigned long dqi_flags; - unsigned int dqi_bgrace; - unsigned int dqi_igrace; -+ qsize_t dqi_maxblimit; -+ qsize_t dqi_maxilimit; - union { - struct v1_mem_dqinfo v1_i; - struct v2_mem_dqinfo v2_i; -Index: linux-2.6.18-128.1.6/include/linux/quotaio_v2.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/quotaio_v2.h 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/quotaio_v2.h 2009-06-02 23:26:36.000000000 -0600 -@@ -16,28 +16,51 @@ - 0xd9c01927 /* GRPQUOTA */\ - } - --#define V2_INITQVERSIONS {\ -+#define V2_INITQVERSIONS_R0 {\ - 0, /* USRQUOTA */\ - 0 /* GRPQUOTA */\ - } - -+#define V2_INITQVERSIONS_R1 {\ -+ 1, /* USRQUOTA */\ -+ 1 /* GRPQUOTA */\ -+} -+ - /* - * The following structure defines the format of the disk quota file - * (as it appears on disk) - the file is a radix tree whose leaves point - * to blocks of these structures. - */ --struct v2_disk_dqblk { -+struct v2_disk_dqblk_r0 { - __le32 dqb_id; /* id this quota applies to */ - __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ - __le32 dqb_isoftlimit; /* preferred inode limit */ - __le32 dqb_curinodes; /* current # allocated inodes */ -- __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ -- __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ -+ __le32 dqb_bhardlimit; /* absolute limit on disk space */ -+ __le32 dqb_bsoftlimit; /* preferred limit on disk space */ -+ __le64 dqb_curspace; /* current space occupied (in bytes) */ -+ __le64 dqb_btime; /* time limit for excessive disk use */ -+ __le64 dqb_itime; /* time limit for excessive inode use */ -+}; -+ -+struct v2_disk_dqblk_r1 { -+ __le32 dqb_id; /* id this quota applies to */ -+ __le32 dqb_padding; /* padding field */ -+ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ -+ __le64 dqb_isoftlimit; /* preferred inode limit */ -+ __le64 dqb_curinodes; /* current # allocated inodes */ -+ __le64 dqb_bhardlimit; /* absolute limit on disk space */ -+ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ - __le64 dqb_curspace; /* current space occupied (in bytes) */ - __le64 dqb_btime; /* time limit for excessive disk use */ - __le64 dqb_itime; /* time limit for excessive inode use */ - }; - -+union v2_disk_dqblk { -+ struct v2_disk_dqblk_r0 r0; -+ struct v2_disk_dqblk_r1 r1; -+}; -+ - /* - * Here are header structures as written on disk and their in-memory copies - */ -@@ -59,7 +82,7 @@ - - /* - * Structure of header of block with quota structures. It is padded to 16 bytes so -- * there will be space for exactly 21 quota-entries in a block -+ * there will be space for exactly 21 (r0) or 14 (r1) quota-entries in a block - */ - struct v2_disk_dqdbheader { - __le32 dqdh_next_free; /* Number of next block with free entry */ -@@ -74,6 +97,5 @@ - #define V2_DQBLKSIZE (1 << V2_DQBLKSIZE_BITS) /* Size of block with quota structures */ - #define V2_DQTREEOFF 1 /* Offset of tree in file in blocks */ - #define V2_DQTREEDEPTH 4 /* Depth of quota tree */ --#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk)) /* Number of entries in one blocks */ - - #endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch b/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch deleted file mode 100644 index be8f6c2..0000000 --- a/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch +++ /dev/null @@ -1,31 +0,0 @@ -diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c ---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:23:39.000000000 +0800 -+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:24:14.000000000 +0800 -@@ -57,7 +57,7 @@ - * Stripe cache - */ - --#define NR_STRIPES 256 -+static int raid5_nr_stripes = 256 * 8; - #define STRIPE_SIZE PAGE_SIZE - #define STRIPE_SHIFT (PAGE_SHIFT - 9) - #define STRIPE_SECTORS (STRIPE_SIZE>>9) -@@ -3230,7 +3230,7 @@ static int run(mddev_t *mddev) - else - conf->max_degraded = 1; - conf->algorithm = mddev->layout; -- conf->max_nr_stripes = NR_STRIPES; -+ conf->max_nr_stripes = raid5_nr_stripes; - conf->expand_progress = mddev->reshape_position; - - /* device size must be a multiple of chunk size */ -@@ -3821,6 +3821,7 @@ static void raid5_exit(void) - - module_init(raid5_init); - module_exit(raid5_exit); -+module_param(raid5_nr_stripes, int, 0644); - MODULE_LICENSE("GPL"); - MODULE_ALIAS("md-personality-4"); /* RAID5 */ - MODULE_ALIAS("md-raid5"); -Only in linux-2.6.18-53/drivers/md: raid5.c.orig -Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp diff --git a/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch b/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch deleted file mode 100644 index 6a712a9..0000000 --- a/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c ---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:26:27.000000000 +0800 -+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:26:55.000000000 +0800 -@@ -3340,6 +3340,11 @@ static int run(mddev_t *mddev) - mddev->array_size = mddev->size * (conf->previous_raid_disks - - conf->max_degraded); - -+ /* in order to support large I/Os */ -+ blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9); -+ mddev->queue->max_phys_segments = conf->chunk_size * (conf->previous_raid_disks - conf->max_degraded) >> PAGE_SHIFT; -+ mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;; -+ - return 0; - abort: - if (conf) { diff --git a/lustre/kernel_patches/patches/raid5-maxsectors-rhel5.patch b/lustre/kernel_patches/patches/raid5-maxsectors-rhel5.patch deleted file mode 100644 index 090d703..0000000 --- a/lustre/kernel_patches/patches/raid5-maxsectors-rhel5.patch +++ /dev/null @@ -1,23 +0,0 @@ -diff -ru linux-orig/drivers/md/raid5.c linux-new/drivers/md/raid5.c ---- linux-orig/drivers/md/raid5.c 2009-04-14 08:11:38.000000000 +1000 -+++ linux-new/drivers/md/raid5.c 2009-09-20 05:02:02.000000000 +1000 -@@ -3595,10 +3595,16 @@ - mddev->array_size = mddev->size * (conf->previous_raid_disks - - conf->max_degraded); - -+ int stripe_size = conf->chunk_size * (conf->previous_raid_disks - conf->max_degraded); -+ - /* in order to support large I/Os */ -- blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9); -- mddev->queue->max_phys_segments = conf->chunk_size * (conf->previous_raid_disks - conf->max_degraded) >> PAGE_SHIFT; -- mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;; -+ blk_queue_max_sectors(mddev->queue, stripe_size >> 9); -+ /* KTVM: set default max_sectors the same as the max_hw_sectors set above */ -+ mddev->queue->max_sectors = mddev->queue->max_hw_sectors; -+ printk("%s: setting max_sectors = %d, max_hw_sectors = %d\n", mdname(mddev), mddev->queue->max_sectors, mddev->queue->max_hw_sectors); -+ -+ mddev->queue->max_phys_segments = stripe_size >> PAGE_SHIFT; -+ mddev->queue->max_hw_segments = stripe_size >> PAGE_SHIFT;; - - /* raid5 device is able to do zcopy right now. */ - mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE; diff --git a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch deleted file mode 100644 index 52da835e..0000000 --- a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch +++ /dev/null @@ -1,185 +0,0 @@ -diff -pur b/drivers/md/raid5.c a/drivers/md/raid5.c ---- b/drivers/md/raid5.c 2009-02-20 15:56:36.000000000 +0800 -+++ a/drivers/md/raid5.c 2009-02-20 15:57:49.000000000 +0800 -@@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip - } - } - -+/* -+ * The whole idea is to collect all bio's and then issue them -+ * disk by disk to assist merging a bit -bzzz -+ */ -+static void raid5_flush_bios(raid5_conf_t *conf, struct bio *bios[], int raid_disks) -+{ -+ struct bio *bio, *nbio; -+ int i; - -+ for (i = 0; i < raid_disks; i++) { -+ bio = bios[i]; -+ while (bio) { -+ nbio = bio->bi_next; -+ bio->bi_next = NULL; -+ generic_make_request(bio); -+ bio = nbio; -+ } -+ bios[i] = NULL; -+ } -+} - - /* - * Each stripe/dev can have one or more bion attached. -@@ -1392,7 +1411,7 @@ static int stripe_to_pdidx(sector_t stri - * - */ - --static void handle_stripe5(struct stripe_head *sh) -+static void handle_stripe5(struct stripe_head *sh, struct bio *bios[]) - { - raid5_conf_t *conf = sh->raid_conf; - int disks = sh->disks; -@@ -1939,7 +1958,11 @@ static void handle_stripe5(struct stripe - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - atomic_inc(&conf->out_reqs_in_queue); -- generic_make_request(bi); -+ if (bios) { -+ bi->bi_next = bios[i]; -+ bios[i] = bi; -+ } else -+ generic_make_request(bi); - } else { - if (rw == 1) - set_bit(STRIPE_DEGRADED, &sh->state); -@@ -1951,7 +1974,7 @@ static void handle_stripe5(struct stripe - } - } - --static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) -+static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[]) - { - raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks; -@@ -2499,7 +2522,11 @@ static void handle_stripe6(struct stripe - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); -- generic_make_request(bi); -+ if (bios) { -+ bi->bi_next = bios[i]; -+ bios[i] = bi; -+ } else -+ generic_make_request(bi); - atomic_inc(&conf->out_reqs_in_queue); - } else { - if (rw == 1) -@@ -2512,12 +2539,12 @@ static void handle_stripe6(struct stripe - } - } - --static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) -+static void handle_stripe(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[]) - { - if (sh->raid_conf->level == 6) -- handle_stripe6(sh, tmp_page); -+ handle_stripe6(sh, tmp_page, bios); - else -- handle_stripe5(sh); -+ handle_stripe5(sh, bios); - } - - -@@ -2670,6 +2697,7 @@ static int make_request(request_queue_t - int stripes_per_chunk, sectors_per_block; - int sectors_per_stripe; - int i, j; -+ struct bio *bios[MD_SB_DISKS]; - - DEFINE_WAIT(w); - int disks, data_disks; -@@ -2698,6 +2726,7 @@ static int make_request(request_queue_t - sectors = bi->bi_size >> 9; - stripes_per_chunk = conf->chunk_size / STRIPE_SIZE; - -+ memset(&bios, 0, sizeof(bios)); - redo_bio: - /* stripe by stripe handle needs a stable raid layout, so if this - * reuqest covers the expanding region, wait it over. -@@ -2756,8 +2785,10 @@ retry: - * the raid layout has been changed, we have to redo the - * whole bio because we don't which sectors in it has been - * done, and which is not done. -jay */ -- if (raid5_redo_bio(conf, bi, disks, logical_sector)) -+ if (raid5_redo_bio(conf, bi, disks, logical_sector)) { -+ raid5_flush_bios(conf, bios, disks); - goto redo_bio; -+ } - - if (test_bit(STRIPE_EXPANDING, &sh->state)) { - /* Stripe is busy expanding or -@@ -2766,6 +2797,7 @@ retry: - */ - release_stripe(sh); - sh = NULL; -+ raid5_flush_bios(conf, bios, disks); - raid5_unplug_device(mddev->queue); - schedule(); - goto retry; -@@ -2784,17 +2816,19 @@ retry: - */ - if (r_sector >= mddev->suspend_lo && - r_sector < mddev->suspend_hi) { -- handle_stripe(sh, NULL); -+ handle_stripe(sh, NULL, NULL); - release_stripe(sh); - sh = NULL; -+ raid5_flush_bios(conf, bios, disks); - schedule(); - goto retry; - } - - if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { -- handle_stripe(sh, NULL); -+ handle_stripe(sh, NULL, NULL); - release_stripe(sh); - sh = NULL; -+ raid5_flush_bios(conf, bios, disks); - raid5_unplug_device(mddev->queue); - schedule(); - goto retry; -@@ -2810,7 +2844,7 @@ retry: - r_sector += sectors_per_chunk; - } - if (sh) { -- handle_stripe(sh, NULL); -+ handle_stripe(sh, NULL, bios); - release_stripe(sh); - sh = NULL; - } -@@ -2820,6 +2854,9 @@ retry: - if (sectors > 0) - goto repeat; - -+ /* flush all of the bios */ -+ raid5_flush_bios(conf, bios, disks); -+ - spin_lock_irq(&conf->device_lock); - remaining = --bi->bi_phys_segments; - spin_unlock_irq(&conf->device_lock); -@@ -3035,7 +3072,7 @@ static inline sector_t sync_request(mdde - clear_bit(STRIPE_INSYNC, &sh->state); - spin_unlock(&sh->lock); - -- handle_stripe(sh, NULL); -+ handle_stripe(sh, NULL, NULL); - release_stripe(sh); - - return STRIPE_SECTORS; -@@ -3091,7 +3128,7 @@ static void raid5d (mddev_t *mddev) - - handled++; - atomic_inc(&conf->handled_in_raid5d); -- handle_stripe(sh, conf->spare_page); -+ handle_stripe(sh, conf->spare_page, NULL); - release_stripe(sh); - - cond_resched(); diff --git a/lustre/kernel_patches/patches/raid5-mmp-unplug-dev.patch b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev.patch deleted file mode 100644 index cc25153..0000000 --- a/lustre/kernel_patches/patches/raid5-mmp-unplug-dev.patch +++ /dev/null @@ -1,22 +0,0 @@ -Index: linux-2.6.18-128.1.6/drivers/md/raid5.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600 -+++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:27:21.000000000 -0600 -@@ -1456,6 +1456,8 @@ - bi->bi_next = *bip; - *bip = bi; - bi->bi_phys_segments ++; -+ if (bio_sync(bi) && !forwrite) -+ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ - spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); - -@@ -3012,6 +3014,8 @@ - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); - } -+ if (bio_sync(bi)) -+ raid5_unplug_device(q); - return 0; - } - diff --git a/lustre/kernel_patches/patches/raid5-rebuild-corrupt-bug.patch b/lustre/kernel_patches/patches/raid5-rebuild-corrupt-bug.patch deleted file mode 100644 index c434498..0000000 --- a/lustre/kernel_patches/patches/raid5-rebuild-corrupt-bug.patch +++ /dev/null @@ -1,26 +0,0 @@ -While the stripe in-memory must be in-sync, the stripe on disk might not be -because if we computed a block rather than reading it from an in-sync disk, -the in-memory stripe can be different from the on-disk stripe. - -If this bug were still in mainline I would probably want a bigger patch which -would leave this code but also set R5_LOCKED on all blocks that have been -computed. But as it is a stablisation patch, the above is simple and more -clearly correct. - -Thanks for you patience - I look forward to your success/failure report. - -NeilBrown - -diff -up /drivers/md/raid5.c -=========================================== ---- a/drivers/md/raid5.c -+++ b/drivers/md/raid5.c -@@ -2466,8 +2466,6 @@ - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } -- /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ -- set_bit(STRIPE_INSYNC, &sh->state); - - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); diff --git a/lustre/kernel_patches/patches/raid5-stats-rhel5.patch b/lustre/kernel_patches/patches/raid5-stats-rhel5.patch deleted file mode 100644 index b119334..0000000 --- a/lustre/kernel_patches/patches/raid5-stats-rhel5.patch +++ /dev/null @@ -1,256 +0,0 @@ -diff -pru linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c ---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:15:22.000000000 +0800 -+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:17:30.000000000 +0800 -@@ -115,10 +115,12 @@ static void __release_stripe(raid5_conf_ - if (test_bit(STRIPE_DELAYED, &sh->state)) { - list_add_tail(&sh->lru, &conf->delayed_list); - blk_plug_device(conf->mddev->queue); -+ atomic_inc(&conf->delayed); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - sh->bm_seq - conf->seq_write > 0) { - list_add_tail(&sh->lru, &conf->bitmap_list); - blk_plug_device(conf->mddev->queue); -+ atomic_inc(&conf->bit_delayed); - } else { - clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); -@@ -289,6 +291,7 @@ static struct stripe_head *get_active_st - if (noblock && sh == NULL) - break; - if (!sh) { -+ atomic_inc(&conf->out_of_stripes); - conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && -@@ -311,6 +314,10 @@ static struct stripe_head *get_active_st - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); - list_del_init(&sh->lru); -+ if (test_bit(STRIPE_DELAYED, &sh->state)) -+ atomic_dec(&conf->delayed); -+ if (test_bit(STRIPE_BIT_DELAY, &sh->state)) -+ atomic_dec(&conf->bit_delayed); - } - } - } while (sh == NULL); -@@ -529,6 +536,8 @@ static int raid5_end_read_request(struct - if (bi->bi_size) - return 1; - -+ atomic_dec(&conf->out_reqs_in_queue); -+ - for (i=0 ; idev[i].req) - break; -@@ -642,6 +651,8 @@ static int raid5_end_write_request (stru - if (bi->bi_size) - return 1; - -+ atomic_dec(&conf->out_reqs_in_queue); -+ - for (i=0 ; idev[i].req) - break; -@@ -1402,6 +1413,8 @@ static void handle_stripe5(struct stripe - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - -+ atomic_inc(&conf->handle_called); -+ - syncing = test_bit(STRIPE_SYNCING, &sh->state); - expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); -@@ -1684,6 +1697,7 @@ static void handle_stripe5(struct stripe - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; -+ atomic_inc(&conf->reads_for_rmw); - } else { - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); -@@ -1703,6 +1717,7 @@ static void handle_stripe5(struct stripe - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; -+ atomic_inc(&conf->reads_for_rcw); - } else { - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); -@@ -1870,6 +1885,7 @@ static void handle_stripe5(struct stripe - bi->bi_end_io(bi, bytes, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); -+ atomic_dec(&conf->in_reqs_in_queue); - } - for (i=disks; i-- ;) { - int rw; -@@ -1885,10 +1901,13 @@ static void handle_stripe5(struct stripe - bi = &sh->dev[i].req; - - bi->bi_rw = rw; -- if (rw) -+ if (rw) { -+ atomic_inc(&conf->writes_out); - bi->bi_end_io = raid5_end_write_request; -- else -+ } else { -+ atomic_inc(&conf->reads_out); - bi->bi_end_io = raid5_end_read_request; -+ } - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); -@@ -1919,6 +1938,7 @@ static void handle_stripe5(struct stripe - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); -+ atomic_inc(&conf->out_reqs_in_queue); - generic_make_request(bi); - } else { - if (rw == 1) -@@ -1955,6 +1975,8 @@ static void handle_stripe6(struct stripe - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - -+ atomic_inc(&conf->handle_called); -+ - syncing = test_bit(STRIPE_SYNCING, &sh->state); - /* Now to look around and see what can be done */ - -@@ -2255,6 +2277,7 @@ static void handle_stripe6(struct stripe - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; -+ atomic_inc(&conf->reads_for_rcw); - } else { - PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); -@@ -2423,6 +2446,7 @@ static void handle_stripe6(struct stripe - bi->bi_end_io(bi, bytes, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); -+ atomic_dec(&conf->in_reqs_in_queue); - } - for (i=disks; i-- ;) { - int rw; -@@ -2438,10 +2462,13 @@ static void handle_stripe6(struct stripe - bi = &sh->dev[i].req; - - bi->bi_rw = rw; -- if (rw) -+ if (rw) { -+ atomic_inc(&conf->writes_out); - bi->bi_end_io = raid5_end_write_request; -- else -+ } else { -+ atomic_inc(&conf->reads_out); - bi->bi_end_io = raid5_end_read_request; -+ } - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); -@@ -2473,6 +2500,7 @@ static void handle_stripe6(struct stripe - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); -+ atomic_inc(&conf->out_reqs_in_queue); - } else { - if (rw == 1) - set_bit(STRIPE_DEGRADED, &sh->state); -@@ -2506,6 +2534,7 @@ static void raid5_activate_delayed(raid5 - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->handle_list); -+ atomic_dec(&conf->delayed); - } - } - } -@@ -2608,6 +2637,8 @@ static int make_request(request_queue_t - const int rw = bio_data_dir(bi); - int remaining; - -+ atomic_inc(&conf->in_reqs_in_queue); -+ - if (unlikely(bio_barrier(bi))) { - bio_endio(bi, bi->bi_size, -EOPNOTSUPP); - return 0; -@@ -2617,6 +2648,11 @@ static int make_request(request_queue_t - - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); -+ if (rw == WRITE) -+ atomic_inc(&conf->writes_in); -+ else -+ atomic_inc(&conf->reads_in); -+ - - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); -@@ -2724,6 +2760,7 @@ static int make_request(request_queue_t - - if ( rw == WRITE ) - md_write_end(mddev); -+ atomic_dec(&conf->in_reqs_in_queue); - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); - } -@@ -2985,6 +3022,7 @@ static void raid5d (mddev_t *mddev) - spin_unlock_irq(&conf->device_lock); - - handled++; -+ atomic_inc(&conf->handled_in_raid5d); - handle_stripe(sh, conf->spare_page); - release_stripe(sh); - -@@ -3381,6 +3419,21 @@ static void status (struct seq_file *seq - conf->disks[i].rdev && - test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); - seq_printf (seq, "]"); -+ seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes", -+ atomic_read(&conf->reads_in), atomic_read(&conf->writes_in), -+ atomic_read(&conf->reads_out), atomic_read(&conf->writes_out)); -+ seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called", -+ atomic_read(&conf->handled_in_raid5d), -+ atomic_read(&conf->out_of_stripes), -+ atomic_read(&conf->handle_called)); -+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", -+ atomic_read(&conf->reads_for_rmw), -+ atomic_read(&conf->reads_for_rcw)); -+ seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n", -+ atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed), -+ atomic_read(&conf->active_stripes), -+ atomic_read(&conf->in_reqs_in_queue), -+ atomic_read(&conf->out_reqs_in_queue)); - #if RAID5_DEBUG - seq_printf (seq, "\n"); - printall(seq, conf); -diff -pru linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h ---- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-06 17:15:22.000000000 +0800 -+++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-06 17:15:32.000000000 +0800 -@@ -259,6 +259,25 @@ struct raid5_private_data { - int pool_size; /* number of disks in stripeheads in pool */ - spinlock_t device_lock; - struct disk_info *disks; -+ -+ /* -+ * Stats -+ */ -+ atomic_t reads_in; -+ atomic_t writes_in; -+ atomic_t reads_out; -+ atomic_t writes_out; -+ atomic_t handled_in_raid5d; -+ atomic_t out_of_stripes; -+ atomic_t reads_for_rmw; -+ atomic_t reads_for_rcw; -+ atomic_t writes_zcopy; -+ atomic_t writes_copied; -+ atomic_t handle_called; -+ atomic_t delayed; -+ atomic_t bit_delayed; -+ atomic_t in_reqs_in_queue; -+ atomic_t out_reqs_in_queue; - }; - - typedef struct raid5_private_data raid5_conf_t; -Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp diff --git a/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch b/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch deleted file mode 100644 index 4b72d95..0000000 --- a/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch +++ /dev/null @@ -1,284 +0,0 @@ -diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c ---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 14:55:08.000000000 +0800 -+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 18:52:08.000000000 +0800 -@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que - return ret; - } - -+static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi) -+{ -+ sector_t first_sector, last_sector; -+ -+ if (likely(conf->expand_progress == MaxSector)) -+ return 0; -+ -+ first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); -+ last_sector = bi->bi_sector + (bi->bi_size>>9); -+ -+ return (first_sector < conf->expand_progress && -+ last_sector >= conf->expand_lo); -+} -+ -+static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector) -+{ -+ int redo = 0; -+ -+ if (likely(conf->expand_progress == MaxSector)) -+ return 0; -+ -+ spin_lock_irq(&conf->device_lock); -+ redo = (raid5_expanding_overlap(conf, bi) || -+ (unlikely(sector < conf->expand_progress) && -+ disks == conf->previous_raid_disks)); -+ spin_unlock_irq(&conf->device_lock); -+ return redo; -+} -+ - static int make_request(request_queue_t *q, struct bio * bi) - { - mddev_t *mddev = q->queuedata; -@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t - struct stripe_head *sh; - const int rw = bio_data_dir(bi); - int remaining; -+ sector_t stripe, sectors, block, r_sector, b_sector; -+ int sectors_per_chunk = conf->chunk_size >> 9; -+ int stripes_per_chunk, sectors_per_block; -+ int sectors_per_stripe; -+ int i, j; -+ -+ DEFINE_WAIT(w); -+ int disks, data_disks; - - atomic_inc(&conf->in_reqs_in_queue); - -@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t - else - atomic_inc(&conf->reads_in); - -- - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); - bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - -- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { -- DEFINE_WAIT(w); -- int disks, data_disks; -- -- retry: -- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); -- if (likely(conf->expand_progress == MaxSector)) -- disks = conf->raid_disks; -- else { -- /* spinlock is needed as expand_progress may be -- * 64bit on a 32bit platform, and so it might be -- * possible to see a half-updated value -- * Ofcourse expand_progress could change after -- * the lock is dropped, so once we get a reference -- * to the stripe that we think it is, we will have -- * to check again. -- */ -- spin_lock_irq(&conf->device_lock); -- disks = conf->raid_disks; -- if (logical_sector >= conf->expand_progress) -- disks = conf->previous_raid_disks; -- else { -- if (logical_sector >= conf->expand_lo) { -- spin_unlock_irq(&conf->device_lock); -- schedule(); -- goto retry; -- } -- } -- spin_unlock_irq(&conf->device_lock); -- } -- data_disks = disks - conf->max_degraded; -+ sectors = bi->bi_size >> 9; -+ stripes_per_chunk = conf->chunk_size / STRIPE_SIZE; - -- new_sector = raid5_compute_sector(logical_sector, disks, data_disks, -- &dd_idx, &pd_idx, conf); -- PRINTK("raid5: make_request, sector %llu logical %llu\n", -- (unsigned long long)new_sector, -- (unsigned long long)logical_sector); -+redo_bio: -+ /* stripe by stripe handle needs a stable raid layout, so if this -+ * reuqest covers the expanding region, wait it over. -+ * Furthermore, we may get here with partial request handled, so -+ * wait for the bi_phys_segment to be 1 also. -jay */ -+ spin_lock_irq(&conf->device_lock); -+ wait_event_lock_irq(conf->wait_for_overlap, -+ (bi->bi_phys_segments == 1) && -+ !raid5_expanding_overlap(conf, bi), -+ conf->device_lock, -+ (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap))); -+ -+ disks = conf->raid_disks; -+ if (unlikely(logical_sector >= conf->expand_progress)) -+ disks = conf->previous_raid_disks; -+ data_disks = disks - conf->max_degraded; -+ spin_unlock_irq(&conf->device_lock); - -- sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); -- if (sh) { -- if (unlikely(conf->expand_progress != MaxSector)) { -- /* expansion might have moved on while waiting for a -- * stripe, so we must do the range check again. -- * Expansion could still move past after this -- * test, but as we are holding a reference to -- * 'sh', we know that if that happens, -- * STRIPE_EXPANDING will get set and the expansion -- * won't proceed until we finish with the stripe. -- */ -- int must_retry = 0; -- spin_lock_irq(&conf->device_lock); -- if (logical_sector < conf->expand_progress && -- disks == conf->previous_raid_disks) -- /* mismatch, need to try again */ -- must_retry = 1; -- spin_unlock_irq(&conf->device_lock); -- if (must_retry) { -- release_stripe(sh); -- goto retry; -+ /* compute the block # */ -+ sectors_per_stripe = STRIPE_SECTORS * data_disks; -+ sectors_per_block = stripes_per_chunk * sectors_per_stripe; -+ -+ block = logical_sector & ~((sector_t)sectors_per_block - 1); -+ sector_div(block, sectors_per_block); -+ -+repeat: -+ stripe = block * (sectors_per_block / data_disks); -+ b_sector = stripe * data_disks; -+ /* iterate through all stripes in this block, -+ * where block is a set of internal stripes -+ * which covers chunk */ -+ -+ for (i = 0; i < stripes_per_chunk && sectors > 0; i++) { -+ r_sector = b_sector + (i * STRIPE_SECTORS); -+ sh = NULL; -+ /* iterrate through all pages in the stripe */ -+ for (j = 0; j < data_disks && sectors > 0; j++) { -+ DEFINE_WAIT(w); -+ -+ if (r_sector + STRIPE_SECTORS <= bi->bi_sector || -+ r_sector >= last_sector) { -+ r_sector += sectors_per_chunk; -+ continue; -+ } -+ -+retry: -+ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); -+ new_sector = raid5_compute_sector(r_sector, disks, -+ data_disks, &dd_idx, -+ &pd_idx, conf); -+ if (sh == NULL) { -+ sh = get_active_stripe(conf, new_sector, disks, pd_idx, -+ (bi->bi_rw&RWA_MASK)); -+ if (sh) { -+ /* we're handling the bio stripe by stripe, so when we found -+ * the raid layout has been changed, we have to redo the -+ * whole bio because we don't which sectors in it has been -+ * done, and which is not done. -jay */ -+ if (raid5_redo_bio(conf, bi, disks, logical_sector)) -+ goto redo_bio; -+ -+ if (test_bit(STRIPE_EXPANDING, &sh->state)) { -+ /* Stripe is busy expanding or -+ * add failed due to overlap. Flush everything -+ * and wait a while -+ */ -+ release_stripe(sh); -+ sh = NULL; -+ raid5_unplug_device(mddev->queue); -+ schedule(); -+ goto retry; -+ } -+ } else { -+ /* cannot get stripe for read-ahead, just give-up */ -+ finish_wait(&conf->wait_for_overlap, &w); -+ clear_bit(BIO_UPTODATE, &bi->bi_flags); -+ sectors = 0; -+ break; - } - } -+ - /* FIXME what if we get a false positive because these - * are being updated. - */ -- if (logical_sector >= mddev->suspend_lo && -- logical_sector < mddev->suspend_hi) { -+ if (r_sector >= mddev->suspend_lo && -+ r_sector < mddev->suspend_hi) { -+ handle_stripe(sh, NULL); - release_stripe(sh); -+ sh = NULL; - schedule(); - goto retry; - } - -- if (test_bit(STRIPE_EXPANDING, &sh->state) || -- !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { -- /* Stripe is busy expanding or -- * add failed due to overlap. Flush everything -- * and wait a while -- */ -- raid5_unplug_device(mddev->queue); -+ if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { -+ handle_stripe(sh, NULL); - release_stripe(sh); -+ sh = NULL; -+ raid5_unplug_device(mddev->queue); - schedule(); - goto retry; - } - finish_wait(&conf->wait_for_overlap, &w); -+ -+ BUG_ON (new_sector != stripe); -+ sectors -= STRIPE_SECTORS; -+ if (bi->bi_sector > r_sector) -+ sectors += bi->bi_sector - r_sector; -+ if (r_sector + STRIPE_SECTORS > last_sector) -+ sectors += r_sector + STRIPE_SECTORS - last_sector; -+ r_sector += sectors_per_chunk; -+ } -+ if (sh) { - handle_stripe(sh, NULL); - release_stripe(sh); -- } else { -- /* cannot get stripe for read-ahead, just give-up */ -- clear_bit(BIO_UPTODATE, &bi->bi_flags); -- finish_wait(&conf->wait_for_overlap, &w); -- break; -+ sh = NULL; - } -- -+ stripe += STRIPE_SECTORS; - } -+ block++; -+ if (sectors > 0) -+ goto repeat; -+ - spin_lock_irq(&conf->device_lock); - remaining = --bi->bi_phys_segments; - spin_unlock_irq(&conf->device_lock); -@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq - atomic_read(&conf->active_stripes), - atomic_read(&conf->in_reqs_in_queue), - atomic_read(&conf->out_reqs_in_queue)); -+ seq_printf (seq, "\t\t%u expanding overlap\n", -+ atomic_read(&conf->expanding_overlap)); - #if RAID5_DEBUG - seq_printf (seq, "\n"); - printall(seq, conf); -diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h ---- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 14:55:08.000000000 +0800 -+++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 18:09:37.000000000 +0800 -@@ -278,6 +278,7 @@ struct raid5_private_data { - atomic_t bit_delayed; - atomic_t in_reqs_in_queue; - atomic_t out_reqs_in_queue; -+ atomic_t expanding_overlap; - }; - - typedef struct raid5_private_data raid5_conf_t; diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch deleted file mode 100644 index 06db94d..0000000 --- a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch +++ /dev/null @@ -1,489 +0,0 @@ -Index: linux-2.6.18-128.1.6/drivers/md/raid5.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:52.000000000 -0600 -+++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600 -@@ -633,6 +633,9 @@ - clear_buffer_uptodate(bh); - } - #endif -+ /* Read on a Directing write is allowable */ -+ /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */ -+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -@@ -669,6 +672,10 @@ - - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); - -+ if (test_bit(R5_Direct, &sh->dev[i].flags)) { -+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); -+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; -+ } - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -@@ -910,7 +917,27 @@ - return r_sector; - } - -+static struct page *zero_copy_data(struct bio *bio, sector_t sector) -+{ -+ sector_t bi_sector = bio->bi_sector; -+ struct page *page = NULL; -+ struct bio_vec *bvl; -+ int i; - -+ bio_for_each_segment(bvl, bio, i) { -+ if (sector == bi_sector) -+ page = bio_iovec_idx(bio, i)->bv_page; -+ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9; -+ if (bi_sector >= sector + STRIPE_SECTORS) { -+ /* check if the stripe is covered by one page */ -+ if (page == bio_iovec_idx(bio, i)->bv_page && -+ PageConstant(page)) -+ return page; -+ return NULL; -+ } -+ } -+ return NULL; -+} - - /* - * Copy data between a page in the stripe cache, and one or more bion -@@ -1002,8 +1029,9 @@ - { - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; -- void *ptr[MAX_XOR_BLOCKS]; -+ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2]; - struct bio *chosen; -+ struct page *page; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); -@@ -1053,34 +1081,92 @@ - count = 1; - } - -- for (i = disks; i--;) -- if (sh->dev[i].written) { -- sector_t sector = sh->dev[i].sector; -- struct bio *wbi = sh->dev[i].written; -- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { -- copy_data(1, wbi, sh->dev[i].page, sector); -- wbi = r5_next_bio(wbi, sector); -+ for (i = disks; i--;) { -+ struct r5dev *dev = &sh->dev[i]; -+ struct bio *wbi = dev->written; -+ sector_t sector; -+ -+ if (!wbi) -+ continue; -+ -+ sector = dev->sector; -+ set_bit(R5_LOCKED, &sh->dev[i].flags); -+ BUG_ON(test_bit(R5_Direct, &dev->flags)); -+ -+ /* check if it's covered by a single page -+ and whole stripe is written at once. -+ * in this case we can avoid memcpy() */ -+ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) && -+ test_bit(R5_Insync, &dev->flags)) { -+ page = zero_copy_data(wbi, sector); -+ if (page) { -+ atomic_inc(&conf->writes_zcopy); -+ /* The pointer must be restored whenever the LOCKED -+ * gets cleared. */ -+ dev->req.bi_io_vec[0].bv_page = page; -+ set_bit(R5_Direct, &dev->flags); -+ clear_bit(R5_UPTODATE, &sh->dev[i].flags); -+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); -+ continue; - } -+ } - -- set_bit(R5_LOCKED, &sh->dev[i].flags); -- set_bit(R5_UPTODATE, &sh->dev[i].flags); -+ /* do copy write */ -+ atomic_inc(&conf->writes_copied); -+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); -+ set_bit(R5_UPTODATE, &sh->dev[i].flags); -+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { -+ copy_data(1, wbi, sh->dev[i].page, sector); -+ wbi = r5_next_bio(wbi, sector); - } -+ } - -+ h_ptr[0] = ptr[0]; - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: -- for (i=disks; i--;) -- if (i != pd_idx) { -- ptr[count++] = page_address(sh->dev[i].page); -- check_xor(); -+ for (i=disks; i--;) { -+ if (i == pd_idx) -+ continue; -+ if (test_bit(R5_Direct, &sh->dev[i].flags)) -+ page = sh->dev[i].req.bi_io_vec[0].bv_page; -+ else -+ page = sh->dev[i].page; -+ -+ /* have to compute the parity immediately for -+ * a highmem page. it would happen for zerocopy. -jay -+ */ -+ if (PageHighMem(page)) { -+ h_ptr[1] = kmap_atomic(page, KM_USER0); -+ xor_block(2, STRIPE_SIZE, h_ptr); -+ kunmap_atomic(page, KM_USER0); -+ } else { -+ ptr[count++] = page_address(page); - } -+ check_xor(); -+ } - break; - case READ_MODIFY_WRITE: -- for (i = disks; i--;) -- if (sh->dev[i].written) { -- ptr[count++] = page_address(sh->dev[i].page); -- check_xor(); -+ for (i = disks; i--;) { -+ if (!sh->dev[i].written) -+ continue; -+ if (test_bit(R5_Direct, &sh->dev[i].flags)) -+ page = sh->dev[i].req.bi_io_vec[0].bv_page; -+ else -+ page = sh->dev[i].page; -+ -+ /* have to compute the parity immediately for -+ * a highmem page. it would happen for zerocopy. -jay -+ */ -+ if (PageHighMem(page)) { -+ h_ptr[1] = kmap_atomic(page, KM_USER0); -+ xor_block(2, STRIPE_SIZE, h_ptr); -+ kunmap_atomic(page, KM_USER0); -+ } else { -+ ptr[count++] = page_address(page); - } -+ check_xor(); -+ } - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); -@@ -1097,6 +1183,7 @@ - raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; - struct bio *chosen; -+ struct page *page; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - -@@ -1126,18 +1213,49 @@ - BUG(); /* Not implemented yet */ - } - -- for (i = disks; i--;) -- if (sh->dev[i].written) { -- sector_t sector = sh->dev[i].sector; -- struct bio *wbi = sh->dev[i].written; -- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { -- copy_data(1, wbi, sh->dev[i].page, sector); -- wbi = r5_next_bio(wbi, sector); -+ for (i = disks; i--;) { -+ struct r5dev *dev = &sh->dev[i]; -+ struct bio *wbi = dev->written; -+ sector_t sector; -+ -+ if (!wbi) -+ continue; -+ -+ sector = sh->dev[i].sector; -+ set_bit(R5_LOCKED, &sh->dev[i].flags); -+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); -+ -+ /* check if it's covered by a single page -+ * and whole stripe is written at once. -+ * in this case we can avoid memcpy() */ -+ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) && -+ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) { -+ page = zero_copy_data(wbi, sector); -+ /* we don't do zerocopy on a HighMem page. Raid6 tend -+ * to prepare all of the pages' content to be accessed -+ * before computing PQ parity. If we need to support HighMem -+ * page also, we have to modify the gen_syndrome() -+ * algorithm. -jay */ -+ if (page && !PageHighMem(page)) { -+ atomic_inc(&conf->writes_zcopy); -+ /* The pointer must be restored whenever the LOCKED -+ * gets cleared. */ -+ sh->dev[i].req.bi_io_vec[0].bv_page = page; -+ set_bit(R5_Direct, &sh->dev[i].flags); -+ clear_bit(R5_UPTODATE, &sh->dev[i].flags); -+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); -+ continue; - } -+ } - -- set_bit(R5_LOCKED, &sh->dev[i].flags); -- set_bit(R5_UPTODATE, &sh->dev[i].flags); -+ atomic_inc(&conf->writes_copied); -+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); -+ set_bit(R5_UPTODATE, &sh->dev[i].flags); -+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { -+ copy_data(1, wbi, sh->dev[i].page, sector); -+ wbi = r5_next_bio(wbi, sector); - } -+ } - - // switch(method) { - // case RECONSTRUCT_WRITE: -@@ -1148,8 +1266,12 @@ - count = 0; - i = d0_idx; - do { -- ptrs[count++] = page_address(sh->dev[i].page); -- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) -+ if (test_bit(R5_Direct, &sh->dev[i].flags)) -+ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page); -+ else -+ ptrs[count++] = page_address(sh->dev[i].page); -+ if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) && -+ !test_bit(R5_Direct, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -@@ -1596,7 +1718,8 @@ - if (sh->dev[i].written) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && -- test_bit(R5_UPTODATE, &dev->flags) ) { -+ (test_bit(R5_UPTODATE, &dev->flags) || -+ test_bit(R5_Direct, &dev->flags)) ) { - /* We can return any write requests */ - struct bio *wbi, *wbi2; - int bitmap_end = 0; -@@ -1604,6 +1727,7 @@ - spin_lock_irq(&conf->device_lock); - wbi = dev->written; - dev->written = NULL; -+ clear_bit(R5_Direct, &dev->flags); - while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { -@@ -1967,6 +2091,15 @@ - set_bit(STRIPE_DEGRADED, &sh->state); - PRINTK("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); -+ -+ if (test_bit(R5_Direct, &sh->dev[i].flags)) { -+ /* restore the page pointer of req, otherwise, -+ * no any read is permitted on this stripe, this is -+ * not what we want. -jay */ -+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); -+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; -+ } -+ - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } -@@ -2172,7 +2305,8 @@ - if (sh->dev[i].written) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && -- test_bit(R5_UPTODATE, &dev->flags) ) { -+ (test_bit(R5_UPTODATE, &dev->flags) || -+ test_bit(R5_Direct, &dev->flags)) ) { - /* We can return any write requests */ - int bitmap_end = 0; - struct bio *wbi, *wbi2; -@@ -2181,6 +2315,7 @@ - spin_lock_irq(&conf->device_lock); - wbi = dev->written; - dev->written = NULL; -+ clear_bit(R5_Direct, &dev->flags); - while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { -@@ -2532,6 +2667,15 @@ - set_bit(STRIPE_DEGRADED, &sh->state); - PRINTK("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); -+ -+ if (test_bit(R5_Direct, &sh->dev[i].flags)) { -+ /* restore the page pointer of req, otherwise, -+ * no any read is permitted on this stripe, this is -+ * not what we want. -jay */ -+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); -+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; -+ } -+ - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } -@@ -3451,6 +3595,9 @@ - mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT; - mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;; - -+ /* raid5 device is able to do zcopy right now. */ -+ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE; -+ - return 0; - abort: - if (conf) { -@@ -3537,9 +3684,11 @@ - atomic_read(&conf->handled_in_raid5d), - atomic_read(&conf->out_of_stripes), - atomic_read(&conf->handle_called)); -- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", -+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u", - atomic_read(&conf->reads_for_rmw), -- atomic_read(&conf->reads_for_rcw)); -+ atomic_read(&conf->reads_for_rcw), -+ atomic_read(&conf->writes_zcopy), -+ atomic_read(&conf->writes_copied)); - seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n", - atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed), - atomic_read(&conf->active_stripes), -Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h 2006-09-19 21:42:06.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/backing-dev.h 2009-06-02 23:24:55.000000000 -0600 -@@ -48,6 +48,7 @@ - #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */ - #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */ - #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */ -+#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */ - #define BDI_CAP_VMFLAGS \ - (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) - -@@ -94,11 +95,18 @@ - #define bdi_cap_account_dirty(bdi) \ - (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY)) - -+#define bdi_cap_page_constant_write(bdi) \ -+ ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE) -+ - #define mapping_cap_writeback_dirty(mapping) \ - bdi_cap_writeback_dirty((mapping)->backing_dev_info) - - #define mapping_cap_account_dirty(mapping) \ - bdi_cap_account_dirty((mapping)->backing_dev_info) - -+#define mapping_cap_page_constant_write(mapping) \ -+ bdi_cap_page_constant_write((mapping)->backing_dev_info) -+ -+ - - #endif /* _LINUX_BACKING_DEV_H */ -Index: linux-2.6.18-128.1.6/include/linux/page-flags.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h 2009-04-14 21:05:24.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/page-flags.h 2009-06-02 23:24:55.000000000 -0600 -@@ -86,6 +86,7 @@ - #define PG_reclaim 17 /* To be reclaimed asap */ - #define PG_nosave_free 18 /* Free, should not be written */ - #define PG_buddy 19 /* Page is free, on buddy lists */ -+#define PG_constant 21 /* To mark if the page is constant */ - #define PG_xpmem 27 /* Testing for xpmem. */ - - /* PG_owner_priv_1 users should have descriptive aliases */ -@@ -283,6 +284,14 @@ - - struct page; /* forward declaration */ - -+#define PageConstant(page) test_bit(PG_constant, &(page)->flags) -+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags) -+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags)) -+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags) -+ -+extern int set_page_constant(struct page *page); -+extern void clear_page_constant(struct page *); -+ - int test_clear_page_dirty(struct page *page); - int test_clear_page_writeback(struct page *page); - int test_set_page_writeback(struct page *page); -Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h -=================================================================== ---- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h 2009-06-02 23:24:50.000000000 -0600 -+++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h 2009-06-02 23:24:55.000000000 -0600 -@@ -156,8 +156,9 @@ - #define R5_Overlap 7 /* There is a pending overlapping request on this block */ - #define R5_ReadError 8 /* seen a read error here recently */ - #define R5_ReWrite 9 /* have tried to over-write the readerror */ -- - #define R5_Expanded 10 /* This block now has post-expand data */ -+#define R5_Direct 11 /* Use the pages in bio to do the write directly. */ -+ - /* - * Write method - */ -Index: linux-2.6.18-128.1.6/mm/filemap.c -=================================================================== ---- linux-2.6.18-128.1.6.orig/mm/filemap.c 2009-04-14 21:05:46.000000000 -0600 -+++ linux-2.6.18-128.1.6/mm/filemap.c 2009-06-02 23:24:55.000000000 -0600 -@@ -30,6 +30,7 @@ - #include - #include - #include -+#include - #include /* for BUG_ON(!in_atomic()) only */ - #include - #include "internal.h" -@@ -567,11 +568,55 @@ - if (!test_clear_page_writeback(page)) - BUG(); - } -+ clear_page_constant(page); - smp_mb__after_clear_bit(); - wake_up_page(page, PG_writeback); - } - EXPORT_SYMBOL(end_page_writeback); - -+/* Make a page to be constant, `constant' means any write to this page will -+ * be blocked until clear_page_constant is called. -+ * The page lock must be held. -+ */ -+int set_page_constant(struct page *page) -+{ -+ BUG_ON(!PageLocked(page)); -+ -+ /* If it's an anonymous page and haven't been added to swap cache, -+ * return directly because we have no way to swap this page. -+ */ -+ if (page_mapping(page) == NULL) -+ return SWAP_FAIL; -+ -+ BUG_ON(!PageUptodate(page)); -+ -+ /* I have to clear page uptodate before trying to remove -+ * it from user's page table because otherwise, the page may be -+ * reinstalled by a page access which happens between try_to_unmap() -+ * and ClearPageUptodate(). -jay -+ */ -+ ClearPageUptodate(page); -+ if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) { -+ SetPageUptodate(page); -+ return SWAP_FAIL; -+ } -+ SetPageConstant(page); -+ return SWAP_SUCCESS; -+} -+ -+void clear_page_constant(struct page *page) -+{ -+ if (PageConstant(page)) { -+ BUG_ON(!PageLocked(page)); -+ BUG_ON(PageUptodate(page)); -+ ClearPageConstant(page); -+ SetPageUptodate(page); -+ unlock_page(page); -+ } -+} -+EXPORT_SYMBOL(set_page_constant); -+EXPORT_SYMBOL(clear_page_constant); -+ - /** - * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @page: the page to lock diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch deleted file mode 100644 index 2297f8c..0000000 --- a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch +++ /dev/null @@ -1,581 +0,0 @@ -Index: linux-2.6.16.60-0.37/drivers/scsi/Kconfig -=================================================================== ---- linux-2.6.16.60-0.37.orig/drivers/scsi/Kconfig 2009-03-24 05:46:32.000000000 -0700 -+++ linux-2.6.16.60-0.37/drivers/scsi/Kconfig 2009-06-02 23:33:14.000000000 -0600 -@@ -78,6 +78,14 @@ - To compile this driver as a module, choose M here and read - . The module will be called st. - -+config SD_IOSTATS -+ bool "Enable SCSI disk I/O stats" -+ depends on BLK_DEV_SD -+ default y -+ ---help--- -+ This enables SCSI disk I/O stats collection. You must also enable -+ /proc file system support if you want this feature. -+ - config CHR_DEV_OSST - tristate "SCSI OnStream SC-x0 tape support" - depends on SCSI -Index: linux-2.6.16.60-0.37/drivers/scsi/scsi_proc.c -=================================================================== ---- linux-2.6.16.60-0.37.orig/drivers/scsi/scsi_proc.c 2009-03-24 05:46:25.000000000 -0700 -+++ linux-2.6.16.60-0.37/drivers/scsi/scsi_proc.c 2009-06-02 23:33:14.000000000 -0600 -@@ -40,7 +40,8 @@ - /* 4K page size, but our output routines, use some slack for overruns */ - #define PROC_BLOCK_SIZE (3*1024) - --static struct proc_dir_entry *proc_scsi; -+struct proc_dir_entry *proc_scsi; -+EXPORT_SYMBOL(proc_scsi); - - /* Protect sht->present and sht->proc_dir */ - static DEFINE_MUTEX(global_host_template_mutex); -Index: linux-2.6.16.60-0.37/drivers/scsi/sd.c -=================================================================== ---- linux-2.6.16.60-0.37.orig/drivers/scsi/sd.c 2009-03-24 05:46:25.000000000 -0700 -+++ linux-2.6.16.60-0.37/drivers/scsi/sd.c 2009-06-02 23:33:14.000000000 -0600 -@@ -63,6 +63,63 @@ - - #include "scsi_logging.h" - -+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) -+# include -+# include -+ -+typedef struct { -+ unsigned long long iostat_size; -+ unsigned long long iostat_count; -+} iostat_counter_t; -+ -+#define IOSTAT_NCOUNTERS 16 -+typedef struct { -+ iostat_counter_t iostat_read_histogram[IOSTAT_NCOUNTERS]; -+ iostat_counter_t iostat_write_histogram[IOSTAT_NCOUNTERS]; -+ struct timeval iostat_timeval; -+ -+ /* queue depth: how well the pipe is filled up */ -+ unsigned long long iostat_queue_ticks[IOSTAT_NCOUNTERS]; -+ unsigned long long iostat_queue_ticks_sum; -+ unsigned long iostat_queue_depth; -+ unsigned long iostat_queue_stamp; -+ -+ /* seeks: how linear the traffic is */ -+ unsigned long long iostat_next_sector; -+ unsigned long long iostat_seek_sectors; -+ unsigned long long iostat_seeks; -+ unsigned long long iostat_sectors; -+ unsigned long long iostat_reqs; -+ unsigned long iostat_read_reqs; -+ unsigned long iostat_write_reqs; -+ -+ /* process time: how long it takes to process requests */ -+ unsigned long iostat_rtime[IOSTAT_NCOUNTERS]; -+ unsigned long iostat_wtime[IOSTAT_NCOUNTERS]; -+ -+ /* queue time: how long process spent in elevator's queue */ -+ unsigned long iostat_rtime_in_queue[IOSTAT_NCOUNTERS]; -+ unsigned long iostat_wtime_in_queue[IOSTAT_NCOUNTERS]; -+ -+ /* must be the last field, as it's used to know size to be memset'ed */ -+ spinlock_t iostat_lock; -+} ____cacheline_aligned_in_smp iostat_stats_t; -+ -+struct proc_dir_entry *sd_iostats_procdir = NULL; -+char sd_iostats_procdir_name[] = "sd_iostats"; -+static struct file_operations sd_iostats_proc_fops; -+ -+extern void sd_iostats_init(void); -+extern void sd_iostats_fini(void); -+void sd_iostats_start_req(struct scsi_cmnd *SCpnt); -+void sd_iostats_finish_req(struct scsi_cmnd *SCpnt); -+#else -+static inline void sd_iostats_init(void) {} -+static inline void sd_iostats_fini(void) {} -+static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {} -+static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {} -+#endif -+ - /* - * More than enough for everybody ;) The huge number of majors - * is a leftover from 16bit dev_t days, we don't really need that -@@ -127,6 +184,9 @@ - unsigned WCE : 1; /* state of disk WCE bit */ - unsigned RCD : 1; /* state of disk RCD bit, unused */ - unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ -+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) -+ iostat_stats_t *stats; /* scsi disk statistics */ -+#endif - }; - #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev) - -@@ -520,6 +580,8 @@ - */ - SCpnt->done = sd_rw_intr; - -+ sd_iostats_start_req(SCpnt); -+ - /* - * This indicates that the command is ready from our end to be - * queued. -@@ -1014,6 +1076,7 @@ - break; - } - out: -+ sd_iostats_finish_req(SCpnt); - scsi_io_completion(SCpnt, good_bytes); - } - -@@ -1713,6 +1776,36 @@ - if (sdp->removable) - gd->flags |= GENHD_FL_REMOVABLE; - -+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) -+ sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL); -+ if (!sdkp->stats) { -+ printk(KERN_WARNING "cannot allocate iostat structure for" -+ "%s\n", gd->disk_name); -+ } else { -+ do_gettimeofday(&sdkp->stats->iostat_timeval); -+ sdkp->stats->iostat_queue_stamp = jiffies; -+ spin_lock_init(&sdkp->stats->iostat_lock); -+ if (sd_iostats_procdir) { -+ struct proc_dir_entry *pde; -+ pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR, -+ sd_iostats_procdir); -+ if (!pde) { -+ printk(KERN_WARNING "Can't create /proc/scsi/" -+ "%s/%s\n", -+ sd_iostats_procdir_name, -+ gd->disk_name); -+ kfree(sdkp->stats); -+ sdkp->stats = NULL; -+ } else { -+ pde->proc_fops = &sd_iostats_proc_fops; -+ pde->data = gd; -+ } -+ } else { -+ kfree(sdkp->stats); -+ sdkp->stats = NULL; -+ } -+ } -+#endif - dev_set_drvdata(dev, sdkp); - add_disk(gd); - -@@ -1756,6 +1849,366 @@ - return 0; - } - -+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) -+static int -+sd_iostats_seq_show(struct seq_file *seq, void *v) -+{ -+ struct timeval now; -+ struct gendisk *disk = seq->private; -+ iostat_stats_t *stats; -+ unsigned long long read_len; -+ unsigned long long read_len_tot; -+ unsigned long read_num; -+ unsigned long read_num_tot; -+ unsigned long long write_len; -+ unsigned long long write_len_tot; -+ unsigned long write_num; -+ unsigned long write_num_tot; -+ int i; -+ int maxi; -+ -+ stats = scsi_disk(disk)->stats; -+ if (stats == NULL) { -+ printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n"); -+ BUG(); -+ } -+ -+ do_gettimeofday(&now); -+ now.tv_sec -= stats->iostat_timeval.tv_sec; -+ now.tv_usec -= stats->iostat_timeval.tv_usec; -+ if (now.tv_usec < 0) { -+ now.tv_usec += 1000000; -+ now.tv_sec--; -+ } -+ -+ /* this sampling races with updates */ -+ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n", -+ (unsigned long) scsi_disk(disk)->index, -+ now.tv_sec, now.tv_usec); -+ -+ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--) -+ if (stats->iostat_read_histogram[i].iostat_count != 0 || -+ stats->iostat_write_histogram[i].iostat_count != 0) -+ break; -+ maxi = i; -+ -+ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", -+ "reads", "total", "writes", "total"); -+ -+ read_len_tot = write_len_tot = 0; -+ read_num_tot = write_num_tot = 0; -+ for (i = 0; i <= maxi; i++) { -+ read_len = stats->iostat_read_histogram[i].iostat_size; -+ read_len_tot += read_len; -+ read_num = stats->iostat_read_histogram[i].iostat_count; -+ read_num_tot += read_num; -+ -+ write_len = stats->iostat_write_histogram[i].iostat_size; -+ write_len_tot += write_len; -+ write_num = stats->iostat_write_histogram[i].iostat_count; -+ write_num_tot += write_num; -+ -+ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", -+ 512<iostat_queue_ticks[i]; -+ if (ticks == 0) -+ continue; -+ percent = stats->iostat_queue_ticks[i] * 100; -+ do_div(percent, stats->iostat_queue_ticks_sum); -+ seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent); -+ } -+ -+ if (stats->iostat_reqs != 0) { -+ unsigned long long aveseek = 0, percent = 0; -+ -+ if (stats->iostat_seeks) { -+ aveseek = stats->iostat_seek_sectors; -+ do_div(aveseek, stats->iostat_seeks); -+ percent = stats->iostat_seeks * 100; -+ do_div(percent, stats->iostat_reqs); -+ } -+ -+ seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over " -+ "%llu sectors in ave, %llu%% of all reqs\n", -+ stats->iostat_sectors, stats->iostat_reqs, -+ stats->iostat_seeks, aveseek, percent); -+ } -+ -+ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads", -+ "%%", "writes", "%%"); -+ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { -+ unsigned long read_percent = 0, write_percent = 0; -+ if (stats->iostat_wtime[i] == 0 && -+ stats->iostat_rtime[i] == 0) -+ continue; -+ if (stats->iostat_read_reqs) -+ read_percent = stats->iostat_rtime[i] * 100 / -+ stats->iostat_read_reqs; -+ if (stats->iostat_write_reqs) -+ write_percent = stats->iostat_wtime[i] * 100 / -+ stats->iostat_write_reqs; -+ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", -+ jiffies_to_msecs(((1UL << i) >> 1) << 1), -+ stats->iostat_rtime[i], read_percent, -+ stats->iostat_wtime[i], write_percent); -+ } -+ -+ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads", -+ "%%", "writes", "%%"); -+ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { -+ unsigned long read_percent = 0, write_percent = 0; -+ if (stats->iostat_wtime_in_queue[i] == 0 && -+ stats->iostat_rtime_in_queue[i] == 0) -+ continue; -+ if (stats->iostat_read_reqs) -+ read_percent = stats->iostat_rtime_in_queue[i] * 100 / -+ stats->iostat_read_reqs; -+ if (stats->iostat_write_reqs) -+ write_percent = stats->iostat_wtime_in_queue[i] * 100 / -+ stats->iostat_write_reqs; -+ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", -+ jiffies_to_msecs(((1UL << i) >> 1) << 1), -+ stats->iostat_rtime_in_queue[i], -+ read_percent, -+ stats->iostat_wtime_in_queue[i], -+ write_percent); -+ } -+ -+ return 0; -+} -+ -+static void * -+sd_iostats_seq_start(struct seq_file *p, loff_t *pos) -+{ -+ return (*pos == 0) ? (void *)1 : NULL; -+} -+ -+static void * -+sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos) -+{ -+ ++*pos; -+ return NULL; -+} -+ -+static void -+sd_iostats_seq_stop(struct seq_file *p, void *v) -+{ -+} -+ -+static struct seq_operations sd_iostats_seqops = { -+ .start = sd_iostats_seq_start, -+ .stop = sd_iostats_seq_stop, -+ .next = sd_iostats_seq_next, -+ .show = sd_iostats_seq_show, -+}; -+ -+static int -+sd_iostats_seq_open (struct inode *inode, struct file *file) -+{ -+ int rc; -+ -+ rc = seq_open(file, &sd_iostats_seqops); -+ if (rc != 0) -+ return rc; -+ -+ ((struct seq_file *)file->private_data)->private = PDE(inode)->data; -+ return 0; -+} -+ -+static ssize_t -+sd_iostats_seq_write(struct file *file, const char *buffer, -+ size_t len, loff_t *off) -+{ -+ struct seq_file *seq = file->private_data; -+ struct gendisk *disk = seq->private; -+ iostat_stats_t *stats = scsi_disk(disk)->stats; -+ unsigned long flags; -+ unsigned long qdepth; -+ -+ -+ spin_lock_irqsave (&stats->iostat_lock, flags); -+ qdepth = stats->iostat_queue_depth; -+ memset (stats, 0, offsetof(iostat_stats_t, iostat_lock)); -+ do_gettimeofday(&stats->iostat_timeval); -+ stats->iostat_queue_stamp = jiffies; -+ stats->iostat_queue_depth = qdepth; -+ spin_unlock_irqrestore (&stats->iostat_lock, flags); -+ -+ return len; -+} -+ -+static struct file_operations sd_iostats_proc_fops = { -+ .owner = THIS_MODULE, -+ .open = sd_iostats_seq_open, -+ .read = seq_read, -+ .write = sd_iostats_seq_write, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+extern struct proc_dir_entry *proc_scsi; -+ -+void -+sd_iostats_init(void) -+{ -+ if (proc_scsi == NULL) { -+ printk(KERN_WARNING "No access to sd iostats: " -+ "proc_scsi is NULL\n"); -+ return; -+ } -+ -+ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name, -+ S_IFDIR | S_IRUGO | S_IXUGO, -+ proc_scsi); -+ if (sd_iostats_procdir == NULL) { -+ printk(KERN_WARNING "No access to sd iostats: " -+ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name); -+ return; -+ } -+} -+ -+void sd_iostats_fini(void) -+{ -+ if (proc_scsi != NULL && sd_iostats_procdir != NULL) -+ remove_proc_entry(sd_iostats_procdir_name, proc_scsi); -+ -+ sd_iostats_procdir = NULL; -+} -+ -+void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) -+{ -+ struct request *rq = SCpnt->request; -+ iostat_stats_t *stats; -+ unsigned long *tcounter; -+ int tbucket; -+ int tmp; -+ unsigned long irqflags; -+ unsigned long i; -+ -+ stats = scsi_disk(rq->rq_disk)->stats; -+ if (stats == NULL) -+ return; -+ -+ tmp = jiffies - rq->start_time; -+ for (tbucket = 0; tmp > 1; tbucket++) -+ tmp >>= 1; -+ if (tbucket >= IOSTAT_NCOUNTERS) -+ tbucket = IOSTAT_NCOUNTERS - 1; -+ //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket); -+ -+ tcounter = rq_data_dir(rq) == WRITE ? -+ &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket]; -+ -+ spin_lock_irqsave(&stats->iostat_lock, irqflags); -+ -+ /* update delay stats */ -+ (*tcounter)++; -+ -+ /* update queue depth stats */ -+ i = stats->iostat_queue_depth; -+ if (i >= IOSTAT_NCOUNTERS) -+ i = IOSTAT_NCOUNTERS - 1; -+ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; -+ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; -+ BUG_ON(stats->iostat_queue_depth == 0); -+ stats->iostat_queue_depth--; -+ -+ /* update seek stats. XXX: not sure about nr_sectors */ -+ stats->iostat_sectors += rq->nr_sectors; -+ stats->iostat_reqs++; -+ if (rq->sector != stats->iostat_next_sector) { -+ stats->iostat_seek_sectors += -+ rq->sector > stats->iostat_next_sector ? -+ rq->sector - stats->iostat_next_sector : -+ stats->iostat_next_sector - rq->sector; -+ stats->iostat_seeks++; -+ } -+ stats->iostat_next_sector = rq->sector + rq->nr_sectors; -+ -+ stats->iostat_queue_stamp = jiffies; -+ -+ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); -+} -+ -+void sd_iostats_start_req(struct scsi_cmnd *SCpnt) -+{ -+ struct request *rq = SCpnt->request; -+ iostat_stats_t *stats; -+ iostat_counter_t *counter; -+ int bucket; -+ int tbucket; -+ int tmp; -+ unsigned long irqflags; -+ unsigned long i; -+ int nsect; -+ -+ stats = scsi_disk(rq->rq_disk)->stats; -+ if (stats == NULL) -+ return; -+ -+ nsect = SCpnt->request_bufflen >> 9; -+ for (bucket = 0, tmp = nsect; tmp > 1; bucket++) -+ tmp >>= 1; -+ -+ if (bucket >= IOSTAT_NCOUNTERS) { -+ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect); -+ BUG(); -+ } -+ -+ counter = rq_data_dir(rq) == WRITE ? -+ &stats->iostat_write_histogram[bucket] : -+ &stats->iostat_read_histogram[bucket]; -+ -+ tmp = jiffies - rq->start_time; -+ for (tbucket = 0; tmp > 1; tbucket++) -+ tmp >>= 1; -+ if (tbucket >= IOSTAT_NCOUNTERS) -+ tbucket = IOSTAT_NCOUNTERS - 1; -+ //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket); -+ -+ /* an ugly hack to know exact processing time. the right -+ * solution is to add one more field to struct request -+ * hopefully it will break nothing ... */ -+ rq->start_time = jiffies; -+ -+ spin_lock_irqsave(&stats->iostat_lock, irqflags); -+ -+ /* update queue depth stats */ -+ i = stats->iostat_queue_depth; -+ if (i >= IOSTAT_NCOUNTERS) -+ i = IOSTAT_NCOUNTERS - 1; -+ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; -+ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; -+ stats->iostat_queue_depth++; -+ -+ /* update delay stats */ -+ if (rq_data_dir(rq) == WRITE) { -+ stats->iostat_wtime_in_queue[tbucket]++; -+ stats->iostat_write_reqs++; -+ } else { -+ stats->iostat_rtime_in_queue[tbucket]++; -+ stats->iostat_read_reqs++; -+ } -+ -+ /* update size stats */ -+ counter->iostat_size += nsect; -+ counter->iostat_count++; -+ -+ stats->iostat_queue_stamp = jiffies; -+ -+ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); -+} -+#endif -+ - /** - * scsi_disk_release - Called to free the scsi_disk structure - * @cdev: pointer to embedded class device -@@ -1774,10 +2227,16 @@ - idr_remove(&sd_index_idr, sdkp->index); - spin_unlock(&sd_index_lock); - -+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) -+ if (sdkp->stats) { -+ remove_proc_entry(disk->disk_name, sd_iostats_procdir); -+ kfree(sdkp->stats); -+ sdkp->stats = NULL; -+ } -+#endif - disk->private_data = NULL; - put_disk(disk); - put_device(&sdkp->device->sdev_gendev); -- - kfree(sdkp); - } - -@@ -1844,6 +2303,7 @@ - static int __init init_sd(void) - { - int majors = 0, i; -+ int rc = 0; - - SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); - -@@ -1854,9 +2314,13 @@ - if (!majors) - return -ENODEV; - -+ sd_iostats_init(); - class_register(&sd_disk_class); - -- return scsi_register_driver(&sd_template.gendrv); -+ rc = scsi_register_driver(&sd_template.gendrv); -+ if (rc) -+ sd_iostats_fini(); -+ return rc; - } - - /** -@@ -1875,6 +2339,7 @@ - unregister_blkdev(sd_major(i), "sd"); - - class_unregister(&sd_disk_class); -+ sd_iostats_fini(); - } - - module_init(init_sd); diff --git a/lustre/kernel_patches/patches/small-fixes-about-jbd.patch b/lustre/kernel_patches/patches/small-fixes-about-jbd.patch deleted file mode 100644 index d39a174..0000000 --- a/lustre/kernel_patches/patches/small-fixes-about-jbd.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff -pur linux-2.6.18-128.orig/fs/jbd/commit.c linux-2.6.18-128/fs/jbd/commit.c ---- linux-2.6.18-128.orig/fs/jbd/commit.c 2009-04-10 16:31:40.000000000 +0800 -+++ linux-2.6.18-128/fs/jbd/commit.c 2009-04-10 16:33:14.000000000 +0800 -@@ -862,7 +862,8 @@ wait_for_iobuf: - if (err) - __journal_abort_hard(journal); - } -- err = journal_wait_on_commit_record(cbh); -+ if (!err && !is_journal_aborted(journal)) -+ err = journal_wait_on_commit_record(cbh); - - if (err) - journal_abort(journal, err); diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series deleted file mode 100644 index e3803f7..0000000 --- a/lustre/kernel_patches/series/2.6-rhel5.series +++ /dev/null @@ -1,30 +0,0 @@ -lustre_version.patch -jbd-jcberr-2.6.18-vanilla.patch -export_symbols-2.6.12.patch -dev_read_only-2.6.18-vanilla.patch -export-2.6.18-vanilla.patch -sd_iostats-2.6-rhel5.patch -export_symbol_numa-2.6-fc5.patch -blkdev_tunables-2.6-rhel5.patch -jbd-stats-2.6-rhel5.patch -raid5-stats-rhel5.patch -raid5-configurable-cachesize-rhel5.patch -raid5-large-io-rhel5.patch -raid5-stripe-by-stripe-handling-rhel5.patch -raid5-merge-ios-rhel5.patch -raid5-zerocopy-rhel5.patch -raid5-maxsectors-rhel5.patch -raid5-rebuild-corrupt-bug.patch -md-rebuild-policy.patch -jbd-journal-chksum-2.6.18-vanilla.patch -quota-large-limits-rhel5.patch -raid5-mmp-unplug-dev.patch -small-fixes-about-jbd.patch -mpt-fusion-max-sge.patch -prune-icache-use-trylock-rhel5.patch -jbd2-jcberr-2.6-rhel5.patch -jbd2-commit-timer-no-jiffies-rounding.diff -md-avoid-bug_on-when-bmc-overflow.patch -jbd2_stats_proc_init-wrong-place.patch -lustre_iser_max_sectors_tuning_lustre2.0.patch -fix-forever-in-do_get_write_access.patch diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index e5534ea..c1bcb70 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -1,7 +1,6 @@ SERIES VERSION COMMENT SUPPORTED KERNELS: -2.6-rhel5 RHEL5: 2.6.18-238.19.1.el5 2.6-rhel6 RHEL6: 2.6.32-279.14.1.el6 CLIENT SUPPORT FOR UNPATCHED KERNELS: -- 1.8.3.1