Remove patches relate to RHEL5 server support.
Signed-off-by: yang sheng <ys@whamcloud.com>
Change-Id: I694c9bbe0b6713119501392540c9cf5c6f8e53f3
Reviewed-on: http://review.whamcloud.com/4865
Tested-by: Hudson
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
+++ /dev/null
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -185,6 +185,8 @@ void ext4_journal_abort_handle(const cha
- jbd2_journal_abort_handle(handle);
- }
-
-+EXPORT_SYMBOL(ext4_journal_abort_handle);
-+
- /* Deal with the reporting of failure conditions on a filesystem such as
- * inconsistencies detected or read IO failures.
- *
-@@ -2459,6 +2461,8 @@ out_fail:
- return ret;
- }
-
-+EXPORT_SYMBOL(ext4_force_commit);
-+
- /*
- * Setup any per-fs journal parameters now. We'll do this both on
- * initial mount, once the journal has been initialised but before we've
-@@ -3504,6 +3508,12 @@ int ext4_map_inode_page(struct inode *in
- unsigned long *blocks, int *created, int create);
- EXPORT_SYMBOL(ext4_map_inode_page);
-
-+EXPORT_SYMBOL(ext4_xattr_get);
-+EXPORT_SYMBOL(ext4_xattr_set_handle);
-+EXPORT_SYMBOL(ext4_bread);
-+EXPORT_SYMBOL(ext4_journal_start_sb);
-+EXPORT_SYMBOL(__ext4_journal_stop);
-+
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
- MODULE_LICENSE("GPL");
-Index: linux-stage/fs/ext4/ext4.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4.h
-+++ linux-stage/fs/ext4/ext4.h
-@@ -1024,6 +1024,8 @@ extern unsigned long ext4_count_free_ino
- ext4_group_t group,
- struct ext4_group_desc *desc);
- extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-+extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb,
-+ ext4_group_t block_group);
-
- /* mballoc.c */
- extern long ext4_mb_stats;
-Index: linux-stage/fs/ext4/ialloc.c
-===================================================================
---- linux-stage.orig/fs/ext4/ialloc.c
-+++ linux-stage/fs/ext4/ialloc.c
-@@ -96,7 +96,7 @@ unsigned ext4_init_inode_bitmap(struct s
- *
- * Return buffer_head of bitmap on success or NULL.
- */
--static struct buffer_head *
-+struct buffer_head *
- ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
- {
- struct ext4_group_desc *desc;
-@@ -137,6 +137,7 @@ ext4_read_inode_bitmap(struct super_bloc
- }
- return bh;
- }
-+EXPORT_SYMBOL(ext4_read_inode_bitmap);
-
- /*
- * NOTE! When we get the inode, we're the only people
-Index: linux-stage/fs/ext4/balloc.c
-===================================================================
---- linux-stage.orig/fs/ext4/balloc.c
-+++ linux-stage/fs/ext4/balloc.c
-@@ -236,6 +236,7 @@ struct ext4_group_desc * ext4_get_group_
- *bh = sbi->s_group_desc[group_desc];
- return desc;
- }
-+EXPORT_SYMBOL(ext4_get_group_desc);
-
- static int ext4_valid_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
+++ /dev/null
-diff -up a/fs/ext4/super.c b/s/ext4/super.c
---- a/fs/ext4/super.c
-+++ b/fs/ext4/super.c
-@@ -706,9 +767,47 @@ static inline struct inode *dquot_to_ino
- static ssize_t ext4_quota_write(struct super_block *sb, int type,
- const char *data, size_t len, loff_t off);
-
-+static int ext4_dquot_initialize(struct inode *inode, int type)
-+{
-+ handle_t *handle;
-+ int ret, err;
-+
-+ /* We may create quota structure so we need to reserve enough blocks */
-+ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+ ret = dquot_initialize(inode, type);
-+ err = ext4_journal_stop(handle);
-+ if (!ret)
-+ ret = err;
-+ return ret;
-+}
-+
-+static int ext4_dquot_drop(struct inode *inode)
-+{
-+ handle_t *handle;
-+ int ret, err;
-+
-+ /* We may delete quota structure so we need to reserve enough blocks */
-+ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
-+ if (IS_ERR(handle)) {
-+ /*
-+ * We call dquot_drop() anyway to at least release references
-+ * to quota structures so that umount does not hang.
-+ */
-+ dquot_drop(inode);
-+ return PTR_ERR(handle);
-+ }
-+ ret = dquot_drop(inode);
-+ err = ext4_journal_stop(handle);
-+ if (!ret)
-+ ret = err;
-+ return ret;
-+}
-+
- static struct dquot_operations ext4_quota_operations = {
-- .initialize = dquot_initialize,
-- .drop = dquot_drop,
-+ .initialize = ext4_dquot_initialize,
-+ .drop = ext4_dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
+++ /dev/null
-Index: linux-2.6.18-128.1.6/fs/ext4/super.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/super.c
-+++ linux-2.6.18-128.1.6/fs/ext4/super.c
-@@ -70,6 +70,8 @@ struct page *ext4_zero_page;
-
- struct proc_dir_entry *proc_root_ext4;
-
-+static int bigendian_extents;
-+
- ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *bg)
- {
-@@ -1222,7 +1224,7 @@ enum {
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_inode_readahead_blks, Opt_journal_ioprio,
-- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents,
- };
-
- static match_table_t tokens = {
-@@ -1284,6 +1286,7 @@ static match_table_t tokens = {
- {Opt_auto_da_alloc, "auto_da_alloc=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc"},
- {Opt_noauto_da_alloc, "noauto_da_alloc"},
-+ {Opt_bigendian_extents, "bigendian_extents"},
- {Opt_err, NULL},
- };
-
-@@ -1682,6 +1685,9 @@ clear_qf_name:
- return 0;
- sbi->s_stripe = option;
- break;
-+ case Opt_bigendian_extents:
-+ bigendian_extents = 1;
-+ break;
- default:
- printk(KERN_ERR
- "EXT4-fs: Unrecognized mount option \"%s\" "
-@@ -2561,6 +2567,15 @@ static int ext4_fill_super(struct super_
- goto failed_mount;
- }
-
-+#ifdef __BIG_ENDIAN
-+ if (bigendian_extents == 0) {
-+ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to "
-+ "work on big-endian systems. Use \"bigendian_extents\" "
-+ "mount option to override.\n");
-+ goto failed_mount;
-+ }
-+#endif
-+
- bgl_lock_init(sbi->s_blockgroup_lock);
-
- sbi->s_last_alloc_group = -1;
+++ /dev/null
-Disable the DELALLOC feature because it is not robust in ext4 versions < 2.6.31.
-
---
-diff -rupN linux-2.6.18-164.6.1_1/fs/ext4/super.c linux-2.6.18-164.6.1_2/fs/ext4/super.c
---- linux-2.6.18-164.6.1_1/fs/ext4/super.c 2010-08-05 13:44:07.000000000 +0530
-+++ linux-2.6.18-164.6.1_2/fs/ext4/super.c 2010-08-05 13:46:29.000000000 +0530
-@@ -2091,12 +2091,6 @@ static int ext4_fill_super(struct super_
-
- set_opt(sbi->s_mount_opt, BARRIER);
-
-- /*
-- * enable delayed allocation by default
-- * Use -o nodelalloc to turn it off
-- */
-- set_opt(sbi->s_mount_opt, DELALLOC);
--
- if (!parse_options((char *) data, sb, &journal_devnum,
- &journal_ioprio, NULL, 0))
- goto failed_mount;
+++ /dev/null
-Index: linux-stage/fs/ext4/xattr.c
-===================================================================
---- linux-stage.orig/fs/ext4/xattr.c
-+++ linux-stage/fs/ext4/xattr.c
-@@ -86,7 +86,8 @@
- # define ea_bdebug(f...)
- #endif
-
--static void ext4_xattr_cache_insert(struct buffer_head *);
-+static void ext4_xattr_cache_insert(struct super_block *,
-+ struct buffer_head *);
- static struct buffer_head *ext4_xattr_cache_find(struct inode *,
- struct ext4_xattr_header *,
- struct mb_cache_entry **);
-@@ -233,7 +234,7 @@ bad_block: ext4_error(inode->i_sb, __fun
- error = -EIO;
- goto cleanup;
- }
-- ext4_xattr_cache_insert(bh);
-+ ext4_xattr_cache_insert(inode->i_sb, bh);
- entry = BFIRST(bh);
- error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EIO)
-@@ -375,7 +376,7 @@ ext4_xattr_block_list(struct inode *inod
- error = -EIO;
- goto cleanup;
- }
-- ext4_xattr_cache_insert(bh);
-+ ext4_xattr_cache_insert(inode->i_sb, bh);
- error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
-
- cleanup:
-@@ -472,7 +473,9 @@ ext4_xattr_release_block(handle_t *handl
- struct mb_cache_entry *ce = NULL;
- int error = 0;
-
-- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
-+ if (!test_opt(inode->i_sb, NO_MBCACHE))
-+ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev,
-+ bh->b_blocknr);
- error = ext4_journal_get_write_access(handle, bh);
- if (error)
- goto out;
-@@ -700,8 +703,10 @@ ext4_xattr_block_set(handle_t *handle, s
- if (i->value && i->value_len > sb->s_blocksize)
- return -ENOSPC;
- if (s->base) {
-- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
-- bs->bh->b_blocknr);
-+ if (!test_opt(inode->i_sb, NO_MBCACHE))
-+ ce = mb_cache_entry_get(ext4_xattr_cache,
-+ bs->bh->b_bdev,
-+ bs->bh->b_blocknr);
- error = ext4_journal_get_write_access(handle, bs->bh);
- if (error)
- goto cleanup;
-@@ -718,7 +723,7 @@ ext4_xattr_block_set(handle_t *handle, s
- if (!IS_LAST_ENTRY(s->first))
- ext4_xattr_rehash(header(s->base),
- s->here);
-- ext4_xattr_cache_insert(bs->bh);
-+ ext4_xattr_cache_insert(sb, bs->bh);
- }
- unlock_buffer(bs->bh);
- if (error == -EIO)
-@@ -801,7 +806,8 @@ inserted:
- if (error)
- goto cleanup_dquot;
- }
-- mb_cache_entry_release(ce);
-+ if (ce)
-+ mb_cache_entry_release(ce);
- ce = NULL;
- } else if (bs->bh && s->base == bs->bh->b_data) {
- /* We were modifying this block in-place. */
-@@ -845,7 +851,7 @@ getblk_failed:
- memcpy(new_bh->b_data, s->base, new_bh->b_size);
- set_buffer_uptodate(new_bh);
- unlock_buffer(new_bh);
-- ext4_xattr_cache_insert(new_bh);
-+ ext4_xattr_cache_insert(sb, new_bh);
- error = ext4_handle_dirty_metadata(handle,
- inode, new_bh);
- if (error)
-@@ -1404,12 +1410,15 @@ ext4_xattr_put_super(struct super_block
- * Returns 0, or a negative error number on failure.
- */
- static void
--ext4_xattr_cache_insert(struct buffer_head *bh)
-+ext4_xattr_cache_insert(struct super_block *sb, struct buffer_head *bh)
- {
- __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
- struct mb_cache_entry *ce;
- int error;
-
-+ if (test_opt(sb, NO_MBCACHE))
-+ return;
-+
- ce = mb_cache_entry_alloc(ext4_xattr_cache);
- if (!ce) {
- ea_bdebug(bh, "out of memory");
-@@ -1483,6 +1492,8 @@ ext4_xattr_cache_find(struct inode *inod
- __u32 hash = le32_to_cpu(header->h_hash);
- struct mb_cache_entry *ce;
-
-+ if (test_opt(inode->i_sb, NO_MBCACHE))
-+ return NULL;
- if (!header->h_hash)
- return NULL; /* never share */
- ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -1481,6 +1481,7 @@ enum {
-
- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents,
- Opt_force_over_128tb,
-+ Opt_no_mbcache,
- };
-
- static match_table_t tokens = {
-@@ -1554,6 +1555,7 @@ static match_table_t tokens = {
- {Opt_noauto_da_alloc, "noauto_da_alloc"},
- {Opt_bigendian_extents, "bigendian_extents"},
- {Opt_force_over_128tb, "force_over_128tb"},
-+ {Opt_no_mbcache, "no_mbcache"},
- {Opt_err, NULL},
- };
-
-@@ -2030,6 +2032,9 @@ set_qf_format:
- }
- clear_opt(sbi->s_mount_opt, EXTENTS);
- break;
-+ case Opt_no_mbcache:
-+ set_opt(sbi->s_mount_opt, NO_MBCACHE);
-+ break;
- default:
- ext4_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" "
-Index: linux-stage/fs/ext4/ext4.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4.h
-+++ linux-stage/fs/ext4/ext4.h
-@@ -715,7 +715,8 @@ struct ext4_inode_info {
- /*
- * Mount flags
- */
--#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
-+#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Disable mbcache */
-+#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
- #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
- #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
- #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+++ /dev/null
-Index: linux-stage/fs/ext4/Makefile
-===================================================================
---- linux-stage.orig/fs/ext4/Makefile
-+++ linux-stage/fs/ext4/Makefile
-@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
- ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-- mmp.o
-+ mmp.o dynlocks.o
-
- ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -4159,6 +4159,7 @@ static int __init init_ext4_fs(void)
- err = init_inodecache();
- if (err)
- goto out1;
-+ dynlock_cache_init();
- err = register_filesystem(&ext4_fs_type);
- if (err)
- goto out;
-@@ -4195,6 +4196,7 @@ static void __exit exit_ext4_fs(void)
- unregister_filesystem(&ext4dev_fs_type);
- #endif
- destroy_inodecache();
-+ dynlock_cache_exit();
- exit_ext4_xattr();
- exit_ext4_mballoc();
- __free_page(ext4_zero_page);
+++ /dev/null
-diff -rupN linux-2.6.18-128.1.6_1/fs/ext4/dynlocks.c linux-2.6.18-128.1.6_2/fs/ext4/dynlocks.c
---- linux-2.6.18-128.1.6_1/fs/ext4/dynlocks.c 1970-01-01 05:30:00.000000000 +0530
-+++ linux-2.6.18-128.1.6_2/fs/ext4/dynlocks.c 2009-08-13 20:42:59.000000000 +0530
-@@ -0,0 +1,236 @@
-+/*
-+ * Dynamic Locks
-+ *
-+ * struct dynlock is lockspace
-+ * one may request lock (exclusive or shared) for some value
-+ * in that lockspace
-+ *
-+ */
-+
-+#include <linux/dynlocks.h>
-+#include <linux/module.h>
-+#include <linux/slab.h>
-+#include <linux/sched.h>
-+
-+#define DYNLOCK_HANDLE_MAGIC 0xd19a10c
-+#define DYNLOCK_HANDLE_DEAD 0xd1956ee
-+#define DYNLOCK_LIST_MAGIC 0x11ee91e6
-+
-+static kmem_cache_t * dynlock_cachep = NULL;
-+
-+struct dynlock_handle {
-+ unsigned dh_magic;
-+ struct list_head dh_list;
-+ unsigned long dh_value; /* lock value */
-+ int dh_refcount; /* number of users */
-+ int dh_readers;
-+ int dh_writers;
-+ int dh_pid; /* holder of the lock */
-+ wait_queue_head_t dh_wait;
-+};
-+
-+int __init dynlock_cache_init(void)
-+{
-+ int rc = 0;
-+
-+ /* printk(KERN_INFO "init dynlocks cache\n"); */
-+ dynlock_cachep = kmem_cache_create("dynlock_cache",
-+ sizeof(struct dynlock_handle),
-+ 0,
-+ SLAB_HWCACHE_ALIGN,
-+ NULL, NULL);
-+ if (dynlock_cachep == NULL) {
-+ printk(KERN_ERR "Not able to create dynlock cache");
-+ rc = -ENOMEM;
-+ }
-+ return rc;
-+}
-+
-+void dynlock_cache_exit(void)
-+{
-+ /* printk(KERN_INFO "exit dynlocks cache\n"); */
-+ kmem_cache_destroy(dynlock_cachep);
-+}
-+
-+/*
-+ * dynlock_init
-+ *
-+ * initialize lockspace
-+ *
-+ */
-+void dynlock_init(struct dynlock *dl)
-+{
-+ spin_lock_init(&dl->dl_list_lock);
-+ INIT_LIST_HEAD(&dl->dl_list);
-+ dl->dl_magic = DYNLOCK_LIST_MAGIC;
-+}
-+EXPORT_SYMBOL(dynlock_init);
-+
-+/*
-+ * dynlock_lock
-+ *
-+ * acquires lock (exclusive or shared) in specified lockspace
-+ * each lock in lockspace is allocated separately, so user have
-+ * to specify GFP flags.
-+ * routine returns pointer to lock. this pointer is intended to
-+ * be passed to dynlock_unlock
-+ *
-+ */
-+struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value,
-+ enum dynlock_type lt, gfp_t gfp)
-+{
-+ struct dynlock_handle *nhl = NULL;
-+ struct dynlock_handle *hl;
-+
-+ BUG_ON(dl == NULL);
-+ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
-+
-+repeat:
-+ /* find requested lock in lockspace */
-+ spin_lock(&dl->dl_list_lock);
-+ BUG_ON(dl->dl_list.next == NULL);
-+ BUG_ON(dl->dl_list.prev == NULL);
-+ list_for_each_entry(hl, &dl->dl_list, dh_list) {
-+ BUG_ON(hl->dh_list.next == NULL);
-+ BUG_ON(hl->dh_list.prev == NULL);
-+ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC);
-+ if (hl->dh_value == value) {
-+ /* lock is found */
-+ if (nhl) {
-+ /* someone else just allocated
-+ * lock we didn't find and just created
-+ * so, we drop our lock
-+ */
-+ kmem_cache_free(dynlock_cachep, nhl);
-+ nhl = NULL;
-+ }
-+ hl->dh_refcount++;
-+ goto found;
-+ }
-+ }
-+ /* lock not found */
-+ if (nhl) {
-+ /* we already have allocated lock. use it */
-+ hl = nhl;
-+ nhl = NULL;
-+ list_add(&hl->dh_list, &dl->dl_list);
-+ goto found;
-+ }
-+ spin_unlock(&dl->dl_list_lock);
-+
-+ /* lock not found and we haven't allocated lock yet. allocate it */
-+ nhl = kmem_cache_alloc(dynlock_cachep, gfp);
-+ if (nhl == NULL)
-+ return NULL;
-+ nhl->dh_refcount = 1;
-+ nhl->dh_value = value;
-+ nhl->dh_readers = 0;
-+ nhl->dh_writers = 0;
-+ nhl->dh_magic = DYNLOCK_HANDLE_MAGIC;
-+ init_waitqueue_head(&nhl->dh_wait);
-+
-+ /* while lock is being allocated, someone else may allocate it
-+ * and put onto to list. check this situation
-+ */
-+ goto repeat;
-+
-+found:
-+ if (lt == DLT_WRITE) {
-+ /* exclusive lock: user don't want to share lock at all
-+ * NOTE: one process may take the same lock several times
-+ * this functionaly is useful for rename operations */
-+ while ((hl->dh_writers && hl->dh_pid != current->pid) ||
-+ hl->dh_readers) {
-+ spin_unlock(&dl->dl_list_lock);
-+ wait_event(hl->dh_wait,
-+ hl->dh_writers == 0 && hl->dh_readers == 0);
-+ spin_lock(&dl->dl_list_lock);
-+ }
-+ hl->dh_writers++;
-+ } else {
-+ /* shared lock: user do not want to share lock with writer */
-+ while (hl->dh_writers) {
-+ spin_unlock(&dl->dl_list_lock);
-+ wait_event(hl->dh_wait, hl->dh_writers == 0);
-+ spin_lock(&dl->dl_list_lock);
-+ }
-+ hl->dh_readers++;
-+ }
-+ hl->dh_pid = current->pid;
-+ spin_unlock(&dl->dl_list_lock);
-+
-+ return hl;
-+}
-+EXPORT_SYMBOL(dynlock_lock);
-+
-+
-+/*
-+ * dynlock_unlock
-+ *
-+ * user have to specify lockspace (dl) and pointer to lock structure
-+ * returned by dynlock_lock()
-+ *
-+ */
-+void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *hl)
-+{
-+ int wakeup = 0;
-+
-+ BUG_ON(dl == NULL);
-+ BUG_ON(hl == NULL);
-+ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
-+
-+ if (hl->dh_magic != DYNLOCK_HANDLE_MAGIC)
-+ printk(KERN_EMERG "wrong lock magic: %#x\n", hl->dh_magic);
-+
-+ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC);
-+ BUG_ON(hl->dh_writers != 0 && current->pid != hl->dh_pid);
-+
-+ spin_lock(&dl->dl_list_lock);
-+ if (hl->dh_writers) {
-+ BUG_ON(hl->dh_readers != 0);
-+ hl->dh_writers--;
-+ if (hl->dh_writers == 0)
-+ wakeup = 1;
-+ } else if (hl->dh_readers) {
-+ hl->dh_readers--;
-+ if (hl->dh_readers == 0)
-+ wakeup = 1;
-+ } else {
-+ BUG();
-+ }
-+ if (wakeup) {
-+ hl->dh_pid = 0;
-+ wake_up(&hl->dh_wait);
-+ }
-+ if (--(hl->dh_refcount) == 0) {
-+ hl->dh_magic = DYNLOCK_HANDLE_DEAD;
-+ list_del(&hl->dh_list);
-+ kmem_cache_free(dynlock_cachep, hl);
-+ }
-+ spin_unlock(&dl->dl_list_lock);
-+}
-+EXPORT_SYMBOL(dynlock_unlock);
-+
-+int dynlock_is_locked(struct dynlock *dl, unsigned long value)
-+{
-+ struct dynlock_handle *hl;
-+ int result = 0;
-+
-+ /* find requested lock in lockspace */
-+ spin_lock(&dl->dl_list_lock);
-+ BUG_ON(dl->dl_list.next == NULL);
-+ BUG_ON(dl->dl_list.prev == NULL);
-+ list_for_each_entry(hl, &dl->dl_list, dh_list) {
-+ BUG_ON(hl->dh_list.next == NULL);
-+ BUG_ON(hl->dh_list.prev == NULL);
-+ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC);
-+ if (hl->dh_value == value && hl->dh_pid == current->pid) {
-+ /* lock is found */
-+ result = 1;
-+ break;
-+ }
-+ }
-+ spin_unlock(&dl->dl_list_lock);
-+ return result;
-+}
-+EXPORT_SYMBOL(dynlock_is_locked);
-diff -rupN linux-2.6.18-128.1.6_1/include/linux/dynlocks.h linux-2.6.18-128.1.6_2/include/linux/dynlocks.h
---- linux-2.6.18-128.1.6_1/include/linux/dynlocks.h 1970-01-01 05:30:00.000000000 +0530
-+++ linux-2.6.18-128.1.6_2/include/linux/dynlocks.h 2009-08-13 20:43:18.000000000 +0530
-@@ -0,0 +1,34 @@
-+#ifndef _LINUX_DYNLOCKS_H
-+#define _LINUX_DYNLOCKS_H
-+
-+#include <linux/list.h>
-+#include <linux/wait.h>
-+
-+struct dynlock_handle;
-+
-+/*
-+ * lock's namespace:
-+ * - list of locks
-+ * - lock to protect this list
-+ */
-+struct dynlock {
-+ unsigned dl_magic;
-+ struct list_head dl_list;
-+ spinlock_t dl_list_lock;
-+};
-+
-+enum dynlock_type {
-+ DLT_WRITE,
-+ DLT_READ
-+};
-+
-+int dynlock_cache_init(void);
-+void dynlock_cache_exit(void);
-+void dynlock_init(struct dynlock *dl);
-+struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value,
-+ enum dynlock_type lt, gfp_t gfp);
-+void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *lock);
-+int dynlock_is_locked(struct dynlock *dl, unsigned long value);
-+
-+#endif
-+
+++ /dev/null
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -3427,7 +3427,6 @@ failed_mount:
- brelse(bh);
- out_fail:
- sb->s_fs_info = NULL;
-- kfree(sbi->s_blockgroup_lock);
- kfree(sbi);
- lock_kernel();
- return ret;
+++ /dev/null
-diff -rupN linux-2.6.18-164.6.1_1/fs/ext4/ext4.h linux-2.6.18-164.6.1_2/fs/ext4/ext4.h
---- linux-2.6.18-164.6.1_1/fs/ext4/ext4.h 2009-12-22 13:07:27.000000000 +0530
-+++ linux-2.6.18-164.6.1_2/fs/ext4/ext4.h 2009-12-22 13:10:18.000000000 +0530
-@@ -305,6 +305,7 @@ struct ext4_new_group_data {
- #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
- #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
- #define EXT4_IOC_MIGRATE _IO('f', 9)
-+#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
- /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
-
-diff -rupN linux-2.6.18-164.6.1_1/fs/ext4/ioctl.c linux-2.6.18-164.6.1_2/fs/ext4/ioctl.c
---- linux-2.6.18-164.6.1_1/fs/ext4/ioctl.c 2009-12-22 13:06:51.000000000 +0530
-+++ linux-2.6.18-164.6.1_2/fs/ext4/ioctl.c 2009-12-22 13:09:45.000000000 +0530
-@@ -17,6 +17,71 @@
- #include "ext4_jbd2.h"
- #include "ext4.h"
-
-+/* So that the fiemap access checks can't overflow on 32 bit machines. */
-+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
-+
-+static int fiemap_check_ranges(struct super_block *sb,
-+ u64 start, u64 len, u64 *new_len)
-+{
-+ *new_len = len;
-+
-+ if (len == 0)
-+ return -EINVAL;
-+
-+ if (start > sb->s_maxbytes)
-+ return -EFBIG;
-+
-+ /*
-+ * Shrink request scope to what the fs can actually handle.
-+ */
-+ if ((len > sb->s_maxbytes) ||
-+ (sb->s_maxbytes - len) < start)
-+ *new_len = sb->s_maxbytes - start;
-+
-+ return 0;
-+}
-+
-+int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg)
-+{
-+ struct fiemap fiemap;
-+ u64 len;
-+ struct fiemap_extent_info fieinfo = {0, };
-+ struct super_block *sb = inode->i_sb;
-+ int error = 0;
-+
-+ if (copy_from_user(&fiemap, (struct fiemap __user *) arg,
-+ sizeof(struct fiemap)))
-+ return -EFAULT;
-+
-+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
-+ return -EINVAL;
-+
-+ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
-+ &len);
-+ if (error)
-+ return error;
-+
-+ fieinfo.fi_flags = fiemap.fm_flags;
-+ fieinfo.fi_extents_max = fiemap.fm_extent_count;
-+ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
-+
-+ if (fiemap.fm_extent_count != 0 &&
-+ !access_ok(VERIFY_WRITE, (void *)arg,
-+ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count])))
-+ return -EFAULT;
-+
-+ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
-+ filemap_write_and_wait(inode->i_mapping);
-+
-+ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len);
-+ fiemap.fm_flags = fieinfo.fi_flags;
-+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-+ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
-+ error = -EFAULT;
-+
-+ return error;
-+}
-+
- long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
- {
- struct inode *inode = filp->f_dentry->d_inode;
-@@ -249,6 +314,9 @@ flags_out:
- mutex_unlock(&(inode->i_mutex));
- return err;
- }
-+ case EXT4_IOC_FIEMAP: {
-+ return ioctl_fiemap(inode, filp, arg);
-+ }
-
- default:
- return -ENOTTY;
+++ /dev/null
-Index: linux-2.6.18-164.6.1/fs/ext4/super.c
-===================================================================
---- linux-2.6.18-164.6.1.orig/fs/ext4/super.c
-+++ linux-2.6.18-164.6.1/fs/ext4/super.c
-@@ -51,6 +51,8 @@
-
- struct proc_dir_entry *ext4_proc_root;
-
-+static int force_over_128tb;
-+
- static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
- unsigned long journal_devnum);
- static int ext4_commit_super(struct super_block *sb,
-@@ -1343,6 +1345,7 @@ enum {
- Opt_stripe, Opt_delalloc, Opt_nodelalloc,
- Opt_inode_readahead_blks, Opt_journal_ioprio,
- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents,
-+ Opt_force_over_128tb,
- };
-
- static match_table_t tokens = {
-@@ -1410,6 +1413,7 @@ static match_table_t tokens = {
- {Opt_auto_da_alloc, "auto_da_alloc"},
- {Opt_noauto_da_alloc, "noauto_da_alloc"},
- {Opt_bigendian_extents, "bigendian_extents"},
-+ {Opt_force_over_128tb, "force_over_128tb"},
- {Opt_err, NULL},
- };
-
-@@ -1837,6 +1841,9 @@ set_qf_format:
- break;
- case Opt_mballoc:
- break;
-+ case Opt_force_over_128tb:
-+ force_over_128tb = 1;
-+ break;
- default:
- printk(KERN_ERR
- "EXT4-fs: Unrecognized mount option \"%s\" "
-@@ -2692,6 +2699,16 @@ static int ext4_fill_super(struct super_
- goto failed_mount;
- }
-
-+ if (ext4_blocks_count(es) > (8ULL << 32)) {
-+ if (force_over_128tb == 0) {
-+ printk(KERN_ERR "EXT4-fs does not support filesystems "
-+ "greater than 128TB and can cause data corruption."
-+ "Use \"force_over_128tb\" mount option to override."
-+ "\n");
-+ goto failed_mount;
-+ }
-+ }
-+
- if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
-
+++ /dev/null
-Index: linux-2.6.18-128.1.6/fs/ext4/inode.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/inode.c
-+++ linux-2.6.18-128.1.6/fs/ext4/inode.c
-@@ -2850,11 +2850,11 @@ struct inode *ext4_iget(struct super_blo
- EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
- EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
-
-- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
-+ ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-- inode->i_version |=
-- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
-+ ei->i_fs_version |= (__u64)(le32_to_cpu(raw_inode->i_version_hi))
-+ << 32;
- }
-
- if (S_ISREG(inode->i_mode)) {
-@@ -3043,16 +3043,11 @@ static int ext4_do_update_inode(handle_t
- } else for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
-
-- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
-+ raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-- /* in RHEL5 i_version is an unsigned long */
--#if BITS_PER_LONG == 64
-- raw_inode->i_version_hi =
-- cpu_to_le32(inode->i_version >> 32);
--#else
-- raw_inode->i_version_hi = 0;
--#endif
-+ raw_inode->i_version_hi = cpu_to_le32(ei->i_fs_version
-+ >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
- }
-
-Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h
-+++ linux-2.6.18-128.1.6/fs/ext4/ext4.h
-@@ -21,6 +21,8 @@
- #include <linux/seqlock.h>
- #include <linux/mutex.h>
-
-+#define HAVE_DISK_INODE_VERSION
-+
- /* data type for block offset of block group */
- typedef int ext4_grpblk_t;
-
-@@ -164,6 +166,8 @@ struct ext4_inode_info {
- */
- tid_t i_sync_tid;
- tid_t i_datasync_tid;
-+
-+ __u64 i_fs_version;
- };
-
- /*
-Index: linux-2.6.18-128.1.6/fs/ext4/ialloc.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/ialloc.c
-+++ linux-2.6.18-128.1.6/fs/ext4/ialloc.c
-@@ -878,6 +878,7 @@ got:
- ei->i_block_alloc_info = NULL;
- ei->i_block_group = group;
- ei->i_last_alloc_group = ~0;
-+ ei->i_fs_version = 0;
-
- ext4_set_inode_flags(inode);
- if (IS_DIRSYNC(inode))
+++ /dev/null
-Index: linux-stage/fs/ext4/ext4_jbd2.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4_jbd2.h
-+++ linux-stage/fs/ext4/ext4_jbd2.h
-@@ -106,6 +106,80 @@
- #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
- #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
-
-+/**
-+ * struct ext4_journal_cb_entry - Base structure for callback information.
-+ *
-+ * This struct is a 'seed' structure for a using with your own callback
-+ * structs. If you are using callbacks you must allocate one of these
-+ * or another struct of your own definition which has this struct
-+ * as it's first element and pass it to ext4_journal_callback_add().
-+ */
-+struct ext4_journal_cb_entry {
-+ /* list information for other callbacks attached to the same handle */
-+ struct list_head jce_list;
-+
-+ /* Function to call with this callback structure */
-+ void (*jce_func)(struct super_block *sb,
-+ struct ext4_journal_cb_entry *jce, int error);
-+
-+ /* user data goes here */
-+};
-+
-+/**
-+ * ext4_journal_callback_add: add a function to call after transaction commit
-+ * @handle: active journal transaction handle to register callback on
-+ * @func: callback function to call after the transaction has committed:
-+ * @sb: superblock of current filesystem for transaction
-+ * @jce: returned journal callback data
-+ * @rc: journal state at commit (0 = transaction committed properly)
-+ * @jce: journal callback data (internal and function private data struct)
-+ *
-+ * The registered function will be called in the context of the journal thread
-+ * after the transaction for which the handle was created has completed.
-+ *
-+ * No locks are held when the callback function is called, so it is safe to
-+ * call blocking functions from within the callback, but the callback should
-+ * not block or run for too long, or the filesystem will be blocked waiting for
-+ * the next transaction to commit. No journaling functions can be used, or
-+ * there is a risk of deadlock.
-+ *
-+ * There is no guaranteed calling order of multiple registered callbacks on
-+ * the same transaction.
-+ */
-+static inline void ext4_journal_callback_add(handle_t *handle,
-+ void (*func)(struct super_block *sb,
-+ struct ext4_journal_cb_entry *jce,
-+ int rc),
-+ struct ext4_journal_cb_entry *jce)
-+{
-+ struct ext4_sb_info *sbi =
-+ EXT4_SB(handle->h_transaction->t_journal->j_private);
-+
-+ /* Add the jce to transaction's private list */
-+ jce->jce_func = func;
-+ spin_lock(&sbi->s_md_lock);
-+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
-+ spin_unlock(&sbi->s_md_lock);
-+}
-+
-+/**
-+ * ext4_journal_callback_del: delete a registered callback
-+ * @handle: active journal transaction handle on which callback was registered
-+ * @jce: registered journal callback entry to unregister
-+ */
-+static inline void ext4_journal_callback_del(handle_t *handle,
-+ struct ext4_journal_cb_entry *jce)
-+{
-+ struct ext4_sb_info *sbi =
-+ EXT4_SB(handle->h_transaction->t_journal->j_private);
-+
-+ spin_lock(&sbi->s_md_lock);
-+ list_del_init(&jce->jce_list);
-+ spin_unlock(&sbi->s_md_lock);
-+}
-+
-+#define HAVE_EXT4_JOURNAL_CALLBACK_ADD
-+
- int
- ext4_mark_iloc_dirty(handle_t *handle,
- struct inode *inode,
-Index: linux-stage/fs/ext4/mballoc.c
-===================================================================
---- linux-stage.orig/fs/ext4/mballoc.c
-+++ linux-stage/fs/ext4/mballoc.c
-@@ -21,6 +21,7 @@
- * mballoc.c contains the multiblocks allocation routines
- */
-
-+#include "ext4_jbd2.h"
- #include "mballoc.h"
- #include <linux/debugfs.h>
-
-@@ -335,14 +336,12 @@
- */
- static struct kmem_cache *ext4_pspace_cachep;
- static struct kmem_cache *ext4_ac_cachep;
--static struct kmem_cache *ext4_free_ext_cachep;
-+static struct kmem_cache *ext4_free_data_cachep;
- static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
- ext4_group_t group);
- static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
- ext4_group_t group);
--static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
--
--
-+static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error);
-
- static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
- {
-@@ -2942,8 +2941,6 @@ int ext4_mb_init(struct super_block *sb,
-
- ext4_mb_history_init(sb);
-
-- if (sbi->s_journal)
-- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
- return 0;
- }
-
-@@ -3032,46 +3029,42 @@ int ext4_mb_release(struct super_block *
- * This function is called by the jbd2 layer once the commit has finished,
- * so we know we can free the blocks that were released with that commit.
- */
--static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
-+static void ext4_free_data_callback(struct super_block *sb,
-+ struct ext4_journal_cb_entry *jce,
-+ int rc)
- {
-- struct super_block *sb = journal->j_private;
-+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
- struct ext4_buddy e4b;
- struct ext4_group_info *db;
- int err, count = 0, count2 = 0;
-- struct ext4_free_data *entry;
-- struct list_head *l, *ltmp;
-
-- list_for_each_safe(l, ltmp, &txn->t_private_list) {
-- entry = list_entry(l, struct ext4_free_data, list);
--
-- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
-- entry->count, entry->group, entry);
--
-- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
-- /* we expect to find existing buddy because it's pinned */
-- BUG_ON(err != 0);
--
-- db = e4b.bd_info;
-- /* there are blocks to put in buddy to make them really free */
-- count += entry->count;
-- count2++;
-- ext4_lock_group(sb, entry->group);
-- /* Take it out of per group rb tree */
-- rb_erase(&entry->node, &(db->bb_free_root));
-- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
--
-- if (!db->bb_free_root.rb_node) {
-- /* No more items in the per group rb tree
-- * balance refcounts from ext4_mb_free_metadata()
-- */
-- page_cache_release(e4b.bd_buddy_page);
-- page_cache_release(e4b.bd_bitmap_page);
-- }
-- ext4_unlock_group(sb, entry->group);
-+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
-+ entry->efd_count, entry->efd_group, entry);
-
-- kmem_cache_free(ext4_free_ext_cachep, entry);
-- ext4_mb_unload_buddy(&e4b);
-+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
-+ /* we expect to find existing buddy because it's pinned */
-+ BUG_ON(err != 0);
-+
-+ db = e4b.bd_info;
-+ /* there are blocks to put in buddy to make them really free */
-+ count += entry->efd_count;
-+ count2++;
-+ ext4_lock_group(sb, entry->efd_group);
-+ /* Take it out of per group rb tree */
-+ rb_erase(&entry->efd_node, &(db->bb_free_root));
-+ mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
-+
-+ if (!db->bb_free_root.rb_node) {
-+ /* No more items in the per group rb tree
-+ * balance refcounts from ext4_mb_free_metadata()
-+ */
-+ page_cache_release(e4b.bd_buddy_page);
-+ page_cache_release(e4b.bd_bitmap_page);
- }
-+ ext4_unlock_group(sb, entry->efd_group);
-+
-+ kmem_cache_free(ext4_free_data_cachep, entry);
-+ ext4_mb_unload_buddy(&e4b);
-
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
- }
-@@ -3123,22 +3116,24 @@ int __init init_ext4_mballoc(void)
- kmem_cache_create("ext4_alloc_context",
- sizeof(struct ext4_allocation_context),
- 0, SLAB_RECLAIM_ACCOUNT, NULL, NULL);
-- if (ext4_ac_cachep == NULL) {
-- kmem_cache_destroy(ext4_pspace_cachep);
-- return -ENOMEM;
-- }
-+ if (ext4_ac_cachep == NULL)
-+ goto out_err;
-
-- ext4_free_ext_cachep =
-- kmem_cache_create("ext4_free_block_extents",
-+ ext4_free_data_cachep =
-+ kmem_cache_create("ext4_free_data",
- sizeof(struct ext4_free_data),
- 0, SLAB_RECLAIM_ACCOUNT, NULL, NULL);
-- if (ext4_free_ext_cachep == NULL) {
-- kmem_cache_destroy(ext4_pspace_cachep);
-- kmem_cache_destroy(ext4_ac_cachep);
-- return -ENOMEM;
-- }
-+ if (ext4_free_data_cachep == NULL)
-+ goto out1_err;
-+
- ext4_create_debugfs_entry();
- return 0;
-+
-+out1_err:
-+ kmem_cache_destroy(ext4_ac_cachep);
-+out_err:
-+ kmem_cache_destroy(ext4_pspace_cachep);
-+ return -ENOMEM;
- }
-
- void exit_ext4_mballoc(void)
-@@ -3150,7 +3145,7 @@ void exit_ext4_mballoc(void)
- rcu_barrier();
- kmem_cache_destroy(ext4_pspace_cachep);
- kmem_cache_destroy(ext4_ac_cachep);
-- kmem_cache_destroy(ext4_free_ext_cachep);
-+ kmem_cache_destroy(ext4_free_data_cachep);
- ext4_remove_debugfs_entry();
- }
-
-@@ -3688,8 +3683,8 @@ static void ext4_mb_generate_from_freeli
- n = rb_first(&(grp->bb_free_root));
-
- while (n) {
-- entry = rb_entry(n, struct ext4_free_data, node);
-- mb_set_bits(bitmap, entry->start_blk, entry->count);
-+ entry = rb_entry(n, struct ext4_free_data, efd_node);
-+ mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
- n = rb_next(n);
- }
- return;
-@@ -4974,11 +4969,11 @@ out3:
- * AND the blocks are associated with the same group.
- */
- static int can_merge(struct ext4_free_data *entry1,
-- struct ext4_free_data *entry2)
-+ struct ext4_free_data *entry2)
- {
-- if ((entry1->t_tid == entry2->t_tid) &&
-- (entry1->group == entry2->group) &&
-- ((entry1->start_blk + entry1->count) == entry2->start_blk))
-+ if ((entry1->efd_tid == entry2->efd_tid) &&
-+ (entry1->efd_group == entry2->efd_group) &&
-+ ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
- return 1;
- return 0;
- }
-@@ -4991,7 +4986,6 @@ ext4_mb_free_metadata(handle_t *handle,
- struct ext4_free_data *entry;
- struct ext4_group_info *db = e4b->bd_info;
- struct super_block *sb = e4b->bd_sb;
-- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct rb_node **n = &db->bb_free_root.rb_node, *node;
- struct rb_node *parent = NULL, *new_node;
-
-@@ -4999,8 +4993,8 @@ ext4_mb_free_metadata(handle_t *handle,
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
-
-- new_node = &new_entry->node;
-- block = new_entry->start_blk;
-+ new_node = &new_entry->efd_node;
-+ block = new_entry->efd_start_blk;
-
- if (!*n) {
- /* first free block exent. We need to
-@@ -5013,15 +5007,15 @@ ext4_mb_free_metadata(handle_t *handle,
- }
- while (*n) {
- parent = *n;
-- entry = rb_entry(parent, struct ext4_free_data, node);
-- if (block < entry->start_blk)
-+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
-+ if (block < entry->efd_start_blk)
- n = &(*n)->rb_left;
-- else if (block >= (entry->start_blk + entry->count))
-+ else if (block >= (entry->efd_start_blk + entry->efd_count))
- n = &(*n)->rb_right;
- else {
- ext4_grp_locked_error(sb, e4b->bd_group, __func__,
- "Double free of blocks %d (%d %d)",
-- block, entry->start_blk, entry->count);
-+ block, entry->efd_start_blk, entry->efd_count);
- return 0;
- }
- }
-@@ -5032,34 +5026,29 @@ ext4_mb_free_metadata(handle_t *handle,
- /* Now try to see the extent can be merged to left and right */
- node = rb_prev(new_node);
- if (node) {
-- entry = rb_entry(node, struct ext4_free_data, node);
-+ entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(entry, new_entry)) {
-- new_entry->start_blk = entry->start_blk;
-- new_entry->count += entry->count;
-+ new_entry->efd_start_blk = entry->efd_start_blk;
-+ new_entry->efd_count += entry->efd_count;
- rb_erase(node, &(db->bb_free_root));
-- spin_lock(&sbi->s_md_lock);
-- list_del(&entry->list);
-- spin_unlock(&sbi->s_md_lock);
-- kmem_cache_free(ext4_free_ext_cachep, entry);
-+ ext4_journal_callback_del(handle, &entry->efd_jce);
-+ kmem_cache_free(ext4_free_data_cachep, entry);
- }
- }
-
- node = rb_next(new_node);
- if (node) {
-- entry = rb_entry(node, struct ext4_free_data, node);
-+ entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(new_entry, entry)) {
-- new_entry->count += entry->count;
-+ new_entry->efd_count += entry->efd_count;
- rb_erase(node, &(db->bb_free_root));
-- spin_lock(&sbi->s_md_lock);
-- list_del(&entry->list);
-- spin_unlock(&sbi->s_md_lock);
-- kmem_cache_free(ext4_free_ext_cachep, entry);
-+ ext4_journal_callback_del(handle, &entry->efd_jce);
-+ kmem_cache_free(ext4_free_data_cachep, entry);
- }
- }
- /* Add the extent to transaction's private list */
-- spin_lock(&sbi->s_md_lock);
-- list_add(&new_entry->list, &handle->h_transaction->t_private_list);
-- spin_unlock(&sbi->s_md_lock);
-+ ext4_journal_callback_add(handle, ext4_free_data_callback,
-+ &new_entry->efd_jce);
- return 0;
- }
-
-@@ -5191,11 +5180,11 @@ do_more:
- * blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed
- */
-- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-- new_entry->start_blk = bit;
-- new_entry->group = block_group;
-- new_entry->count = count;
-- new_entry->t_tid = handle->h_transaction->t_tid;
-+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
-+ new_entry->efd_start_blk = bit;
-+ new_entry->efd_group = block_group;
-+ new_entry->efd_count = count;
-+ new_entry->efd_tid = handle->h_transaction->t_tid;
-
- ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
-Index: linux-stage/fs/ext4/mballoc.h
-===================================================================
---- linux-stage.orig/fs/ext4/mballoc.h
-+++ linux-stage/fs/ext4/mballoc.h
-@@ -107,23 +107,24 @@ extern u8 mb_enable_debug;
- */
- #define MB_DEFAULT_GROUP_PREALLOC 512
-
--
- struct ext4_free_data {
-- /* this links the free block information from group_info */
-- struct rb_node node;
-+ /* MUST be the first member */
-+ struct ext4_journal_cb_entry efd_jce;
-
-- /* this links the free block information from ext4_sb_info */
-- struct list_head list;
-+ /* ext4_free_data private data starts from here */
-+
-+ /* this links the free block information from group_info */
-+ struct rb_node efd_node;
-
- /* group which free block extent belongs */
-- ext4_group_t group;
-+ ext4_group_t efd_group;
-
- /* free block extent */
-- ext4_grpblk_t start_blk;
-- ext4_grpblk_t count;
-+ ext4_grpblk_t efd_start_blk;
-+ ext4_grpblk_t efd_count;
-
- /* transaction which freed this extent */
-- tid_t t_tid;
-+ tid_t efd_tid;
- };
-
- struct ext4_prealloc_space {
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -304,6 +304,23 @@ void ext4_journal_abort_handle(const cha
-
- EXPORT_SYMBOL(ext4_journal_abort_handle);
-
-+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
-+{
-+ struct super_block *sb = journal->j_private;
-+ struct ext4_sb_info *sbi = EXT4_SB(sb);
-+ int error = is_journal_aborted(journal);
-+ struct ext4_journal_cb_entry *jce, *tmp;
-+
-+ spin_lock(&sbi->s_md_lock);
-+ list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
-+ list_del_init(&jce->jce_list);
-+ spin_unlock(&sbi->s_md_lock);
-+ jce->jce_func(sb, jce, error);
-+ spin_lock(&sbi->s_md_lock);
-+ }
-+ spin_unlock(&sbi->s_md_lock);
-+}
-+
- /* Deal with the reporting of failure conditions on a filesystem such as
- * inconsistencies detected or read IO failures.
- *
-@@ -2997,6 +3014,8 @@ static int ext4_fill_super(struct super_
- }
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-
-+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
-+
- no_journal:
-
- if (test_opt(sb, NOBH)) {
+++ /dev/null
-removes static definition of dx_root struct. so that "." and ".." dirent can
-have extra data. This patch does not change any functionality but is required for
-ext4_data_in_dirent patch.
-
-Index: b/fs/ext4/namei.c
-===================================================================
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -121,22 +121,13 @@ struct dx_entry
- * hash version mod 4 should never be 0. Sincerely, the paranoia department.
- */
-
--struct dx_root
-+struct dx_root_info
- {
-- struct fake_dirent dot;
-- char dot_name[4];
-- struct fake_dirent dotdot;
-- char dotdot_name[4];
-- struct dx_root_info
-- {
-- __le32 reserved_zero;
-- u8 hash_version;
-- u8 info_length; /* 8 */
-- u8 indirect_levels;
-- u8 unused_flags;
-- }
-- info;
-- struct dx_entry entries[0];
-+ __le32 reserved_zero;
-+ u8 hash_version;
-+ u8 info_length; /* 8 */
-+ u8 indirect_levels;
-+ u8 unused_flags;
- };
-
- struct dx_node
-@@ -225,6 +216,16 @@ ext4_next_entry(struct ext4_dir_entry_2
- * Future: use high four bits of block for coalesce-on-delete flags
- * Mask them off for now.
- */
-+struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de)
-+{
-+ /* get dotdot first */
-+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1));
-+
-+ /* dx root info is after dotdot entry */
-+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2));
-+
-+ return (struct dx_root_info *) de;
-+}
-
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
-@@ -378,7 +379,7 @@ dx_probe(struct dentry *dentry, struct i
- {
- unsigned count, indirect;
- struct dx_entry *at, *entries, *p, *q, *m;
-- struct dx_root *root;
-+ struct dx_root_info * info;
- struct buffer_head *bh;
- struct dx_frame *frame = frame_in;
- u32 hash;
-@@ -388,18 +389,19 @@ dx_probe(struct dentry *dentry, struct i
- dir = dentry->d_parent->d_inode;
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
- goto fail;
-- root = (struct dx_root *) bh->b_data;
-- if (root->info.hash_version != DX_HASH_TEA &&
-- root->info.hash_version != DX_HASH_HALF_MD4 &&
-- root->info.hash_version != DX_HASH_LEGACY) {
-+
-+ info = dx_get_dx_info((struct ext4_dir_entry_2*)bh->b_data);
-+ if (info->hash_version != DX_HASH_TEA &&
-+ info->hash_version != DX_HASH_HALF_MD4 &&
-+ info->hash_version != DX_HASH_LEGACY) {
- ext4_warning(dir->i_sb, "Unrecognised inode hash code %d"
- "for directory #%lu",
-- root->info.hash_version, dir->i_ino);
-+ info->hash_version, dir->i_ino);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
-- hinfo->hash_version = root->info.hash_version;
-+ hinfo->hash_version = info->hash_version;
- if (hinfo->hash_version <= DX_HASH_TEA)
- hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
- hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-@@ -398,27 +399,26 @@ dx_probe(struct dentry *dentry, struct i
- ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
- hash = hinfo->hash;
-
-- if (root->info.unused_flags & 1) {
-+ if (info->unused_flags & 1) {
- ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
-- root->info.unused_flags);
-+ info->unused_flags);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
-
-- if ((indirect = root->info.indirect_levels) > 1) {
-+ if ((indirect = info->indirect_levels) > 1) {
- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
-- root->info.indirect_levels);
-+ info->indirect_levels);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
-
-- entries = (struct dx_entry *) (((char *)&root->info) +
-- root->info.info_length);
-+ entries = (struct dx_entry *) (((char *)info) + info->info_length);
-
- if (dx_get_limit(entries) != dx_root_limit(dir,
-- root->info.info_length)) {
-+ info->info_length)) {
- ext4_warning(dir->i_sb, "dx entry: limit != root limit");
- brelse(bh);
-
-@@ -509,10 +510,12 @@ fail:
-
- static void dx_release (struct dx_frame *frames)
- {
-+ struct dx_root_info *info;
- if (frames[0].bh == NULL)
- return;
-
-- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
-+ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
-+ if (info->indirect_levels)
- brelse(frames[1].bh);
- brelse(frames[0].bh);
- }
-@@ -1430,17 +1433,16 @@ static int make_indexed_dir(handle_t *ha
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct buffer_head *bh2;
-- struct dx_root *root;
- struct dx_frame frames[2], *frame;
- struct dx_entry *entries;
-- struct ext4_dir_entry_2 *de, *de2;
-+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
- char *data1, *top;
- unsigned len;
- int retval;
- unsigned blocksize;
- struct dx_hash_info hinfo;
- ext4_lblk_t block;
-- struct fake_dirent *fde;
-+ struct dx_root_info *dx_info;
-
- blocksize = dir->i_sb->s_blocksize;
- dxtrace(printk("Creating index\n"));
-@@ -1450,7 +1452,6 @@ static int make_indexed_dir(handle_t *ha
- brelse(bh);
- return retval;
- }
-- root = (struct dx_root *) bh->b_data;
-
- bh2 = ext4_append (handle, dir, &block, &retval);
- if (!(bh2)) {
-@@ -1460,18 +1461,20 @@ static int make_indexed_dir(handle_t *ha
- }
- root = (struct dx_root *) bh->b_data;
-
-+ dot_de = (struct ext4_dir_entry_2 *) bh->b_data;
-+ dotdot_de = ext4_next_entry(dot_de, blocksize);
-+
- /* The 0th block becomes the root, move the dirents out */
-- fde = &root->dotdot;
-- de = (struct ext4_dir_entry_2 *)((char *)fde +
-- ext4_rec_len_from_disk(fde->rec_len, blocksize));
-+ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de +
-+ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize));
-- if ((char *) de >= (((char *) root) + blocksize)) {
-+ if ((char *) de >= (((char *) dot_de) + blocksize)) {
- ext4_error(dir->i_sb,
- "invalid rec_len for '..' in inode %lu",
- dir->i_ino);
- brelse(bh);
- return -EIO;
- }
-- len = ((char *) root) + blocksize - (char *) de;
-+ len = ((char *) dot_de) + blocksize - (char *) de;
-
- /* Allocate new block for the 0th block's dirents */
- bh2 = ext4_append(handle, dir, &block, &retval);
-@@ -1472,19 +1475,23 @@ static int make_indexed_dir(handle_t *ha
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
- blocksize);
- /* Initialize the root; the dot dirents already exist */
-- de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
-- blocksize);
-- memset (&root->info, 0, sizeof(root->info));
-- root->info.info_length = sizeof(root->info);
-- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
-- entries = root->entries;
-- dx_set_block(entries, 1);
-- dx_set_count(entries, 1);
-- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
-+ dotdot_de->rec_len = ext4_rec_len_to_disk(blocksize -
-+ le16_to_cpu(dot_de->rec_len), blocksize);
-+
-+ /* initialize hashing info */
-+ dx_info = dx_get_dx_info(dot_de);
-+ memset (dx_info, 0, sizeof(*dx_info));
-+ dx_info->info_length = sizeof(*dx_info);
-+ dx_info->hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
-+
-+ entries = (void *)dx_info + sizeof(*dx_info);
-+
-+ dx_set_block(entries, 1);
-+ dx_set_count(entries, 1);
-+ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
-
- /* Initialize as for dx_probe */
-- hinfo.hash_version = root->info.hash_version;
-+ hinfo.hash_version = dx_info->hash_version;
- if (hinfo.hash_version <= DX_HASH_TEA)
- hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
- hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-@@ -1724,6 +1733,7 @@ static int ext4_dx_add_entry(handle_t *h
- goto journal_error;
- brelse (bh2);
- } else {
-+ struct dx_root_info * info;
- dxtrace(printk(KERN_DEBUG
- "Creating second level index...\n"));
- memcpy((char *) entries2, (char *) entries,
-@@ -1732,7 +1742,9 @@ static int ext4_dx_add_entry(handle_t *h
- /* Set up root */
- dx_set_count(entries, 1);
- dx_set_block(entries + 0, newblock);
-- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-+ info = dx_get_dx_info((struct ext4_dir_entry_2*)
-+ frames[0].bh->b_data);
-+ info->indirect_levels = 1;
-
- /* Add new access path frame */
- frame = frames + 1;
+++ /dev/null
-diff -rupN linux-2.6.18-164.6.1_1//fs/ext4/ialloc.c linux-2.6.18-164.6.1_2//fs/ext4/ialloc.c
---- linux-2.6.18-164.6.1_1//fs/ext4/ialloc.c 2010-03-31 17:42:50.000000000 +0530
-+++ linux-2.6.18-164.6.1_2//fs/ext4/ialloc.c 2010-03-31 17:43:22.000000000 +0530
-@@ -622,11 +622,14 @@ struct inode *ext4_new_inode_goal(handle
- sb = dir->i_sb;
- trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
- dir->i_ino, mode);
-+ sbi = EXT4_SB(sb);
-+ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size)
-+ return ERR_PTR(-EFBIG);
-+
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- ei = EXT4_I(inode);
-- sbi = EXT4_SB(sb);
-
- if (goal)
- goal = sbi->s_inode_goal;
-diff -rupN linux-2.6.18-164.6.1_1//fs/ext4/super.c linux-2.6.18-164.6.1_2//fs/ext4/super.c
---- linux-2.6.18-164.6.1_1//fs/ext4/super.c 2010-03-31 17:42:50.000000000 +0530
-+++ linux-2.6.18-164.6.1_2//fs/ext4/super.c 2010-03-31 17:45:32.000000000 +0530
-@@ -40,6 +40,7 @@
- EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
- EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
- EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-+EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size);
-
- static struct attribute *ext4_attrs[] = {
- ATTR_LIST(delayed_allocation_blocks),
-@@ -48,6 +48,7 @@
- ATTR_LIST(mb_order2_req),
- ATTR_LIST(mb_stream_req),
- ATTR_LIST(mb_group_prealloc),
-+ ATTR_LIST(max_dir_size),
- NULL,
- };
-
-diff -rupN linux-2.6.18-164.6.1_1//fs/ext4/ext4_sb.h linux-2.6.18-164.6.1_2//fs/ext4/ext4_sb.h
---- linux-2.6.18-164.6.1_1//fs/ext4/ext4.h 2010-03-31 17:42:50.000000000 +0530
-+++ linux-2.6.18-164.6.1_2//fs/ext4/ext4.h 2010-03-31 17:43:22.000000000 +0530
-@@ -119,6 +119,7 @@ struct ext4_sb_info {
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
-+ unsigned long s_max_dir_size;
-
- /* history to debug policy */
- struct ext4_mb_history *s_mb_history;
+++ /dev/null
-diff -rupN linux-2.6.18-128.1.6_1//fs/ext4/mballoc.c linux-2.6.18-128.1.6_2//fs/ext4/mballoc.c
---- linux-2.6.18-128.1.6_1//fs/ext4/mballoc.c
-+++ linux-2.6.18-128.1.6_2//fs/ext4/mballoc.c
-@@ -360,8 +360,8 @@ static void ext4_mb_mark_free_simple(str
- static struct kmem_cache *ext4_pspace_cachep;
- static struct kmem_cache *ext4_ac_cachep;
- static struct kmem_cache *ext4_free_ext_cachep;
--static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-- ext4_group_t group);
-+static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-+ ext4_group_t group);
- static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
- ext4_group_t group);
-
-@@ -660,7 +660,7 @@ static void ext4_mb_mark_free_simple(str
- }
-
- static noinline_for_stack
--void ext4_mb_generate_buddy(struct super_block *sb,
-+int ext4_mb_generate_buddy(struct super_block *sb,
- void *buddy, void *bitmap, ext4_group_t group)
- {
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-@@ -692,14 +692,14 @@ static void ext4_mb_generate_buddy(struc
- grp->bb_fragments = fragments;
-
- if (free != grp->bb_free) {
-- ext4_grp_locked_error(sb, group, __func__,
-- "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
-- group, free, grp->bb_free);
-- /*
-- * If we intent to continue, we consider group descritor
-- * corrupt and update bb_free using bitmap value
-- */
-- grp->bb_free = free;
-+ struct ext4_group_desc *gdp;
-+ gdp = ext4_get_group_desc (sb, group, NULL);
-+ ext4_error(sb,
-+ "group %lu: %u blocks in bitmap, %u in bb, "
-+ "%u in gd, %lu pa's\n", (long unsigned int)group,
-+ free, grp->bb_free, ext4_free_blks_count(sb, gdp),
-+ grp->bb_prealloc_nr);
-+ return -EIO;
- }
-
- clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-@@ -709,6 +709,8 @@ static void ext4_mb_generate_buddy(struc
- EXT4_SB(sb)->s_mb_buddies_generated++;
- EXT4_SB(sb)->s_mb_generation_time += period;
- spin_unlock(&EXT4_SB(sb)->s_bal_lock);
-+
-+ return 0;
- }
-
- /* The buddy information is attached the buddy cache inode
-@@ -814,7 +816,7 @@ static int ext4_mb_init_cache(struct pag
- first_block = page->index * blocks_per_page;
- /* init the page */
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
-- for (i = 0; i < blocks_per_page; i++) {
-+ for (i = 0; i < blocks_per_page && err == 0; i++) {
- int group;
- struct ext4_group_info *grinfo;
-
-@@ -848,7 +850,7 @@ static int ext4_mb_init_cache(struct pag
- * incore got set to the group block bitmap below
- */
- ext4_lock_group(sb, group);
-- ext4_mb_generate_buddy(sb, data, incore, group);
-+ err = ext4_mb_generate_buddy(sb, data, incore, group);
- ext4_unlock_group(sb, group);
- incore = NULL;
- } else {
-@@ -861,7 +863,7 @@ static int ext4_mb_init_cache(struct pag
- memcpy(data, bitmap, blocksize);
-
- /* mark all preallocated blks used in in-core bitmap */
-- ext4_mb_generate_from_pa(sb, data, group);
-+ err = ext4_mb_generate_from_pa(sb, data, group);
- ext4_mb_generate_from_freelist(sb, data, group);
- ext4_unlock_group(sb, group);
-
-@@ -870,6 +872,7 @@ static int ext4_mb_init_cache(struct pag
- incore = data;
- }
- }
-+ if (likely(err == 0))
- SetPageUptodate(page);
-
- out:
-@@ -1964,7 +1967,10 @@ static int ext4_mb_seq_history_show(stru
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s free\n",
- hs->pid, hs->ino, buf2);
-+ } else {
-+ seq_printf(seq, "unknown op %d\n", hs->op);
- }
-+
- return 0;
- }
-
-@@ -2092,9 +2098,11 @@ static void *ext4_mb_seq_groups_next(str
- static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
- {
- struct super_block *sb = seq->private;
-+ struct ext4_group_desc *gdp;
- ext4_group_t group = (ext4_group_t) ((unsigned long) v);
- int i;
- int err;
-+ int free = 0;
- struct ext4_buddy e4b;
- struct sg {
- struct ext4_group_info info;
-@@ -2103,10 +2111,10 @@ static int ext4_mb_seq_groups_show(struc
-
- group--;
- if (group == 0)
-- seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
-+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s"
- "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
- "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
-- "group", "free", "frags", "first",
-+ "group", "free", "frags", "first", "first", "pa",
- "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
- "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
-
-@@ -2117,13 +2125,20 @@ static int ext4_mb_seq_groups_show(struc
- seq_printf(seq, "#%-5lu: I/O error\n", group);
- return 0;
- }
-+
-+ gdp = ext4_get_group_desc(sb, group, NULL);
-+ if (gdp != NULL)
-+ free = ext4_free_blks_count(sb, gdp);
-+
- ext4_lock_group(sb, group);
- memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
-
-- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
-- sg.info.bb_fragments, sg.info.bb_first_free);
-+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [",
-+ (long unsigned int)group, sg.info.bb_free, free,
-+ sg.info.bb_fragments, sg.info.bb_first_free,
-+ sg.info.bb_prealloc_nr);
- for (i = 0; i <= 13; i++)
- seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
- sg.info.bb_counters[i] : 0);
-@@ -2226,6 +2241,7 @@ ext4_mb_store_history(struct ext4_alloca
- h.tail = ac->ac_tail;
- h.buddy = ac->ac_buddy;
- h.merged = 0;
-+ h.cr = ac->ac_criteria;
- if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
- if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
- ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
-@@ -3539,23 +3555,68 @@ ext4_mb_use_preallocated(struct ext4_all
- }
-
- /*
-+ * check free blocks in bitmap match free block in group descriptor
-+ * do this before taking preallocated blocks into account to be able
-+ * to detect on-disk corruptions. The group lock should be hold by the
-+ * caller.
-+ */
-+int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap,
-+ struct ext4_group_desc *gdp, int group)
-+{
-+ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
-+ unsigned short i, first, free = 0;
-+
-+ i = mb_find_next_zero_bit(bitmap, max, 0);
-+
-+ while (i < max) {
-+ first = i;
-+ i = mb_find_next_bit(bitmap, max, i);
-+ if (i > max)
-+ i = max;
-+ free += i - first;
-+ if (i < max)
-+ i = mb_find_next_zero_bit(bitmap, max, i);
-+ }
-+
-+ if (free != ext4_free_blks_count(sb, gdp)) {
-+ ext4_error(sb, "on-disk bitmap for group %d"
-+ "corrupted: %u blocks free in bitmap, %u - in gd\n",
-+ group, free, ext4_free_blks_count(sb, gdp));
-+ return -EIO;
-+ }
-+ return 0;
-+}
-+
-+/*
- * the function goes through all preallocation in this group and marks them
- * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock held
- */
- static noinline_for_stack
--void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-+int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
- ext4_group_t group)
- {
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- struct ext4_prealloc_space *pa;
-+ struct ext4_group_desc *gdp;
- struct list_head *cur;
- ext4_group_t groupnr;
- ext4_grpblk_t start;
- int preallocated = 0;
- int count = 0;
-+ int skip = 0;
-+ int err;
- int len;
-
-+ gdp = ext4_get_group_desc (sb, group, NULL);
-+ if (gdp == NULL)
-+ return -EIO;
-+
-+ /* before applying preallocations, check bitmap consistency */
-+ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group);
-+ if (err)
-+ return err;
-+
- /* all form of preallocation discards first load group,
- * so the only competing code is preallocation use.
- * we don't need any locking here
-@@ -3570,14 +3631,23 @@ static void ext4_mb_generate_from_pa(str
- &groupnr, &start);
- len = pa->pa_len;
- spin_unlock(&pa->pa_lock);
-- if (unlikely(len == 0))
-+ if (unlikely(len == 0)) {
-+ skip++;
- continue;
-+ }
- BUG_ON(groupnr != group);
- mb_set_bits(bitmap, start, len);
- preallocated += len;
- count++;
- }
-+ if (count + skip != grp->bb_prealloc_nr) {
-+ ext4_error(sb, "lost preallocations: "
-+ "count %d, bb_prealloc_nr %lu, skip %d\n",
-+ count, grp->bb_prealloc_nr, skip);
-+ return -EIO;
-+ }
- mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
-+ return 0;
- }
-
- static void ext4_mb_pa_callback(struct rcu_head *head)
-@@ -3629,6 +3699,7 @@ static void ext4_mb_put_pa(struct ext4_a
- */
- ext4_lock_group(sb, grp);
- list_del(&pa->pa_group_list);
-+ ext4_get_group_info(sb, grp)->bb_prealloc_nr--;
- ext4_unlock_group(sb, grp);
-
- spin_lock(pa->pa_obj_lock);
-@@ -3717,6 +3788,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat
-
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
-+ grp->bb_prealloc_nr++;
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-
- spin_lock(pa->pa_obj_lock);
-@@ -3776,6 +3848,7 @@ ext4_mb_new_group_pa(struct ext4_allocat
-
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
-+ grp->bb_prealloc_nr++;
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-
- /*
-@@ -3828,6 +3901,7 @@ ext4_mb_release_inode_pa(struct ext4_bud
- ac->ac_sb = sb;
- ac->ac_inode = pa->pa_inode;
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-+ ac->ac_o_ex.fe_len = 1;
- }
-
- while (bit < end) {
-@@ -3972,6 +4046,8 @@ repeat:
-
- spin_unlock(&pa->pa_lock);
-
-+ BUG_ON(grp->bb_prealloc_nr == 0);
-+ grp->bb_prealloc_nr--;
- list_del(&pa->pa_group_list);
- list_add(&pa->u.pa_tmp_list, &list);
- }
-@@ -4107,7 +4183,7 @@ repeat:
- if (err) {
- ext4_error(sb, "Error loading buddy information for %u",
- group);
-- continue;
-+ return;
- }
-
- bitmap_bh = ext4_read_block_bitmap(sb, group);
-@@ -4119,6 +4195,8 @@ repeat:
- }
-
- ext4_lock_group(sb, group);
-+ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0);
-+ e4b.bd_info->bb_prealloc_nr--;
- list_del(&pa->pa_group_list);
- ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
- ext4_unlock_group(sb, group);
-@@ -4394,6 +4472,7 @@ ext4_mb_discard_lg_preallocations(struct
- }
- ext4_lock_group(sb, group);
- list_del(&pa->pa_group_list);
-+ ext4_get_group_info(sb, group)->bb_prealloc_nr--;
- ext4_mb_release_group_pa(&e4b, pa, ac);
- ext4_unlock_group(sb, group);
-
-diff -rupN linux-2.6.18-128.1.6/fs/ext4/ext4.h
---- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h
-+++ linux-2.6.18-128.1.6/fs/ext4/ext4.h
-@@ -119,6 +119,7 @@ struct ext4_group_info {
- unsigned short bb_free;
- unsigned short bb_fragments;
- struct list_head bb_prealloc_list;
-+ unsigned long bb_prealloc_nr;
- #ifdef DOUBLE_CHECK
- void *bb_bitmap;
- #endif
-Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.h
-+++ linux-2.6.18-128.1.6/fs/ext4/mballoc.h
-@@ -92,7 +92,7 @@
- /*
- * for which requests use 2^N search using buddies
- */
--#define MB_DEFAULT_ORDER2_REQS 2
-+#define MB_DEFAULT_ORDER2_REQS 8
-
- /*
- * default group prealloc size 512 blocks
-@@ -228,7 +229,7 @@ struct ext4_mb_history {
- __u16 tail; /* what tail broke some buddy */
- __u16 buddy; /* buddy the tail ^^^ broke */
- __u16 flags;
-- __u8 cr:3; /* which phase the result extent was found at */
-+ __u8 cr:8; /* which phase the result extent was found at */
- __u8 op:4;
- __u8 merged:1;
- };
+++ /dev/null
-commit 8a57d9d61a6e361c7bb159dda797672c1df1a691
-Author: Curt Wohlgemuth <curtw@google.com>
-Date: Sun May 16 15:00:00 2010 -0400
-
- ext4: check for a good block group before loading buddy pages
-
- This adds a new field in ext4_group_info to cache the largest available
- block range in a block group; and don't load the buddy pages until *after*
- we've done a sanity check on the block group.
-
- With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
- partitions, it's easy to have no block groups with a block extent large
- enough to satisfy the input request length. This currently causes the loop
- during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
- for EVERY block group. That can be a lot of pages. The patch below allows
- us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
- have check again after we lock the block group).
-
- Addresses-Google-Bug: #2578108
- Addresses-Google-Bug: #2704453
-
- Signed-off-by: Curt Wohlgemuth <curtw@google.com>
- Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-
-Index: linux-2.6.32/fs/ext4/ext4.h
-===================================================================
---- linux-2.6.32.orig/fs/ext4/ext4.h 2009-12-02 20:51:21.000000000 -0700
-+++ linux-2.6.32/fs/ext4/ext4.h 2011-02-17 23:54:52.708097710 -0700
-@@ -1625,6 +1625,7 @@ struct ext4_group_info {
- ext4_grpblk_t bb_first_free; /* first free block */
- ext4_grpblk_t bb_free; /* total free blocks */
- ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
-+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
- struct list_head bb_prealloc_list;
- #ifdef DOUBLE_CHECK
- void *bb_bitmap;
-Index: linux-2.6.32/fs/ext4/mballoc.c
-===================================================================
---- linux-2.6.32.orig/fs/ext4/mballoc.c 2009-12-02 20:51:21.000000000 -0700
-+++ linux-2.6.32/fs/ext4/mballoc.c 2011-02-18 00:41:06.872097644 -0700
-@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(str
- }
- }
-
-+/*
-+ * Cache the order of the largest free extent we have available in this block
-+ * group.
-+ */
-+static void
-+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
-+{
-+ int i;
-+ int bits;
-+
-+ grp->bb_largest_free_order = -1; /* uninit */
-+
-+ bits = sb->s_blocksize_bits + 1;
-+ for (i = bits; i >= 0; i--) {
-+ if (grp->bb_counters[i] > 0) {
-+ grp->bb_largest_free_order = i;
-+ break;
-+ }
-+ }
-+}
-+
- static noinline_for_stack
- void ext4_mb_generate_buddy(struct super_block *sb,
- void *buddy, void *bitmap, ext4_group_t group)
-@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super
- */
- grp->bb_free = free;
- }
-+ mb_set_largest_free_order(sb, grp);
-
- clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-
-@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super
- * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
- * So it can have information regarding groups_per_page which
- * is blocks_per_page/2
-+ *
-+ * Locking note: This routine takes the block group lock of all groups
-+ * for this page; do not hold this lock when calling this routine!
- */
-
- static int ext4_mb_init_cache(struct page *page, char *incore)
-@@ -910,6 +935,11 @@ out:
- return err;
- }
-
-+/*
-+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
-+ * block group lock of all groups for this page; do not hold the BG lock when
-+ * calling this routine!
-+ */
- static noinline_for_stack
- int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
- {
-@@ -1004,6 +1034,11 @@ err:
- return ret;
- }
-
-+/*
-+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
-+ * block group lock of all groups for this page; do not hold the BG lock when
-+ * calling this routine!
-+ */
- static noinline_for_stack int
- ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
- struct ext4_buddy *e4b)
-@@ -1150,7 +1185,7 @@ err:
- return ret;
- }
-
--static void ext4_mb_release_desc(struct ext4_buddy *e4b)
-+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
- {
- if (e4b->bd_bitmap_page)
- page_cache_release(e4b->bd_bitmap_page);
-@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode
- buddy = buddy2;
- } while (1);
- }
-+ mb_set_largest_free_order(sb, e4b->bd_info);
- mb_check_buddy(e4b);
- }
-
-@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_budd
- e4b->bd_info->bb_counters[ord]++;
- e4b->bd_info->bb_counters[ord]++;
- }
-+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
-
- mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
- mb_check_buddy(e4b);
-@@ -1618,7 +1655,7 @@ int ext4_mb_try_best_found(struct ext4_a
- }
-
- ext4_unlock_group(ac->ac_sb, group);
-- ext4_mb_release_desc(e4b);
-+ ext4_mb_unload_buddy(e4b);
-
- return 0;
- }
-@@ -1674,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_all
- ext4_mb_use_best_found(ac, e4b);
- }
- ext4_unlock_group(ac->ac_sb, group);
-- ext4_mb_release_desc(e4b);
-+ ext4_mb_unload_buddy(e4b);
-
- return 0;
- }
-@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_al
- }
- }
-
-+/* This is now called BEFORE we load the buddy bitmap. */
- static int ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
- {
- unsigned free, fragments;
-- unsigned i, bits;
- int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
- struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
-
- BUG_ON(cr < 0 || cr >= 4);
-- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
-+
-+ /* We only do this if the grp has never been initialized */
-+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-+ int ret = ext4_mb_init_group(ac->ac_sb, group);
-+ if (ret)
-+ return 0;
-+ }
-
- free = grp->bb_free;
- fragments = grp->bb_fragments;
-@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext
- case 0:
- BUG_ON(ac->ac_2order == 0);
-
-+ if (grp->bb_largest_free_order < ac->ac_2order)
-+ return 0;
-+
- /* Avoid using the first bg of a flexgroup for data files */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
- (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
- ((group % flex_size) == 0))
- return 0;
-
-- bits = ac->ac_sb->s_blocksize_bits + 1;
-- for (i = ac->ac_2order; i <= bits; i++)
-- if (grp->bb_counters[i] > 0)
-- return 1;
-- break;
-+ return 1;
- case 1:
- if ((free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
-@@ -2026,15 +2068,11 @@ repeat:
- group = ac->ac_g_ex.fe_group;
-
- for (i = 0; i < ngroups; group++, i++) {
-- struct ext4_group_info *grp;
-- struct ext4_group_desc *desc;
--
- if (group == ngroups)
- group = 0;
-
-- /* quick check to skip empty groups */
-- grp = ext4_get_group_info(sb, group);
-- if (grp->bb_free == 0)
-+ /* This now checks without needing the buddy page */
-+ if (!ext4_mb_good_group(ac, group, cr))
- continue;
-
- err = ext4_mb_load_buddy(sb, group, &e4b);
-@@ -2042,15 +2080,18 @@ repeat:
- goto out;
-
- ext4_lock_group(sb, group);
-+
-+ /*
-+ * We need to check again after locking the
-+ * block group
-+ */
- if (!ext4_mb_good_group(ac, group, cr)) {
-- /* someone did allocation from this group */
- ext4_unlock_group(sb, group);
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
- continue;
- }
-
- ac->ac_groups_scanned++;
-- desc = ext4_get_group_desc(sb, group, NULL);
- if (cr == 0)
- ext4_mb_simple_scan_group(ac, &e4b);
- else if (cr == 1 &&
-@@ -2060,7 +2101,7 @@ repeat:
- ext4_mb_complex_scan_group(ac, &e4b);
-
- ext4_unlock_group(sb, group);
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
-
- if (ac->ac_status != AC_STATUS_CONTINUE)
- break;
-@@ -2150,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struc
- ext4_lock_group(sb, group);
- memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
-
- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
- sg.info.bb_fragments, sg.info.bb_first_free);
-@@ -2257,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_b
- INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
- init_rwsem(&meta_group_info[i]->alloc_sem);
- meta_group_info[i]->bb_free_root = RB_ROOT;
-+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
-
- #ifdef DOUBLE_CHECK
- {
-@@ -2567,7 +2609,7 @@ static void release_blocks_on_commit(jou
- sb_issue_discard(sb, discard_block, entry->count);
-
- kmem_cache_free(ext4_free_ext_cachep, entry);
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
- }
-
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
-@@ -3692,7 +3734,7 @@ out:
- ext4_unlock_group(sb, group);
- if (ac)
- kmem_cache_free(ext4_ac_cachep, ac);
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
- put_bh(bitmap_bh);
- return free;
- }
-@@ -3796,7 +3838,7 @@ repeat:
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u",
- group);
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
- continue;
- }
-
-@@ -3805,7 +3847,7 @@ repeat:
- ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
- ext4_unlock_group(sb, group);
-
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
- put_bh(bitmap_bh);
-
- list_del(&pa->u.pa_tmp_list);
-@@ -4069,7 +4111,7 @@ ext4_mb_discard_lg_preallocations(struct
- ext4_mb_release_group_pa(&e4b, pa, ac);
- ext4_unlock_group(sb, group);
-
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
- list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
- }
-@@ -4570,7 +4612,7 @@ do_more:
- atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
- }
-
-- ext4_mb_release_desc(&e4b);
-+ ext4_mb_unload_buddy(&e4b);
-
- *freed += count;
-
+++ /dev/null
-Index: linux-stage/fs/ext4/mballoc.c
-===================================================================
---- linux-stage.orig/fs/ext4/mballoc.c 2010-01-26 22:50:37.000000000 +0800
-+++ linux-stage/fs/ext4/mballoc.c 2010-01-26 22:57:24.000000000 +0800
-@@ -3892,6 +3892,7 @@
- INIT_LIST_HEAD(&pa->pa_group_list);
- pa->pa_deleted = 0;
- pa->pa_type = MB_INODE_PA;
-+ pa->pa_error = 0;
-
- mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-@@ -3956,6 +3957,7 @@
- INIT_LIST_HEAD(&pa->pa_group_list);
- pa->pa_deleted = 0;
- pa->pa_type = MB_GROUP_PA;
-+ pa->pa_error = 0;
-
- mb_debug("new group pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-@@ -4019,7 +4021,9 @@
- int err = 0;
- int free = 0;
-
-+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
- BUG_ON(pa->pa_deleted == 0);
-+ BUG_ON(pa->pa_inode == NULL);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- grp_blk_start = pa->pa_pstart - bit;
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
-@@ -4059,11 +4064,18 @@
- mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
- bit = next + 1;
- }
-- if (free != pa->pa_free) {
-- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
-- pa, (unsigned long) pa->pa_lstart,
-- (unsigned long) pa->pa_pstart,
-- (unsigned long) pa->pa_len);
-+
-+ /* "free < pa->pa_free" means we maybe double alloc the same blocks,
-+ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/
-+ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) {
-+ ext4_error(sb,"pa free mismatch: [pa %p] "
-+ "[phy %lu] [logic %lu] [len %u] [free %u] "
-+ "[error %u] [inode %lu] [freed %u]", pa,
-+ (unsigned long)pa->pa_pstart,
-+ (unsigned long)pa->pa_lstart,
-+ (unsigned)pa->pa_len, (unsigned)pa->pa_free,
-+ (unsigned)pa->pa_error, pa->pa_inode->i_ino,
-+ free);
- ext4_grp_locked_error(sb, group,
- __func__, "free %u, pa_free %u",
- free, pa->pa_free);
-@@ -4072,6 +4084,7 @@
- * from the bitmap and continue.
- */
- }
-+ BUG_ON(pa->pa_free != free);
- atomic_add(free, &sbi->s_mb_discarded);
-
- return err;
-@@ -4832,6 +4863,25 @@
- ac->ac_b_ex.fe_len = 0;
- ar->len = 0;
- ext4_mb_show_ac(ac);
-+ if (ac->ac_pa) {
-+ struct ext4_prealloc_space *pa = ac->ac_pa;
-+
-+ /* We can not make sure whether the bitmap has
-+ * been updated or not when fail case. So can
-+ * not revert pa_free back, just mark pa_error*/
-+ pa->pa_error++;
-+ ext4_error(sb,
-+ "Updating bitmap error: [err %d] "
-+ "[pa %p] [phy %lu] [logic %lu] "
-+ "[len %u] [free %u] [error %u] "
-+ "[inode %lu]", *errp, pa,
-+ (unsigned long)pa->pa_pstart,
-+ (unsigned long)pa->pa_lstart,
-+ (unsigned)pa->pa_len,
-+ (unsigned)pa->pa_free,
-+ (unsigned)pa->pa_error,
-+ pa->pa_inode ? pa->pa_inode->i_ino : 0);
-+ }
- } else {
- block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- ar->len = ac->ac_b_ex.fe_len;
-Index: linux-stage/fs/ext4/mballoc.h
-===================================================================
---- linux-stage.orig/fs/ext4/mballoc.h 2010-01-26 22:50:36.000000000 +0800
-+++ linux-stage/fs/ext4/mballoc.h 2010-01-26 22:52:58.000000000 +0800
-@@ -21,6 +21,7 @@
- #include <linux/blkdev.h>
- #include <linux/marker.h>
- #include <linux/mutex.h>
-+#include <linux/genhd.h>
- #include "ext4_jbd2.h"
- #include "ext4.h"
- #include "group.h"
-@@ -134,6 +135,7 @@
- ext4_grpblk_t pa_len; /* len of preallocated chunk */
- ext4_grpblk_t pa_free; /* how many blocks are free */
- unsigned short pa_type; /* pa type. inode or group */
-+ unsigned short pa_error;
- spinlock_t *pa_obj_lock;
- struct inode *pa_inode; /* hack, for history only */
- };
+++ /dev/null
-Index: linux-stage/fs/ext4/ext4_jbd2.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4_jbd2.h
-+++ linux-stage/fs/ext4/ext4_jbd2.h
-@@ -35,6 +35,8 @@
- (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 27U : 8U)
-
-+#define ext4_journal_dirty_metadata(handle, bh) \
-+ ext4_handle_dirty_metadata(handle, NULL, bh)
- /* Extended attribute operations touch at most two data buffers,
- * two bitmap buffers, and two group summaries, in addition to the inode
- * and the superblock, which are already accounted for. */
-Index: linux-stage/fs/ext4/extents.c
-===================================================================
---- linux-stage.orig/fs/ext4/extents.c
-+++ linux-stage/fs/ext4/extents.c
-@@ -59,6 +59,17 @@ ext4_fsblk_t ext_pblock(struct ext4_exte
- }
-
- /*
-+ * ext4_ext_store_pblock:
-+ * stores a large physical block number into an extent struct,
-+ * breaking it into parts
-+ */
-+void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-+{
-+ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-+ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-+}
-+
-+/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-@@ -72,17 +83,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte
- }
-
- /*
-- * ext4_ext_store_pblock:
-- * stores a large physical block number into an extent struct,
-- * breaking it into parts
-- */
--void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
--{
-- ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-- ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
--}
--
--/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
-@@ -2097,6 +2097,56 @@ static int ext4_ext_rm_idx(handle_t *han
- }
-
- /*
-+ * This routine returns max. credits extent tree can consume.
-+ * It should be OK for low-performance paths like ->writepage()
-+ * To allow many writing process to fit a single transaction,
-+ * caller should calculate credits under truncate_mutex and
-+ * pass actual path.
-+ */
-+int ext4_ext_calc_credits_for_insert(struct inode *inode,
-+ struct ext4_ext_path *path)
-+{
-+ int depth, needed;
-+
-+ if (path) {
-+ /* probably there is space in leaf? */
-+ depth = ext_depth(inode);
-+ if (le16_to_cpu(path[depth].p_hdr->eh_entries)
-+ < le16_to_cpu(path[depth].p_hdr->eh_max))
-+ return 1;
-+ }
-+
-+ /*
-+ * given 32bit logical block (4294967296 blocks), max. tree
-+ * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
-+ * let's also add one more level for imbalance.
-+ */
-+ depth = 5;
-+
-+ /* allocation of new data block(s) */
-+ needed = 2;
-+
-+ /*
-+ * tree can be full, so it'd need to grow in depth:
-+ * we need one credit to modify old root, credits for
-+ * new root will be added in split accounting
-+ */
-+ needed += 1;
-+
-+ /*
-+ * Index split can happen, we'd need:
-+ * allocate intermediate indexes (bitmap + group)
-+ * + change two blocks at each level, but root (already included)
-+ */
-+ needed += (depth * 2) + (depth * 2);
-+
-+ /* any allocation modifies superblock */
-+ needed += 1;
-+
-+ return needed;
-+}
-+
-+/*
- * ext4_ext_calc_credits_for_single_extent:
- * This routine returns max. credits that needed to insert an extent
- * to the extent tree.
-@@ -3941,3 +3991,15 @@ int ext4_fiemap(struct inode *inode, str
- return error;
- }
-
-+EXPORT_SYMBOL(ext4_ext_store_pblock);
-+EXPORT_SYMBOL(ext4_ext_search_right);
-+EXPORT_SYMBOL(ext4_ext_search_left);
-+EXPORT_SYMBOL(ext_pblock);
-+EXPORT_SYMBOL(ext4_ext_insert_extent);
-+EXPORT_SYMBOL(ext4_mb_new_blocks);
-+EXPORT_SYMBOL(ext4_ext_walk_space);
-+EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
-+EXPORT_SYMBOL(ext4_mark_inode_dirty);
-+EXPORT_SYMBOL(ext4_ext_find_extent);
-+EXPORT_SYMBOL(ext4_ext_drop_refs);
-+
-Index: linux-stage/fs/ext4/ext4_extents.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4_extents.h
-+++ linux-stage/fs/ext4/ext4_extents.h
-@@ -58,6 +58,12 @@
- */
- #define EXT_STATS_
-
-+/*
-+ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb
-+ * are now accounted in ext4_ext_calc_credits_for_insert()
-+ */
-+#define EXT4_ALLOC_NEEDED 0
-+#define HAVE_EXT_PREPARE_CB_EXTENT
-
- /*
- * ext4_inode has i_block array (60 bytes total).
-@@ -231,6 +237,8 @@ extern ext4_fsblk_t ext_pblock(struct ex
- extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
- extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
- extern int ext4_extent_tree_init(handle_t *, struct inode *);
-+extern int ext4_ext_calc_credits_for_insert(struct inode *,
-+ struct ext4_ext_path *);
- extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
- int num,
- struct ext4_ext_path *path);
-Index: linux-stage/fs/ext4/mballoc.c
-===================================================================
---- linux-stage.orig/fs/ext4/mballoc.c
-+++ linux-stage/fs/ext4/mballoc.c
-@@ -4313,6 +4313,7 @@ repeat:
- if (ac)
- kmem_cache_free(ext4_ac_cachep, ac);
- }
-+EXPORT_SYMBOL(ext4_discard_preallocations);
-
- /*
- * finds all preallocated spaces and return blocks being freed to them
-@@ -5127,3 +5128,6 @@ error_return:
- kmem_cache_free(ext4_ac_cachep, ac);
- return;
- }
-+
-+EXPORT_SYMBOL(ext4_free_blocks);
-+
-Index: linux-stage/fs/ext4/ext4_jbd2.c
-===================================================================
---- linux-stage.orig/fs/ext4/ext4_jbd2.c
-+++ linux-stage/fs/ext4/ext4_jbd2.c
-@@ -31,6 +31,7 @@ int __ext4_journal_get_write_access(cons
- }
- return err;
- }
-+EXPORT_SYMBOL(__ext4_journal_get_write_access);
-
- int __ext4_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh)
-@@ -107,3 +108,4 @@ int __ext4_handle_dirty_metadata(const c
- }
- return err;
- }
-+EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
-Index: linux-stage/fs/ext4/ext4.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4.h
-+++ linux-stage/fs/ext4/ext4.h
-@@ -1528,6 +1528,8 @@ extern int ext4_mb_add_groupinfo(struct
- extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
- extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
- ext4_group_t, int);
-+extern void ext4_mb_discard_inode_preallocations(struct inode *);
-+
- /* inode.c */
- int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr);
-Index: linux-stage/fs/ext4/inode.c
-===================================================================
---- linux-stage.orig/fs/ext4/inode.c
-+++ linux-stage/fs/ext4/inode.c
-@@ -5078,6 +5078,7 @@ bad_inode:
- iget_failed(inode);
- return ERR_PTR(ret);
- }
-+EXPORT_SYMBOL(ext4_iget);
-
- static int ext4_inode_blocks_set(handle_t *handle,
- struct ext4_inode *raw_inode,
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -90,6 +90,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su
- (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
- }
-+EXPORT_SYMBOL(ext4_inode_bitmap);
-
- ext4_fsblk_t ext4_inode_table(struct super_block *sb,
- struct ext4_group_desc *bg)
-@@ -114,6 +115,7 @@ __u32 ext4_free_inodes_count(struct supe
- (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
- }
-+EXPORT_SYMBOL(ext4_itable_unused_count);
-
- __u32 ext4_used_dirs_count(struct super_block *sb,
- struct ext4_group_desc *bg)
-@@ -1489,9 +1491,11 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
-+ Opt_mballoc, Opt_extents,
- Opt_stripe, Opt_delalloc, Opt_nodelalloc,
- Opt_block_validity, Opt_noblock_validity,
-- Opt_inode_readahead_blks, Opt_journal_ioprio
-+ Opt_inode_readahead_blks, Opt_journal_ioprio,
-+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
- };
-
- static match_table_t tokens = {
-@@ -1547,6 +1551,11 @@ static match_table_t tokens = {
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_i_version, "i_version"},
-+ {Opt_mballoc, "mballoc"},
-+ {Opt_extents, "extents"},
-+ {Opt_iopen, "iopen"},
-+ {Opt_noiopen, "noiopen"},
-+ {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_stripe, "stripe=%u"},
- {Opt_resize, "resize"},
- {Opt_delalloc, "delalloc"},
-@@ -1993,6 +2002,12 @@ set_qf_format:
- else
- set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
- break;
-+ case Opt_mballoc:
-+ case Opt_extents:
-+ case Opt_iopen:
-+ case Opt_noiopen:
-+ case Opt_iopen_nopriv:
-+ break;
- default:
- ext4_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" "
-@@ -2543,7 +2558,7 @@ static ssize_t delayed_allocation_blocks
- char *buf)
- {
- return snprintf(buf, PAGE_SIZE, "%llu\n",
-- (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
-+ (unsigned long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- }
-
- static ssize_t session_write_kbytes_show(struct ext4_attr *a,
-@@ -2564,11 +2579,11 @@ static ssize_t lifetime_write_kbytes_sho
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- return snprintf(buf, PAGE_SIZE, "%llu\n",
-- sbi->s_kbytes_written +
-+ (unsigned long long)(sbi->s_kbytes_written +
- (sb->s_bdev->bd_part ?
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1
-- : 0));
-+ : 0)));
- }
-
- static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
-@@ -3042,7 +3057,7 @@ static int ext4_fill_super(struct super_
- if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)",
-- ext4_blocks_count(es), blocks_count);
-+ ext4_blocks_count(es), (unsigned long long)blocks_count);
- goto failed_mount;
- }
-
-Index: linux-stage/fs/ext4/fsync.c
-===================================================================
---- linux-stage.orig/fs/ext4/fsync.c
-+++ linux-stage/fs/ext4/fsync.c
-@@ -61,7 +61,7 @@ int ext4_sync_file(struct file *file, st
-
- trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
- inode->i_sb->s_id, datasync, inode->i_ino,
-- dentry->d_parent->d_inode->i_ino);
-+ 0L);
-
- ret = flush_aio_dio_completed_IO(inode);
- if (ret < 0)
-Index: linux-stage/fs/ext4/move_extent.c
-===================================================================
---- linux-stage.orig/fs/ext4/move_extent.c
-+++ linux-stage/fs/ext4/move_extent.c
-@@ -1358,7 +1358,8 @@ ext4_move_extents(struct file *o_filp, s
- ext4_error(orig_inode->i_sb,
- "We replaced blocks too much! "
- "sum of replaced: %llu requested: %llu",
-- *moved_len, len);
-+ (unsigned long long)(*moved_len),
-+ (unsigned long long)(len));
- ret1 = -EIO;
- break;
- }
+++ /dev/null
-Prevent an ext4 filesystem from being mounted multiple times.
-A sequence number is stored on disk and is periodically updated (every 5
-seconds by default) by a mounted filesystem.
-At mount time, we now wait for s_mmp_update_interval seconds to make sure
-that the MMP sequence does not change.
-In case of failure, the nodename, bdevname and the time at which the MMP
-block was last updated is displayed.
-Move all mmp code to a dedicated file (mmp.c).
-
-Signed-off-by: Andreas Dilger <adilger <at> whamcloud.com>
-Signed-off-by: Johann Lombardi <johann <at> whamcloud.com>
----
- fs/ext4/Makefile | 3 +-
- fs/ext4/ext4.h | 76 ++++++++++++-
- fs/ext4/mmp.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
- fs/ext4/super.c | 18 +++-
- 4 files changed, 444 insertions(+), 4 deletions(-)
- create mode 100644 fs/ext4/mmp.c
-
-Index: linux-stage/fs/ext4/Makefile
-===================================================================
---- linux-stage.orig/fs/ext4/Makefile
-+++ linux-stage/fs/ext4/Makefile
-@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
-
- ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
-+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-+ mmp.o
-
- ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
-Index: linux-stage/fs/ext4/ext4.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4.h
-+++ linux-stage/fs/ext4/ext4.h
-@@ -878,7 +878,7 @@ struct ext4_super_block {
- __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
- __le32 s_flags; /* Miscellaneous flags */
- __le16 s_raid_stride; /* RAID stride */
-- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
-+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
- __le64 s_mmp_block; /* Block for multi-mount protection */
- __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u8 s_log_groups_per_flex; /* FLEX_BG group size */
-@@ -1032,6 +1032,9 @@ struct ext4_sb_info {
-
- /* workqueue for dio unwritten */
- struct workqueue_struct *dio_unwritten_wq;
-+
-+ /* Kernel thread for multiple mount protection */
-+ struct task_struct *s_mmp_tsk;
- };
-
- static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
-@@ -1169,7 +1172,8 @@ static inline void ext4_clear_inode_stat
- EXT4_FEATURE_INCOMPAT_META_BG| \
- EXT4_FEATURE_INCOMPAT_EXTENTS| \
- EXT4_FEATURE_INCOMPAT_64BIT| \
-- EXT4_FEATURE_INCOMPAT_FLEX_BG)
-+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-+ EXT4_FEATURE_INCOMPAT_MMP)
- #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
-@@ -1376,6 +1380,67 @@ void ext4_get_group_no_and_offset(struct
- extern struct proc_dir_entry *ext4_proc_root;
-
- /*
-+ * This structure will be used for multiple mount protection. It will be
-+ * written into the block number saved in the s_mmp_block field in the
-+ * superblock. Programs that check MMP should assume that if
-+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
-+ * to use the filesystem, regardless of how old the timestamp is.
-+ */
-+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
-+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
-+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
-+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
-+
-+struct mmp_struct {
-+ __le32 mmp_magic; /* Magic number for MMP */
-+ __le32 mmp_seq; /* Sequence no. updated periodically */
-+
-+ /*
-+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information
-+ * purposes and do not affect the correctness of the algorithm
-+ */
-+ __le64 mmp_time; /* Time last updated */
-+ char mmp_nodename[64]; /* Node which last updated MMP block */
-+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */
-+
-+ /*
-+ * mmp_check_interval is used to verify if the MMP block has been
-+ * updated on the block device. The value is updated based on the
-+ * maximum time to write the MMP block during an update cycle.
-+ */
-+ __le16 mmp_check_interval;
-+
-+ __le16 mmp_pad1;
-+ __le32 mmp_pad2[227];
-+};
-+
-+/* arguments passed to the mmp thread */
-+struct mmpd_data {
-+ struct buffer_head *bh; /* bh from initial read_mmp_block() */
-+ struct super_block *sb; /* super block of the fs */
-+};
-+
-+/*
-+ * Check interval multiplier
-+ * The MMP block is written every update interval and initially checked every
-+ * update interval x the multiplier (the value is then adapted based on the
-+ * write latency). The reason is that writes can be delayed under load and we
-+ * don't want readers to incorrectly assume that the filesystem is no longer
-+ * in use.
-+ */
-+#define EXT4_MMP_CHECK_MULT 2UL
-+
-+/*
-+ * Minimum interval for MMP checking in seconds.
-+ */
-+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
-+
-+/*
-+ * Maximum interval for MMP checking in seconds.
-+ */
-+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
-+
-+/*
- * Function prototypes
- */
-
-@@ -1547,6 +1612,10 @@ extern void __ext4_warning(struct super_
- #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
- extern void ext4_msg(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
-+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
-+ const char *, const char *);
-+#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, msg)
-+
- extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
- const char *, const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
-@@ -1784,6 +1853,9 @@ static inline void ext4_unlock_group(str
- spin_unlock(ext4_group_lock_ptr(sb, group));
- }
-
-+/* mmp.c */
-+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
-+
- /*
- * Inodes and files operations
- */
-Index: linux-stage/fs/ext4/mmp.c
-===================================================================
---- /dev/null
-+++ linux-stage/fs/ext4/mmp.c
-@@ -0,0 +1,351 @@
-+#include <linux/fs.h>
-+#include <linux/random.h>
-+#include <linux/buffer_head.h>
-+#include <linux/utsname.h>
-+#include <linux/kthread.h>
-+
-+#include "ext4.h"
-+
-+/*
-+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
-+ * faster.
-+ */
-+static int write_mmp_block(struct buffer_head *bh)
-+{
-+ mark_buffer_dirty(bh);
-+ lock_buffer(bh);
-+ bh->b_end_io = end_buffer_write_sync;
-+ get_bh(bh);
-+ submit_bh(WRITE_SYNC, bh);
-+ wait_on_buffer(bh);
-+ if (unlikely(!buffer_uptodate(bh)))
-+ return 1;
-+
-+ return 0;
-+}
-+
-+/*
-+ * Read the MMP block. It _must_ be read from disk and hence we clear the
-+ * uptodate flag on the buffer.
-+ */
-+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
-+ ext4_fsblk_t mmp_block)
-+{
-+ struct mmp_struct *mmp;
-+
-+ if (*bh)
-+ clear_buffer_uptodate(*bh);
-+
-+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
-+ * that the MD RAID device cache has been bypassed, and that the read
-+ * is not blocked in the elevator. */
-+ if (!*bh)
-+ *bh = sb_getblk(sb, mmp_block);
-+ if (*bh) {
-+ get_bh(*bh);
-+ lock_buffer(*bh);
-+ (*bh)->b_end_io = end_buffer_read_sync;
-+ submit_bh(READ_SYNC, *bh);
-+ wait_on_buffer(*bh);
-+ if (!buffer_uptodate(*bh)) {
-+ brelse(*bh);
-+ *bh = NULL;
-+ }
-+ }
-+ if (!*bh) {
-+ ext4_warning(sb, "Error while reading MMP block %llu",
-+ mmp_block);
-+ return -EIO;
-+ }
-+
-+ mmp = (struct mmp_struct *)((*bh)->b_data);
-+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
-+ return -EINVAL;
-+
-+ return 0;
-+}
-+
-+/*
-+ * Dump as much information as possible to help the admin.
-+ */
-+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
-+ const char *function, const char *msg)
-+{
-+ __ext4_warning(sb, function, "%s", msg);
-+ __ext4_warning(sb, function,
-+ "MMP failure info: last update time: %llu, last update "
-+ "node: %s, last update device: %s\n",
-+ (long long unsigned int) le64_to_cpu(mmp->mmp_time),
-+ mmp->mmp_nodename, mmp->mmp_bdevname);
-+}
-+
-+/*
-+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
-+ */
-+static int kmmpd(void *data)
-+{
-+ struct super_block *sb = ((struct mmpd_data *) data)->sb;
-+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
-+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-+ struct mmp_struct *mmp;
-+ ext4_fsblk_t mmp_block;
-+ u32 seq = 0;
-+ unsigned long failed_writes = 0;
-+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
-+ unsigned mmp_check_interval;
-+ unsigned long last_update_time;
-+ unsigned long diff;
-+ int retval;
-+
-+ mmp_block = le64_to_cpu(es->s_mmp_block);
-+ mmp = (struct mmp_struct *)(bh->b_data);
-+ mmp->mmp_time = cpu_to_le64(get_seconds());
-+ /*
-+ * Start with the higher mmp_check_interval and reduce it if
-+ * the MMP block is being updated on time.
-+ */
-+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
-+ EXT4_MMP_MIN_CHECK_INTERVAL);
-+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
-+ bdevname(bh->b_bdev, mmp->mmp_bdevname);
-+
-+ memcpy(mmp->mmp_nodename, init_utsname()->sysname,
-+ sizeof(mmp->mmp_nodename));
-+
-+ while (!kthread_should_stop()) {
-+ if (++seq > EXT4_MMP_SEQ_MAX)
-+ seq = 1;
-+
-+ mmp->mmp_seq = cpu_to_le32(seq);
-+ mmp->mmp_time = cpu_to_le64(get_seconds());
-+ last_update_time = jiffies;
-+
-+ retval = write_mmp_block(bh);
-+ /*
-+ * Don't spew too many error messages. Print one every
-+ * (s_mmp_update_interval * 60) seconds.
-+ */
-+ if (retval) {
-+ if ((failed_writes % 60) == 0)
-+ ext4_error(sb, "Error writing to MMP block");
-+ failed_writes++;
-+ }
-+
-+ if (!(le32_to_cpu(es->s_feature_incompat) &
-+ EXT4_FEATURE_INCOMPAT_MMP)) {
-+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
-+ " has been disabled.");
-+ EXT4_SB(sb)->s_mmp_tsk = NULL;
-+ goto failed;
-+ }
-+
-+ if (sb->s_flags & MS_RDONLY) {
-+ ext4_warning(sb, "kmmpd being stopped since filesystem "
-+ "has been remounted as readonly.");
-+ EXT4_SB(sb)->s_mmp_tsk = NULL;
-+ goto failed;
-+ }
-+
-+ diff = jiffies - last_update_time;
-+ if (diff < mmp_update_interval * HZ)
-+ schedule_timeout_interruptible(mmp_update_interval *
-+ HZ - diff);
-+
-+ /*
-+ * We need to make sure that more than mmp_check_interval
-+ * seconds have not passed since writing. If that has happened
-+ * we need to check if the MMP block is as we left it.
-+ */
-+ diff = jiffies - last_update_time;
-+ if (diff > mmp_check_interval * HZ) {
-+ struct buffer_head *bh_check = NULL;
-+ struct mmp_struct *mmp_check;
-+
-+ retval = read_mmp_block(sb, &bh_check, mmp_block);
-+ if (retval) {
-+ ext4_error(sb, "error reading MMP data: %d",
-+ retval);
-+
-+ EXT4_SB(sb)->s_mmp_tsk = NULL;
-+ goto failed;
-+ }
-+
-+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
-+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
-+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
-+ sizeof(mmp->mmp_nodename))) {
-+ dump_mmp_msg(sb, mmp_check,
-+ "Error while updating MMP info. "
-+ "The filesystem seems to have been"
-+ " multiply mounted.");
-+ ext4_error(sb, "abort");
-+ goto failed;
-+ }
-+ put_bh(bh_check);
-+ }
-+
-+ /*
-+ * Adjust the mmp_check_interval depending on how much time
-+ * it took for the MMP block to be written.
-+ */
-+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
-+ EXT4_MMP_MAX_CHECK_INTERVAL),
-+ EXT4_MMP_MIN_CHECK_INTERVAL);
-+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
-+ }
-+
-+ /*
-+ * Unmount seems to be clean.
-+ */
-+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
-+ mmp->mmp_time = cpu_to_le64(get_seconds());
-+
-+ retval = write_mmp_block(bh);
-+
-+failed:
-+ kfree(data);
-+ brelse(bh);
-+ return retval;
-+}
-+
-+/*
-+ * Get a random new sequence number but make sure it is not greater than
-+ * EXT4_MMP_SEQ_MAX.
-+ */
-+static unsigned int mmp_new_seq(void)
-+{
-+ u32 new_seq;
-+
-+ do {
-+ get_random_bytes(&new_seq, sizeof(u32));
-+ } while (new_seq > EXT4_MMP_SEQ_MAX);
-+
-+ return new_seq;
-+}
-+
-+/*
-+ * Protect the filesystem from being mounted more than once.
-+ */
-+int ext4_multi_mount_protect(struct super_block *sb,
-+ ext4_fsblk_t mmp_block)
-+{
-+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-+ struct buffer_head *bh = NULL;
-+ struct mmp_struct *mmp = NULL;
-+ struct mmpd_data *mmpd_data;
-+ u32 seq;
-+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
-+ unsigned int wait_time = 0;
-+ int retval;
-+
-+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
-+ mmp_block >= ext4_blocks_count(es)) {
-+ ext4_warning(sb, "Invalid MMP block in superblock");
-+ goto failed;
-+ }
-+
-+ retval = read_mmp_block(sb, &bh, mmp_block);
-+ if (retval)
-+ goto failed;
-+
-+ mmp = (struct mmp_struct *)(bh->b_data);
-+
-+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
-+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
-+
-+ /*
-+ * If check_interval in MMP block is larger, use that instead of
-+ * update_interval from the superblock.
-+ */
-+ if (mmp->mmp_check_interval > mmp_check_interval)
-+ mmp_check_interval = mmp->mmp_check_interval;
-+
-+ seq = le32_to_cpu(mmp->mmp_seq);
-+ if (seq == EXT4_MMP_SEQ_CLEAN)
-+ goto skip;
-+
-+ if (seq == EXT4_MMP_SEQ_FSCK) {
-+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
-+ goto failed;
-+ }
-+
-+ wait_time = min(mmp_check_interval * 2 + 1,
-+ mmp_check_interval + 60);
-+
-+ /* Print MMP interval if more than 20 secs. */
-+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
-+ ext4_warning(sb, "MMP interval %u higher than expected, please"
-+ " wait.\n", wait_time * 2);
-+
-+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
-+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
-+ goto failed;
-+ }
-+
-+ retval = read_mmp_block(sb, &bh, mmp_block);
-+ if (retval)
-+ goto failed;
-+ mmp = (struct mmp_struct *)(bh->b_data);
-+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
-+ dump_mmp_msg(sb, mmp,
-+ "Device is already active on another node.");
-+ goto failed;
-+ }
-+
-+skip:
-+ /*
-+ * write a new random sequence number.
-+ */
-+ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
-+
-+ retval = write_mmp_block(bh);
-+ if (retval)
-+ goto failed;
-+
-+ /*
-+ * wait for MMP interval and check mmp_seq.
-+ */
-+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
-+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
-+ goto failed;
-+ }
-+
-+ retval = read_mmp_block(sb, &bh, mmp_block);
-+ if (retval)
-+ goto failed;
-+ mmp = (struct mmp_struct *)(bh->b_data);
-+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
-+ dump_mmp_msg(sb, mmp,
-+ "Device is already active on another node.");
-+ goto failed;
-+ }
-+
-+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
-+ if (!mmpd_data) {
-+ ext4_warning(sb, "not enough memory for mmpd_data");
-+ goto failed;
-+ }
-+ mmpd_data->sb = sb;
-+ mmpd_data->bh = bh;
-+
-+ /*
-+ * Start a kernel thread to update the MMP block periodically.
-+ */
-+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
-+ bdevname(bh->b_bdev,
-+ mmp->mmp_bdevname));
-+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
-+ EXT4_SB(sb)->s_mmp_tsk = NULL;
-+ kfree(mmpd_data);
-+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
-+ sb->s_id);
-+ goto failed;
-+ }
-+
-+ return 0;
-+
-+failed:
-+ brelse(bh);
-+ return 1;
-+}
-+
-+
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -40,6 +40,8 @@
- #include <linux/log2.h>
- #include <linux/crc16.h>
- #include <asm/uaccess.h>
-+#include <linux/kthread.h>
-+#include <linux/utsname.h>
-
- #include "ext4.h"
- #include "ext4_jbd2.h"
-@@ -698,6 +700,8 @@ static void ext4_put_super(struct super_
- invalidate_bdev(sbi->journal_bdev, 0);
- ext4_blkdev_remove(sbi);
- }
-+ if (sbi->s_mmp_tsk)
-+ kthread_stop(sbi->s_mmp_tsk);
- sb->s_fs_info = NULL;
- /*
- * Now that we are completely done shutting down the
-@@ -2810,6 +2814,11 @@ static int ext4_fill_super(struct super_
- EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_RECOVER));
-
-+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
-+ !(sb->s_flags & MS_RDONLY))
-+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
-+ goto failed_mount3;
-+
- /*
- * The first inode we look at is the journal inode. Don't try
- * root first: it may be modified in the journal!
-@@ -3048,6 +3057,8 @@ failed_mount3:
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
-+ if (sbi->s_mmp_tsk)
-+ kthread_stop(sbi->s_mmp_tsk);
- failed_mount2:
- for (i = 0; i < db_count; i++)
- brelse(sbi->s_group_desc[i]);
-@@ -3557,7 +3568,7 @@ static int ext4_remount(struct super_blo
- struct ext4_mount_options old_opts;
- ext4_group_t g;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-- int err;
-+ int err = 0;
- #ifdef CONFIG_QUOTA
- int i;
- #endif
-@@ -3676,6 +3687,13 @@ static int ext4_remount(struct super_blo
- goto restore_opts;
- if (!ext4_setup_super(sb, es, 0))
- sb->s_flags &= ~MS_RDONLY;
-+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-+ EXT4_FEATURE_INCOMPAT_MMP))
-+ if (ext4_multi_mount_protect(sb,
-+ le64_to_cpu(es->s_mmp_block))) {
-+ err = -EROFS;
-+ goto restore_opts;
-+ }
- }
- }
- ext4_setup_system_zone(sb);
+++ /dev/null
-diff -rupN 2.6.27.21_2/fs/ext4/ext4.h 2.6.27.21_3/fs/ext4/ext4.h
---- 2.6.27.21_2/fs/ext4/ext4.h 2009-07-17 12:19:59.000000000 +0530
-+++ 2.6.27.21_3/fs/ext4/ext4.h 2009-07-17 12:38:59.000000000 +0530
-@@ -1181,6 +1181,9 @@ extern int ext4_orphan_add(handle_t *, s
- extern int ext4_orphan_del(handle_t *, struct inode *);
- extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
- __u32 start_minor_hash, __u32 *next_hash);
-+extern struct buffer_head *ext4_append(handle_t *handle,
-+ struct inode *inode,
-+ ext4_lblk_t *block, int *err);
-
- /* resize.c */
- extern int ext4_group_add(struct super_block *sb,
-diff -rupN 2.6.27.21_2/fs/ext4/hash.c 2.6.27.21_3/fs/ext4/hash.c
---- 2.6.27.21_2/fs/ext4/hash.c 2009-07-17 12:12:56.000000000 +0530
-+++ 2.6.27.21_3/fs/ext4/hash.c 2009-07-17 12:40:22.000000000 +0530
-@@ -9,6 +9,7 @@
- * License.
- */
-
-+#include <linux/module.h>
- #include <linux/fs.h>
- #include <linux/jbd2.h>
- #include <linux/cryptohash.h>
-@@ -206,3 +207,4 @@ int ext4fs_dirhash(const char *name, int
- hinfo->minor_hash = minor_hash;
- return 0;
- }
-+EXPORT_SYMBOL(ext4fs_dirhash);
-diff -rupN 2.6.27.21_2/fs/ext4/namei.c 2.6.27.21_3/fs/ext4/namei.c
---- 2.6.27.21_2/fs/ext4/namei.c 2009-07-17 12:23:51.000000000 +0530
-+++ 2.6.27.21_3/fs/ext4/namei.c 2009-07-17 12:37:59.000000000 +0530
-@@ -51,9 +51,9 @@
- #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
- #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
-
--static struct buffer_head *ext4_append(handle_t *handle,
-- struct inode *inode,
-- ext4_lblk_t *block, int *err)
-+struct buffer_head *ext4_append(handle_t *handle,
-+ struct inode *inode,
-+ ext4_lblk_t *block, int *err)
- {
- struct buffer_head *bh;
- struct ext4_inode_info *ei = EXT4_I(inode);
-@@ -72,6 +72,7 @@ static struct buffer_head *ext4_append(h
- up(&ei->i_append_sem);
- return bh;
- }
-+EXPORT_SYMBOL(ext4_append);
-
- #ifndef assert
- #define assert(test) J_ASSERT(test)
-diff -rupN 2.6.27.21_2/fs/ext4/super.c 2.6.27.21_3/fs/ext4/super.c
---- 2.6.27.21_2/fs/ext4/super.c 2009-07-17 12:12:57.000000000 +0530
-+++ 2.6.27.21_3/fs/ext4/super.c 2009-07-17 12:40:52.000000000 +0530
-@@ -377,6 +377,7 @@ void __ext4_std_error(struct super_block
-
- ext4_handle_error(sb);
- }
-+EXPORT_SYMBOL(__ext4_std_error);
-
- /*
- * ext4_abort is a much stronger failure handler than ext4_error. The
+++ /dev/null
-diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/ext4.h linux-2.6.27.21-0.1_2//fs/ext4/ext4.h
---- linux-2.6.27.21-0.1_1//fs/ext4/ext4.h 2009-08-24 15:32:00.000000000 +0530
-+++ linux-2.6.27.21-0.1_2//fs/ext4/ext4.h 2009-08-24 15:32:55.000000000 +0530
-@@ -1171,6 +1171,19 @@ extern int ext4_fiemap(struct inode *, s
- /* migrate.c */
- extern int ext4_ext_migrate(struct inode *);
- /* namei.c */
-+extern struct inode *ext4_create_inode(handle_t *handle,
-+ struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode);
-+extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
-+ struct ext4_dir_entry_2 * de_del,
-+ struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir);
-+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
-+extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
-+ struct inode *inode);
- extern int ext4_orphan_add(handle_t *, struct inode *);
- extern int ext4_orphan_del(handle_t *, struct inode *);
- extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/namei.c linux-2.6.27.21-0.1_2//fs/ext4/namei.c
---- linux-2.6.27.21-0.1_1//fs/ext4/namei.c 2009-08-24 15:32:00.000000000 +0530
-+++ linux-2.6.27.21-0.1_2//fs/ext4/namei.c 2009-08-24 15:43:56.000000000 +0530
-@@ -24,6 +24,7 @@
- * Theodore Ts'o, 2002
- */
-
-+#include <linux/module.h>
- #include <linux/fs.h>
- #include <linux/pagemap.h>
- #include <linux/jbd2.h>
-@@ -882,9 +883,9 @@ static inline int search_dirblock(struct
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
-- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir)
-+struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir)
- {
- struct super_block *sb;
- struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -991,6 +992,7 @@ cleanup_and_exit:
- brelse(bh_use[ra_ptr]);
- return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
-
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1511,8 +1513,8 @@ static int make_indexed_dir(handle_t *ha
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
- {
- struct inode *dir = dentry->d_parent->d_inode;
- struct buffer_head *bh;
-@@ -1557,6 +1559,7 @@ static int ext4_add_entry(handle_t *hand
- de->rec_len = ext4_rec_len_to_disk(blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
- }
-+EXPORT_SYMBOL(ext4_add_entry);
-
- /*
- * Returns 0 for success, or a negative error value
-@@ -1699,10 +1702,10 @@ cleanup:
- * ext4_delete_entry deletes a directory entry by merging it with the
- * previous entry
- */
--static int ext4_delete_entry(handle_t *handle,
-- struct inode *dir,
-- struct ext4_dir_entry_2 *de_del,
-- struct buffer_head *bh)
-+int ext4_delete_entry(handle_t *handle,
-+ struct inode *dir,
-+ struct ext4_dir_entry_2 *de_del,
-+ struct buffer_head *bh)
- {
- struct ext4_dir_entry_2 *de, *pde;
- int i;
-@@ -1733,7 +1736,7 @@ static int ext4_delete_entry(handle_t *h
- }
- return -ENOENT;
- }
--
-+EXPORT_SYMBOL(ext4_delete_entry);
- /*
- * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
- * since this indicates that nlinks count was previously 1.
-@@ -1796,6 +1799,26 @@ static unsigned ext4_dentry_goal(struct
- return inum;
- }
-
-+struct inode * ext4_create_inode(handle_t *handle, struct inode * dir, int mode)
-+{
-+ struct inode *inode;
-+
-+ inode = ext4_new_inode(handle, dir, mode, NULL, 0);
-+ if (!IS_ERR(inode)) {
-+ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) {
-+#ifdef CONFIG_LDISKFS_FS_XATTR
-+ inode->i_op = &ext4_special_inode_operations;
-+#endif
-+ } else {
-+ inode->i_op = &ext4_file_inode_operations;
-+ inode->i_fop = &ext4_file_operations;
-+ ext4_set_aops(inode);
-+ }
-+ }
-+ return inode;
-+}
-+EXPORT_SYMBOL(ext4_create_inode);
-+
- /*
- * By the time this is called, we already have created
- * the directory cache entry for the new file, but it
-@@ -1872,40 +1895,32 @@ retry:
- return err;
- }
-
--static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-+/* Initialize @inode as a subdirectory of @dir, and add the
-+ * "." and ".." entries into the first directory block. */
-+int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir,
-+ struct inode *inode)
- {
-- handle_t *handle;
-- struct inode *inode;
-- struct buffer_head *dir_block;
-- struct ext4_dir_entry_2 *de;
- unsigned int blocksize = dir->i_sb->s_blocksize;
-- int err, retries = 0;
--
-- if (EXT4_DIR_LINK_MAX(dir))
-- return -EMLINK;
-+ struct buffer_head * dir_block;
-+ struct ext4_dir_entry_2 * de;
-+ int err = 0;
-
--retry:
-- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
-- inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name,
-- ext4_dentry_goal(dir->i_sb, dentry));
-- err = PTR_ERR(inode);
-- if (IS_ERR(inode))
-- goto out_stop;
--
- inode->i_op = &ext4_dir_inode_operations;
- inode->i_fop = &ext4_dir_operations;
- inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread(handle, inode, 0, 1, &err);
-- if (!dir_block)
-- goto out_clear_inode;
-+ if (!dir_block) {
-+ clear_nlink(inode);
-+ ext4_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto get_out;
-+ }
- BUFFER_TRACE(dir_block, "get_write_access");
- ext4_journal_get_write_access(handle, dir_block);
- de = (struct ext4_dir_entry_2 *) dir_block->b_data;
-@@ -1925,9 +1940,43 @@ retry:
- ext4_journal_dirty_metadata(handle, dir_block);
- brelse(dir_block);
- ext4_mark_inode_dirty(handle, inode);
-+get_out:
-+ return err;
-+}
-+EXPORT_SYMBOL(ext4_add_dot_dotdot);
-+
-+
-+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-+{
-+ handle_t *handle;
-+ struct inode *inode;
-+ int err, retries = 0;
-+
-+ if (EXT4_DIR_LINK_MAX(dir))
-+ return -EMLINK;
-+
-+retry:
-+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-+ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_DIRSYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name,
-+ ext4_dentry_goal(dir->i_sb, dentry));
-+ err = PTR_ERR(inode);
-+ if (IS_ERR(inode))
-+ goto out_stop;
-+
-+ err = ext4_add_dot_dotdot(handle, dir, inode);
-+ if (err)
-+ goto out_stop;
-+
- err = ext4_add_entry(handle, dentry, inode);
- if (err) {
--out_clear_inode:
- clear_nlink(inode);
- ext4_mark_inode_dirty(handle, inode);
- iput(inode);
+++ /dev/null
-diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/ext4_i.h linux-2.6.27.21-0.1_2//fs/ext4/ext4_i.h
---- linux-2.6.27.21-0.1_1//fs/ext4/ext4.h 2009-08-24 13:00:59.000000000 +0530
-+++ linux-2.6.27.21-0.1_2//fs/ext4/ext4.h 2009-08-24 13:01:25.000000000 +0530
-@@ -16,6 +16,7 @@
- #include <linux/blkdev.h>
- #include <linux/magic.h>
- #include <linux/jbd2.h>
-+#include <linux/dynlocks.h>
- #include <linux/quota.h>
- #include <linux/rwsem.h>
- #include <linux/rbtree.h>
-@@ -56,7 +57,9 @@ struct ext4_inode_info {
- __u32 i_flags;
- ext4_fsblk_t i_file_acl;
- __u32 i_dtime;
--
-+ /* following fields for parallel directory operations -bzzz */
-+ struct dynlock i_htree_lock;
-+ struct semaphore i_append_sem;
- /*
- * i_block_group is the number of the block group which contains
- * this file's inode. Constant across the lifetime of the inode,
-diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/namei.c linux-2.6.27.21-0.1_2//fs/ext4/namei.c
---- linux-2.6.27.21-0.1_1//fs/ext4/namei.c 2009-08-24 13:00:59.000000000 +0530
-+++ linux-2.6.27.21-0.1_2//fs/ext4/namei.c 2009-08-24 13:03:45.000000000 +0530
-@@ -55,6 +55,11 @@ static struct buffer_head *ext4_append(h
- ext4_lblk_t *block, int *err)
- {
- struct buffer_head *bh;
-+ struct ext4_inode_info *ei = EXT4_I(inode);
-+
-+ /* with parallel dir operations all appends
-+ * have to be serialized -bzzz */
-+ down(&ei->i_append_sem);
-
- *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-
-@@ -67,7 +72,9 @@ static struct buffer_head *ext4_append(h
- brelse(bh);
- bh = NULL;
- }
-+ ei->i_disksize = inode->i_size;
- }
-+ up(&ei->i_append_sem);
- return bh;
- }
-
-diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/super.c linux-2.6.27.21-0.1_2//fs/ext4/super.c
---- linux-2.6.27.21-0.1_1//fs/ext4/super.c 2009-08-24 13:00:59.000000000 +0530
-+++ linux-2.6.27.21-0.1_2//fs/ext4/super.c 2009-08-24 13:01:25.000000000 +0530
-@@ -635,6 +635,8 @@ static struct inode *ext4_alloc_inode(st
- #endif
- ei->vfs_inode.i_version = 1;
- ei->vfs_inode.i_data.writeback_index = 0;
-+ dynlock_init(&ei->i_htree_lock);
-+ sema_init(&ei->i_append_sem, 1);
- memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
- INIT_LIST_HEAD(&ei->i_prealloc_list);
- spin_lock_init(&ei->i_prealloc_lock);
+++ /dev/null
-Index: linux-2.6.18-128.1.6/fs/ext4/super.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/super.c
-+++ linux-2.6.18-128.1.6/fs/ext4/super.c
-@@ -108,7 +108,8 @@
- EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
- EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
- EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
--EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-+EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
-+EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
- EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
- EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size);
-
-@@ -108,7 +108,8 @@
- ATTR_LIST(mb_max_to_scan),
- ATTR_LIST(mb_min_to_scan),
- ATTR_LIST(mb_order2_req),
-- ATTR_LIST(mb_stream_req),
-+ ATTR_LIST(mb_small_req),
-+ ATTR_LIST(mb_large_req),
- ATTR_LIST(mb_group_prealloc),
- ATTR_LIST(max_dir_size),
- NULL,
-Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h 2009-05-28 17:16:51.000000000 +0530
-+++ linux-2.6.18-128.1.6/fs/ext4/ext4.h 2009-05-28 17:16:52.000000000 +0530
-@@ -108,11 +108,14 @@
-
- /* tunables */
- unsigned long s_stripe;
-- unsigned int s_mb_stream_request;
-+ unsigned long s_mb_small_req;
-+ unsigned long s_mb_large_req;
- unsigned int s_mb_max_to_scan;
- unsigned int s_mb_min_to_scan;
- unsigned int s_mb_stats;
- unsigned int s_mb_order2_reqs;
-+ unsigned long *s_mb_prealloc_table;
-+ unsigned long s_mb_prealloc_table_size;
- unsigned int s_mb_group_prealloc;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
-Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c 2009-05-28 17:16:51.000000000 +0530
-+++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c 2009-05-28 17:19:57.000000000 +0530
-@@ -2284,6 +2284,26 @@
- }
- }
-
-+static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value)
-+{
-+ int i;
-+
-+ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
-+ return;
-+
-+ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
-+ if (sbi->s_mb_prealloc_table[i] == 0) {
-+ sbi->s_mb_prealloc_table[i] = value;
-+ return;
-+ }
-+
-+ /* they should add values in order */
-+ if (value <= sbi->s_mb_prealloc_table[i])
-+ return;
-+ }
-+}
-+
-+
- static int ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
- {
-@@ -2325,6 +2389,80 @@
- .llseek = seq_lseek,
- .release = seq_release,
- };
-+
-+#define EXT4_MB_PREALLOC_TABLE "prealloc_table"
-+
-+static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
-+{
-+ struct ext4_sb_info *sbi = data;
-+ int len = 0;
-+ int i;
-+
-+ *eof = 1;
-+ if (off != 0)
-+ return 0;
-+
-+ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
-+ len += sprintf(page + len, "%ld ",
-+ sbi->s_mb_prealloc_table[i]);
-+ len += sprintf(page + len, "\n");
-+
-+ *start = page;
-+ return len;
-+}
-+
-+static int ext4_mb_prealloc_table_proc_write(struct file *file,
-+ const char __user *buf,
-+ unsigned long cnt, void *data)
-+{
-+ struct ext4_sb_info *sbi = data;
-+ unsigned long value;
-+ unsigned long prev = 0;
-+ char str[128];
-+ char *cur;
-+ char *end;
-+ unsigned long *new_table;
-+ int num = 0;
-+ int i = 0;
-+
-+ if (cnt >= sizeof(str))
-+ return -EINVAL;
-+ if (copy_from_user(str, buf, cnt))
-+ return -EFAULT;
-+
-+ num = 0;
-+ cur = str;
-+ end = str + cnt;
-+ while (cur < end) {
-+ while ((cur < end) && (*cur == ' ')) cur++;
-+ value = simple_strtol(cur, &cur, 0);
-+ if (value == 0)
-+ break;
-+ if (value <= prev)
-+ return -EINVAL;
-+ prev = value;
-+ num++;
-+ }
-+
-+ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
-+ if (new_table == NULL)
-+ return -ENOMEM;
-+ kfree(sbi->s_mb_prealloc_table);
-+ memset(new_table, 0, num * sizeof(*new_table));
-+ sbi->s_mb_prealloc_table = new_table;
-+ sbi->s_mb_prealloc_table_size = num;
-+ cur = str;
-+ end = str + cnt;
-+ while (cur < end && i < num) {
-+ while ((cur < end) && (*cur == ' ')) cur++;
-+ value = simple_strtol(cur, &cur, 0);
-+ ext4_mb_prealloc_table_add(sbi, value);
-+ i++;
-+ }
-+
-+ return cnt;
-+}
-
- static void ext4_mb_history_release(struct super_block *sb)
- {
-@@ -2400,6 +2400,7 @@
- remove_proc_entry("mb_groups", sbi->s_proc);
- if (sbi->s_mb_history_max)
- remove_proc_entry("mb_history", sbi->s_proc);
-+ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
- }
- kfree(sbi->s_mb_history);
- }
-@@ -2408,6 +2446,13 @@
- p->proc_fops = &ext4_mb_seq_groups_fops;
- p->data = sb;
- }
-+ p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG |
-+ S_IRUGO | S_IWUSR, sbi->s_proc);
-+ if (p) {
-+ p->data = sbi;
-+ p->read_proc = ext4_mb_prealloc_table_proc_read;
-+ p->write_proc = ext4_mb_prealloc_table_proc_write;
-+ }
- }
-
- sbi->s_mb_history_cur = 0;
-@@ -2542,13 +2562,57 @@
- sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
- sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
- sbi->s_mb_stats = MB_DEFAULT_STATS;
-- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
- sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
-- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
-+
-+ if (sbi->s_stripe == 0) {
-+ sbi->s_mb_prealloc_table_size = 10;
-+ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
-+ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
-+ if (sbi->s_mb_prealloc_table == NULL) {
-+ kfree(sbi->s_mb_offsets);
-+ kfree(sbi->s_mb_maxs);
-+ return -ENOMEM;
-+ }
-+ memset(sbi->s_mb_prealloc_table, 0, i);
-+
-+ ext4_mb_prealloc_table_add(sbi, 4);
-+ ext4_mb_prealloc_table_add(sbi, 8);
-+ ext4_mb_prealloc_table_add(sbi, 16);
-+ ext4_mb_prealloc_table_add(sbi, 32);
-+ ext4_mb_prealloc_table_add(sbi, 64);
-+ ext4_mb_prealloc_table_add(sbi, 128);
-+ ext4_mb_prealloc_table_add(sbi, 256);
-+ ext4_mb_prealloc_table_add(sbi, 512);
-+ ext4_mb_prealloc_table_add(sbi, 1024);
-+ ext4_mb_prealloc_table_add(sbi, 2048);
-+
-+ sbi->s_mb_small_req = 256;
-+ sbi->s_mb_large_req = 1024;
-+ sbi->s_mb_group_prealloc = 512;
-+ } else {
-+ sbi->s_mb_prealloc_table_size = 3;
-+ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
-+ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
-+ if (sbi->s_mb_prealloc_table == NULL) {
-+ kfree(sbi->s_mb_offsets);
-+ kfree(sbi->s_mb_maxs);
-+ return -ENOMEM;
-+ }
-+ memset(sbi->s_mb_prealloc_table, 0, i);
-+
-+ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe);
-+ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2);
-+ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4);
-+
-+ sbi->s_mb_small_req = sbi->s_stripe;
-+ sbi->s_mb_large_req = sbi->s_stripe * 8;
-+ sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
-+ }
-
- sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
- if (sbi->s_locality_groups == NULL) {
-+ kfree(sbi->s_mb_prealloc_table);
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- return -ENOMEM;
-@@ -3032,11 +3186,12 @@
- ext4_mb_normalize_request(struct ext4_allocation_context *ac,
- struct ext4_allocation_request *ar)
- {
-- int bsbits, max;
-+ int bsbits, i, wind;
- ext4_lblk_t end;
-- loff_t size, orig_size, start_off;
-+ loff_t size, orig_size;
- ext4_lblk_t start, orig_start;
- struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
-+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_prealloc_space *pa;
-
- /* do normalize only data requests, metadata requests
-@@ -3066,49 +3221,35 @@
- size = size << bsbits;
- if (size < i_size_read(ac->ac_inode))
- size = i_size_read(ac->ac_inode);
-+ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
-
-- /* max size of free chunks */
-- max = 2 << bsbits;
-+ start = wind = 0;
-
--#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
-- (req <= (size) || max <= (chunk_size))
-+ /* let's choose preallocation window depending on file size */
-+ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
-+ if (size <= sbi->s_mb_prealloc_table[i]) {
-+ wind = sbi->s_mb_prealloc_table[i];
-+ break;
-+ }
-+ }
-+ size = wind;
-
-- /* first, try to predict filesize */
-- /* XXX: should this table be tunable? */
-- start_off = 0;
-- if (size <= 16 * 1024) {
-- size = 16 * 1024;
-- } else if (size <= 32 * 1024) {
-- size = 32 * 1024;
-- } else if (size <= 64 * 1024) {
-- size = 64 * 1024;
-- } else if (size <= 128 * 1024) {
-- size = 128 * 1024;
-- } else if (size <= 256 * 1024) {
-- size = 256 * 1024;
-- } else if (size <= 512 * 1024) {
-- size = 512 * 1024;
-- } else if (size <= 1024 * 1024) {
-- size = 1024 * 1024;
-- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
-- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
-- (21 - bsbits)) << 21;
-- size = 2 * 1024 * 1024;
-- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
-- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
-- (22 - bsbits)) << 22;
-- size = 4 * 1024 * 1024;
-- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
-- (8<<20)>>bsbits, max, 8 * 1024)) {
-- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
-- (23 - bsbits)) << 23;
-- size = 8 * 1024 * 1024;
-- } else {
-- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
-- size = ac->ac_o_ex.fe_len << bsbits;
-+ if (wind == 0) {
-+ __u64 tstart, tend;
-+ /* file is quite large, we now preallocate with
-+ * the biggest configured window with regart to
-+ * logical offset */
-+ wind = sbi->s_mb_prealloc_table[i - 1];
-+ tstart = ac->ac_o_ex.fe_logical;
-+ do_div(tstart, wind);
-+ start = tstart * wind;
-+ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
-+ do_div(tend, wind);
-+ tend = tend * wind + wind;
-+ size = tend - start;
- }
-- orig_size = size = size >> bsbits;
-- orig_start = start = start_off >> bsbits;
-+ orig_size = size;
-+ orig_start = start;
-
- /* don't cover already allocated blocks in selected range */
- if (ar->pleft && start <= ar->lleft) {
-@@ -3185,7 +3326,6 @@
- }
- BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
- start > ac->ac_o_ex.fe_logical);
-- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
-
- /* now prepare goal request */
-
-@@ -4077,11 +4217,17 @@
-
- /* don't use group allocation for large files */
- size = max(size, isize);
-- if (size > sbi->s_mb_stream_request) {
-+ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
-+ (size >= sbi->s_mb_large_req)) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
- return;
- }
-
-+ /* request is so large that we don't care about
-+ * streaming - it overweights any possible seek */
-+ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
-+ return;
-+
- BUG_ON(ac->ac_lg != NULL);
- /*
- * locality group prealloc space are per cpu. The reason for having
-Index: linux-2.6.27.21-0.1/fs/ext4/inode.c
-===================================================================
---- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c 2009-05-28 11:12:42.000000000 +0530
-+++ linux-2.6.27.21-0.1/fs/ext4/inode.c 2009-05-28 11:16:48.000000000 +0530
-@@ -2442,14 +2442,14 @@
- return -EROFS;
-
- /*
-- * Make sure nr_to_write is >= sbi->s_mb_stream_request
-+ * Make sure nr_to_write is >= sbi->s_mb_small_req
- * This make sure small files blocks are allocated in
- * single attempt. This ensure that small files
- * get less fragmented.
- */
-- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
-- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
-- wbc->nr_to_write = sbi->s_mb_stream_request;
-+ if (wbc->nr_to_write < sbi->s_mb_small_req) {
-+ nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write;
-+ wbc->nr_to_write = sbi->s_mb_small_req;
- }
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
+++ /dev/null
-Index: linux-2.6.18.i386/fs/ext4/namei.c
-===================================================================
---- linux-2.6.18.i386.orig/fs/ext4/namei.c
-+++ linux-2.6.18.i386/fs/ext4/namei.c
-@@ -374,8 +374,9 @@ dx_probe(struct dentry *dentry, struct i
- if (root->info.hash_version != DX_HASH_TEA &&
- root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY) {
-- ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
-- root->info.hash_version);
-+ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d"
-+ "for directory #%lu",
-+ root->info.hash_version, dir->i_ino);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
+++ /dev/null
-Index: linux-2.6.18-238.12.1/fs/ext4/ext4.h
-===================================================================
---- linux-2.6.18-238.12.1.orig/fs/ext4/ext4.h 2011-09-21 17:55:44.627741549 +0200
-+++ linux-2.6.18-238.12.1/fs/ext4/ext4.h 2011-09-21 18:05:20.974106450 +0200
-@@ -971,6 +971,7 @@
- #ifdef CONFIG_QUOTA
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
-+ unsigned long s_qf_inums[MAXQUOTAS]; /* Quota file inodes */
- #endif
- unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct rb_root system_blks;
-@@ -1171,6 +1172,7 @@
- #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
- #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
- #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
-+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
-
- #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
- #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
+++ /dev/null
-Index: linux-2.6.18-128.1.6/fs/ext4/super.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/ext4/super.c 2009-07-24 01:33:54.000000000 -0400
-+++ linux-2.6.18-128.1.6/fs/ext4/super.c 2009-07-24 01:35:28.000000000 -0400
-@@ -3461,6 +3461,8 @@ static int __init init_ext4_fs(void)
- goto out;
- }
- #endif
-+
-+ printk(KERN_INFO "ldiskfs created from ""ext""4-2.6-rhel5\n");
- return 0;
- out:
- destroy_inodecache();
---- /dev/null 2009-09-21 17:11:24.467285554 +0800
-+++ linux-2.6.27.21-0.1/fs/ext4/fiemap.h
-@@ -0,0 +1,2 @@
-+
-+#include_next <fiemap.h>
+++ /dev/null
-Index: linux-stage/fs/ext4/super.c
-===================================================================
---- linux-stage.orig/fs/ext4/super.c
-+++ linux-stage/fs/ext4/super.c
-@@ -662,7 +662,12 @@ static void ext4_put_super(struct super_
-
- for (i = 0; i < sbi->s_gdb_count; i++)
- brelse(sbi->s_group_desc[i]);
-- kfree(sbi->s_group_desc);
-+
-+ if (is_vmalloc_addr(sbi->s_group_desc))
-+ vfree(sbi->s_group_desc);
-+ else
-+ kfree(sbi->s_group_desc);
-+
- if (is_vmalloc_addr(sbi->s_flex_groups))
- vfree(sbi->s_flex_groups);
- else
-@@ -2402,12 +2407,13 @@ static int ext4_fill_super(struct super_
- unsigned long offset = 0;
- unsigned long journal_devnum = 0;
- unsigned long def_mount_opts;
-- struct inode *root;
-+ struct inode *root = NULL;
- char *cp;
- const char *descr;
- int ret = -EINVAL;
- int blocksize;
- unsigned int db_count;
-+ size_t size;
- unsigned int i;
- int needs_recovery, has_huge_files;
- __u64 blocks_count;
-@@ -2718,10 +2724,16 @@ static int ext4_fill_super(struct super_
- (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
- db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
-- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
-- GFP_KERNEL);
-+ size = (size_t) db_count * sizeof(struct buffer_head *);
-+ sbi->s_group_desc = kzalloc(size, GFP_KERNEL);
-+ if (sbi->s_group_desc == NULL) {
-+ sbi->s_group_desc = vmalloc(size);
-+ if (sbi->s_group_desc != NULL)
-+ memset(sbi->s_group_desc, 0, size);
-+ }
- if (sbi->s_group_desc == NULL) {
-- ext4_msg(sb, KERN_ERR, "not enough memory");
-+ ext4_msg(sb, KERN_ERR, "not enough memory for %u groups (%u)\n",
-+ sbi->s_groups_count, (unsigned int) size);
- goto failed_mount;
- }
-
-@@ -2907,17 +2919,16 @@ no_journal:
- if (IS_ERR(root)) {
- ext4_msg(sb, KERN_ERR, "get root inode failed");
- ret = PTR_ERR(root);
-+ root = NULL;
- goto failed_mount4;
- }
- if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-- iput(root);
- ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
- goto failed_mount4;
- }
- sb->s_root = d_alloc_root(root);
- if (!sb->s_root) {
- ext4_msg(sb, KERN_ERR, "get root dentry failed");
-- iput(root);
- ret = -ENOMEM;
- goto failed_mount4;
- }
-@@ -2968,6 +2979,7 @@ no_journal:
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
- err);
-+ ret = err;
- goto failed_mount4;
- }
-
-@@ -3011,6 +3023,8 @@ cantfind_ext4:
- goto failed_mount;
-
- failed_mount4:
-+ iput(root);
-+ sb->s_root = NULL;
- ext4_msg(sb, KERN_ERR, "mount failed");
- destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
- failed_mount_wq:
-@@ -3033,7 +3047,11 @@ failed_mount3:
- failed_mount2:
- for (i = 0; i < db_count; i++)
- brelse(sbi->s_group_desc[i]);
-- kfree(sbi->s_group_desc);
-+
-+ if (is_vmalloc_addr(sbi->s_group_desc))
-+ vfree(sbi->s_group_desc);
-+ else
-+ kfree(sbi->s_group_desc);
- failed_mount:
- if (sbi->s_proc) {
- remove_proc_entry(sb->s_id, ext4_proc_root);
-Index: linux-stage/fs/ext4/mballoc.c
-===================================================================
---- linux-stage.orig/fs/ext4/mballoc.c
-+++ linux-stage/fs/ext4/mballoc.c
-@@ -2607,10 +2607,21 @@ static int ext4_mb_init_backend(struct s
- while (array_size < sizeof(*sbi->s_group_info) *
- num_meta_group_infos_max)
- array_size = array_size << 1;
-- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
-- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
-- * So a two level scheme suffices for now. */
-- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
-+
-+ /*
-+ * A 16TB filesystem with 64-bit pointers requires an 8192 byte
-+ * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally)
-+ * have group descriptors at least twice as large (64 bytes or
-+ * more vs. 32 bytes for traditional ext3 filesystems, so a 128TB
-+ * filesystem needs a 128kB allocation, which may need vmalloc().
-+ */
-+ sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
-+ if (sbi->s_group_info == NULL) {
-+ sbi->s_group_info = vmalloc(array_size);
-+ if (sbi->s_group_info != NULL)
-+ memset(sbi->s_group_info, 0, array_size);
-+ }
-+
- if (sbi->s_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
- return -ENOMEM;
-@@ -2620,6 +2631,11 @@ static int ext4_mb_init_backend(struct s
- printk(KERN_ERR "EXT4-fs: can't get new inode\n");
- goto err_freesgi;
- }
-+ /*
-+ * To avoid colliding with an valid on-disk inode number,
-+ * EXT4_BAD_INO is used here as the number of the buddy cache inode.
-+ */
-+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
- EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
- for (i = 0; i < ngroups; i++) {
- desc = ext4_get_group_desc(sb, i, NULL);
-@@ -2642,7 +2658,10 @@ err_freebuddy:
- kfree(sbi->s_group_info[i]);
- iput(sbi->s_buddy_cache);
- err_freesgi:
-- kfree(sbi->s_group_info);
-+ if (is_vmalloc_addr(sbi->s_group_info))
-+ vfree(sbi->s_group_info);
-+ else
-+ kfree(sbi->s_group_info);
- return -ENOMEM;
- }
-
-@@ -2683,14 +2702,6 @@ int ext4_mb_init(struct super_block *sb,
- i++;
- } while (i <= sb->s_blocksize_bits + 1);
-
-- /* init file for buddy data */
-- ret = ext4_mb_init_backend(sb);
-- if (ret != 0) {
-- kfree(sbi->s_mb_offsets);
-- kfree(sbi->s_mb_maxs);
-- return ret;
-- }
--
- spin_lock_init(&sbi->s_md_lock);
- spin_lock_init(&sbi->s_bal_lock);
-
-@@ -2717,6 +2728,14 @@ int ext4_mb_init(struct super_block *sb,
- spin_lock_init(&lg->lg_prealloc_lock);
- }
-
-+ /* init file for buddy data */
-+ ret = ext4_mb_init_backend(sb);
-+ if (ret != 0) {
-+ kfree(sbi->s_mb_offsets);
-+ kfree(sbi->s_mb_maxs);
-+ return ret;
-+ }
-+
- ext4_mb_history_init(sb);
-
- if (sbi->s_journal)
-@@ -2766,7 +2785,10 @@ int ext4_mb_release(struct super_block *
- EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++)
- kfree(sbi->s_group_info[i]);
-- kfree(sbi->s_group_info);
-+ if (is_vmalloc_addr(sbi->s_group_info))
-+ vfree(sbi->s_group_info);
-+ else
-+ kfree(sbi->s_group_info);
- }
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
+++ /dev/null
-Index: linux-2.6.18-194.3.1/fs/ext4/namei.c
-===================================================================
---- linux-2.6.18-194.3.1.orig/fs/ext4/namei.c
-+++ linux-2.6.18-194.3.1/fs/ext4/namei.c
-@@ -148,6 +148,17 @@ struct dx_map_entry
- u16 size;
- };
-
-+/*
-+ * dentry_param used by ext4_new_inode_wantedi()
-+ */
-+#define LVFS_DENTRY_PARAM_MAGIC 20070216UL
-+struct lvfs_dentry_params
-+{
-+ unsigned long ldp_inum;
-+ unsigned long ldp_flags;
-+ u32 ldp_magic;
-+};
-+
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
- static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
- static inline unsigned dx_get_hash(struct dx_entry *entry);
-@@ -1761,6 +1772,19 @@ static int ext4_add_nondir(handle_t *han
- return err;
- }
-
-+static unsigned ext4_dentry_goal(struct super_block *sb, struct dentry *dentry)
-+{
-+ unsigned inum = EXT4_SB(sb)->s_inode_goal;
-+
-+ if (dentry->d_fsdata != NULL) {
-+ struct lvfs_dentry_params *param = dentry->d_fsdata;
-+
-+ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC)
-+ inum = param->ldp_inum;
-+ }
-+ return inum;
-+}
-+
- /*
- * By the time this is called, we already have created
- * the directory cache entry for the new file, but it
-@@ -1786,7 +1810,8 @@ retry:
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
-- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
-+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name,
-+ ext4_dentry_goal(dir->i_sb, dentry));
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ext4_file_inode_operations;
-@@ -1820,7 +1845,8 @@ retry:
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
-- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
-+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name,
-+ ext4_dentry_goal(dir->i_sb, dentry));
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, inode->i_mode, rdev);
-@@ -1857,8 +1883,8 @@ retry:
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
-- inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
-- &dentry->d_name, 0);
-+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name,
-+ ext4_dentry_goal(dir->i_sb, dentry));
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
-@@ -2270,8 +2296,8 @@ retry:
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
-- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
-- &dentry->d_name, 0);
-+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name,
-+ ext4_dentry_goal(dir->i_sb, dentry));
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
+++ /dev/null
-this patch implements feature which allows ext4 fs users (e.g. Lustre)
-to store data in ext4 dirent.
-data is stored in ext4 dirent after file-name, this space is accounted
-in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data
-is present.
-
-make use of dentry->d_fsdata to pass fid to ext4. so no
-changes in ext4_add_entry() interface required.
-
-Index: linux-stage/fs/ext4/dir.c
-===================================================================
---- linux-stage.orig/fs/ext4/dir.c
-+++ linux-stage/fs/ext4/dir.c
-@@ -53,11 +53,18 @@ const struct file_operations ext4_dir_op
-
- static unsigned char get_dtype(struct super_block *sb, int filetype)
- {
-+ int fl_index = filetype & EXT4_FT_MASK;
-+
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
-- (filetype >= EXT4_FT_MAX))
-+ (fl_index >= EXT4_FT_MAX))
- return DT_UNKNOWN;
-
-- return (ext4_filetype_table[filetype]);
-+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA))
-+ return (ext4_filetype_table[fl_index]);
-+
-+ return (ext4_filetype_table[fl_index]) |
-+ (filetype & EXT4_DIRENT_LUFID);
-+
- }
-
-
-@@ -70,11 +77,11 @@ int ext4_check_dir_entry(const char *fun
- const int rlen = ext4_rec_len_from_disk(de->rec_len,
- dir->i_sb->s_blocksize);
-
-- if (rlen < EXT4_DIR_REC_LEN(1))
-+ if (rlen < __EXT4_DIR_REC_LEN(1))
- error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
- error_msg = "rec_len % 4 != 0";
-- else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
-+ else if (rlen < EXT4_DIR_REC_LEN(de))
- error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
- error_msg = "directory entry across blocks";
-@@ -179,7 +186,7 @@ revalidate:
- * failure will be detected in the
- * dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len,
-- sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
-+ sb->s_blocksize) < __EXT4_DIR_REC_LEN(1))
- break;
- i += ext4_rec_len_from_disk(de->rec_len,
- sb->s_blocksize);
-@@ -342,12 +349,17 @@ int ext4_htree_store_dirent(struct file
- struct fname *fname, *new_fn;
- struct dir_private_info *info;
- int len;
-+ int extra_data = 1;
-
- info = (struct dir_private_info *) dir_file->private_data;
- p = &info->root.rb_node;
-
- /* Create and allocate the fname structure */
-- len = sizeof(struct fname) + dirent->name_len + 1;
-+ if (dirent->file_type & EXT4_DIRENT_LUFID)
-+ extra_data = ext4_get_dirent_data_len(dirent);
-+
-+ len = sizeof(struct fname) + dirent->name_len + extra_data;
-+
- new_fn = kzalloc(len, GFP_KERNEL);
- if (!new_fn)
- return -ENOMEM;
-@@ -356,7 +368,7 @@ int ext4_htree_store_dirent(struct file
- new_fn->inode = le32_to_cpu(dirent->inode);
- new_fn->name_len = dirent->name_len;
- new_fn->file_type = dirent->file_type;
-- memcpy(new_fn->name, dirent->name, dirent->name_len);
-+ memcpy(new_fn->name, dirent->name, dirent->name_len + extra_data);
- new_fn->name[dirent->name_len] = 0;
-
- while (*p) {
-Index: linux-stage/fs/ext4/ext4.h
-===================================================================
---- linux-stage.orig/fs/ext4/ext4.h
-+++ linux-stage/fs/ext4/ext4.h
-@@ -1172,6 +1172,7 @@ static inline void ext4_clear_inode_stat
- #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
- #define EXT4_FEATURE_INCOMPAT_MMP 0x0100
- #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
-+#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000
-
- #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
- #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
-@@ -1180,7 +1181,9 @@ static inline void ext4_clear_inode_stat
- EXT4_FEATURE_INCOMPAT_EXTENTS| \
- EXT4_FEATURE_INCOMPAT_64BIT| \
- EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-- EXT4_FEATURE_INCOMPAT_MMP)
-+ EXT4_FEATURE_INCOMPAT_MMP| \
-+ EXT4_FEATURE_INCOMPAT_DIRDATA)
-+
- #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
-@@ -1262,6 +1265,43 @@ struct ext4_dir_entry_2 {
- #define EXT4_FT_SYMLINK 7
-
- #define EXT4_FT_MAX 8
-+#define EXT4_FT_MASK 0xf
-+
-+#if EXT4_FT_MAX > EXT4_FT_MASK
-+#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
-+#endif
-+
-+/*
-+ * d_type has 4 unused bits, so it can hold four types data. these different
-+ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be
-+ * stored, in flag order, after file-name in ext4 dirent.
-+*/
-+/*
-+ * this flag is added to d_type if ext4 dirent has extra data after
-+ * filename. this data length is variable and length is stored in first byte
-+ * of data. data start after filename NUL byte.
-+ * This is used by Lustre FS.
-+ */
-+#define EXT4_DIRENT_LUFID 0x10
-+
-+#define EXT4_LUFID_MAGIC 0xAD200907UL
-+struct ext4_dentry_param {
-+ __u32 edp_magic; /* EXT4_LUFID_MAGIC */
-+ char edp_len; /* size of edp_data in bytes */
-+ char edp_data[0]; /* packed array of data */
-+} __attribute__((packed));
-+
-+static inline unsigned char *ext4_dentry_get_data(struct super_block *sb,
-+ struct ext4_dentry_param* p)
-+
-+{
-+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA))
-+ return NULL;
-+ if (p && p->edp_magic == EXT4_LUFID_MAGIC)
-+ return &p->edp_len;
-+ else
-+ return NULL;
-+}
-
- /*
- * EXT4_DIR_PAD defines the directory entries boundaries
-@@ -1270,8 +1310,11 @@ struct ext4_dir_entry_2 {
- */
- #define EXT4_DIR_PAD 4
- #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
--#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
-+#define __EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
- ~EXT4_DIR_ROUND)
-+#define EXT4_DIR_REC_LEN(de) (__EXT4_DIR_REC_LEN(de->name_len +\
-+ ext4_get_dirent_data_len(de)))
-+
- #define EXT4_MAX_REC_LEN ((1<<16)-1)
-
- static inline unsigned int
-@@ -1611,7 +1654,7 @@ extern struct buffer_head * ext4_find_en
- struct ext4_dir_entry_2 ** res_dir);
- #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
-- struct inode *inode);
-+ struct inode *inode, const void *, const void *);
- extern int ext4_orphan_add(handle_t *, struct inode *);
- extern int ext4_orphan_del(handle_t *, struct inode *);
- extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-@@ -1809,6 +1852,28 @@ static inline void ext4_update_i_disksiz
- up_write(&EXT4_I(inode)->i_data_sem);
- return ;
- }
-+/*
-+ * Compute the total directory entry data length.
-+ * This includes the filename and an implicit NUL terminator (always present),
-+ * and optional extensions. Each extension has a bit set in the high 4 bits of
-+ * de->file_type, and the extension length is the first byte in each entry.
-+ */
-+
-+static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de)
-+{
-+ char *len = de->name + de->name_len + 1 /* NUL terminator */;
-+ int dlen = 0;
-+ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
-+
-+ while (extra_data_flags) {
-+ if (extra_data_flags & 1) {
-+ dlen += *len + (dlen == 0);
-+ len += *len;
-+ }
-+ extra_data_flags >>= 1;
-+ }
-+ return dlen;
-+}
-
- struct ext4_group_info {
- unsigned long bb_state;
-Index: linux-stage/fs/ext4/namei.c
-===================================================================
---- linux-stage.orig/fs/ext4/namei.c
-+++ linux-stage/fs/ext4/namei.c
-@@ -173,7 +173,8 @@ static unsigned dx_get_count(struct dx_e
- static unsigned dx_get_limit(struct dx_entry *entries);
- static void dx_set_count(struct dx_entry *entries, unsigned value);
- static void dx_set_limit(struct dx_entry *entries, unsigned value);
--static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
-+static inline unsigned dx_root_limit(__u32 blocksize,
-+ struct ext4_dir_entry_2 *dot_de, unsigned infosize);
- static unsigned dx_node_limit(struct inode *dir);
- static struct dx_frame *dx_probe(const struct qstr *d_name,
- struct inode *dir,
-@@ -216,11 +217,12 @@ ext4_next_entry(struct ext4_dir_entry_2
- */
- struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de)
- {
-- /* get dotdot first */
-- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1));
-+ BUG_ON(de->name_len != 1);
-+ /* get dotdot first */
-+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
-
-- /* dx root info is after dotdot entry */
-- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2));
-+ /* dx root info is after dotdot entry */
-+ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
-
- return (struct dx_root_info *) de;
- }
-@@ -265,16 +267,23 @@ static inline void dx_set_limit(struct d
- ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
- }
-
--static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
-+static inline unsigned dx_root_limit(__u32 blocksize,
-+ struct ext4_dir_entry_2 *dot_de, unsigned infosize)
- {
-- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
-- EXT4_DIR_REC_LEN(2) - infosize;
-+ struct ext4_dir_entry_2 *dotdot_de;
-+ unsigned entry_space;
-+
-+ BUG_ON(dot_de->name_len != 1);
-+ dotdot_de = ext4_next_entry(dot_de, blocksize);
-+ entry_space = blocksize - EXT4_DIR_REC_LEN(dot_de) -
-+ EXT4_DIR_REC_LEN(dotdot_de) - infosize;
-+
- return entry_space / sizeof(struct dx_entry);
- }
-
- static inline unsigned dx_node_limit(struct inode *dir)
- {
-- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-+ unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0);
- return entry_space / sizeof(struct dx_entry);
- }
-
-@@ -321,7 +330,7 @@ static struct stats dx_show_leaf(struct
- printk(":%x.%u ", h.hash,
- ((char *) de - base));
- }
-- space += EXT4_DIR_REC_LEN(de->name_len);
-+ space += EXT4_DIR_REC_LEN(de);
- names++;
- }
- de = ext4_next_entry(de, size);
-@@ -424,7 +433,8 @@ dx_probe(const struct qstr *d_name, stru
-
- entries = (struct dx_entry *) (((char *)info) + info->info_length);
-
-- if (dx_get_limit(entries) != dx_root_limit(dir,
-+ if (dx_get_limit(entries) != dx_root_limit(dir->i_sb->s_blocksize,
-+ (struct ext4_dir_entry_2*)bh->b_data,
- info->info_length)) {
- ext4_warning(dir->i_sb, "dx entry: limit != root limit");
- brelse(bh);
-@@ -480,14 +490,17 @@ dx_probe(const struct qstr *d_name, stru
- if (!indirect--) return frame;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
- goto fail2;
-- at = entries = ((struct dx_node *) bh->b_data)->entries;
-+ entries = ((struct dx_node *) bh->b_data)->entries;
- if (dx_get_limit(entries) != dx_node_limit (dir)) {
- ext4_warning(dir->i_sb,
-- "dx entry: limit != node limit");
-+ "block %u(%lu): limit %u != node limit %u",
-+ dx_get_block(at), (long)bh->b_blocknr,
-+ dx_get_limit(entries), dx_node_limit(dir));
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail2;
- }
-+ at = entries;
- frame++;
- frame->bh = NULL;
- }
-@@ -613,7 +626,7 @@ static int htree_dirblock_to_tree(struct
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- top = (struct ext4_dir_entry_2 *) ((char *) de +
- dir->i_sb->s_blocksize -
-- EXT4_DIR_REC_LEN(0));
-+ __EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
- if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-@@ -1025,7 +1038,7 @@ static struct buffer_head * ext4_dx_find
- goto errout;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-- EXT4_DIR_REC_LEN(0));
-+ __EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
- int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
- + ((char *) de - bh->b_data);
-@@ -1186,7 +1199,7 @@ dx_move_dirents(char *from, char *to, st
- while (count--) {
- struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
- (from + (map->offs<<2));
-- rec_len = EXT4_DIR_REC_LEN(de->name_len);
-+ rec_len = EXT4_DIR_REC_LEN(de);
- memcpy (to, de, rec_len);
- ((struct ext4_dir_entry_2 *) to)->rec_len =
- ext4_rec_len_to_disk(rec_len, blocksize);
-@@ -1210,7 +1223,7 @@ static struct ext4_dir_entry_2* dx_pack_
- while ((char*)de < base + blocksize) {
- next = ext4_next_entry(de, blocksize);
- if (de->inode && de->name_len) {
-- rec_len = EXT4_DIR_REC_LEN(de->name_len);
-+ rec_len = EXT4_DIR_REC_LEN(de);
- if (de > to)
- memmove(to, de, rec_len);
- to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
-@@ -1340,10 +1353,16 @@ static int add_dirent_to_buf(handle_t *h
- unsigned int offset = 0;
- unsigned int blocksize = dir->i_sb->s_blocksize;
- unsigned short reclen;
-- int nlen, rlen, err;
-+ int nlen, rlen, err, dlen = 0;
-+ unsigned char *data;
- char *top;
-
-- reclen = EXT4_DIR_REC_LEN(namelen);
-+ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *)
-+ dentry->d_fsdata);
-+ if (data)
-+ dlen = (*data) + 1;
-+
-+ reclen = __EXT4_DIR_REC_LEN(namelen + dlen);
- if (!de) {
- de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + blocksize - reclen;
-@@ -1353,7 +1372,7 @@ static int add_dirent_to_buf(handle_t *h
- return -EIO;
- if (ext4_match(namelen, name, de))
- return -EEXIST;
-- nlen = EXT4_DIR_REC_LEN(de->name_len);
-+ nlen = EXT4_DIR_REC_LEN(de);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if ((de->inode? rlen - nlen: rlen) >= reclen)
- break;
-@@ -1371,7 +1390,7 @@ static int add_dirent_to_buf(handle_t *h
- }
-
- /* By now the buffer is marked for journaling */
-- nlen = EXT4_DIR_REC_LEN(de->name_len);
-+ nlen = EXT4_DIR_REC_LEN(de);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if (de->inode) {
- struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-@@ -1387,6 +1406,12 @@ static int add_dirent_to_buf(handle_t *h
- de->inode = 0;
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
-+ if (data) {
-+ de->name[namelen] = 0;
-+ memcpy(&de->name[namelen + 1], data, *(char *) data);
-+ de->file_type |= EXT4_DIRENT_LUFID;
-+ }
-+
- /*
- * XXX shouldn't update any times until successful
- * completion of syscall, but too many callers depend
-@@ -1485,7 +1510,8 @@ static int make_indexed_dir(handle_t *ha
-
- dx_set_block(entries, 1);
- dx_set_count(entries, 1);
-- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
-+ dx_set_limit(entries, dx_root_limit(dir->i_sb->s_blocksize,
-+ dot_de, sizeof(*dx_info)));
-
- /* Initialize as for dx_probe */
- hinfo.hash_version = dx_info->hash_version;
-@@ -1516,6 +1542,8 @@ static int ext4_update_dotdot(handle_t *
- struct buffer_head * dir_block;
- struct ext4_dir_entry_2 * de;
- int len, journal = 0, err = 0;
-+ int dlen = 0;
-+ char *data;
-
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-@@ -1531,19 +1559,24 @@ static int ext4_update_dotdot(handle_t *
- /* the first item must be "." */
- assert(de->name_len == 1 && de->name[0] == '.');
- len = le16_to_cpu(de->rec_len);
-- assert(len >= EXT4_DIR_REC_LEN(1));
-- if (len > EXT4_DIR_REC_LEN(1)) {
-+ assert(len >= __EXT4_DIR_REC_LEN(1));
-+ if (len > __EXT4_DIR_REC_LEN(1)) {
- BUFFER_TRACE(dir_block, "get_write_access");
- err = ext4_journal_get_write_access(handle, dir_block);
- if (err)
- goto out_journal;
-
- journal = 1;
-- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1));
-+ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
- }
-
-- len -= EXT4_DIR_REC_LEN(1);
-- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2));
-+ len -= EXT4_DIR_REC_LEN(de);
-+ data = ext4_dentry_get_data(dir->i_sb,
-+ (struct ext4_dentry_param *) dentry->d_fsdata);
-+ if (data)
-+ dlen = *data + 1;
-+ assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen));
-+
- de = (struct ext4_dir_entry_2 *)
- ((char *) de + le16_to_cpu(de->rec_len));
- if (!journal) {
-@@ -1557,10 +1590,15 @@ static int ext4_update_dotdot(handle_t *
- if (len > 0)
- de->rec_len = cpu_to_le16(len);
- else
-- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2));
-+ assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2));
- de->name_len = 2;
- strcpy (de->name, "..");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-+ if (data) {
-+ de->name[2] = 0;
-+ memcpy(&de->name[2 + 1], data, dlen);
-+ de->file_type |= EXT4_DIRENT_LUFID;
-+ }
-
- out_journal:
- if (journal) {
-@@ -1982,12 +2020,13 @@ retry:
- /* Initialize @inode as a subdirectory of @dir, and add the
- * "." and ".." entries into the first directory block. */
- int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir,
-- struct inode *inode)
-+ struct inode *inode,
-+ const void *data1, const void *data2)
- {
- unsigned int blocksize = dir->i_sb->s_blocksize;
- struct buffer_head * dir_block;
- struct ext4_dir_entry_2 * de;
-- int err = 0;
-+ int err = 0, dot_reclen;
-
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-@@ -1999,28 +2038,42 @@ int ext4_add_dot_dotdot(handle_t *handle
- inode->i_fop = &ext4_dir_operations;
- inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread(handle, inode, 0, 1, &err);
-- if (!dir_block) {
-- clear_nlink(inode);
-- ext4_mark_inode_dirty(handle, inode);
-- iput (inode);
-+ if (!dir_block)
- goto get_out;
-- }
-+
- BUFFER_TRACE(dir_block, "get_write_access");
- ext4_journal_get_write_access(handle, dir_block);
- de = (struct ext4_dir_entry_2 *) dir_block->b_data;
- de->inode = cpu_to_le32(inode->i_ino);
- de->name_len = 1;
-- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
-- blocksize);
- strcpy(de->name, ".");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-+ /* get packed fid data*/
-+ data1 = ext4_dentry_get_data(dir->i_sb,
-+ (struct ext4_dentry_param *) data1);
-+ if (data1) {
-+ de->name[1] = 0;
-+ memcpy(&de->name[2], data1, *(char *) data1);
-+ de->file_type |= EXT4_DIRENT_LUFID;
-+ }
-+ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
-+ dot_reclen = cpu_to_le16(de->rec_len);
-+
- de = ext4_next_entry(de, blocksize);
- de->inode = cpu_to_le32(dir->i_ino);
-- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
-+ de->rec_len = ext4_rec_len_to_disk(blocksize - dot_reclen,
- blocksize);
- de->name_len = 2;
- strcpy(de->name, "..");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-+ data2 = ext4_dentry_get_data(dir->i_sb,
-+ (struct ext4_dentry_param *) data2);
-+ if (data2) {
-+ de->name[2] = 0;
-+ memcpy(&de->name[3], data2, *(char *) data2);
-+ de->file_type |= EXT4_DIRENT_LUFID;
-+ }
-+
- inode->i_nlink = 2;
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, dir, dir_block);
-@@ -2057,9 +2110,14 @@ retry:
- if (IS_ERR(inode))
- goto out_stop;
-
-- err = ext4_add_dot_dotdot(handle, dir, inode);
-- if (err)
-+ err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL);
-+ if (err) {
-+ clear_nlink(inode);
-+ unlock_new_inode(inode);
-+ ext4_mark_inode_dirty(handle, inode);
-+ iput (inode);
- goto out_stop;
-+ }
-
- err = ext4_add_entry(handle, dentry, inode);
- if (err) {
-@@ -2093,7 +2151,7 @@ static int empty_dir(struct inode *inode
- int err = 0;
-
- sb = inode->i_sb;
-- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-+ if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2) ||
- !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
- if (err)
- ext4_error(inode->i_sb,
+++ /dev/null
-ext4-version-2.6-rhel5.patch
-ext4-wantedi-2.6-rhel5.patch
-ext4-map_inode_page-2.6.18-rhel5.patch
-export-ext4-2.6-rhel5.patch
-ext4-remove-cond_resched-calls-rhel5.patch
-ext4-nlink-2.6-rhel5.patch
-ext4-inode-version-rhel5.patch
-ext4-mmp-rhel5.patch
-ext4-lookup-dotdot-rhel5.patch
-ext4-max-dir-size-rhel5.patch
-ext4-print-inum-in-htree-warning-rhel5.patch
-ext4-xattr-no-update-ctime-rhel5.patch
-ext4-prealloc-rhel5.patch
-ext4-mballoc-extra-checks-rhel5.patch
-ext4-misc-rhel5.patch
-ext4-big-endian-check-2.6-rhel5.patch
-ext4-alloc-policy-2.6-rhel5.patch
-ext4-force_over_128tb-rhel5.patch
-ext4-pdir-fix.patch
-ext4-osd-iop-common.patch
-ext4-osd-iam-exports.patch
-ext4-dynlocks-common.patch
-ext4-dynlocks-2.6-rhel5.patch
-ext4-hash-indexed-dir-dotdot-update-rhel5.patch
-ext4-ext_generation-sles11.patch
-ext4-kill-dx_root.patch
-ext4-fiemap-2.6-rhel5.patch
-ext4-mballoc-pa_free-mismatch.patch
-ext4_data_in_dirent.patch
-ext4-large-eas.patch
-ext4-disable-mb-cache-rhel5.patch
-ext4-disable-delalloc-rhel5.patch
-ext4-back-dquot-to-rhel54.patch
-ext4-nocmtime-2.6-rhel5.patch
-ext4-failed-mount-b23368.patch
-ext4-export-64bit-name-hash.patch
-ext4-vmalloc-rhel5.patch
-ext4-mballoc-group_check-rhel5.patch
-ext4-journal-callback-rhel5.patch
-ext4-store-tree-generation-at-find.patch
-ext4-quota-minimal-rhel5.patch
+++ /dev/null
-Index: linux-2.6.18-164.11.1/include/linux/blkdev.h
-===================================================================
---- linux-2.6.18-164.11.1.orig/include/linux/blkdev.h
-+++ linux-2.6.18-164.11.1/include/linux/blkdev.h
-@@ -788,10 +788,10 @@ extern void blk_free_tags(struct blk_que
- extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
- extern int blkdev_issue_flush(struct block_device *, sector_t *);
-
--#define MAX_PHYS_SEGMENTS 128
--#define MAX_HW_SEGMENTS 128
-+#define MAX_PHYS_SEGMENTS 256
-+#define MAX_HW_SEGMENTS 256
- #define SAFE_MAX_SECTORS 255
--#define BLK_DEF_MAX_SECTORS 1024
-+#define BLK_DEF_MAX_SECTORS 2048
-
- #define MAX_SEGMENT_SIZE 65536
-
-Index: linux-2.6.18-164.11.1/include/scsi/scsi_host.h
-===================================================================
---- linux-2.6.18-164.11.1.orig/include/scsi/scsi_host.h
-+++ linux-2.6.18-164.11.1/include/scsi/scsi_host.h
-@@ -30,7 +30,7 @@ struct blk_queue_tags;
- * used in one scatter-gather request.
- */
- #define SG_NONE 0
--#define SG_ALL 0xff
-+#define SG_ALL 256
-
-
- #define DISABLE_CLUSTERING 0
-Index: linux-2.6.18-164.11.1/drivers/scsi/lpfc/lpfc.h
-===================================================================
---- linux-2.6.18-164.11.1.orig/drivers/scsi/lpfc/lpfc.h
-+++ linux-2.6.18-164.11.1/drivers/scsi/lpfc/lpfc.h
-@@ -38,7 +38,7 @@
- #define LPFC_MAX_NS_RETRY 3 /* Number of retry attempts to contact
- the NameServer before giving up. */
- #define LPFC_CMD_PER_LUN 3 /* max outstanding cmds per lun */
--#define LPFC_DEFAULT_SG_SEG_CNT 64 /* sg element count per scsi cmnd */
-+#define LPFC_DEFAULT_SG_SEG_CNT 256 /* sg element count per scsi cmnd */
- #define LPFC_MAX_SG_SEG_CNT 256 /* sg element count per scsi cmnd */
- #define LPFC_IOCB_LIST_CNT 2250 /* list of IOCBs for fast-path usage. */
- #define LPFC_Q_RAMP_UP_INTERVAL 120 /* lun q_depth ramp up interval */
+++ /dev/null
-This patch is no longer needed for Lustre. It is only included
-for testing and ease of using the same kernel with older Lustre
-versions. This testing functionality was replaced in Linux 3.0
-by the dm-flakey driver.
-
-This functionality is mainly used during testing, in order to
-simulate a server crash for ldiskfs by discarding all of the
-writes to the filesystem. For recovery testing we could simulate
-this by using a special loopback or DM device that also discards
-writes to the device.
-
-This functionality is also used by target "failback" in order
-to speed up service shutdown and takeover by the other node
-during controlled operation. However, it would also be possible
-to do this by simply allowing all of the in-flight requests to
-complete and then waiting for the service to stop. This will
-also be needed by the DMU-OSD, because discarding of writes on
-a DMU-based target is not safe as it could trigger a storage
-failure if the data is ever read from disk again and the
-checksum does not match that expected by the block pointer.
-
-Index: linux-2.6.18.1/block/ll_rw_blk.c
-===================================================================
---- linux-2.6.18.1.orig/block/ll_rw_blk.c
-+++ linux-2.6.18.1/block/ll_rw_blk.c
-@@ -3067,6 +3067,8 @@ static void handle_bad_sector(struct bio
- set_bit(BIO_EOF, &bio->bi_flags);
- }
-
-+int dev_check_rdonly(struct block_device *bdev);
-+
- /**
- * generic_make_request: hand a buffer to its device driver for I/O
- * @bio: The bio describing the location in memory and on the device.
-@@ -3151,6 +3153,12 @@ end_io:
-
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
- goto end_io;
-
-+ /* this is cfs's dev_rdonly check */
-+ if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) {
-+ bio_endio(bio, bio->bi_size, 0);
-+ break;
-+ }
-+
- /*
- * If this device has partitions, remap block n
-@@ -3765,6 +3773,91 @@ void swap_io_context(struct io_context *
- *ioc2 = temp;
- }
- EXPORT_SYMBOL(swap_io_context);
-+ /*
-+ * Debug code for turning block devices "read-only" (will discard writes
-+ * silently). This is for filesystem crash/recovery testing.
-+ */
-+struct deventry {
-+ dev_t dev;
-+ struct deventry *next;
-+};
-+
-+static struct deventry *devlist = NULL;
-+static spinlock_t devlock = SPIN_LOCK_UNLOCKED;
-+
-+int dev_check_rdonly(struct block_device *bdev)
-+{
-+ struct deventry *cur;
-+ if (!bdev) return 0;
-+ spin_lock(&devlock);
-+ cur = devlist;
-+ while(cur) {
-+ if (bdev->bd_dev == cur->dev) {
-+ spin_unlock(&devlock);
-+ return 1;
-+ }
-+ cur = cur->next;
-+ }
-+ spin_unlock(&devlock);
-+ return 0;
-+}
-+
-+void dev_set_rdonly(struct block_device *bdev)
-+{
-+ struct deventry *newdev, *cur;
-+
-+ if (!bdev)
-+ return;
-+ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL);
-+ if (!newdev)
-+ return;
-+
-+ spin_lock(&devlock);
-+ cur = devlist;
-+ while(cur) {
-+ if (bdev->bd_dev == cur->dev) {
-+ spin_unlock(&devlock);
-+ kfree(newdev);
-+ return;
-+ }
-+ cur = cur->next;
-+ }
-+ newdev->dev = bdev->bd_dev;
-+ newdev->next = devlist;
-+ devlist = newdev;
-+ spin_unlock(&devlock);
-+ printk(KERN_WARNING "Turning device %s (%#x) read-only\n",
-+ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev);
-+}
-+
-+void dev_clear_rdonly(struct block_device *bdev)
-+{
-+ struct deventry *cur, *last = NULL;
-+ if (!bdev) return;
-+ spin_lock(&devlock);
-+ cur = devlist;
-+ while(cur) {
-+ if (bdev->bd_dev == cur->dev) {
-+ if (last)
-+ last->next = cur->next;
-+ else
-+ devlist = cur->next;
-+ spin_unlock(&devlock);
-+ kfree(cur);
-+ printk(KERN_WARNING "Removing read-only on %s (%#x)\n",
-+ bdev->bd_disk ? bdev->bd_disk->disk_name :
-+ "unknown block", bdev->bd_dev);
-+ return;
-+ }
-+ last = cur;
-+ cur = cur->next;
-+ }
-+ spin_unlock(&devlock);
-+}
-+
-+EXPORT_SYMBOL(dev_set_rdonly);
-+EXPORT_SYMBOL(dev_clear_rdonly);
-+EXPORT_SYMBOL(dev_check_rdonly);
-
- /*
- * sysfs parts below
-Index: linux-2.6.18.1/fs/block_dev.c
-===================================================================
---- linux-2.6.18.1.orig/fs/block_dev.c
-+++ linux-2.6.18.1/fs/block_dev.c
-@@ -1059,6 +1059,7 @@ static int __blkdev_put(struct block_dev
- if (bdev != bdev->bd_contains)
- victim = bdev->bd_contains;
- bdev->bd_contains = NULL;
-+ dev_clear_rdonly(bdev);
- }
- unlock_kernel();
- mutex_unlock(&bdev->bd_mutex);
-Index: linux-2.6.18.1/include/linux/fs.h
-===================================================================
---- linux-2.6.18.1.orig/include/linux/fs.h
-+++ linux-2.6.18.1/include/linux/fs.h
-@@ -1685,6 +1685,10 @@ extern void file_kill(struct file *f);
- struct bio;
- extern void submit_bio(int, struct bio *);
- extern int bdev_read_only(struct block_device *);
-+#define HAVE_CLEAR_RDONLY_ON_PUT
-+void dev_set_rdonly(struct block_device *bdev);
-+int dev_check_rdonly(struct block_device *bdev);
-+void dev_clear_rdonly(struct block_device *bdev);
- extern int set_blocksize(struct block_device *, int);
- extern int sb_set_blocksize(struct super_block *, int);
- extern int sb_min_blocksize(struct super_block *, int);
+++ /dev/null
-Allow starting the commit of a journal transaction, without waiting for
-it to complete. This is a performance enhancement for OST IO so that
-the journal commit can run concurrently with the file IO. It isn't
-necessary if the client can handle bulk IO recovery (bug 16919).
-
-Index: linux-2.6/fs/jbd/journal.c
-===================================================================
---- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:13:50.000000000 +0800
-+++ linux-2.6/fs/jbd/journal.c 2006-07-15 16:22:04.000000000 +0800
-@@ -74,6 +74,7 @@ EXPORT_SYMBOL(journal_abort);
- EXPORT_SYMBOL(journal_errno);
- EXPORT_SYMBOL(journal_ack_err);
- EXPORT_SYMBOL(journal_clear_err);
-+EXPORT_SYMBOL(log_start_commit);
- EXPORT_SYMBOL(log_wait_commit);
- EXPORT_SYMBOL(journal_start_commit);
- EXPORT_SYMBOL(journal_force_commit_nested);
+++ /dev/null
-Index: linux-2.6.16.i686/arch/i386/kernel/smpboot.c
-===================================================================
---- linux-2.6.16.i686.orig/arch/i386/kernel/smpboot.c 2006-05-30 15:47:03.000000000 +0800
-+++ linux-2.6.16.i686/arch/i386/kernel/smpboot.c 2006-05-30 21:22:02.000000000 +0800
-@@ -579,6 +579,7 @@
- /* which logical CPUs are on which nodes */
- cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
- { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-+EXPORT_SYMBOL(node_2_cpu_mask);
- /* which node each logical CPU is on */
- int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
- EXPORT_SYMBOL(cpu_2_node);
+++ /dev/null
-This patch is not needed for 2.x, but is kept to allow the same kernel
-to be used between 1.8.x and 2.0.x for ease of upgrade.
-
-Index: linux-2.6.12-rc6/fs/dcache.c
-===================================================================
---- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200
-+++ linux-2.6.12-rc6/fs/dcache.c 2005-06-14 15:53:58.385436913 +0200
-@@ -1581,6 +1581,7 @@
-
- return result;
- }
-+EXPORT_SYMBOL(is_subdir);
-
- void d_genocide(struct dentry *root)
- {
+++ /dev/null
-commit 229309caebe4508d650bb6d8f7d51f2b116f5bbd
-Author: Jan Kara <jack@suse.cz>
-Date: Sun May 8 19:09:53 2011 -0400
-
-jbd2: Fix forever sleeping process in do_get_write_access()
-
-In do_get_write_access() we wait on BH_Unshadow bit for buffer to get
-from shadow state. The waking code in journal_commit_transaction() has
-a bug because it does not issue a memory barrier after the buffer is
-moved from the shadow state and before wake_up_bit() is called. Thus a
-waitqueue check can happen before the buffer is actually moved from
-the shadow state and waiting process may never be woken. Fix the
-problem by issuing proper barrier.
-
-Reported-by: Tao Ma <boyu.mt@taobao.com>
-Signed-off-by: Jan Kara <jack@suse.cz>
-Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
----
- fs/jbd2/commit.c | 9 +++++++--
- 1 files changed, 7 insertions(+), 2 deletions(-)
-
-Index: linux-2.6.18.4/fs/jbd2/commit.c
-===================================================================
---- linux-2.6.18.4.orig/fs/jbd2/commit.c
-+++ linux-2.6.18.4/fs/jbd2/commit.c
-@@ -788,8 +788,13 @@ wait_for_iobuf:
- required. */
- JBUFFER_TRACE(jh, "file as BJ_Forget");
- jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-- /* Wake up any transactions which were waiting for this
-- IO to complete */
-+ /*
-+ * Wake up any transactions which were waiting for this IO to
-+ * complete. The barrier must be here so that changes by
-+ * jbd2_journal_file_buffer() take effect before wake_up_bit()
-+ * does the waitqueue check.
-+ */
-+ smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
- JBUFFER_TRACE(jh, "brelse shadowed buffer");
- __brelse(bh);
+++ /dev/null
-Implement a JBD per-transaction commit callback. Users can attach arbitrary
-callbacks to a journal handle, which are propagated to the transaction at
-journal handle stop time. The commit callbacks are run when the transaction
-is finished commit, and will be passed a non-zero error code if there was
-a commit error.
-
-Signed-off-by: Andreas Dilger <adilger@sun.com>
-
-
-Index: linux-2.6/include/linux/jbd.h
-===================================================================
---- linux-2.6.orig/include/linux/jbd.h 2006-07-15 16:08:35.000000000 +0800
-+++ linux-2.6/include/linux/jbd.h 2006-07-15 16:13:01.000000000 +0800
-@@ -356,6 +356,27 @@ static inline void jbd_unlock_bh_journal
- bit_spin_unlock(BH_JournalHead, &bh->b_state);
- }
-
-+#define HAVE_JOURNAL_CALLBACK_STATUS
-+/**
-+ * struct journal_callback - Base structure for callback information
-+ * @jcb_list: list information for other callbacks attached to the same handle
-+ * @jcb_func: Function to call with this callback structure
-+ *
-+ * This struct is a 'seed' structure for a using with your own callback
-+ * structs. If you are using callbacks you must allocate one of these
-+ * or another struct of your own definition which has this struct
-+ * as it's first element and pass it to journal_callback_set().
-+ *
-+ * This is used internally by jbd to maintain callback information.
-+ *
-+ * See journal_callback_set for more information.
-+ **/
-+struct journal_callback {
-+ struct list_head jcb_list; /* t_jcb_lock */
-+ void (*jcb_func)(struct journal_callback *jcb, int error);
-+ /* caller data goes here */
-+};
-+
- struct jbd_revoke_table_s;
-
- /**
-@@ -364,6 +385,7 @@ struct jbd_revoke_table_s;
- * @h_transaction: Which compound transaction is this update a part of?
- * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
- * @h_ref: Reference count on this handle
-+ * @h_jcb: List of application registered callbacks for this handle.
- * @h_err: Field for caller's use to track errors through large fs operations
- * @h_sync: flag for sync-on-close
- * @h_jdata: flag to force data journaling
-@@ -389,6 +411,13 @@ struct handle_s
- /* operations */
- int h_err;
-
-+ /*
-+ * List of application registered callbacks for this handle. The
-+ * function(s) will be called after the transaction that this handle is
-+ * part of has been committed to disk. [t_jcb_lock]
-+ */
-+ struct list_head h_jcb;
-+
- /* Flags [no locking] */
- unsigned int h_sync: 1; /* sync-on-close */
- unsigned int h_jdata: 1; /* force data journaling */
-@@ -430,6 +459,8 @@ struct handle_s
- * j_state_lock
- * ->j_list_lock (journal_unmap_buffer)
- *
-+ * t_handle_lock
-+ * ->t_jcb_lock
- */
-
- struct transaction_s
-@@ -559,6 +590,15 @@ struct transaction_s
- */
- int t_handle_count;
-
-+ /*
-+ * Protects the callback list
-+ */
-+ spinlock_t t_jcb_lock;
-+ /*
-+ * List of registered callback functions for this transaction.
-+ * Called when the transaction is committed. [t_jcb_lock]
-+ */
-+ struct list_head t_jcb;
- };
-
- /**
-@@ -906,6 +946,10 @@ extern void journal_invalidatepage(jour
- extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
- extern int journal_stop(handle_t *);
- extern int journal_flush (journal_t *);
-+extern void journal_callback_set(handle_t *handle,
-+ void (*fn)(struct journal_callback *,int),
-+ struct journal_callback *jcb);
-+
- extern void journal_lock_updates (journal_t *);
- extern void journal_unlock_updates (journal_t *);
-
-Index: linux-2.6/fs/jbd/checkpoint.c
-===================================================================
---- linux-2.6.orig/fs/jbd/checkpoint.c 2006-07-15 16:08:36.000000000 +0800
-+++ linux-2.6/fs/jbd/checkpoint.c 2006-07-15 16:13:01.000000000 +0800
-@@ -688,6 +688,7 @@ void __journal_drop_transaction(journal_
- J_ASSERT(transaction->t_checkpoint_list == NULL);
- J_ASSERT(transaction->t_checkpoint_io_list == NULL);
- J_ASSERT(transaction->t_updates == 0);
-+ J_ASSERT(list_empty(&transaction->t_jcb));
- J_ASSERT(journal->j_committing_transaction != transaction);
- J_ASSERT(journal->j_running_transaction != transaction);
-
-Index: linux-2.6/fs/jbd/commit.c
-===================================================================
---- linux-2.6.orig/fs/jbd/commit.c 2006-07-15 16:08:36.000000000 +0800
-+++ linux-2.6/fs/jbd/commit.c 2006-07-15 16:13:01.000000000 +0800
-@@ -708,6 +708,32 @@ wait_for_iobuf:
- transaction can be removed from any checkpoint list it was on
- before. */
-
-+ /*
-+ * Call any callbacks that had been registered for handles in this
-+ * transaction. It is up to the callback to free any allocated
-+ * memory.
-+ *
-+ * Locking not strictly required, since this is the only process
-+ * touching this transaction anymore, but is done to keep code
-+ * checkers happy and has no contention in any case.
-+ */
-+ spin_lock(&commit_transaction->t_jcb_lock);
-+ if (!list_empty(&commit_transaction->t_jcb)) {
-+ struct list_head *p, *n;
-+ int error = is_journal_aborted(journal);
-+
-+ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-+ struct journal_callback *jcb;
-+
-+ jcb = list_entry(p, struct journal_callback, jcb_list);
-+ list_del_init(p);
-+ spin_unlock(&commit_transaction->t_jcb_lock);
-+ jcb->jcb_func(jcb, error);
-+ spin_lock(&commit_transaction->t_jcb_lock);
-+ }
-+ }
-+ spin_unlock(&commit_transaction->t_jcb_lock);
-+
- jbd_debug(3, "JBD: commit phase 7\n");
-
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
-Index: linux-2.6/fs/jbd/journal.c
-===================================================================
---- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:08:36.000000000 +0800
-+++ linux-2.6/fs/jbd/journal.c 2006-07-15 16:13:01.000000000 +0800
-@@ -58,6 +58,7 @@ EXPORT_SYMBOL(journal_sync_buffer);
- #endif
- EXPORT_SYMBOL(journal_flush);
- EXPORT_SYMBOL(journal_revoke);
-+EXPORT_SYMBOL(journal_callback_set);
-
- EXPORT_SYMBOL(journal_init_dev);
- EXPORT_SYMBOL(journal_init_inode);
-@@ -80,6 +81,7 @@ EXPORT_SYMBOL(journal_wipe);
- EXPORT_SYMBOL(journal_blocks_per_page);
- EXPORT_SYMBOL(journal_invalidatepage);
- EXPORT_SYMBOL(journal_try_to_free_buffers);
-+EXPORT_SYMBOL(journal_bmap);
- EXPORT_SYMBOL(journal_force_commit);
-
- static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
-Index: linux-2.6/fs/jbd/transaction.c
-===================================================================
---- linux-2.6.orig/fs/jbd/transaction.c 2006-07-15 16:08:35.000000000 +0800
-+++ linux-2.6/fs/jbd/transaction.c 2006-07-15 16:13:01.000000000 +0800
-@@ -50,7 +50,9 @@ get_transaction(journal_t *journal, tran
- transaction->t_state = T_RUNNING;
- transaction->t_tid = journal->j_transaction_sequence++;
- transaction->t_expires = jiffies + journal->j_commit_interval;
-+ INIT_LIST_HEAD(&transaction->t_jcb);
- spin_lock_init(&transaction->t_handle_lock);
-+ spin_lock_init(&transaction->t_jcb_lock);
-
- /* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = transaction->t_expires;
-@@ -241,6 +243,7 @@ static handle_t *new_handle(int nblocks)
- memset(handle, 0, sizeof(*handle));
- handle->h_buffer_credits = nblocks;
- handle->h_ref = 1;
-+ INIT_LIST_HEAD(&handle->h_jcb);
-
- return handle;
- }
-@@ -1291,6 +1294,35 @@ drop:
- }
-
- /**
-+ * void journal_callback_set() - Register a callback function for this handle.
-+ * @handle: handle to attach the callback to.
-+ * @func: function to callback.
-+ * @jcb: structure with additional information required by func() , and
-+ * some space for jbd internal information.
-+ *
-+ * The function will be called when the transaction that this handle is
-+ * part of has been committed to disk with the original callback data
-+ * struct and the error status of the journal as parameters. There is no
-+ * guarantee of ordering between handles within a single transaction, nor
-+ * between callbacks registered on the same handle.
-+ *
-+ * The caller is responsible for allocating the journal_callback struct.
-+ * This is to allow the caller to add as much extra data to the callback
-+ * as needed, but reduce the overhead of multiple allocations. The caller
-+ * allocated struct must start with a struct journal_callback at offset 0,
-+ * and has the caller-specific data afterwards.
-+ */
-+void journal_callback_set(handle_t *handle,
-+ void (*func)(struct journal_callback *jcb, int error),
-+ struct journal_callback *jcb)
-+{
-+ jcb->jcb_func = func;
-+ spin_lock(&handle->h_transaction->t_jcb_lock);
-+ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
-+ spin_unlock(&handle->h_transaction->t_jcb_lock);
-+}
-+
-+/**
- * int journal_stop() - complete a transaction
- * @handle: tranaction to complete.
- *
-@@ -1363,6 +1396,11 @@ int journal_stop(handle_t *handle)
- wake_up(&journal->j_wait_transaction_locked);
- }
-
-+ /* Move callbacks from the handle to the transaction. */
-+ spin_lock(&transaction->t_jcb_lock);
-+ list_splice(&handle->h_jcb, &transaction->t_jcb);
-+ spin_unlock(&transaction->t_jcb_lock);
-+
- /*
- * If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
+++ /dev/null
-Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c 2009-06-02 23:24:00.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/commit.c 2009-06-02 23:26:07.000000000 -0600
-@@ -22,6 +22,7 @@
- #include <linux/mm.h>
- #include <linux/pagemap.h>
- #include <linux/smp_lock.h>
-+#include <linux/crc32.h>
-
-
- /*
-@@ -95,19 +96,23 @@
- return 1;
- }
-
--/* Done it all: now write the commit record. We should have
-+/*
-+ * Done it all: now submit the commit record. We should have
- * cleaned up our previous buffers by now, so if we are in abort
- * mode we can now just skip the rest of the journal write
- * entirely.
- *
- * Returns 1 if the journal needs to be aborted or 0 on success
- */
--static int journal_write_commit_record(journal_t *journal,
-- transaction_t *commit_transaction)
-+static int journal_submit_commit_record(journal_t *journal,
-+ transaction_t *commit_transaction,
-+ struct buffer_head **cbh,
-+ __u32 crc32_sum)
- {
- struct journal_head *descriptor;
-+ struct commit_header *tmp;
- struct buffer_head *bh;
-- int i, ret;
-+ int ret;
- int barrier_done = 0;
-
- if (is_journal_aborted(journal))
-@@ -119,21 +124,34 @@
-
- bh = jh2bh(descriptor);
-
-- /* AKPM: buglet - add `i' to tmp! */
-- for (i = 0; i < bh->b_size; i += 512) {
-- journal_header_t *tmp = (journal_header_t*)bh->b_data;
-- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
-- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
-- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-+ tmp = (struct commit_header *)bh->b_data;
-+ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
-+ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
-+ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-+
-+ if (JFS_HAS_COMPAT_FEATURE(journal,
-+ JFS_FEATURE_COMPAT_CHECKSUM)) {
-+ tmp->h_chksum_type = JFS_CRC32_CHKSUM;
-+ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
-+ tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
- }
-
-- JBUFFER_TRACE(descriptor, "write commit block");
-+ JBUFFER_TRACE(descriptor, "submit commit block");
-+ lock_buffer(bh);
-+
- set_buffer_dirty(bh);
-- if (journal->j_flags & JFS_BARRIER) {
-+ set_buffer_uptodate(bh);
-+ bh->b_end_io = journal_end_buffer_io_sync;
-+
-+ if (journal->j_flags & JFS_BARRIER &&
-+ !JFS_HAS_INCOMPAT_FEATURE(journal,
-+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-+
- set_buffer_ordered(bh);
- barrier_done = 1;
- }
-- ret = sync_dirty_buffer(bh);
-+ ret = submit_bh(WRITE, bh);
-+
- /* is it possible for another commit to fail at roughly
- * the same time as this one? If so, we don't want to
- * trust the barrier flag in the super, but instead want
-@@ -154,12 +172,70 @@
- clear_buffer_ordered(bh);
- set_buffer_uptodate(bh);
- set_buffer_dirty(bh);
-- ret = sync_dirty_buffer(bh);
-+ ret = submit_bh(WRITE, bh);
- }
-- put_bh(bh); /* One for getblk() */
-- journal_put_journal_head(descriptor);
-+ *cbh = bh;
-+ return ret;
-+}
-+
-+/*
-+ * This function along with journal_submit_commit_record
-+ * allows to write the commit record asynchronously.
-+ */
-+static int journal_wait_on_commit_record(struct buffer_head *bh)
-+{
-+ int ret = 0;
-+
-+ clear_buffer_dirty(bh);
-+ wait_on_buffer(bh);
-+
-+ if (unlikely(!buffer_uptodate(bh)))
-+ ret = -EIO;
-+ put_bh(bh); /* One for getblk() */
-+ journal_put_journal_head(bh2jh(bh));
-+
-+ return ret;
-+}
-+
-+/*
-+ * Wait for all submitted IO to complete.
-+ */
-+static int journal_wait_on_locked_list(journal_t *journal,
-+ transaction_t *commit_transaction)
-+{
-+ int ret = 0;
-+ struct journal_head *jh;
-
-- return (ret == -EIO);
-+ while (commit_transaction->t_locked_list) {
-+ struct buffer_head *bh;
-+
-+ jh = commit_transaction->t_locked_list->b_tprev;
-+ bh = jh2bh(jh);
-+ get_bh(bh);
-+ if (buffer_locked(bh)) {
-+ spin_unlock(&journal->j_list_lock);
-+ wait_on_buffer(bh);
-+ if (unlikely(!buffer_uptodate(bh)))
-+ ret = -EIO;
-+ spin_lock(&journal->j_list_lock);
-+ }
-+ if (!inverted_lock(journal, bh)) {
-+ put_bh(bh);
-+ spin_lock(&journal->j_list_lock);
-+ continue;
-+ }
-+ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-+ __journal_unfile_buffer(jh);
-+ jbd_unlock_bh_state(bh);
-+ journal_remove_journal_head(bh);
-+ put_bh(bh);
-+ } else {
-+ jbd_unlock_bh_state(bh);
-+ }
-+ put_bh(bh);
-+ cond_resched_lock(&journal->j_list_lock);
-+ }
-+ return ret;
- }
-
- void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-@@ -282,6 +358,20 @@
- return err;
- }
-
-+static inline __u32 jbd_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
-+{
-+ struct page *page = bh->b_page;
-+ char *addr;
-+ __u32 checksum;
-+
-+ addr = kmap_atomic(page, KM_USER0);
-+ checksum = crc32_be(crc32_sum,
-+ (void *)(addr + offset_in_page(bh->b_data)),
-+ bh->b_size);
-+ kunmap_atomic(addr, KM_USER0);
-+ return checksum;
-+}
-+
- /*
- * journal_commit_transaction
- *
-@@ -305,6 +395,8 @@
- int first_tag = 0;
- int tag_flag;
- int i;
-+ struct buffer_head *cbh = NULL; /* For transactional checksums */
-+ __u32 crc32_sum = ~0;
-
- /*
- * First job: lock down the current transaction and wait for
-@@ -431,39 +523,14 @@
- err = journal_submit_data_buffers(journal, commit_transaction);
-
- /*
-- * Wait for all previously submitted IO to complete.
-+ * Wait for all previously submitted IO to complete if commit
-+ * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
-- while (commit_transaction->t_locked_list) {
-- struct buffer_head *bh;
--
-- jh = commit_transaction->t_locked_list->b_tprev;
-- bh = jh2bh(jh);
-- get_bh(bh);
-- if (buffer_locked(bh)) {
-- spin_unlock(&journal->j_list_lock);
-- wait_on_buffer(bh);
-- spin_lock(&journal->j_list_lock);
-- }
-- if (unlikely(!buffer_uptodate(bh)))
-- err = -EIO;
--
-- if (!inverted_lock(journal, bh)) {
-- put_bh(bh);
-- spin_lock(&journal->j_list_lock);
-- continue;
-- }
-- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-- __journal_unfile_buffer(jh);
-- jbd_unlock_bh_state(bh);
-- journal_remove_journal_head(bh);
-- put_bh(bh);
-- } else {
-- jbd_unlock_bh_state(bh);
-- }
-- release_data_buffer(bh);
-- cond_resched_lock(&journal->j_list_lock);
-- }
-+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
-+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
-+ err = journal_wait_on_locked_list(journal,
-+ commit_transaction);
- spin_unlock(&journal->j_list_lock);
-
- if (err)
-@@ -642,6 +709,16 @@
- start_journal_io:
- for (i = 0; i < bufs; i++) {
- struct buffer_head *bh = wbuf[i];
-+ /*
-+ * Compute checksum.
-+ */
-+ if (JFS_HAS_COMPAT_FEATURE(journal,
-+ JFS_FEATURE_COMPAT_CHECKSUM)) {
-+ crc32_sum =
-+ jbd_checksum_data(crc32_sum,
-+ bh);
-+ }
-+
- lock_buffer(bh);
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
-@@ -658,6 +735,23 @@
- }
- }
-
-+ /* Done it all: now write the commit record asynchronously. */
-+
-+ if (JFS_HAS_INCOMPAT_FEATURE(journal,
-+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-+ err = journal_submit_commit_record(journal, commit_transaction,
-+ &cbh, crc32_sum);
-+ if (err)
-+ __journal_abort_hard(journal);
-+
-+ spin_lock(&journal->j_list_lock);
-+ err = journal_wait_on_locked_list(journal,
-+ commit_transaction);
-+ spin_unlock(&journal->j_list_lock);
-+ if (err)
-+ __journal_abort_hard(journal);
-+ }
-+
- /* Lo and behold: we have just managed to send a transaction to
- the log. Before we can commit it, wait for the IO so far to
- complete. Control buffers being written are on the
-@@ -759,9 +853,15 @@
- journal_abort(journal, err);
-
- jbd_debug(3, "JBD: commit phase 6\n");
--
-- if (journal_write_commit_record(journal, commit_transaction))
-- err = -EIO;
-+
-+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
-+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-+ err = journal_submit_commit_record(journal, commit_transaction,
-+ &cbh, crc32_sum);
-+ if (err)
-+ __journal_abort_hard(journal);
-+ }
-+ err = journal_wait_on_commit_record(cbh);
-
- if (err)
- journal_abort(journal, err);
-Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c 2009-04-14 21:05:39.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/recovery.c 2009-06-02 23:26:07.000000000 -0600
-@@ -21,6 +21,7 @@
- #include <linux/jbd.h>
- #include <linux/errno.h>
- #include <linux/slab.h>
-+#include <linux/crc32.h>
- #endif
-
- /*
-@@ -310,6 +311,38 @@
- return err;
- }
-
-+/*
-+ * calc_chksums calculates the checksums for the blocks described in the
-+ * descriptor block.
-+ */
-+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
-+ unsigned long *next_log_block, __u32 *crc32_sum)
-+{
-+ int i, num_blks, err;
-+ unsigned long io_block;
-+ struct buffer_head *obh;
-+
-+ num_blks = count_tags(bh, journal->j_blocksize);
-+ /* Calculate checksum of the descriptor block. */
-+ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
-+
-+ for (i = 0; i < num_blks; i++) {
-+ io_block = (*next_log_block)++;
-+ wrap(journal, *next_log_block);
-+ err = jread(&obh, journal, io_block);
-+ if (err) {
-+ printk(KERN_ERR "JBD: IO error %d recovering block "
-+ "%lu in log\n", err, io_block);
-+ return 1;
-+ } else {
-+ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
-+ obh->b_size);
-+ }
-+ put_bh(obh);
-+ }
-+ return 0;
-+}
-+
- static int do_one_pass(journal_t *journal,
- struct recovery_info *info, enum passtype pass)
- {
-@@ -321,6 +354,7 @@
- struct buffer_head * bh;
- unsigned int sequence;
- int blocktype;
-+ __u32 crc32_sum = ~0; /* Transactional Checksums */
-
- /* Precompute the maximum metadata descriptors in a descriptor block */
- int MAX_BLOCKS_PER_DESC;
-@@ -412,9 +446,24 @@
- switch(blocktype) {
- case JFS_DESCRIPTOR_BLOCK:
- /* If it is a valid descriptor block, replay it
-- * in pass REPLAY; otherwise, just skip over the
-- * blocks it describes. */
-+ * in pass REPLAY; if journal_checksums enabled, then
-+ * calculate checksums in PASS_SCAN, otherwise,
-+ * just skip over the blocks it describes. */
- if (pass != PASS_REPLAY) {
-+ if (pass == PASS_SCAN &&
-+ JFS_HAS_COMPAT_FEATURE(journal,
-+ JFS_FEATURE_COMPAT_CHECKSUM) &&
-+ !info->end_transaction) {
-+ if (calc_chksums(journal, bh,
-+ &next_log_block,
-+ &crc32_sum)) {
-+ put_bh(bh);
-+ break;
-+ }
-+ put_bh(bh);
-+ continue;
-+ }
-+
- next_log_block +=
- count_tags(bh, journal->j_blocksize);
- wrap(journal, next_log_block);
-@@ -509,9 +558,97 @@
- continue;
-
- case JFS_COMMIT_BLOCK:
-- /* Found an expected commit block: not much to
-- * do other than move on to the next sequence
-+ /* How to differentiate between interrupted commit
-+ * and journal corruption ?
-+ *
-+ * {nth transaction}
-+ * Checksum Verification Failed
-+ * |
-+ * ____________________
-+ * | |
-+ * async_commit sync_commit
-+ * | |
-+ * | GO TO NEXT "Journal Corruption"
-+ * | TRANSACTION
-+ * |
-+ * {(n+1)th transanction}
-+ * |
-+ * _______|______________
-+ * | |
-+ * Commit block found Commit block not found
-+ * | |
-+ * "Journal Corruption" |
-+ * _____________|__________
-+ * | |
-+ * nth trans corrupt OR nth trans
-+ * and (n+1)th interrupted interrupted
-+ * before commit block
-+ * could reach the disk.
-+ * (Cannot find the difference in above
-+ * mentioned conditions. Hence assume
-+ * "Interrupted Commit".)
-+ */
-+
-+ /* Found an expected commit block: if checksums
-+ * are present verify them in PASS_SCAN; else not
-+ * much to do other than move on to the next sequence
- * number. */
-+ if (pass == PASS_SCAN &&
-+ JFS_HAS_COMPAT_FEATURE(journal,
-+ JFS_FEATURE_COMPAT_CHECKSUM)) {
-+ int chksum_err, chksum_seen;
-+ struct commit_header *cbh =
-+ (struct commit_header *)bh->b_data;
-+ unsigned found_chksum =
-+ be32_to_cpu(cbh->h_chksum[0]);
-+
-+ chksum_err = chksum_seen = 0;
-+
-+ if (info->end_transaction) {
-+ printk(KERN_ERR "JBD: Transaction %u "
-+ "found to be corrupt.\n",
-+ next_commit_ID - 1);
-+ brelse(bh);
-+ break;
-+ }
-+
-+ if (crc32_sum == found_chksum &&
-+ cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
-+ cbh->h_chksum_size ==
-+ JFS_CRC32_CHKSUM_SIZE) {
-+ chksum_seen = 1;
-+ } else if (!(cbh->h_chksum_type == 0 &&
-+ cbh->h_chksum_size == 0 &&
-+ found_chksum == 0 &&
-+ !chksum_seen)) {
-+ /*
-+ * If fs is mounted using an old kernel and then
-+ * kernel with journal_chksum is used then we
-+ * get a situation where the journal flag has
-+ * checksum flag set but checksums are not
-+ * present i.e chksum = 0, in the individual
-+ * commit blocks.
-+ * Hence to avoid checksum failures, in this
-+ * situation, this extra check is added.
-+ */
-+ chksum_err = 1;
-+ }
-+
-+ if (chksum_err) {
-+ info->end_transaction = next_commit_ID;
-+
-+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
-+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
-+ printk(KERN_ERR
-+ "JBD: Transaction %u "
-+ "found to be corrupt.\n",
-+ next_commit_ID);
-+ brelse(bh);
-+ break;
-+ }
-+ }
-+ crc32_sum = ~0;
-+ }
- brelse(bh);
- next_commit_ID++;
- continue;
-@@ -547,9 +684,10 @@
- * transaction marks the end of the valid log.
- */
-
-- if (pass == PASS_SCAN)
-- info->end_transaction = next_commit_ID;
-- else {
-+ if (pass == PASS_SCAN) {
-+ if (!info->end_transaction)
-+ info->end_transaction = next_commit_ID;
-+ } else {
- /* It's really bad news if different passes end up at
- * different places (but possible due to IO errors). */
- if (info->end_transaction != next_commit_ID) {
-Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:24:00.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/journal.c 2009-06-02 23:26:07.000000000 -0600
-@@ -67,6 +67,7 @@
- EXPORT_SYMBOL(journal_check_used_features);
- EXPORT_SYMBOL(journal_check_available_features);
- EXPORT_SYMBOL(journal_set_features);
-+EXPORT_SYMBOL(journal_clear_features);
- EXPORT_SYMBOL(journal_create);
- EXPORT_SYMBOL(journal_load);
- EXPORT_SYMBOL(journal_destroy);
-@@ -1583,6 +1584,33 @@
- return 1;
- }
-
-+/**
-+ * int journal_clear_features () - Clear a given journal feature in the superblock
-+ * @journal: Journal to act on.
-+ * @compat: bitmask of compatible features
-+ * @ro: bitmask of features that force read-only mount
-+ * @incompat: bitmask of incompatible features
-+ *
-+ * Clear a given journal feature as present on the
-+ * superblock. Returns true if the requested features could be reset.
-+ *
-+ */
-+int journal_clear_features (journal_t *journal, unsigned long compat,
-+ unsigned long ro, unsigned long incompat)
-+{
-+ journal_superblock_t *sb;
-+
-+ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
-+ compat, ro, incompat);
-+
-+ sb = journal->j_superblock;
-+
-+ sb->s_feature_compat &= ~cpu_to_be32(compat);
-+ sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
-+ sb->s_feature_incompat &= ~cpu_to_be32(incompat);
-+
-+ return 1;
-+}
-
- /**
- * int journal_update_format () - Update on-disk journal structure.
-Index: linux-2.6.18-128.1.6/fs/Kconfig
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/Kconfig 2009-04-14 21:05:39.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/Kconfig 2009-06-02 23:26:07.000000000 -0600
-@@ -206,6 +206,7 @@
-
- config JBD
- tristate
-+ select CRC32
- help
- This is a generic journaling layer for block devices. It is
- currently used by the ext3 and OCFS2 file systems, but it could
-Index: linux-2.6.18-128.1.6/include/linux/jbd.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/jbd.h 2009-06-02 23:24:00.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/jbd.h 2009-06-02 23:26:07.000000000 -0600
-@@ -148,6 +148,29 @@
- __be32 h_sequence;
- } journal_header_t;
-
-+/*
-+ * Checksum types.
-+ */
-+#define JFS_CRC32_CHKSUM 1
-+#define JFS_MD5_CHKSUM 2
-+#define JFS_SHA1_CHKSUM 3
-+
-+#define JFS_CRC32_CHKSUM_SIZE 4
-+
-+#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
-+/*
-+ * Commit block header for storing transactional checksums:
-+ */
-+struct commit_header
-+{
-+ __be32 h_magic;
-+ __be32 h_blocktype;
-+ __be32 h_sequence;
-+ unsigned char h_chksum_type;
-+ unsigned char h_chksum_size;
-+ unsigned char h_padding[2];
-+ __be32 h_chksum[JFS_CHECKSUM_BYTES];
-+};
-
- /*
- * The block tag: used to describe a single buffer in the journal
-@@ -234,12 +257,16 @@
- ((j)->j_format_version >= 2 && \
- ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
-
--#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
-+#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
-+
-+#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
-+#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
-
- /* Features known to this kernel version: */
--#define JFS_KNOWN_COMPAT_FEATURES 0
-+#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
- #define JFS_KNOWN_ROCOMPAT_FEATURES 0
--#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
-+#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \
-+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
-
- #ifdef __KERNEL__
-
-@@ -1053,6 +1080,8 @@
- (journal_t *, unsigned long, unsigned long, unsigned long);
- extern int journal_set_features
- (journal_t *, unsigned long, unsigned long, unsigned long);
-+extern int journal_clear_features
-+ (journal_t *, unsigned long, unsigned long, unsigned long);
- extern int journal_create (journal_t *);
- extern int journal_load (journal_t *journal);
- #ifndef __GENKSYMS__
-Index: linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt
-===================================================================
---- linux-2.6.18-128.1.6.orig/Documentation/filesystems/ext3.txt 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt 2009-06-02 23:26:07.000000000 -0600
-@@ -14,6 +14,16 @@
- When mounting an ext3 filesystem, the following option are accepted:
- (*) == default
-
-+journal_checksum Enable checksumming of the journal transactions.
-+ This will allow the recovery code in e2fsck and the
-+ kernel to detect corruption in the kernel. It is a
-+ compatible change and will be ignored by older kernels.
-+
-+journal_async_commit Commit block can be written to disk without waiting
-+ for descriptor blocks. If enabled older kernels cannot
-+ mount the device. This will enable 'journal_checksum'
-+ internally.
-+
- journal=update Update the ext3 file system's journal to the current
- format.
-
+++ /dev/null
-Index: linux-2.6.18-128.1.6/include/linux/jbd.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/jbd.h 2009-06-02 23:22:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/jbd.h 2009-06-02 23:24:00.000000000 -0600
-@@ -428,6 +428,16 @@
- };
-
-
-+/*
-+ * Some stats for checkpoint phase
-+ */
-+struct transaction_chp_stats_s {
-+ unsigned long cs_chp_time;
-+ unsigned long cs_forced_to_close;
-+ unsigned long cs_written;
-+ unsigned long cs_dropped;
-+};
-+
- /* The transaction_t type is the guts of the journaling mechanism. It
- * tracks a compound transaction through its various states:
- *
-@@ -565,6 +575,21 @@
- spinlock_t t_handle_lock;
-
- /*
-+ * Longest time some handle had to wait for running transaction
-+ */
-+ unsigned long t_max_wait;
-+
-+ /*
-+ * When transaction started
-+ */
-+ unsigned long t_start;
-+
-+ /*
-+ * Checkpointing stats [j_checkpoint_sem]
-+ */
-+ struct transaction_chp_stats_s t_chp_stats;
-+
-+ /*
- * Number of outstanding updates running on this transaction
- * [t_handle_lock]
- */
-@@ -604,6 +629,57 @@
- struct list_head t_jcb;
- };
-
-+struct transaction_run_stats_s {
-+ unsigned long rs_wait;
-+ unsigned long rs_running;
-+ unsigned long rs_locked;
-+ unsigned long rs_flushing;
-+ unsigned long rs_logging;
-+
-+ unsigned long rs_handle_count;
-+ unsigned long rs_blocks;
-+ unsigned long rs_blocks_logged;
-+};
-+
-+struct transaction_stats_s
-+{
-+ int ts_type;
-+ unsigned long ts_tid;
-+ union {
-+ struct transaction_run_stats_s run;
-+ struct transaction_chp_stats_s chp;
-+ } u;
-+};
-+
-+#define JBD_STATS_RUN 1
-+#define JBD_STATS_CHECKPOINT 2
-+
-+#define ts_wait u.run.rs_wait
-+#define ts_running u.run.rs_running
-+#define ts_locked u.run.rs_locked
-+#define ts_flushing u.run.rs_flushing
-+#define ts_logging u.run.rs_logging
-+#define ts_handle_count u.run.rs_handle_count
-+#define ts_blocks u.run.rs_blocks
-+#define ts_blocks_logged u.run.rs_blocks_logged
-+
-+#define ts_chp_time u.chp.cs_chp_time
-+#define ts_forced_to_close u.chp.cs_forced_to_close
-+#define ts_written u.chp.cs_written
-+#define ts_dropped u.chp.cs_dropped
-+
-+#define CURRENT_MSECS (jiffies_to_msecs(jiffies))
-+
-+static inline unsigned int
-+jbd_time_diff(unsigned int start, unsigned int end)
-+{
-+ if (unlikely(start > end))
-+ end = end + (~0UL - start);
-+ else
-+ end -= start;
-+ return end;
-+}
-+
- /**
- * struct journal_s - The journal_s type is the concrete type associated with
- * journal_t.
-@@ -857,6 +933,16 @@
- pid_t j_last_sync_writer;
-
- /*
-+ *
-+ */
-+ struct transaction_stats_s *j_history;
-+ int j_history_max;
-+ int j_history_cur;
-+ spinlock_t j_history_lock;
-+ struct proc_dir_entry *j_proc_entry;
-+ struct transaction_stats_s j_stats;
-+
-+ /*
- * An opaque pointer to fs-private information. ext3 puts its
- * superblock pointer here
- */
-Index: linux-2.6.18-128.1.6/fs/jbd/transaction.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/transaction.c 2009-06-02 23:22:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/transaction.c 2009-06-02 23:24:00.000000000 -0600
-@@ -60,6 +60,8 @@
-
- J_ASSERT(journal->j_running_transaction == NULL);
- journal->j_running_transaction = transaction;
-+ transaction->t_max_wait = 0;
-+ transaction->t_start = CURRENT_MSECS;
-
- return transaction;
- }
-@@ -86,6 +88,7 @@
- int nblocks = handle->h_buffer_credits;
- transaction_t *new_transaction = NULL;
- int ret = 0;
-+ unsigned long ts = CURRENT_MSECS;
-
- if (nblocks > journal->j_max_transaction_buffers) {
- printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
-@@ -219,6 +222,12 @@
- /* OK, account for the buffers that this operation expects to
- * use and add the handle to the running transaction. */
-
-+ if (time_after(transaction->t_start, ts)) {
-+ ts = jbd_time_diff(ts, transaction->t_start);
-+ if (ts > transaction->t_max_wait)
-+ transaction->t_max_wait= ts;
-+ }
-+
- handle->h_transaction = transaction;
- transaction->t_outstanding_credits += nblocks;
- transaction->t_updates++;
-Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:23:03.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/journal.c 2009-06-02 23:24:00.000000000 -0600
-@@ -36,6 +36,7 @@
- #include <linux/kthread.h>
- #include <linux/poison.h>
- #include <linux/proc_fs.h>
-+#include <linux/seq_file.h>
-
- #include <asm/uaccess.h>
- #include <asm/page.h>
-@@ -638,6 +639,300 @@
- return journal_add_journal_head(bh);
- }
-
-+struct jbd_stats_proc_session {
-+ journal_t *journal;
-+ struct transaction_stats_s *stats;
-+ int start;
-+ int max;
-+};
-+
-+static void *jbd_history_skip_empty(struct jbd_stats_proc_session *s,
-+ struct transaction_stats_s *ts,
-+ int first)
-+{
-+ if (ts == s->stats + s->max)
-+ ts = s->stats;
-+ if (!first && ts == s->stats + s->start)
-+ return NULL;
-+ while (ts->ts_type == 0) {
-+ ts++;
-+ if (ts == s->stats + s->max)
-+ ts = s->stats;
-+ if (ts == s->stats + s->start)
-+ return NULL;
-+ }
-+ return ts;
-+
-+}
-+
-+static void *jbd_seq_history_start(struct seq_file *seq, loff_t *pos)
-+{
-+ struct jbd_stats_proc_session *s = seq->private;
-+ struct transaction_stats_s *ts;
-+ int l = *pos;
-+
-+ if (l == 0)
-+ return SEQ_START_TOKEN;
-+ ts = jbd_history_skip_empty(s, s->stats + s->start, 1);
-+ if (!ts)
-+ return NULL;
-+ while (--l && (ts = jbd_history_skip_empty(s, ++ts, 0)) != NULL);
-+ return ts;
-+}
-+
-+static void *jbd_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
-+{
-+ struct jbd_stats_proc_session *s = seq->private;
-+ struct transaction_stats_s *ts = v;
-+
-+ ++*pos;
-+ if (v == SEQ_START_TOKEN)
-+ return jbd_history_skip_empty(s, s->stats + s->start, 1);
-+ else
-+ return jbd_history_skip_empty(s, ++ts, 0);
-+}
-+
-+static int jbd_seq_history_show(struct seq_file *seq, void *v)
-+{
-+ struct transaction_stats_s *ts = v;
-+ if (v == SEQ_START_TOKEN) {
-+ seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
-+ "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
-+ "wait", "run", "lock", "flush", "log", "hndls",
-+ "block", "inlog", "ctime", "write", "drop",
-+ "close");
-+ return 0;
-+ }
-+ if (ts->ts_type == JBD_STATS_RUN)
-+ seq_printf(seq, "%-4s %-5lu %-5lu %-5lu %-5lu %-5lu %-5lu "
-+ "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
-+ ts->ts_wait, ts->ts_running, ts->ts_locked,
-+ ts->ts_flushing, ts->ts_logging,
-+ ts->ts_handle_count, ts->ts_blocks,
-+ ts->ts_blocks_logged);
-+ else if (ts->ts_type == JBD_STATS_CHECKPOINT)
-+ seq_printf(seq, "%-4s %-5lu %48s %-5lu %-5lu %-5lu %-5lu\n",
-+ "C", ts->ts_tid, " ", ts->ts_chp_time,
-+ ts->ts_written, ts->ts_dropped,
-+ ts->ts_forced_to_close);
-+ else
-+ J_ASSERT(0);
-+ return 0;
-+}
-+
-+static void jbd_seq_history_stop(struct seq_file *seq, void *v)
-+{
-+}
-+
-+static struct seq_operations jbd_seq_history_ops = {
-+ .start = jbd_seq_history_start,
-+ .next = jbd_seq_history_next,
-+ .stop = jbd_seq_history_stop,
-+ .show = jbd_seq_history_show,
-+};
-+
-+static int jbd_seq_history_open(struct inode *inode, struct file *file)
-+{
-+ journal_t *journal = PDE(inode)->data;
-+ struct jbd_stats_proc_session *s;
-+ int rc, size;
-+
-+ s = kmalloc(sizeof(*s), GFP_KERNEL);
-+ if (s == NULL)
-+ return -EIO;
-+ size = sizeof(struct transaction_stats_s) * journal->j_history_max;
-+ s->stats = kmalloc(size, GFP_KERNEL);
-+ if (s->stats == NULL) {
-+ kfree(s);
-+ return -EIO;
-+ }
-+ spin_lock(&journal->j_history_lock);
-+ memcpy(s->stats, journal->j_history, size);
-+ s->max = journal->j_history_max;
-+ s->start = journal->j_history_cur % s->max;
-+ spin_unlock(&journal->j_history_lock);
-+
-+ rc = seq_open(file, &jbd_seq_history_ops);
-+ if (rc == 0) {
-+ struct seq_file *m = (struct seq_file *)file->private_data;
-+ m->private = s;
-+ } else {
-+ kfree(s->stats);
-+ kfree(s);
-+ }
-+ return rc;
-+
-+}
-+
-+static int jbd_seq_history_release(struct inode *inode, struct file *file)
-+{
-+ struct seq_file *seq = (struct seq_file *)file->private_data;
-+ struct jbd_stats_proc_session *s = seq->private;
-+ kfree(s->stats);
-+ kfree(s);
-+ return seq_release(inode, file);
-+}
-+
-+static struct file_operations jbd_seq_history_fops = {
-+ .owner = THIS_MODULE,
-+ .open = jbd_seq_history_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = jbd_seq_history_release,
-+};
-+
-+static void *jbd_seq_info_start(struct seq_file *seq, loff_t *pos)
-+{
-+ return *pos ? NULL : SEQ_START_TOKEN;
-+}
-+
-+static void *jbd_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
-+{
-+ return NULL;
-+}
-+
-+static int jbd_seq_info_show(struct seq_file *seq, void *v)
-+{
-+ struct jbd_stats_proc_session *s = seq->private;
-+ if (v != SEQ_START_TOKEN)
-+ return 0;
-+ seq_printf(seq, "%lu transaction, each upto %u blocks\n",
-+ s->stats->ts_tid,
-+ s->journal->j_max_transaction_buffers);
-+ if (s->stats->ts_tid == 0)
-+ return 0;
-+ seq_printf(seq, "average: \n %lums waiting for transaction\n",
-+ s->stats->ts_wait / s->stats->ts_tid);
-+ seq_printf(seq, " %lums running transaction\n",
-+ s->stats->ts_running / s->stats->ts_tid);
-+ seq_printf(seq, " %lums transaction was being locked\n",
-+ s->stats->ts_locked / s->stats->ts_tid);
-+ seq_printf(seq, " %lums flushing data (in ordered mode)\n",
-+ s->stats->ts_flushing / s->stats->ts_tid);
-+ seq_printf(seq, " %lums logging transaction\n",
-+ s->stats->ts_logging / s->stats->ts_tid);
-+ seq_printf(seq, " %lu handles per transaction\n",
-+ s->stats->ts_handle_count / s->stats->ts_tid);
-+ seq_printf(seq, " %lu blocks per transaction\n",
-+ s->stats->ts_blocks / s->stats->ts_tid);
-+ seq_printf(seq, " %lu logged blocks per transaction\n",
-+ s->stats->ts_blocks_logged / s->stats->ts_tid);
-+ return 0;
-+}
-+
-+static void jbd_seq_info_stop(struct seq_file *seq, void *v)
-+{
-+}
-+
-+static struct seq_operations jbd_seq_info_ops = {
-+ .start = jbd_seq_info_start,
-+ .next = jbd_seq_info_next,
-+ .stop = jbd_seq_info_stop,
-+ .show = jbd_seq_info_show,
-+};
-+
-+static int jbd_seq_info_open(struct inode *inode, struct file *file)
-+{
-+ journal_t *journal = PDE(inode)->data;
-+ struct jbd_stats_proc_session *s;
-+ int rc, size;
-+
-+ s = kmalloc(sizeof(*s), GFP_KERNEL);
-+ if (s == NULL)
-+ return -EIO;
-+ size = sizeof(struct transaction_stats_s);
-+ s->stats = kmalloc(size, GFP_KERNEL);
-+ if (s->stats == NULL) {
-+ kfree(s);
-+ return -EIO;
-+ }
-+ spin_lock(&journal->j_history_lock);
-+ memcpy(s->stats, &journal->j_stats, size);
-+ s->journal = journal;
-+ spin_unlock(&journal->j_history_lock);
-+
-+ rc = seq_open(file, &jbd_seq_info_ops);
-+ if (rc == 0) {
-+ struct seq_file *m = (struct seq_file *)file->private_data;
-+ m->private = s;
-+ } else {
-+ kfree(s->stats);
-+ kfree(s);
-+ }
-+ return rc;
-+
-+}
-+
-+static int jbd_seq_info_release(struct inode *inode, struct file *file)
-+{
-+ struct seq_file *seq = (struct seq_file *)file->private_data;
-+ struct jbd_stats_proc_session *s = seq->private;
-+ kfree(s->stats);
-+ kfree(s);
-+ return seq_release(inode, file);
-+}
-+
-+static struct file_operations jbd_seq_info_fops = {
-+ .owner = THIS_MODULE,
-+ .open = jbd_seq_info_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = jbd_seq_info_release,
-+};
-+
-+static struct proc_dir_entry *proc_jbd_stats = NULL;
-+
-+static void jbd_stats_proc_init(journal_t *journal)
-+{
-+ char name[64];
-+
-+ snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
-+ journal->j_proc_entry = proc_mkdir(name, proc_jbd_stats);
-+ if (journal->j_proc_entry) {
-+ struct proc_dir_entry *p;
-+ p = create_proc_entry("history", S_IRUGO,
-+ journal->j_proc_entry);
-+ if (p) {
-+ p->proc_fops = &jbd_seq_history_fops;
-+ p->data = journal;
-+ p = create_proc_entry("info", S_IRUGO,
-+ journal->j_proc_entry);
-+ if (p) {
-+ p->proc_fops = &jbd_seq_info_fops;
-+ p->data = journal;
-+ }
-+ }
-+ }
-+}
-+
-+static void jbd_stats_proc_exit(journal_t *journal)
-+{
-+ char name[64];
-+
-+ snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
-+ remove_proc_entry("info", journal->j_proc_entry);
-+ remove_proc_entry("history", journal->j_proc_entry);
-+ remove_proc_entry(name, proc_jbd_stats);
-+}
-+
-+static void journal_init_stats(journal_t *journal)
-+{
-+ int size;
-+
-+ if (proc_jbd_stats == NULL)
-+ return;
-+
-+ journal->j_history_max = 100;
-+ size = sizeof(struct transaction_stats_s) * journal->j_history_max;
-+ journal->j_history = kmalloc(size, GFP_KERNEL);
-+ if (journal->j_history == NULL) {
-+ journal->j_history_max = 0;
-+ return;
-+ }
-+ memset(journal->j_history, 0, size);
-+ spin_lock_init(&journal->j_history_lock);
-+}
-+
- /*
- * Management for journal control blocks: functions to create and
- * destroy journal_t structures, and to initialise and read existing
-@@ -680,6 +975,9 @@
- kfree(journal);
- goto fail;
- }
-+
-+ journal_init_stats(journal);
-+
- return journal;
- fail:
- return NULL;
-@@ -723,6 +1021,7 @@
- journal->j_blk_offset = start;
- journal->j_maxlen = len;
- journal->j_blocksize = blocksize;
-+ jbd_stats_proc_init(journal);
-
- bh = __getblk(journal->j_dev, start, journal->j_blocksize);
- J_ASSERT(bh != NULL);
-@@ -772,6 +1071,7 @@
-
- journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
- journal->j_blocksize = inode->i_sb->s_blocksize;
-+ jbd_stats_proc_init(journal);
-
- /* journal descriptor can store up to n blocks -bzzz */
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
-@@ -1168,6 +1468,8 @@
- brelse(journal->j_sb_buffer);
- }
-
-+ if (journal->j_proc_entry)
-+ jbd_stats_proc_exit(journal);
- if (journal->j_inode)
- iput(journal->j_inode);
- if (journal->j_revoke)
-@@ -2015,6 +2317,28 @@
-
- #endif
-
-+#if defined(CONFIG_PROC_FS)
-+
-+#define JBD_STATS_PROC_NAME "fs/jbd"
-+
-+static void __init create_jbd_stats_proc_entry(void)
-+{
-+ proc_jbd_stats = proc_mkdir(JBD_STATS_PROC_NAME, NULL);
-+}
-+
-+static void __exit remove_jbd_stats_proc_entry(void)
-+{
-+ if (proc_jbd_stats)
-+ remove_proc_entry(JBD_STATS_PROC_NAME, NULL);
-+}
-+
-+#else
-+
-+#define create_jbd_stats_proc_entry() do {} while (0)
-+#define remove_jbd_stats_proc_entry() do {} while (0)
-+
-+#endif
-+
- kmem_cache_t *jbd_handle_cache;
-
- static int __init journal_init_handle_cache(void)
-@@ -2078,6 +2402,7 @@
- if (ret != 0)
- journal_destroy_caches();
- create_jbd_proc_entry();
-+ create_jbd_stats_proc_entry();
- return ret;
- }
-
-@@ -2089,6 +2414,7 @@
- printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
- #endif
- remove_jbd_proc_entry();
-+ remove_jbd_stats_proc_entry();
- journal_destroy_caches();
- }
-
-Index: linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/checkpoint.c 2009-06-02 23:22:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/checkpoint.c 2009-06-02 23:24:00.000000000 -0600
-@@ -242,7 +242,7 @@
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
- */
- static int __process_buffer(journal_t *journal, struct journal_head *jh,
-- struct buffer_head **bhs, int *batch_count)
-+ struct buffer_head **bhs, int *batch_count, transaction_t *transaction)
- {
- struct buffer_head *bh = jh2bh(jh);
- int ret = 0;
-@@ -260,6 +260,7 @@
- transaction_t *t = jh->b_transaction;
- tid_t tid = t->t_tid;
-
-+ transaction->t_chp_stats.cs_forced_to_close++;
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- log_start_commit(journal, tid);
-@@ -291,6 +292,7 @@
- bhs[*batch_count] = bh;
- __buffer_relink_io(jh);
- jbd_unlock_bh_state(bh);
-+ transaction->t_chp_stats.cs_written++;
- (*batch_count)++;
- if (*batch_count == NR_BATCH) {
- spin_unlock(&journal->j_list_lock);
-@@ -336,6 +338,8 @@
- if (!journal->j_checkpoint_transactions)
- goto out;
- transaction = journal->j_checkpoint_transactions;
-+ if (transaction->t_chp_stats.cs_chp_time == 0)
-+ transaction->t_chp_stats.cs_chp_time = CURRENT_MSECS;
- this_tid = transaction->t_tid;
- restart:
- /*
-@@ -360,7 +364,8 @@
- retry = 1;
- break;
- }
-- retry = __process_buffer(journal, jh, bhs,&batch_count);
-+ retry = __process_buffer(journal, jh, bhs,&batch_count,
-+ transaction);
- if (retry < 0 && !result)
- result = retry;
- if (!retry && lock_need_resched(&journal->j_list_lock)){
-@@ -692,6 +697,8 @@
-
- void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
- {
-+ struct transaction_stats_s stats;
-+
- assert_spin_locked(&journal->j_list_lock);
- if (transaction->t_cpnext) {
- transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
-@@ -718,5 +725,25 @@
- J_ASSERT(journal->j_running_transaction != transaction);
-
- jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-+
-+ /*
-+ * File the transaction for history
-+ */
-+ if (transaction->t_chp_stats.cs_written != 0 ||
-+ transaction->t_chp_stats.cs_chp_time != 0) {
-+ stats.ts_type = JBD_STATS_CHECKPOINT;
-+ stats.ts_tid = transaction->t_tid;
-+ stats.u.chp = transaction->t_chp_stats;
-+ if (stats.ts_chp_time)
-+ stats.ts_chp_time =
-+ jbd_time_diff(stats.ts_chp_time, CURRENT_MSECS);
-+ spin_lock(&journal->j_history_lock);
-+ memcpy(journal->j_history + journal->j_history_cur, &stats,
-+ sizeof(stats));
-+ if (++journal->j_history_cur == journal->j_history_max)
-+ journal->j_history_cur = 0;
-+ spin_unlock(&journal->j_history_lock);
-+ }
-+
- kfree(transaction);
- }
-Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c 2009-06-02 23:22:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/jbd/commit.c 2009-06-02 23:24:00.000000000 -0600
-@@ -13,6 +13,7 @@
- * part of the ext2fs journaling system.
- */
-
-+#include <linux/jiffies.h>
- #include <linux/time.h>
- #include <linux/fs.h>
- #include <linux/jbd.h>
-@@ -22,6 +23,7 @@
- #include <linux/pagemap.h>
- #include <linux/smp_lock.h>
-
-+
- /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
- */
-@@ -288,6 +290,7 @@
- */
- void journal_commit_transaction(journal_t *journal)
- {
-+ struct transaction_stats_s stats;
- transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
- struct buffer_head **wbuf = journal->j_wbuf;
-@@ -334,6 +337,11 @@
- spin_lock(&journal->j_state_lock);
- commit_transaction->t_state = T_LOCKED;
-
-+ stats.ts_wait = commit_transaction->t_max_wait;
-+ stats.ts_locked = CURRENT_MSECS;
-+ stats.ts_running = jbd_time_diff(commit_transaction->t_start,
-+ stats.ts_locked);
-+
- spin_lock(&commit_transaction->t_handle_lock);
- while (commit_transaction->t_updates) {
- DEFINE_WAIT(wait);
-@@ -404,6 +412,9 @@
- */
- journal_switch_revoke_table(journal);
-
-+ stats.ts_flushing = CURRENT_MSECS;
-+ stats.ts_locked = jbd_time_diff(stats.ts_locked, stats.ts_flushing);
-+
- commit_transaction->t_state = T_FLUSH;
- journal->j_committing_transaction = commit_transaction;
- journal->j_running_transaction = NULL;
-@@ -484,6 +495,11 @@
- J_ASSERT(commit_transaction->t_nr_buffers <=
- commit_transaction->t_outstanding_credits);
-
-+ stats.ts_logging = CURRENT_MSECS;
-+ stats.ts_flushing = jbd_time_diff(stats.ts_flushing, stats.ts_logging);
-+ stats.ts_blocks = commit_transaction->t_outstanding_credits;
-+ stats.ts_blocks_logged = 0;
-+
- descriptor = NULL;
- bufs = 0;
- while (commit_transaction->t_buffers) {
-@@ -633,6 +649,7 @@
- submit_bh(WRITE, bh);
- }
- cond_resched();
-+ stats.ts_blocks_logged += bufs;
-
- /* Force a new descriptor to be generated next
- time round the loop. */
-@@ -832,6 +849,7 @@
- cp_transaction = jh->b_cp_transaction;
- if (cp_transaction) {
- JBUFFER_TRACE(jh, "remove from old cp transaction");
-+ cp_transaction->t_chp_stats.cs_dropped++;
- __journal_remove_checkpoint(jh);
- }
-
-@@ -908,6 +926,36 @@
-
- J_ASSERT(commit_transaction->t_state == T_COMMIT);
-
-+ commit_transaction->t_start = CURRENT_MSECS;
-+ stats.ts_logging = jbd_time_diff(stats.ts_logging,
-+ commit_transaction->t_start);
-+
-+ /*
-+ * File the transaction for history
-+ */
-+ stats.ts_type = JBD_STATS_RUN;
-+ stats.ts_tid = commit_transaction->t_tid;
-+ stats.ts_handle_count = commit_transaction->t_handle_count;
-+ spin_lock(&journal->j_history_lock);
-+ memcpy(journal->j_history + journal->j_history_cur, &stats,
-+ sizeof(stats));
-+ if (++journal->j_history_cur == journal->j_history_max)
-+ journal->j_history_cur = 0;
-+
-+ /*
-+ * Calculate overall stats
-+ */
-+ journal->j_stats.ts_tid++;
-+ journal->j_stats.ts_wait += stats.ts_wait;
-+ journal->j_stats.ts_running += stats.ts_running;
-+ journal->j_stats.ts_locked += stats.ts_locked;
-+ journal->j_stats.ts_flushing += stats.ts_flushing;
-+ journal->j_stats.ts_logging += stats.ts_logging;
-+ journal->j_stats.ts_handle_count += stats.ts_handle_count;
-+ journal->j_stats.ts_blocks += stats.ts_blocks;
-+ journal->j_stats.ts_blocks_logged += stats.ts_blocks_logged;
-+ spin_unlock(&journal->j_history_lock);
-+
- commit_transaction->t_state = T_FINISHED;
- J_ASSERT(commit_transaction == journal->j_committing_transaction);
- journal->j_commit_sequence = commit_transaction->t_tid;
+++ /dev/null
-This patch is no longer needed for Lustre, since Lustre 2.2. It is kept
-in the kernel patch series for compatibility with older Lustre releases
-to simplify the upgrade process so that both the kernel and Lustre do
-not need to be upgraded at the same time. See Jira issue LU-433.
-
-Index: linux-2.6.18-128.1.6/include/linux/jbd2.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/jbd2.h 2009-04-15 08:35:28.000000000 +0530
-+++ linux-2.6.18-128.1.6/include/linux/jbd2.h 2009-05-28 15:10:18.000000000 +0530
-@@ -381,6 +381,27 @@
- bit_spin_unlock(BH_JournalHead, &bh->b_state);
- }
-
-+#define HAVE_JOURNAL_CALLBACK_STATUS
-+/**
-+ * struct journal_callback - Base structure for callback information.
-+ * @jcb_list: list information for other callbacks attached to the same handle.
-+ * @jcb_func: Function to call with this callback structure.
-+ *
-+ * This struct is a 'seed' structure for a using with your own callback
-+ * structs. If you are using callbacks you must allocate one of these
-+ * or another struct of your own definition which has this struct
-+ * as it's first element and pass it to journal_callback_set().
-+ *
-+ * This is used internally by jbd2 to maintain callback information.
-+ *
-+ * See journal_callback_set for more information.
-+ **/
-+struct journal_callback {
-+ struct list_head jcb_list; /* t_jcb_lock */
-+ void (*jcb_func)(struct journal_callback *jcb, int error);
-+ /* user data goes here */
-+};
-+
- struct jbd2_revoke_table_s;
-
- /**
-@@ -389,6 +410,7 @@
- * @h_transaction: Which compound transaction is this update a part of?
- * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
- * @h_ref: Reference count on this handle
-+ * @h_jcb: List of application registered callbacks for this handle.
- * @h_err: Field for caller's use to track errors through large fs operations
- * @h_sync: flag for sync-on-close
- * @h_jdata: flag to force data journaling
-@@ -414,6 +436,13 @@
- /* operations */
- int h_err;
-
-+ /*
-+ * List of application registered callbacks for this handle. The
-+ * function(s) will be called after the transaction that this handle is
-+ * part of has been committed to disk. [t_jcb_lock]
-+ */
-+ struct list_head h_jcb;
-+
- /* Flags [no locking] */
- unsigned int h_sync: 1; /* sync-on-close */
- unsigned int h_jdata: 1; /* force data journaling */
-@@ -469,6 +498,8 @@
- * j_state_lock
- * ->j_list_lock (journal_unmap_buffer)
- *
-+ * t_handle_lock
-+ * ->t_jcb_lock
- */
-
- struct transaction_s
-@@ -615,6 +646,15 @@
- */
- int t_handle_count;
-
-+ /*
-+ * Protects the callback list
-+ */
-+ spinlock_t t_jcb_lock;
-+ /*
-+ * List of registered callback functions for this transaction.
-+ * Called when the transaction is committed. [t_jcb_lock]
-+ */
-+ struct list_head t_jcb;
- /*
- * For use by the filesystem to store fs-specific data
- * structures associated with the transaction
-@@ -1018,6 +1058,9 @@
- extern int jbd2_journal_flush (journal_t *);
- extern void jbd2_journal_lock_updates (journal_t *);
- extern void jbd2_journal_unlock_updates (journal_t *);
-+extern void jbd2_journal_callback_set(handle_t *handle,
-+ void (*fn)(struct journal_callback *,int),
-+ struct journal_callback *jcb);
-
- extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
- struct block_device *fs_dev,
-Index: linux-2.6.18-128.1.6/fs/jbd2/checkpoint.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd2/checkpoint.c 2009-04-15 08:35:28.000000000 +0530
-+++ linux-2.6.18-128.1.6/fs/jbd2/checkpoint.c 2009-05-28 15:10:18.000000000 +0530
-@@ -695,6 +695,7 @@
- J_ASSERT(transaction->t_checkpoint_list == NULL);
- J_ASSERT(transaction->t_checkpoint_io_list == NULL);
- J_ASSERT(transaction->t_updates == 0);
-+ J_ASSERT(list_empty(&transaction->t_jcb));
- J_ASSERT(journal->j_committing_transaction != transaction);
- J_ASSERT(journal->j_running_transaction != transaction);
-
-Index: linux-2.6.18-128.1.6/fs/jbd2/commit.c
-===================================================================
---- linux-2.6.18-164.6.1/fs/jbd2/commit.c 2010-01-21 11:24:52.000000000 +0530
-+++ linux-2.6.18-164.6.1_new/fs/jbd2/commit.c 2010-01-21 11:26:36.000000000 +0530
-@@ -832,6 +832,29 @@ wait_for_iobuf:
- processing: any buffers committed as a result of this
- transaction can be removed from any checkpoint list it was on
- before. */
-+ /*
-+ * Call any callbacks that had been registered for handles in this
-+ * transaction. It is up to the callback to free any allocated
-+ * memory.
-+ *
-+ * The spinlocking (t_jcb_lock) here is surely unnecessary...
-+ */
-+ spin_lock(&commit_transaction->t_jcb_lock);
-+ if (!list_empty(&commit_transaction->t_jcb)) {
-+ struct list_head *p, *n;
-+ int error = is_journal_aborted(journal);
-+
-+ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-+ struct journal_callback *jcb;
-+
-+ jcb = list_entry(p, struct journal_callback, jcb_list);
-+ list_del(p);
-+ spin_unlock(&commit_transaction->t_jcb_lock);
-+ jcb->jcb_func(jcb, error);
-+ spin_lock(&commit_transaction->t_jcb_lock);
-+ }
-+ }
-+ spin_unlock(&commit_transaction->t_jcb_lock);
-
- jbd_debug(3, "JBD: commit phase 6\n");
-
-Index: linux-2.6.18-128.1.6/fs/jbd2/journal.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd2/journal.c 2009-04-15 08:35:28.000000000 +0530
-+++ linux-2.6.18-128.1.6/fs/jbd2/journal.c 2009-05-28 17:13:35.000000000 +0530
-@@ -80,6 +80,8 @@
- EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
- EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
- EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
-+EXPORT_SYMBOL(jbd2_journal_callback_set);
-+EXPORT_SYMBOL(jbd2_journal_bmap);
-
- static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
- static void __journal_abort_soft (journal_t *journal, int errno);
-Index: linux-2.6.18-128.1.6/fs/jbd2/transaction.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/jbd2/transaction.c 2009-04-15 08:35:28.000000000 +0530
-+++ linux-2.6.18-128.1.6/fs/jbd2/transaction.c 2009-05-28 15:11:28.000000000 +0530
-@@ -51,6 +51,9 @@
- spin_lock_init(&transaction->t_handle_lock);
- INIT_LIST_HEAD(&transaction->t_inode_list);
- INIT_LIST_HEAD(&transaction->t_private_list);
-+ INIT_LIST_HEAD(&transaction->t_jcb);
-+ spin_lock_init(&transaction->t_jcb_lock);
-+
-
- /* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
-@@ -251,6 +254,7 @@
- memset(handle, 0, sizeof(*handle));
- handle->h_buffer_credits = nblocks;
- handle->h_ref = 1;
-+ INIT_LIST_HEAD(&handle->h_jcb);
-
- lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
- &jbd2_handle_key, 0);
-@@ -1349,6 +1353,36 @@
- }
-
- /**
-+ * void jbd2_journal_callback_set() - Register a callback function for this handle.
-+ * @handle: handle to attach the callback to.
-+ * @func: function to callback.
-+ * @jcb: structure with additional information required by func() , and
-+ * some space for jbd2 internal information.
-+ *
-+ * The function will be
-+ * called when the transaction that this handle is part of has been
-+ * committed to disk with the original callback data struct and the
-+ * error status of the journal as parameters. There is no guarantee of
-+ * ordering between handles within a single transaction, nor between
-+ * callbacks registered on the same handle.
-+ *
-+ * The caller is responsible for allocating the journal_callback struct.
-+ * This is to allow the caller to add as much extra data to the callback
-+ * as needed, but reduce the overhead of multiple allocations. The caller
-+ * allocated struct must start with a struct journal_callback at offset 0,
-+ * and has the caller-specific data afterwards.
-+ */
-+void jbd2_journal_callback_set(handle_t *handle,
-+ void (*func)(struct journal_callback *jcb, int error),
-+ struct journal_callback *jcb)
-+{
-+ spin_lock(&handle->h_transaction->t_jcb_lock);
-+ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
-+ spin_unlock(&handle->h_transaction->t_jcb_lock);
-+ jcb->jcb_func = func;
-+}
-+
-+/**
- * int jbd2_journal_stop() - complete a transaction
- * @handle: tranaction to complete.
- *
-@@ -1422,6 +1456,11 @@
- wake_up(&journal->j_wait_transaction_locked);
- }
-
-+ /* Move callbacks from the handle to the transaction. */
-+ spin_lock(&transaction->t_jcb_lock);
-+ list_splice(&handle->h_jcb, &transaction->t_jcb);
-+ spin_unlock(&transaction->t_jcb_lock);
-+
- /*
- * If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
+++ /dev/null
-From 42e140bf105aea1c9679b1cd128aebc35196e6fc Mon Sep 17 00:00:00 2001
-From: yangsheng <sheng.yang@oracle.com>
-Date: Mon, 15 Nov 2010 21:26:35 +0800
-Subject: [PATCH] jbd2_stats_proc_init wrong place.
-
- The jbd2_stats_proc_init() was placed on wrong location in
- jbd2_journal_init_dev(). This may cause /proc/fs/jdb2/<dev>/*
- cannot be created when using external journal device.
-
- Reviewed-by: Andreas Dilger <andreas.dilger@oracle.com>
-
----
----
- fs/jbd2/journal.c | 16 ++++++++--------
- 1 files changed, 8 insertions(+), 8 deletions(-)
-
-diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
-index c590d15..f837ba9 100644
---- a/fs/jbd2/journal.c
-+++ b/fs/jbd2/journal.c
-@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
-
- /* journal descriptor can store up to n blocks -bzzz */
- journal->j_blocksize = blocksize;
-+ journal->j_dev = bdev;
-+ journal->j_fs_dev = fs_dev;
-+ journal->j_blk_offset = start;
-+ journal->j_maxlen = len;
-+ bdevname(journal->j_dev, journal->j_devname);
-+ p = journal->j_devname;
-+ while ((p = strchr(p, '/')))
-+ *p = '!';
- jbd2_stats_proc_init(journal);
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
- journal->j_wbufsize = n;
-@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
- __func__);
- goto out_err;
- }
-- journal->j_dev = bdev;
-- journal->j_fs_dev = fs_dev;
-- journal->j_blk_offset = start;
-- journal->j_maxlen = len;
-- bdevname(journal->j_dev, journal->j_devname);
-- p = journal->j_devname;
-- while ((p = strchr(p, '/')))
-- *p = '!';
-
- bh = __getblk(journal->j_dev, start, journal->j_blocksize);
- if (!bh) {
---
-1.7.2.3
-
+++ /dev/null
-diff -Naur base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c
---- base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c 2010-09-09 16:57:15.000000000 -0400
-+++ linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.c 2010-09-09 17:02:17.000000000 -0400
-@@ -586,18 +586,25 @@
- iser_conn_terminate(ib_conn);
- }
-
-+static int iscsi_iser_slave_configure(struct scsi_device *sdev)
-+{
-+ blk_queue_dma_alignment(sdev->request_queue, 0);
-+ return 0;
-+}
-+
- static struct scsi_host_template iscsi_iser_sht = {
- .module = THIS_MODULE,
- .name = "iSCSI Initiator over iSER, v." DRV_VER,
- .queuecommand = iscsi2_queuecommand,
- .change_queue_depth = iscsi2_change_queue_depth,
- .sg_tablesize = ISCSI_ISER_SG_TABLESIZE,
-- .max_sectors = 1024,
-+ .max_sectors = 0xffff,
- .cmd_per_lun = ISER_DEF_CMD_PER_LUN,
- .eh_abort_handler = iscsi2_eh_abort,
- .eh_device_reset_handler= iscsi2_eh_device_reset,
- .eh_host_reset_handler= iscsi2_eh_target_reset,
- .use_clustering = DISABLE_CLUSTERING,
-+ .slave_configure = iscsi_iser_slave_configure,
- .proc_name = "iscsi_iser",
- .this_id = -1,
- };
-diff -Naur base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h
---- base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h 2010-09-09 16:57:15.000000000 -0400
-+++ linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iscsi_iser.h 2010-09-09 17:03:17.000000000 -0400
-@@ -92,7 +92,8 @@
- #define MASK_4K (~(SIZE_4K-1))
-
- /* support upto 512KB in one RDMA */
--#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K)
-+/* FMR space for 1 MB of 4k-page transfers, plus 1 if not page aligned */
-+#define ISCSI_ISER_SG_TABLESIZE (((1<<20) >> SHIFT_4K) + 1)
- #define ISER_DEF_CMD_PER_LUN 128
-
- /* QP settings */
-diff -Naur base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c
---- base.linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c 2010-09-09 16:57:15.000000000 -0400
-+++ linux-2.6.18.x86_64/drivers/infiniband/ulp/iser/iser_verbs.c 2010-09-09 17:04:44.000000000 -0400
-@@ -137,7 +137,7 @@
- device = ib_conn->device;
-
- ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
-- (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
-+ sizeof(u64) * ISCSI_ISER_SG_TABLESIZE,
- GFP_KERNEL);
- if (!ib_conn->page_vec) {
- ret = -ENOMEM;
-@@ -146,9 +146,7 @@
- ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1);
-
- params.page_shift = SHIFT_4K;
-- /* when the first/last SG element are not start/end *
-- * page aligned, the map whould be of N+1 pages */
-- params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
-+ params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE;
- /* make the pool size twice the max number of SCSI commands *
- * the ML is expected to queue, watermark for unmap at 50% */
- params.pool_size = ISCSI_DEF_XMIT_CMDS_MAX * 2;
-diff -Naur base.linux-2.6.18.x86_64/include/scsi/libiscsi2.h linux-2.6.18.x86_64/include/scsi/libiscsi2.h
---- base.linux-2.6.18.x86_64/include/scsi/libiscsi2.h 2010-09-09 16:57:35.000000000 -0400
-+++ linux-2.6.18.x86_64/include/scsi/libiscsi2.h 2010-09-09 17:05:34.000000000 -0400
-@@ -43,7 +43,7 @@
- struct iscsi_nopin;
- struct device;
-
--#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* must be power of 2 */
-+#define ISCSI_DEF_XMIT_CMDS_MAX 256 /* must be power of 2 */
- #define ISCSI_MGMT_CMDS_MAX 15
-
- #define ISCSI_DEF_CMD_PER_LUN 32
+++ /dev/null
-diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c
---- .prev/drivers/md/bitmap.c 2007-02-07 13:03:56.000000000 +1100
-+++ ./drivers/md/bitmap.c 2007-02-07 21:34:47.000000000 +1100
-@@ -1160,6 +1160,22 @@ int bitmap_startwrite(struct bitmap *bit
- return 0;
- }
-
-+ if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) {
-+ DEFINE_WAIT(__wait);
-+ /* note that it is safe to do the prepare_to_wait
-+ * after the test as long as we do it before dropping
-+ * the spinlock.
-+ */
-+ prepare_to_wait(&bitmap->overflow_wait, &__wait,
-+ TASK_UNINTERRUPTIBLE);
-+ spin_unlock_irq(&bitmap->lock);
-+ bitmap->mddev->queue
-+ ->unplug_fn(bitmap->mddev->queue);
-+ schedule();
-+ finish_wait(&bitmap->overflow_wait, &__wait);
-+ continue;
-+ }
-+
- switch(*bmc) {
- case 0:
- bitmap_file_set_bit(bitmap, offset);
-@@ -1169,7 +1185,7 @@ int bitmap_startwrite(struct bitmap *bit
- case 1:
- *bmc = 2;
- }
-- BUG_ON((*bmc & COUNTER_MAX) == COUNTER_MAX);
-+
- (*bmc)++;
-
- spin_unlock_irq(&bitmap->lock);
-@@ -1207,6 +1223,9 @@ void bitmap_endwrite(struct bitmap *bitm
- if (!success && ! (*bmc & NEEDED_MASK))
- *bmc |= NEEDED_MASK;
-
-+ if ((*bmc & COUNTER_MAX) == COUNTER_MAX)
-+ wake_up(&bitmap->overflow_wait);
-+
- (*bmc)--;
- if (*bmc <= 2) {
- set_page_attr(bitmap,
-@@ -1431,6 +1450,7 @@ int bitmap_create(mddev_t *mddev)
- spin_lock_init(&bitmap->lock);
- atomic_set(&bitmap->pending_writes, 0);
- init_waitqueue_head(&bitmap->write_wait);
-+ init_waitqueue_head(&bitmap->overflow_wait);
-
- bitmap->mddev = mddev;
-
-diff .prev/include/linux/raid/bitmap.h ./include/linux/raid/bitmap.h
---- .prev/include/linux/raid/bitmap.h 2007-02-07 13:03:56.000000000 +1100
-+++ ./include/linux/raid/bitmap.h 2007-02-07 20:57:57.000000000 +1100
-@@ -247,6 +247,7 @@ struct bitmap {
-
- atomic_t pending_writes; /* pending writes to the bitmap file */
- wait_queue_head_t write_wait;
-+ wait_queue_head_t overflow_wait;
-
- };
-
+++ /dev/null
-Index: linux-2.6.18-128.1.6/drivers/md/md.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/drivers/md/md.c 2009-04-14 21:05:26.000000000 -0600
-+++ linux-2.6.18-128.1.6/drivers/md/md.c 2009-06-02 23:25:31.000000000 -0600
-@@ -90,6 +90,8 @@
-
- static int sysctl_speed_limit_min = 1000;
- static int sysctl_speed_limit_max = 200000;
-+static int sysctl_rebuild_window_size = 256;
-+static int sysctl_disk_idle_size = 4096;
- static inline int speed_min(mddev_t *mddev)
- {
- return mddev->sync_speed_min ?
-@@ -121,6 +123,22 @@
- .mode = S_IRUGO|S_IWUSR,
- .proc_handler = &proc_dointvec,
- },
-+ {
-+ .ctl_name = DEV_RAID_REBUILD_WINDOW,
-+ .procname = "rebuild_window_size",
-+ .data = &sysctl_rebuild_window_size,
-+ .maxlen = sizeof(int),
-+ .mode = S_IRUGO|S_IWUSR,
-+ .proc_handler = &proc_dointvec,
-+ },
-+ {
-+ .ctl_name = DEV_RAID_DISK_IDLE_SIZE,
-+ .procname = "disk_idle_size",
-+ .data = &sysctl_disk_idle_size,
-+ .maxlen = sizeof(int),
-+ .mode = S_IRUGO|S_IWUSR,
-+ .proc_handler = &proc_dointvec,
-+ },
- { .ctl_name = 0 }
- };
-
-@@ -5009,15 +5027,16 @@
- {
- mdk_rdev_t * rdev;
- int idle;
-- unsigned long curr_events;
-+ unsigned long rw, sync;
-
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-- curr_events = disk_stat_read(disk, sectors[0]) +
-- disk_stat_read(disk, sectors[1]) -
-- atomic_read(&disk->sync_io);
-+
-+ rw = disk_stat_read(disk, sectors[READ])+disk_stat_read(disk, sectors[WRITE]);
-+ sync = atomic_read(&disk->sync_io);
-+
- /* The difference between curr_events and last_events
- * will be affected by any new non-sync IO (making
- * curr_events bigger) and any difference in the amount of
-@@ -5031,9 +5050,9 @@
- *
- * Note: the following is an unsigned comparison.
- */
-- if ((curr_events - rdev->last_events + 4096) > 8192) {
-- rdev->last_events = curr_events;
-+ if (rw - rdev->last_events > sync + sysctl_disk_idle_size) {
- idle = 0;
-+ rdev->last_events = rw - sync;
- }
- }
- rcu_read_unlock();
-@@ -5100,8 +5119,7 @@
- void md_do_sync(mddev_t *mddev)
- {
- mddev_t *mddev2;
-- unsigned int currspeed = 0,
-- window;
-+ unsigned int currspeed = 0;
- sector_t max_sectors,j, io_sectors;
- unsigned long mark[SYNC_MARKS];
- sector_t mark_cnt[SYNC_MARKS];
-@@ -5221,9 +5239,8 @@
- /*
- * Tune reconstruction:
- */
-- window = 32*(PAGE_SIZE/512);
- printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
-- window/2,(unsigned long long) max_sectors/2);
-+ sysctl_rebuild_window_size/2,(unsigned long long) max_sectors/2);
-
- atomic_set(&mddev->recovery_active, 0);
- init_waitqueue_head(&mddev->recovery_wait);
-@@ -5261,7 +5278,7 @@
- */
- md_new_event(mddev);
-
-- if (last_check + window > io_sectors || j == max_sectors)
-+ if (last_check + sysctl_rebuild_window_size > io_sectors || j == max_sectors)
- continue;
-
- last_check = io_sectors;
-@@ -5282,7 +5299,6 @@
- last_mark = next;
- }
-
--
- if (kthread_should_stop()) {
- /*
- * got a signal, exit.
-@@ -5306,10 +5322,16 @@
-
- currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
- /((jiffies-mddev->resync_mark)/HZ +1) +1;
--
- if (currspeed > speed_min(mddev)) {
- if ((currspeed > speed_max(mddev)) ||
- !is_mddev_idle(mddev)) {
-+ static unsigned long next_report;
-+ if (time_after(jiffies, next_report)) {
-+ printk(KERN_INFO "md: rebuild %s throttled due to IO\n",
-+ mdname(mddev));
-+ /* once per 10 minutes */
-+ next_report = jiffies + 600 * HZ;
-+ }
- msleep(500);
- goto repeat;
- }
-Index: linux-2.6.18-128.1.6/include/linux/sysctl.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/sysctl.h 2009-04-14 21:05:41.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/sysctl.h 2009-06-02 23:25:31.000000000 -0600
-@@ -928,7 +928,9 @@
- /* /proc/sys/dev/raid */
- enum {
- DEV_RAID_SPEED_LIMIT_MIN=1,
-- DEV_RAID_SPEED_LIMIT_MAX=2
-+ DEV_RAID_SPEED_LIMIT_MAX=2,
-+ DEV_RAID_REBUILD_WINDOW=3,
-+ DEV_RAID_DISK_IDLE_SIZE=4
- };
-
- /* /proc/sys/dev/parport/default */
+++ /dev/null
-diff -Nrup linux-2.6.18-92.1.10.orig/drivers/message/fusion/Kconfig linux-2.6.18-92.1.10/drivers/message/fusion/Kconfig
---- linux-2.6.18-92.1.10.orig/drivers/message/fusion/Kconfig 2008-12-11 10:27:02.000000000 +1100
-+++ linux-2.6.18-92.1.10/drivers/message/fusion/Kconfig 2008-12-11 10:28:42.000000000 +1100
-@@ -59,10 +59,10 @@ config FUSION_SAS
- LSISAS1078
-
- config FUSION_MAX_SGE
-- int "Maximum number of scatter gather entries (16 - 128)"
-+ int "Maximum number of scatter gather entries (16 - 256)"
- depends on FUSION
-- default "128"
-- range 16 128
-+ default "256"
-+ range 16 256
- help
- This option allows you to specify the maximum number of scatter-
- gather entries per I/O. The driver default is 128, which matches
-diff -Nrup linux-2.6.18-92.1.10.orig/drivers/message/fusion/mptbase.h linux-2.6.18-92.1.10/drivers/message/fusion/mptbase.h
---- linux-2.6.18-92.1.10.orig/drivers/message/fusion/mptbase.h 2008-12-11 10:27:03.000000000 +1100
-+++ linux-2.6.18-92.1.10/drivers/message/fusion/mptbase.h 2008-12-11 10:30:55.000000000 +1100
-@@ -166,8 +166,8 @@
- #ifdef CONFIG_FUSION_MAX_SGE
- #if CONFIG_FUSION_MAX_SGE < 16
- #define MPT_SCSI_SG_DEPTH 16
--#elif CONFIG_FUSION_MAX_SGE > 128
--#define MPT_SCSI_SG_DEPTH 128
-+#elif CONFIG_FUSION_MAX_SGE > 256
-+#define MPT_SCSI_SG_DEPTH 256
- #else
- #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE
- #endif
+++ /dev/null
---- linux/fs/inode.c.orig 2009-01-24 03:28:57.000000000 +0800
-+++ linux/fs/inode.c 2009-01-24 03:30:18.000000000 +0800
-@@ -418,7 +418,9 @@ static void prune_icache(int nr_to_scan)
- int nr_scanned;
- unsigned long reap = 0;
-
-- mutex_lock(&iprune_mutex);
-+ if (!mutex_trylock(&iprune_mutex))
-+ return;
-+
- spin_lock(&inode_lock);
- for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
- struct inode *inode;
+++ /dev/null
-Index: linux-2.6.18-128.1.6/fs/dquot.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/dquot.c 2009-04-14 21:04:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/dquot.c 2009-06-02 23:26:36.000000000 -0600
-@@ -1592,10 +1592,19 @@
- }
-
- /* Generic routine for setting common part of quota structure */
--static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
-+static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
- {
- struct mem_dqblk *dm = &dquot->dq_dqb;
- int check_blim = 0, check_ilim = 0;
-+ struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
-+
-+ if ((di->dqb_valid & QIF_BLIMITS &&
-+ (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
-+ di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
-+ (di->dqb_valid & QIF_ILIMITS &&
-+ (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
-+ di->dqb_isoftlimit > dqi->dqi_maxilimit)))
-+ return -ERANGE;
-
- spin_lock(&dq_data_lock);
- if (di->dqb_valid & QIF_SPACE) {
-@@ -1627,7 +1636,7 @@
- clear_bit(DQ_BLKS_B, &dquot->dq_flags);
- }
- else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */
-- dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace;
-+ dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
- }
- if (check_ilim) {
- if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) {
-@@ -1635,7 +1644,7 @@
- clear_bit(DQ_INODES_B, &dquot->dq_flags);
- }
- else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */
-- dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
-+ dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
- }
- if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit)
- clear_bit(DQ_FAKE_B, &dquot->dq_flags);
-@@ -1643,21 +1652,24 @@
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- spin_unlock(&dq_data_lock);
- mark_dquot_dirty(dquot);
-+
-+ return 0;
- }
-
- int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
- {
- struct dquot *dquot;
-+ int rc;
-
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!(dquot = dqget(sb, id, type))) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return -ESRCH;
- }
-- do_set_dqblk(dquot, di);
-+ rc = do_set_dqblk(dquot, di);
- dqput(dquot);
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-- return 0;
-+ return rc;
- }
-
- /* Generic routine for getting common part of quota file information */
-Index: linux-2.6.18-128.1.6/fs/quota_v1.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/quota_v1.c 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/quota_v1.c 2009-06-02 23:26:36.000000000 -0600
-@@ -139,6 +139,9 @@
- goto out;
- }
- ret = 0;
-+ /* limits are stored as unsigned 32-bit data */
-+ dqopt->info[type].dqi_maxblimit = 0xffffffff;
-+ dqopt->info[type].dqi_maxilimit = 0xffffffff;
- dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
- dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
- out:
-Index: linux-2.6.18-128.1.6/fs/quota_v2.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/fs/quota_v2.c 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/fs/quota_v2.c 2009-06-02 23:26:36.000000000 -0600
-@@ -23,26 +23,64 @@
- typedef char *dqbuf_t;
-
- #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
--#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
-+#define GETENTRIES(buf) ((union v2_disk_dqblk *)(((char *)buf) + \
-+ sizeof(struct v2_disk_dqdbheader)))
-+#define REV_ASSERT(r) BUG_ON((rev) != 0 && (rev) != 1)
-+
-+static const union v2_disk_dqblk emptydquot;
-+static const union v2_disk_dqblk fakedquot[2] = {
-+ {.r0 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} },
-+ {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }
-+};
-
--/* Check whether given file is really vfsv0 quotafile */
--static int v2_check_quota_file(struct super_block *sb, int type)
-+static inline uint v2_dqblksz(uint rev)
-+{
-+ uint sz;
-+
-+ REV_ASSERT(rev);
-+
-+ if (rev == 0)
-+ sz = sizeof(struct v2_disk_dqblk_r0);
-+ else
-+ sz = sizeof(struct v2_disk_dqblk_r1);
-+
-+ return sz;
-+}
-+
-+/* Number of quota entries in a block */
-+static inline int v2_dqstrinblk(uint rev)
-+{
-+ return (V2_DQBLKSIZE-sizeof(struct v2_disk_dqdbheader))/v2_dqblksz(rev);
-+}
-+
-+/* Get revision of a quota file, -1 if it does not look a quota file */
-+static int v2_quota_file_revision(struct super_block *sb, int type)
- {
- struct v2_disk_dqheader dqhead;
- ssize_t size;
- static const uint quota_magics[] = V2_INITQMAGICS;
-- static const uint quota_versions[] = V2_INITQVERSIONS;
-+ static const uint quota_versions_r0[] = V2_INITQVERSIONS_R0;
-+ static const uint quota_versions_r1[] = V2_INITQVERSIONS_R1;
-
- size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
- if (size != sizeof(struct v2_disk_dqheader)) {
- printk("quota_v2: failed read expected=%zd got=%zd\n",
- sizeof(struct v2_disk_dqheader), size);
-- return 0;
-+ return -1;
- }
-- if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
-- le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
-- return 0;
-- return 1;
-+ if (le32_to_cpu(dqhead.dqh_magic) == quota_magics[type]) {
-+ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r0[type])
-+ return 0;
-+ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r1[type])
-+ return 1;
-+ }
-+ return -1;
-+}
-+
-+/* Check whether given file is really vfsv0 quotafile */
-+static inline int v2_check_quota_file(struct super_block *sb, int type)
-+{
-+ return v2_quota_file_revision(sb, type) != -1;
- }
-
- /* Read information header from quota file */
-@@ -51,6 +89,13 @@
- struct v2_disk_dqinfo dinfo;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
- ssize_t size;
-+ int rev;
-+
-+ rev = v2_quota_file_revision(sb, type);
-+ if (rev < 0) {
-+ printk(KERN_WARNING "Second quota file check failed.\n");
-+ return -1;
-+ }
-
- size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
- sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
-@@ -65,6 +110,16 @@
- info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
- info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
- info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
-+
-+ info->u.v2_i.dqi_revision = rev;
-+ if (rev == 0) {
-+ info->dqi_maxblimit = 0xffffffffULL;
-+ info->dqi_maxilimit = 0xffffffffULL;
-+ } else {
-+ info->dqi_maxblimit = 0xffffffffffffffffULL;
-+ info->dqi_maxilimit = 0xffffffffffffffffULL;
-+ }
-+
- return 0;
- }
-
-@@ -94,29 +149,61 @@
- return 0;
- }
-
--static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
-+static void disk2memdqb(struct mem_dqblk *m, union v2_disk_dqblk *d, uint rev)
- {
-- m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
-- m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
-- m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
-- m->dqb_itime = le64_to_cpu(d->dqb_itime);
-- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
-- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
-- m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
-- m->dqb_btime = le64_to_cpu(d->dqb_btime);
--}
--
--static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
--{
-- d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
-- d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
-- d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
-- d->dqb_itime = cpu_to_le64(m->dqb_itime);
-- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
-- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
-- d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
-- d->dqb_btime = cpu_to_le64(m->dqb_btime);
-- d->dqb_id = cpu_to_le32(id);
-+ REV_ASSERT(rev);
-+
-+ if (rev == 0) {
-+ struct v2_disk_dqblk_r0 *ddqblk = &d->r0;
-+ m->dqb_ihardlimit = le32_to_cpu(ddqblk->dqb_ihardlimit);
-+ m->dqb_isoftlimit = le32_to_cpu(ddqblk->dqb_isoftlimit);
-+ m->dqb_curinodes = le32_to_cpu(ddqblk->dqb_curinodes);
-+ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime);
-+ m->dqb_bhardlimit = le32_to_cpu(ddqblk->dqb_bhardlimit);
-+ m->dqb_bsoftlimit = le32_to_cpu(ddqblk->dqb_bsoftlimit);
-+ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace);
-+ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime);
-+ } else {
-+ struct v2_disk_dqblk_r1 *ddqblk = &d->r1;
-+ m->dqb_ihardlimit = le64_to_cpu(ddqblk->dqb_ihardlimit);
-+ m->dqb_isoftlimit = le64_to_cpu(ddqblk->dqb_isoftlimit);
-+ m->dqb_curinodes = le64_to_cpu(ddqblk->dqb_curinodes);
-+ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime);
-+ m->dqb_bhardlimit = le64_to_cpu(ddqblk->dqb_bhardlimit);
-+ m->dqb_bsoftlimit = le64_to_cpu(ddqblk->dqb_bsoftlimit);
-+ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace);
-+ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime);
-+ }
-+}
-+
-+static void mem2diskdqb(union v2_disk_dqblk *d, struct mem_dqblk *m,
-+ qid_t id, uint rev)
-+{
-+ REV_ASSERT(rev);
-+
-+ if (rev == 0) {
-+ struct v2_disk_dqblk_r0 *ddqblk = &d->r0;
-+ ddqblk->dqb_id = cpu_to_le32(id);
-+ ddqblk->dqb_ihardlimit = cpu_to_le32((__u32)m->dqb_ihardlimit);
-+ ddqblk->dqb_isoftlimit = cpu_to_le32((__u32)m->dqb_isoftlimit);
-+ ddqblk->dqb_curinodes = cpu_to_le32((__u32)m->dqb_curinodes);
-+ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime);
-+ ddqblk->dqb_bhardlimit = cpu_to_le32((__u32)m->dqb_bhardlimit);
-+ ddqblk->dqb_bsoftlimit = cpu_to_le32((__u32)m->dqb_bsoftlimit);
-+ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace);
-+ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime);
-+ } else {
-+ struct v2_disk_dqblk_r1 *ddqblk = &d->r1;
-+ ddqblk->dqb_id = cpu_to_le32(id);
-+ ddqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
-+ ddqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
-+ ddqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
-+ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime);
-+ ddqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
-+ ddqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
-+ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace);
-+ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime);
-+ }
- }
-
- static dqbuf_t getdqbuf(void)
-@@ -268,10 +355,10 @@
- {
- struct super_block *sb = dquot->dq_sb;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-- uint blk, i;
-+ uint blk, i, rev = info->u.v2_i.dqi_revision;
-+ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev);
- struct v2_disk_dqdbheader *dh;
-- struct v2_disk_dqblk *ddquot;
-- struct v2_disk_dqblk fakedquot;
-+ union v2_disk_dqblk *ddquot;
- dqbuf_t buf;
-
- *err = 0;
-@@ -298,17 +385,18 @@
- info->u.v2_i.dqi_free_entry = blk;
- mark_info_dirty(sb, dquot->dq_type);
- }
-- if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */
-+ /* Block will be full? */
-+ if (le16_to_cpu(dh->dqdh_entries)+1 >= dqstrinblk)
- if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
- goto out_buf;
- }
- dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1);
-- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
- /* Find free structure in block */
-- for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-+ for (i = 0; i < dqstrinblk && memcmp(&emptydquot, ddquot, dqblksz);
-+ i++, ddquot = (char *)ddquot + dqblksz);
- #ifdef __QUOTA_V2_PARANOIA
-- if (i == V2_DQSTRINBLK) {
-+ if (i == dqstrinblk) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
- *err = -EIO;
- goto out_buf;
-@@ -318,7 +406,8 @@
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
- goto out_buf;
- }
-- dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-+ dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+
-+ ((char *)ddquot - (char *)buf);
- freedqbuf(buf);
- return blk;
- out_buf:
-@@ -392,7 +481,9 @@
- {
- int type = dquot->dq_type;
- ssize_t ret;
-- struct v2_disk_dqblk ddquot, empty;
-+ union v2_disk_dqblk ddquot;
-+ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
-+ uint dqblksz = v2_dqblksz(rev);
-
- /* dq_off is guarded by dqio_mutex */
- if (!dquot->dq_off)
-@@ -401,18 +492,22 @@
- return ret;
- }
- spin_lock(&dq_data_lock);
-- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-+ mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, rev);
- /* Argh... We may need to write structure full of zeroes but that would be
- * treated as an empty place by the rest of the code. Format change would
- * be definitely cleaner but the problems probably are not worth it */
-- memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-- ddquot.dqb_itime = cpu_to_le64(1);
-+ if (!memcmp(&emptydquot, &ddquot, dqblksz)) {
-+ if (rev == 0)
-+ ddquot.r0.dqb_itime = cpu_to_le64(1);
-+ else
-+ ddquot.r1.dqb_itime = cpu_to_le64(1);
-+ }
- spin_unlock(&dq_data_lock);
- ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-- (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-- if (ret != sizeof(struct v2_disk_dqblk)) {
-- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-+ (char *)&ddquot, dqblksz, dquot->dq_off);
-+ if (ret != dqblksz) {
-+ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-+ dquot->dq_sb->s_id);
- if (ret >= 0)
- ret = -ENOSPC;
- }
-@@ -431,6 +526,7 @@
- struct v2_disk_dqdbheader *dh;
- dqbuf_t buf = getdqbuf();
- int ret = 0;
-+ uint rev = sb_dqopt(sb)->info[type].u.v2_i.dqi_revision;
-
- if (!buf)
- return -ENOMEM;
-@@ -456,8 +552,8 @@
- }
- else {
- memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-- sizeof(struct v2_disk_dqblk));
-- if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-+ v2_dqblksz(rev));
-+ if (le16_to_cpu(dh->dqdh_entries) == v2_dqstrinblk(rev)-1) {
- /* Insert will write block itself */
- if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-@@ -529,41 +625,56 @@
- return remove_tree(dquot, &tmp, 0);
- }
-
-+static inline __u32 dqid(union v2_disk_dqblk *ddquot, uint rev)
-+{
-+ __u32 dq_id;
-+
-+ REV_ASSERT(rev);
-+
-+ if (rev == 0)
-+ dq_id = le32_to_cpu(ddquot->r0.dqb_id);
-+ else
-+ dq_id = le32_to_cpu(ddquot->r1.dqb_id);
-+
-+ return dq_id;
-+}
-+
- /* Find entry in block */
- static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
- {
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- int i;
-- struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-+ union v2_disk_dqblk *ddquot = GETENTRIES(buf);
-+ int type = dquot->dq_type;
-+ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
-+ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev);
-
- if (!buf)
- return -ENOMEM;
-- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-+
-+ ret = read_blk(dquot->dq_sb, type, blk, buf);
-+ if (ret < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- if (dquot->dq_id)
-- for (i = 0; i < V2_DQSTRINBLK &&
-- le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-+ for (i = 0; i < dqstrinblk && dqid(ddquot, rev) != dquot->dq_id;
-+ i++, ddquot = (char *)ddquot + dqblksz);
- else { /* ID 0 as a bit more complicated searching... */
-- struct v2_disk_dqblk fakedquot;
--
-- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-- for (i = 0; i < V2_DQSTRINBLK; i++)
-- if (!le32_to_cpu(ddquot[i].dqb_id) &&
-- memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-+ for (i = 0; i < dqstrinblk; i++, ddquot = (char *)ddquot+dqblksz)
-+ if (!dqid(ddquot, rev) &&
-+ memcmp(&emptydquot, ddquot, dqblksz))
- break;
- }
-- if (i == V2_DQSTRINBLK) {
-+ if (i == dqstrinblk) {
- printk(KERN_ERR "VFS: Quota for id %u referenced "
- "but not present.\n", dquot->dq_id);
- ret = -EIO;
- goto out_buf;
- }
- else
-- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-- v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-+ ret = (blk << V2_DQBLKSIZE_BITS)+((char *)ddquot-(char *)buf);
- out_buf:
- freedqbuf(buf);
- return ret;
-@@ -605,7 +716,7 @@
- {
- int type = dquot->dq_type;
- loff_t offset;
-- struct v2_disk_dqblk ddquot, empty;
-+ union v2_disk_dqblk ddquot;
- int ret = 0;
-
- #ifdef __QUOTA_V2_PARANOIA
-@@ -626,25 +737,30 @@
- ret = offset;
- }
- else {
-+ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.
-+ dqi_revision;
-+ uint dqblksz = v2_dqblksz(rev);
- dquot->dq_off = offset;
-- if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-- (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-- != sizeof(struct v2_disk_dqblk)) {
-+ ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-+ (char *)&ddquot, dqblksz, offset);
-+ if (ret != dqblksz) {
- if (ret >= 0)
- ret = -EIO;
- printk(KERN_ERR "VFS: Error while reading quota "
- "structure for id %u.\n", dquot->dq_id);
-- memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-+ memset(&ddquot, 0, dqblksz);
- }
- else {
- ret = 0;
- /* We need to escape back all-zero structure */
-- memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-- empty.dqb_itime = cpu_to_le64(1);
-- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-- ddquot.dqb_itime = 0;
-+ if (!memcmp(&fakedquot[rev], &ddquot, dqblksz)) {
-+ if (rev == 0)
-+ ddquot.r0.dqb_itime = cpu_to_le64(0);
-+ else
-+ ddquot.r1.dqb_itime = cpu_to_le64(0);
-+ }
- }
-- disk2memdqb(&dquot->dq_dqb, &ddquot);
-+ disk2memdqb(&dquot->dq_dqb, &ddquot, rev);
- if (!dquot->dq_dqb.dqb_bhardlimit &&
- !dquot->dq_dqb.dqb_bsoftlimit &&
- !dquot->dq_dqb.dqb_ihardlimit &&
-Index: linux-2.6.18-128.1.6/include/linux/dqblk_v2.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/dqblk_v2.h 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/dqblk_v2.h 2009-06-02 23:26:36.000000000 -0600
-@@ -21,6 +21,7 @@
- unsigned int dqi_blocks;
- unsigned int dqi_free_blk;
- unsigned int dqi_free_entry;
-+ unsigned int dqi_revision;
- };
-
- #endif /* _LINUX_DQBLK_V2_H */
-Index: linux-2.6.18-128.1.6/include/linux/quota.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/quota.h 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/quota.h 2009-06-02 23:26:36.000000000 -0600
-@@ -149,12 +149,12 @@
- * Data for one user/group kept in memory
- */
- struct mem_dqblk {
-- __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */
-- __u32 dqb_bsoftlimit; /* preferred limit on disk blks */
-+ qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */
-+ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */
- qsize_t dqb_curspace; /* current used space */
-- __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
-- __u32 dqb_isoftlimit; /* preferred inode limit */
-- __u32 dqb_curinodes; /* current # allocated inodes */
-+ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */
-+ qsize_t dqb_isoftlimit; /* preferred inode limit */
-+ qsize_t dqb_curinodes; /* current # allocated inodes */
- time_t dqb_btime; /* time limit for excessive disk use */
- time_t dqb_itime; /* time limit for excessive inode use */
- };
-@@ -170,6 +170,8 @@
- unsigned long dqi_flags;
- unsigned int dqi_bgrace;
- unsigned int dqi_igrace;
-+ qsize_t dqi_maxblimit;
-+ qsize_t dqi_maxilimit;
- union {
- struct v1_mem_dqinfo v1_i;
- struct v2_mem_dqinfo v2_i;
-Index: linux-2.6.18-128.1.6/include/linux/quotaio_v2.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/quotaio_v2.h 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/quotaio_v2.h 2009-06-02 23:26:36.000000000 -0600
-@@ -16,28 +16,51 @@
- 0xd9c01927 /* GRPQUOTA */\
- }
-
--#define V2_INITQVERSIONS {\
-+#define V2_INITQVERSIONS_R0 {\
- 0, /* USRQUOTA */\
- 0 /* GRPQUOTA */\
- }
-
-+#define V2_INITQVERSIONS_R1 {\
-+ 1, /* USRQUOTA */\
-+ 1 /* GRPQUOTA */\
-+}
-+
- /*
- * The following structure defines the format of the disk quota file
- * (as it appears on disk) - the file is a radix tree whose leaves point
- * to blocks of these structures.
- */
--struct v2_disk_dqblk {
-+struct v2_disk_dqblk_r0 {
- __le32 dqb_id; /* id this quota applies to */
- __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
- __le32 dqb_isoftlimit; /* preferred inode limit */
- __le32 dqb_curinodes; /* current # allocated inodes */
-- __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
-- __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
-+ __le32 dqb_bhardlimit; /* absolute limit on disk space */
-+ __le32 dqb_bsoftlimit; /* preferred limit on disk space */
-+ __le64 dqb_curspace; /* current space occupied (in bytes) */
-+ __le64 dqb_btime; /* time limit for excessive disk use */
-+ __le64 dqb_itime; /* time limit for excessive inode use */
-+};
-+
-+struct v2_disk_dqblk_r1 {
-+ __le32 dqb_id; /* id this quota applies to */
-+ __le32 dqb_padding; /* padding field */
-+ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
-+ __le64 dqb_isoftlimit; /* preferred inode limit */
-+ __le64 dqb_curinodes; /* current # allocated inodes */
-+ __le64 dqb_bhardlimit; /* absolute limit on disk space */
-+ __le64 dqb_bsoftlimit; /* preferred limit on disk space */
- __le64 dqb_curspace; /* current space occupied (in bytes) */
- __le64 dqb_btime; /* time limit for excessive disk use */
- __le64 dqb_itime; /* time limit for excessive inode use */
- };
-
-+union v2_disk_dqblk {
-+ struct v2_disk_dqblk_r0 r0;
-+ struct v2_disk_dqblk_r1 r1;
-+};
-+
- /*
- * Here are header structures as written on disk and their in-memory copies
- */
-@@ -59,7 +82,7 @@
-
- /*
- * Structure of header of block with quota structures. It is padded to 16 bytes so
-- * there will be space for exactly 21 quota-entries in a block
-+ * there will be space for exactly 21 (r0) or 14 (r1) quota-entries in a block
- */
- struct v2_disk_dqdbheader {
- __le32 dqdh_next_free; /* Number of next block with free entry */
-@@ -74,6 +97,5 @@
- #define V2_DQBLKSIZE (1 << V2_DQBLKSIZE_BITS) /* Size of block with quota structures */
- #define V2_DQTREEOFF 1 /* Offset of tree in file in blocks */
- #define V2_DQTREEDEPTH 4 /* Depth of quota tree */
--#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk)) /* Number of entries in one blocks */
-
- #endif /* _LINUX_QUOTAIO_V2_H */
+++ /dev/null
-diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:23:39.000000000 +0800
-+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:24:14.000000000 +0800
-@@ -57,7 +57,7 @@
- * Stripe cache
- */
-
--#define NR_STRIPES 256
-+static int raid5_nr_stripes = 256 * 8;
- #define STRIPE_SIZE PAGE_SIZE
- #define STRIPE_SHIFT (PAGE_SHIFT - 9)
- #define STRIPE_SECTORS (STRIPE_SIZE>>9)
-@@ -3230,7 +3230,7 @@ static int run(mddev_t *mddev)
- else
- conf->max_degraded = 1;
- conf->algorithm = mddev->layout;
-- conf->max_nr_stripes = NR_STRIPES;
-+ conf->max_nr_stripes = raid5_nr_stripes;
- conf->expand_progress = mddev->reshape_position;
-
- /* device size must be a multiple of chunk size */
-@@ -3821,6 +3821,7 @@ static void raid5_exit(void)
-
- module_init(raid5_init);
- module_exit(raid5_exit);
-+module_param(raid5_nr_stripes, int, 0644);
- MODULE_LICENSE("GPL");
- MODULE_ALIAS("md-personality-4"); /* RAID5 */
- MODULE_ALIAS("md-raid5");
-Only in linux-2.6.18-53/drivers/md: raid5.c.orig
-Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
+++ /dev/null
-diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:26:27.000000000 +0800
-+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:26:55.000000000 +0800
-@@ -3340,6 +3340,11 @@ static int run(mddev_t *mddev)
- mddev->array_size = mddev->size * (conf->previous_raid_disks -
- conf->max_degraded);
-
-+ /* in order to support large I/Os */
-+ blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9);
-+ mddev->queue->max_phys_segments = conf->chunk_size * (conf->previous_raid_disks - conf->max_degraded) >> PAGE_SHIFT;
-+ mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
-+
- return 0;
- abort:
- if (conf) {
+++ /dev/null
-diff -ru linux-orig/drivers/md/raid5.c linux-new/drivers/md/raid5.c
---- linux-orig/drivers/md/raid5.c 2009-04-14 08:11:38.000000000 +1000
-+++ linux-new/drivers/md/raid5.c 2009-09-20 05:02:02.000000000 +1000
-@@ -3595,10 +3595,16 @@
- mddev->array_size = mddev->size * (conf->previous_raid_disks -
- conf->max_degraded);
-
-+ int stripe_size = conf->chunk_size * (conf->previous_raid_disks - conf->max_degraded);
-+
- /* in order to support large I/Os */
-- blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9);
-- mddev->queue->max_phys_segments = conf->chunk_size * (conf->previous_raid_disks - conf->max_degraded) >> PAGE_SHIFT;
-- mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
-+ blk_queue_max_sectors(mddev->queue, stripe_size >> 9);
-+ /* KTVM: set default max_sectors the same as the max_hw_sectors set above */
-+ mddev->queue->max_sectors = mddev->queue->max_hw_sectors;
-+ printk("%s: setting max_sectors = %d, max_hw_sectors = %d\n", mdname(mddev), mddev->queue->max_sectors, mddev->queue->max_hw_sectors);
-+
-+ mddev->queue->max_phys_segments = stripe_size >> PAGE_SHIFT;
-+ mddev->queue->max_hw_segments = stripe_size >> PAGE_SHIFT;;
-
- /* raid5 device is able to do zcopy right now. */
- mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
+++ /dev/null
-diff -pur b/drivers/md/raid5.c a/drivers/md/raid5.c
---- b/drivers/md/raid5.c 2009-02-20 15:56:36.000000000 +0800
-+++ a/drivers/md/raid5.c 2009-02-20 15:57:49.000000000 +0800
-@@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip
- }
- }
-
-+/*
-+ * The whole idea is to collect all bio's and then issue them
-+ * disk by disk to assist merging a bit -bzzz
-+ */
-+static void raid5_flush_bios(raid5_conf_t *conf, struct bio *bios[], int raid_disks)
-+{
-+ struct bio *bio, *nbio;
-+ int i;
-
-+ for (i = 0; i < raid_disks; i++) {
-+ bio = bios[i];
-+ while (bio) {
-+ nbio = bio->bi_next;
-+ bio->bi_next = NULL;
-+ generic_make_request(bio);
-+ bio = nbio;
-+ }
-+ bios[i] = NULL;
-+ }
-+}
-
- /*
- * Each stripe/dev can have one or more bion attached.
-@@ -1392,7 +1411,7 @@ static int stripe_to_pdidx(sector_t stri
- *
- */
-
--static void handle_stripe5(struct stripe_head *sh)
-+static void handle_stripe5(struct stripe_head *sh, struct bio *bios[])
- {
- raid5_conf_t *conf = sh->raid_conf;
- int disks = sh->disks;
-@@ -1939,7 +1958,11 @@ static void handle_stripe5(struct stripe
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
- atomic_inc(&conf->out_reqs_in_queue);
-- generic_make_request(bi);
-+ if (bios) {
-+ bi->bi_next = bios[i];
-+ bios[i] = bi;
-+ } else
-+ generic_make_request(bi);
- } else {
- if (rw == 1)
- set_bit(STRIPE_DEGRADED, &sh->state);
-@@ -1951,7 +1974,7 @@ static void handle_stripe5(struct stripe
- }
- }
-
--static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
-+static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
- {
- raid6_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks;
-@@ -2499,7 +2522,11 @@ static void handle_stripe6(struct stripe
- if (rw == WRITE &&
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-- generic_make_request(bi);
-+ if (bios) {
-+ bi->bi_next = bios[i];
-+ bios[i] = bi;
-+ } else
-+ generic_make_request(bi);
- atomic_inc(&conf->out_reqs_in_queue);
- } else {
- if (rw == 1)
-@@ -2512,12 +2539,12 @@ static void handle_stripe6(struct stripe
- }
- }
-
--static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
-+static void handle_stripe(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
- {
- if (sh->raid_conf->level == 6)
-- handle_stripe6(sh, tmp_page);
-+ handle_stripe6(sh, tmp_page, bios);
- else
-- handle_stripe5(sh);
-+ handle_stripe5(sh, bios);
- }
-
-
-@@ -2670,6 +2697,7 @@ static int make_request(request_queue_t
- int stripes_per_chunk, sectors_per_block;
- int sectors_per_stripe;
- int i, j;
-+ struct bio *bios[MD_SB_DISKS];
-
- DEFINE_WAIT(w);
- int disks, data_disks;
-@@ -2698,6 +2726,7 @@ static int make_request(request_queue_t
- sectors = bi->bi_size >> 9;
- stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
-
-+ memset(&bios, 0, sizeof(bios));
- redo_bio:
- /* stripe by stripe handle needs a stable raid layout, so if this
- * reuqest covers the expanding region, wait it over.
-@@ -2756,8 +2785,10 @@ retry:
- * the raid layout has been changed, we have to redo the
- * whole bio because we don't which sectors in it has been
- * done, and which is not done. -jay */
-- if (raid5_redo_bio(conf, bi, disks, logical_sector))
-+ if (raid5_redo_bio(conf, bi, disks, logical_sector)) {
-+ raid5_flush_bios(conf, bios, disks);
- goto redo_bio;
-+ }
-
- if (test_bit(STRIPE_EXPANDING, &sh->state)) {
- /* Stripe is busy expanding or
-@@ -2766,6 +2797,7 @@ retry:
- */
- release_stripe(sh);
- sh = NULL;
-+ raid5_flush_bios(conf, bios, disks);
- raid5_unplug_device(mddev->queue);
- schedule();
- goto retry;
-@@ -2784,17 +2816,19 @@ retry:
- */
- if (r_sector >= mddev->suspend_lo &&
- r_sector < mddev->suspend_hi) {
-- handle_stripe(sh, NULL);
-+ handle_stripe(sh, NULL, NULL);
- release_stripe(sh);
- sh = NULL;
-+ raid5_flush_bios(conf, bios, disks);
- schedule();
- goto retry;
- }
-
- if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-- handle_stripe(sh, NULL);
-+ handle_stripe(sh, NULL, NULL);
- release_stripe(sh);
- sh = NULL;
-+ raid5_flush_bios(conf, bios, disks);
- raid5_unplug_device(mddev->queue);
- schedule();
- goto retry;
-@@ -2810,7 +2844,7 @@ retry:
- r_sector += sectors_per_chunk;
- }
- if (sh) {
-- handle_stripe(sh, NULL);
-+ handle_stripe(sh, NULL, bios);
- release_stripe(sh);
- sh = NULL;
- }
-@@ -2820,6 +2854,9 @@ retry:
- if (sectors > 0)
- goto repeat;
-
-+ /* flush all of the bios */
-+ raid5_flush_bios(conf, bios, disks);
-+
- spin_lock_irq(&conf->device_lock);
- remaining = --bi->bi_phys_segments;
- spin_unlock_irq(&conf->device_lock);
-@@ -3035,7 +3072,7 @@ static inline sector_t sync_request(mdde
- clear_bit(STRIPE_INSYNC, &sh->state);
- spin_unlock(&sh->lock);
-
-- handle_stripe(sh, NULL);
-+ handle_stripe(sh, NULL, NULL);
- release_stripe(sh);
-
- return STRIPE_SECTORS;
-@@ -3091,7 +3128,7 @@ static void raid5d (mddev_t *mddev)
-
- handled++;
- atomic_inc(&conf->handled_in_raid5d);
-- handle_stripe(sh, conf->spare_page);
-+ handle_stripe(sh, conf->spare_page, NULL);
- release_stripe(sh);
-
- cond_resched();
+++ /dev/null
-Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600
-+++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:27:21.000000000 -0600
-@@ -1456,6 +1456,8 @@
- bi->bi_next = *bip;
- *bip = bi;
- bi->bi_phys_segments ++;
-+ if (bio_sync(bi) && !forwrite)
-+ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */
- spin_unlock_irq(&conf->device_lock);
- spin_unlock(&sh->lock);
-
-@@ -3012,6 +3014,8 @@
- bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
- }
-+ if (bio_sync(bi))
-+ raid5_unplug_device(q);
- return 0;
- }
-
+++ /dev/null
-While the stripe in-memory must be in-sync, the stripe on disk might not be
-because if we computed a block rather than reading it from an in-sync disk,
-the in-memory stripe can be different from the on-disk stripe.
-
-If this bug were still in mainline I would probably want a bigger patch which
-would leave this code but also set R5_LOCKED on all blocks that have been
-computed. But as it is a stablisation patch, the above is simple and more
-clearly correct.
-
-Thanks for you patience - I look forward to your success/failure report.
-
-NeilBrown
-
-diff -up /drivers/md/raid5.c
-===========================================
---- a/drivers/md/raid5.c
-+++ b/drivers/md/raid5.c
-@@ -2466,8 +2466,6 @@
- locked++;
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
- }
-- /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-- set_bit(STRIPE_INSYNC, &sh->state);
-
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
+++ /dev/null
-diff -pru linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:15:22.000000000 +0800
-+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:17:30.000000000 +0800
-@@ -115,10 +115,12 @@ static void __release_stripe(raid5_conf_
- if (test_bit(STRIPE_DELAYED, &sh->state)) {
- list_add_tail(&sh->lru, &conf->delayed_list);
- blk_plug_device(conf->mddev->queue);
-+ atomic_inc(&conf->delayed);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- sh->bm_seq - conf->seq_write > 0) {
- list_add_tail(&sh->lru, &conf->bitmap_list);
- blk_plug_device(conf->mddev->queue);
-+ atomic_inc(&conf->bit_delayed);
- } else {
- clear_bit(STRIPE_BIT_DELAY, &sh->state);
- list_add_tail(&sh->lru, &conf->handle_list);
-@@ -289,6 +291,7 @@ static struct stripe_head *get_active_st
- if (noblock && sh == NULL)
- break;
- if (!sh) {
-+ atomic_inc(&conf->out_of_stripes);
- conf->inactive_blocked = 1;
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list) &&
-@@ -311,6 +314,10 @@ static struct stripe_head *get_active_st
- !test_bit(STRIPE_EXPANDING, &sh->state))
- BUG();
- list_del_init(&sh->lru);
-+ if (test_bit(STRIPE_DELAYED, &sh->state))
-+ atomic_dec(&conf->delayed);
-+ if (test_bit(STRIPE_BIT_DELAY, &sh->state))
-+ atomic_dec(&conf->bit_delayed);
- }
- }
- } while (sh == NULL);
-@@ -529,6 +536,8 @@ static int raid5_end_read_request(struct
- if (bi->bi_size)
- return 1;
-
-+ atomic_dec(&conf->out_reqs_in_queue);
-+
- for (i=0 ; i<disks; i++)
- if (bi == &sh->dev[i].req)
- break;
-@@ -642,6 +651,8 @@ static int raid5_end_write_request (stru
- if (bi->bi_size)
- return 1;
-
-+ atomic_dec(&conf->out_reqs_in_queue);
-+
- for (i=0 ; i<disks; i++)
- if (bi == &sh->dev[i].req)
- break;
-@@ -1402,6 +1413,8 @@ static void handle_stripe5(struct stripe
- clear_bit(STRIPE_HANDLE, &sh->state);
- clear_bit(STRIPE_DELAYED, &sh->state);
-
-+ atomic_inc(&conf->handle_called);
-+
- syncing = test_bit(STRIPE_SYNCING, &sh->state);
- expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
- expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-@@ -1684,6 +1697,7 @@ static void handle_stripe5(struct stripe
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- locked++;
-+ atomic_inc(&conf->reads_for_rmw);
- } else {
- set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
-@@ -1703,6 +1717,7 @@ static void handle_stripe5(struct stripe
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- locked++;
-+ atomic_inc(&conf->reads_for_rcw);
- } else {
- set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
-@@ -1870,6 +1885,7 @@ static void handle_stripe5(struct stripe
- bi->bi_end_io(bi, bytes,
- test_bit(BIO_UPTODATE, &bi->bi_flags)
- ? 0 : -EIO);
-+ atomic_dec(&conf->in_reqs_in_queue);
- }
- for (i=disks; i-- ;) {
- int rw;
-@@ -1885,10 +1901,13 @@ static void handle_stripe5(struct stripe
- bi = &sh->dev[i].req;
-
- bi->bi_rw = rw;
-- if (rw)
-+ if (rw) {
-+ atomic_inc(&conf->writes_out);
- bi->bi_end_io = raid5_end_write_request;
-- else
-+ } else {
-+ atomic_inc(&conf->reads_out);
- bi->bi_end_io = raid5_end_read_request;
-+ }
-
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
-@@ -1919,6 +1938,7 @@ static void handle_stripe5(struct stripe
- if (rw == WRITE &&
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-+ atomic_inc(&conf->out_reqs_in_queue);
- generic_make_request(bi);
- } else {
- if (rw == 1)
-@@ -1955,6 +1975,8 @@ static void handle_stripe6(struct stripe
- clear_bit(STRIPE_HANDLE, &sh->state);
- clear_bit(STRIPE_DELAYED, &sh->state);
-
-+ atomic_inc(&conf->handle_called);
-+
- syncing = test_bit(STRIPE_SYNCING, &sh->state);
- /* Now to look around and see what can be done */
-
-@@ -2255,6 +2277,7 @@ static void handle_stripe6(struct stripe
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- locked++;
-+ atomic_inc(&conf->reads_for_rcw);
- } else {
- PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
- (unsigned long long)sh->sector, i);
-@@ -2423,6 +2446,7 @@ static void handle_stripe6(struct stripe
- bi->bi_end_io(bi, bytes,
- test_bit(BIO_UPTODATE, &bi->bi_flags)
- ? 0 : -EIO);
-+ atomic_dec(&conf->in_reqs_in_queue);
- }
- for (i=disks; i-- ;) {
- int rw;
-@@ -2438,10 +2462,13 @@ static void handle_stripe6(struct stripe
- bi = &sh->dev[i].req;
-
- bi->bi_rw = rw;
-- if (rw)
-+ if (rw) {
-+ atomic_inc(&conf->writes_out);
- bi->bi_end_io = raid5_end_write_request;
-- else
-+ } else {
-+ atomic_inc(&conf->reads_out);
- bi->bi_end_io = raid5_end_read_request;
-+ }
-
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
-@@ -2473,6 +2500,7 @@ static void handle_stripe6(struct stripe
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
- generic_make_request(bi);
-+ atomic_inc(&conf->out_reqs_in_queue);
- } else {
- if (rw == 1)
- set_bit(STRIPE_DEGRADED, &sh->state);
-@@ -2506,6 +2534,7 @@ static void raid5_activate_delayed(raid5
- if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- atomic_inc(&conf->preread_active_stripes);
- list_add_tail(&sh->lru, &conf->handle_list);
-+ atomic_dec(&conf->delayed);
- }
- }
- }
-@@ -2608,6 +2637,8 @@ static int make_request(request_queue_t
- const int rw = bio_data_dir(bi);
- int remaining;
-
-+ atomic_inc(&conf->in_reqs_in_queue);
-+
- if (unlikely(bio_barrier(bi))) {
- bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
- return 0;
-@@ -2617,6 +2648,11 @@ static int make_request(request_queue_t
-
- disk_stat_inc(mddev->gendisk, ios[rw]);
- disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
-+ if (rw == WRITE)
-+ atomic_inc(&conf->writes_in);
-+ else
-+ atomic_inc(&conf->reads_in);
-+
-
- logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_sector + (bi->bi_size>>9);
-@@ -2724,6 +2760,7 @@ static int make_request(request_queue_t
-
- if ( rw == WRITE )
- md_write_end(mddev);
-+ atomic_dec(&conf->in_reqs_in_queue);
- bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
- }
-@@ -2985,6 +3022,7 @@ static void raid5d (mddev_t *mddev)
- spin_unlock_irq(&conf->device_lock);
-
- handled++;
-+ atomic_inc(&conf->handled_in_raid5d);
- handle_stripe(sh, conf->spare_page);
- release_stripe(sh);
-
-@@ -3381,6 +3419,21 @@ static void status (struct seq_file *seq
- conf->disks[i].rdev &&
- test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
- seq_printf (seq, "]");
-+ seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes",
-+ atomic_read(&conf->reads_in), atomic_read(&conf->writes_in),
-+ atomic_read(&conf->reads_out), atomic_read(&conf->writes_out));
-+ seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called",
-+ atomic_read(&conf->handled_in_raid5d),
-+ atomic_read(&conf->out_of_stripes),
-+ atomic_read(&conf->handle_called));
-+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
-+ atomic_read(&conf->reads_for_rmw),
-+ atomic_read(&conf->reads_for_rcw));
-+ seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
-+ atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
-+ atomic_read(&conf->active_stripes),
-+ atomic_read(&conf->in_reqs_in_queue),
-+ atomic_read(&conf->out_reqs_in_queue));
- #if RAID5_DEBUG
- seq_printf (seq, "\n");
- printall(seq, conf);
-diff -pru linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
---- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-06 17:15:22.000000000 +0800
-+++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-06 17:15:32.000000000 +0800
-@@ -259,6 +259,25 @@ struct raid5_private_data {
- int pool_size; /* number of disks in stripeheads in pool */
- spinlock_t device_lock;
- struct disk_info *disks;
-+
-+ /*
-+ * Stats
-+ */
-+ atomic_t reads_in;
-+ atomic_t writes_in;
-+ atomic_t reads_out;
-+ atomic_t writes_out;
-+ atomic_t handled_in_raid5d;
-+ atomic_t out_of_stripes;
-+ atomic_t reads_for_rmw;
-+ atomic_t reads_for_rcw;
-+ atomic_t writes_zcopy;
-+ atomic_t writes_copied;
-+ atomic_t handle_called;
-+ atomic_t delayed;
-+ atomic_t bit_delayed;
-+ atomic_t in_reqs_in_queue;
-+ atomic_t out_reqs_in_queue;
- };
-
- typedef struct raid5_private_data raid5_conf_t;
-Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
+++ /dev/null
-diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 14:55:08.000000000 +0800
-+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 18:52:08.000000000 +0800
-@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que
- return ret;
- }
-
-+static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi)
-+{
-+ sector_t first_sector, last_sector;
-+
-+ if (likely(conf->expand_progress == MaxSector))
-+ return 0;
-+
-+ first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-+ last_sector = bi->bi_sector + (bi->bi_size>>9);
-+
-+ return (first_sector < conf->expand_progress &&
-+ last_sector >= conf->expand_lo);
-+}
-+
-+static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector)
-+{
-+ int redo = 0;
-+
-+ if (likely(conf->expand_progress == MaxSector))
-+ return 0;
-+
-+ spin_lock_irq(&conf->device_lock);
-+ redo = (raid5_expanding_overlap(conf, bi) ||
-+ (unlikely(sector < conf->expand_progress) &&
-+ disks == conf->previous_raid_disks));
-+ spin_unlock_irq(&conf->device_lock);
-+ return redo;
-+}
-+
- static int make_request(request_queue_t *q, struct bio * bi)
- {
- mddev_t *mddev = q->queuedata;
-@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t
- struct stripe_head *sh;
- const int rw = bio_data_dir(bi);
- int remaining;
-+ sector_t stripe, sectors, block, r_sector, b_sector;
-+ int sectors_per_chunk = conf->chunk_size >> 9;
-+ int stripes_per_chunk, sectors_per_block;
-+ int sectors_per_stripe;
-+ int i, j;
-+
-+ DEFINE_WAIT(w);
-+ int disks, data_disks;
-
- atomic_inc(&conf->in_reqs_in_queue);
-
-@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t
- else
- atomic_inc(&conf->reads_in);
-
--
- logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_sector + (bi->bi_size>>9);
- bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
-
-- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
-- DEFINE_WAIT(w);
-- int disks, data_disks;
--
-- retry:
-- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-- if (likely(conf->expand_progress == MaxSector))
-- disks = conf->raid_disks;
-- else {
-- /* spinlock is needed as expand_progress may be
-- * 64bit on a 32bit platform, and so it might be
-- * possible to see a half-updated value
-- * Ofcourse expand_progress could change after
-- * the lock is dropped, so once we get a reference
-- * to the stripe that we think it is, we will have
-- * to check again.
-- */
-- spin_lock_irq(&conf->device_lock);
-- disks = conf->raid_disks;
-- if (logical_sector >= conf->expand_progress)
-- disks = conf->previous_raid_disks;
-- else {
-- if (logical_sector >= conf->expand_lo) {
-- spin_unlock_irq(&conf->device_lock);
-- schedule();
-- goto retry;
-- }
-- }
-- spin_unlock_irq(&conf->device_lock);
-- }
-- data_disks = disks - conf->max_degraded;
-+ sectors = bi->bi_size >> 9;
-+ stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
-
-- new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
-- &dd_idx, &pd_idx, conf);
-- PRINTK("raid5: make_request, sector %llu logical %llu\n",
-- (unsigned long long)new_sector,
-- (unsigned long long)logical_sector);
-+redo_bio:
-+ /* stripe by stripe handle needs a stable raid layout, so if this
-+ * reuqest covers the expanding region, wait it over.
-+ * Furthermore, we may get here with partial request handled, so
-+ * wait for the bi_phys_segment to be 1 also. -jay */
-+ spin_lock_irq(&conf->device_lock);
-+ wait_event_lock_irq(conf->wait_for_overlap,
-+ (bi->bi_phys_segments == 1) &&
-+ !raid5_expanding_overlap(conf, bi),
-+ conf->device_lock,
-+ (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap)));
-+
-+ disks = conf->raid_disks;
-+ if (unlikely(logical_sector >= conf->expand_progress))
-+ disks = conf->previous_raid_disks;
-+ data_disks = disks - conf->max_degraded;
-+ spin_unlock_irq(&conf->device_lock);
-
-- sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
-- if (sh) {
-- if (unlikely(conf->expand_progress != MaxSector)) {
-- /* expansion might have moved on while waiting for a
-- * stripe, so we must do the range check again.
-- * Expansion could still move past after this
-- * test, but as we are holding a reference to
-- * 'sh', we know that if that happens,
-- * STRIPE_EXPANDING will get set and the expansion
-- * won't proceed until we finish with the stripe.
-- */
-- int must_retry = 0;
-- spin_lock_irq(&conf->device_lock);
-- if (logical_sector < conf->expand_progress &&
-- disks == conf->previous_raid_disks)
-- /* mismatch, need to try again */
-- must_retry = 1;
-- spin_unlock_irq(&conf->device_lock);
-- if (must_retry) {
-- release_stripe(sh);
-- goto retry;
-+ /* compute the block # */
-+ sectors_per_stripe = STRIPE_SECTORS * data_disks;
-+ sectors_per_block = stripes_per_chunk * sectors_per_stripe;
-+
-+ block = logical_sector & ~((sector_t)sectors_per_block - 1);
-+ sector_div(block, sectors_per_block);
-+
-+repeat:
-+ stripe = block * (sectors_per_block / data_disks);
-+ b_sector = stripe * data_disks;
-+ /* iterate through all stripes in this block,
-+ * where block is a set of internal stripes
-+ * which covers chunk */
-+
-+ for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
-+ r_sector = b_sector + (i * STRIPE_SECTORS);
-+ sh = NULL;
-+ /* iterrate through all pages in the stripe */
-+ for (j = 0; j < data_disks && sectors > 0; j++) {
-+ DEFINE_WAIT(w);
-+
-+ if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
-+ r_sector >= last_sector) {
-+ r_sector += sectors_per_chunk;
-+ continue;
-+ }
-+
-+retry:
-+ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-+ new_sector = raid5_compute_sector(r_sector, disks,
-+ data_disks, &dd_idx,
-+ &pd_idx, conf);
-+ if (sh == NULL) {
-+ sh = get_active_stripe(conf, new_sector, disks, pd_idx,
-+ (bi->bi_rw&RWA_MASK));
-+ if (sh) {
-+ /* we're handling the bio stripe by stripe, so when we found
-+ * the raid layout has been changed, we have to redo the
-+ * whole bio because we don't which sectors in it has been
-+ * done, and which is not done. -jay */
-+ if (raid5_redo_bio(conf, bi, disks, logical_sector))
-+ goto redo_bio;
-+
-+ if (test_bit(STRIPE_EXPANDING, &sh->state)) {
-+ /* Stripe is busy expanding or
-+ * add failed due to overlap. Flush everything
-+ * and wait a while
-+ */
-+ release_stripe(sh);
-+ sh = NULL;
-+ raid5_unplug_device(mddev->queue);
-+ schedule();
-+ goto retry;
-+ }
-+ } else {
-+ /* cannot get stripe for read-ahead, just give-up */
-+ finish_wait(&conf->wait_for_overlap, &w);
-+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
-+ sectors = 0;
-+ break;
- }
- }
-+
- /* FIXME what if we get a false positive because these
- * are being updated.
- */
-- if (logical_sector >= mddev->suspend_lo &&
-- logical_sector < mddev->suspend_hi) {
-+ if (r_sector >= mddev->suspend_lo &&
-+ r_sector < mddev->suspend_hi) {
-+ handle_stripe(sh, NULL);
- release_stripe(sh);
-+ sh = NULL;
- schedule();
- goto retry;
- }
-
-- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-- !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-- /* Stripe is busy expanding or
-- * add failed due to overlap. Flush everything
-- * and wait a while
-- */
-- raid5_unplug_device(mddev->queue);
-+ if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-+ handle_stripe(sh, NULL);
- release_stripe(sh);
-+ sh = NULL;
-+ raid5_unplug_device(mddev->queue);
- schedule();
- goto retry;
- }
- finish_wait(&conf->wait_for_overlap, &w);
-+
-+ BUG_ON (new_sector != stripe);
-+ sectors -= STRIPE_SECTORS;
-+ if (bi->bi_sector > r_sector)
-+ sectors += bi->bi_sector - r_sector;
-+ if (r_sector + STRIPE_SECTORS > last_sector)
-+ sectors += r_sector + STRIPE_SECTORS - last_sector;
-+ r_sector += sectors_per_chunk;
-+ }
-+ if (sh) {
- handle_stripe(sh, NULL);
- release_stripe(sh);
-- } else {
-- /* cannot get stripe for read-ahead, just give-up */
-- clear_bit(BIO_UPTODATE, &bi->bi_flags);
-- finish_wait(&conf->wait_for_overlap, &w);
-- break;
-+ sh = NULL;
- }
--
-+ stripe += STRIPE_SECTORS;
- }
-+ block++;
-+ if (sectors > 0)
-+ goto repeat;
-+
- spin_lock_irq(&conf->device_lock);
- remaining = --bi->bi_phys_segments;
- spin_unlock_irq(&conf->device_lock);
-@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq
- atomic_read(&conf->active_stripes),
- atomic_read(&conf->in_reqs_in_queue),
- atomic_read(&conf->out_reqs_in_queue));
-+ seq_printf (seq, "\t\t%u expanding overlap\n",
-+ atomic_read(&conf->expanding_overlap));
- #if RAID5_DEBUG
- seq_printf (seq, "\n");
- printall(seq, conf);
-diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
---- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 14:55:08.000000000 +0800
-+++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 18:09:37.000000000 +0800
-@@ -278,6 +278,7 @@ struct raid5_private_data {
- atomic_t bit_delayed;
- atomic_t in_reqs_in_queue;
- atomic_t out_reqs_in_queue;
-+ atomic_t expanding_overlap;
- };
-
- typedef struct raid5_private_data raid5_conf_t;
+++ /dev/null
-Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:52.000000000 -0600
-+++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600
-@@ -633,6 +633,9 @@
- clear_buffer_uptodate(bh);
- }
- #endif
-+ /* Read on a Directing write is allowable */
-+ /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
-@@ -669,6 +672,10 @@
-
- rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-
-+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+ }
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
-@@ -910,7 +917,27 @@
- return r_sector;
- }
-
-+static struct page *zero_copy_data(struct bio *bio, sector_t sector)
-+{
-+ sector_t bi_sector = bio->bi_sector;
-+ struct page *page = NULL;
-+ struct bio_vec *bvl;
-+ int i;
-
-+ bio_for_each_segment(bvl, bio, i) {
-+ if (sector == bi_sector)
-+ page = bio_iovec_idx(bio, i)->bv_page;
-+ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
-+ if (bi_sector >= sector + STRIPE_SECTORS) {
-+ /* check if the stripe is covered by one page */
-+ if (page == bio_iovec_idx(bio, i)->bv_page &&
-+ PageConstant(page))
-+ return page;
-+ return NULL;
-+ }
-+ }
-+ return NULL;
-+}
-
- /*
- * Copy data between a page in the stripe cache, and one or more bion
-@@ -1002,8 +1029,9 @@
- {
- raid5_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-- void *ptr[MAX_XOR_BLOCKS];
-+ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
- struct bio *chosen;
-+ struct page *page;
-
- PRINTK("compute_parity5, stripe %llu, method %d\n",
- (unsigned long long)sh->sector, method);
-@@ -1053,34 +1081,92 @@
- count = 1;
- }
-
-- for (i = disks; i--;)
-- if (sh->dev[i].written) {
-- sector_t sector = sh->dev[i].sector;
-- struct bio *wbi = sh->dev[i].written;
-- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-- copy_data(1, wbi, sh->dev[i].page, sector);
-- wbi = r5_next_bio(wbi, sector);
-+ for (i = disks; i--;) {
-+ struct r5dev *dev = &sh->dev[i];
-+ struct bio *wbi = dev->written;
-+ sector_t sector;
-+
-+ if (!wbi)
-+ continue;
-+
-+ sector = dev->sector;
-+ set_bit(R5_LOCKED, &sh->dev[i].flags);
-+ BUG_ON(test_bit(R5_Direct, &dev->flags));
-+
-+ /* check if it's covered by a single page
-+ and whole stripe is written at once.
-+ * in this case we can avoid memcpy() */
-+ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
-+ test_bit(R5_Insync, &dev->flags)) {
-+ page = zero_copy_data(wbi, sector);
-+ if (page) {
-+ atomic_inc(&conf->writes_zcopy);
-+ /* The pointer must be restored whenever the LOCKED
-+ * gets cleared. */
-+ dev->req.bi_io_vec[0].bv_page = page;
-+ set_bit(R5_Direct, &dev->flags);
-+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ continue;
- }
-+ }
-
-- set_bit(R5_LOCKED, &sh->dev[i].flags);
-- set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ /* do copy write */
-+ atomic_inc(&conf->writes_copied);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-+ copy_data(1, wbi, sh->dev[i].page, sector);
-+ wbi = r5_next_bio(wbi, sector);
- }
-+ }
-
-+ h_ptr[0] = ptr[0];
- switch(method) {
- case RECONSTRUCT_WRITE:
- case CHECK_PARITY:
-- for (i=disks; i--;)
-- if (i != pd_idx) {
-- ptr[count++] = page_address(sh->dev[i].page);
-- check_xor();
-+ for (i=disks; i--;) {
-+ if (i == pd_idx)
-+ continue;
-+ if (test_bit(R5_Direct, &sh->dev[i].flags))
-+ page = sh->dev[i].req.bi_io_vec[0].bv_page;
-+ else
-+ page = sh->dev[i].page;
-+
-+ /* have to compute the parity immediately for
-+ * a highmem page. it would happen for zerocopy. -jay
-+ */
-+ if (PageHighMem(page)) {
-+ h_ptr[1] = kmap_atomic(page, KM_USER0);
-+ xor_block(2, STRIPE_SIZE, h_ptr);
-+ kunmap_atomic(page, KM_USER0);
-+ } else {
-+ ptr[count++] = page_address(page);
- }
-+ check_xor();
-+ }
- break;
- case READ_MODIFY_WRITE:
-- for (i = disks; i--;)
-- if (sh->dev[i].written) {
-- ptr[count++] = page_address(sh->dev[i].page);
-- check_xor();
-+ for (i = disks; i--;) {
-+ if (!sh->dev[i].written)
-+ continue;
-+ if (test_bit(R5_Direct, &sh->dev[i].flags))
-+ page = sh->dev[i].req.bi_io_vec[0].bv_page;
-+ else
-+ page = sh->dev[i].page;
-+
-+ /* have to compute the parity immediately for
-+ * a highmem page. it would happen for zerocopy. -jay
-+ */
-+ if (PageHighMem(page)) {
-+ h_ptr[1] = kmap_atomic(page, KM_USER0);
-+ xor_block(2, STRIPE_SIZE, h_ptr);
-+ kunmap_atomic(page, KM_USER0);
-+ } else {
-+ ptr[count++] = page_address(page);
- }
-+ check_xor();
-+ }
- }
- if (count != 1)
- xor_block(count, STRIPE_SIZE, ptr);
-@@ -1097,6 +1183,7 @@
- raid6_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
- struct bio *chosen;
-+ struct page *page;
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
- void *ptrs[disks];
-
-@@ -1126,18 +1213,49 @@
- BUG(); /* Not implemented yet */
- }
-
-- for (i = disks; i--;)
-- if (sh->dev[i].written) {
-- sector_t sector = sh->dev[i].sector;
-- struct bio *wbi = sh->dev[i].written;
-- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-- copy_data(1, wbi, sh->dev[i].page, sector);
-- wbi = r5_next_bio(wbi, sector);
-+ for (i = disks; i--;) {
-+ struct r5dev *dev = &sh->dev[i];
-+ struct bio *wbi = dev->written;
-+ sector_t sector;
-+
-+ if (!wbi)
-+ continue;
-+
-+ sector = sh->dev[i].sector;
-+ set_bit(R5_LOCKED, &sh->dev[i].flags);
-+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
-+
-+ /* check if it's covered by a single page
-+ * and whole stripe is written at once.
-+ * in this case we can avoid memcpy() */
-+ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
-+ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
-+ page = zero_copy_data(wbi, sector);
-+ /* we don't do zerocopy on a HighMem page. Raid6 tend
-+ * to prepare all of the pages' content to be accessed
-+ * before computing PQ parity. If we need to support HighMem
-+ * page also, we have to modify the gen_syndrome()
-+ * algorithm. -jay */
-+ if (page && !PageHighMem(page)) {
-+ atomic_inc(&conf->writes_zcopy);
-+ /* The pointer must be restored whenever the LOCKED
-+ * gets cleared. */
-+ sh->dev[i].req.bi_io_vec[0].bv_page = page;
-+ set_bit(R5_Direct, &sh->dev[i].flags);
-+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ continue;
- }
-+ }
-
-- set_bit(R5_LOCKED, &sh->dev[i].flags);
-- set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ atomic_inc(&conf->writes_copied);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-+ copy_data(1, wbi, sh->dev[i].page, sector);
-+ wbi = r5_next_bio(wbi, sector);
- }
-+ }
-
- // switch(method) {
- // case RECONSTRUCT_WRITE:
-@@ -1148,8 +1266,12 @@
- count = 0;
- i = d0_idx;
- do {
-- ptrs[count++] = page_address(sh->dev[i].page);
-- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-+ if (test_bit(R5_Direct, &sh->dev[i].flags))
-+ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
-+ else
-+ ptrs[count++] = page_address(sh->dev[i].page);
-+ if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
-+ !test_bit(R5_Direct, &sh->dev[i].flags))
- printk("block %d/%d not uptodate on parity calc\n", i,count);
- i = raid6_next_disk(i, disks);
- } while ( i != d0_idx );
-@@ -1596,7 +1718,8 @@
- if (sh->dev[i].written) {
- dev = &sh->dev[i];
- if (!test_bit(R5_LOCKED, &dev->flags) &&
-- test_bit(R5_UPTODATE, &dev->flags) ) {
-+ (test_bit(R5_UPTODATE, &dev->flags) ||
-+ test_bit(R5_Direct, &dev->flags)) ) {
- /* We can return any write requests */
- struct bio *wbi, *wbi2;
- int bitmap_end = 0;
-@@ -1604,6 +1727,7 @@
- spin_lock_irq(&conf->device_lock);
- wbi = dev->written;
- dev->written = NULL;
-+ clear_bit(R5_Direct, &dev->flags);
- while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
- if (--wbi->bi_phys_segments == 0) {
-@@ -1967,6 +2091,15 @@
- set_bit(STRIPE_DEGRADED, &sh->state);
- PRINTK("skip op %ld on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
-+
-+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+ /* restore the page pointer of req, otherwise,
-+ * no any read is permitted on this stripe, this is
-+ * not what we want. -jay */
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+ }
-+
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
-@@ -2172,7 +2305,8 @@
- if (sh->dev[i].written) {
- dev = &sh->dev[i];
- if (!test_bit(R5_LOCKED, &dev->flags) &&
-- test_bit(R5_UPTODATE, &dev->flags) ) {
-+ (test_bit(R5_UPTODATE, &dev->flags) ||
-+ test_bit(R5_Direct, &dev->flags)) ) {
- /* We can return any write requests */
- int bitmap_end = 0;
- struct bio *wbi, *wbi2;
-@@ -2181,6 +2315,7 @@
- spin_lock_irq(&conf->device_lock);
- wbi = dev->written;
- dev->written = NULL;
-+ clear_bit(R5_Direct, &dev->flags);
- while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
- if (--wbi->bi_phys_segments == 0) {
-@@ -2532,6 +2667,15 @@
- set_bit(STRIPE_DEGRADED, &sh->state);
- PRINTK("skip op %ld on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
-+
-+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+ /* restore the page pointer of req, otherwise,
-+ * no any read is permitted on this stripe, this is
-+ * not what we want. -jay */
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+ }
-+
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
-@@ -3451,6 +3595,9 @@
- mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
- mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
-
-+ /* raid5 device is able to do zcopy right now. */
-+ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
-+
- return 0;
- abort:
- if (conf) {
-@@ -3537,9 +3684,11 @@
- atomic_read(&conf->handled_in_raid5d),
- atomic_read(&conf->out_of_stripes),
- atomic_read(&conf->handle_called));
-- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
-+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
- atomic_read(&conf->reads_for_rmw),
-- atomic_read(&conf->reads_for_rcw));
-+ atomic_read(&conf->reads_for_rcw),
-+ atomic_read(&conf->writes_zcopy),
-+ atomic_read(&conf->writes_copied));
- seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
- atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
- atomic_read(&conf->active_stripes),
-Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/backing-dev.h 2009-06-02 23:24:55.000000000 -0600
-@@ -48,6 +48,7 @@
- #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
- #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
- #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
-+#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */
- #define BDI_CAP_VMFLAGS \
- (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
-
-@@ -94,11 +95,18 @@
- #define bdi_cap_account_dirty(bdi) \
- (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
-
-+#define bdi_cap_page_constant_write(bdi) \
-+ ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
-+
- #define mapping_cap_writeback_dirty(mapping) \
- bdi_cap_writeback_dirty((mapping)->backing_dev_info)
-
- #define mapping_cap_account_dirty(mapping) \
- bdi_cap_account_dirty((mapping)->backing_dev_info)
-
-+#define mapping_cap_page_constant_write(mapping) \
-+ bdi_cap_page_constant_write((mapping)->backing_dev_info)
-+
-+
-
- #endif /* _LINUX_BACKING_DEV_H */
-Index: linux-2.6.18-128.1.6/include/linux/page-flags.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h 2009-04-14 21:05:24.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/page-flags.h 2009-06-02 23:24:55.000000000 -0600
-@@ -86,6 +86,7 @@
- #define PG_reclaim 17 /* To be reclaimed asap */
- #define PG_nosave_free 18 /* Free, should not be written */
- #define PG_buddy 19 /* Page is free, on buddy lists */
-+#define PG_constant 21 /* To mark if the page is constant */
- #define PG_xpmem 27 /* Testing for xpmem. */
-
- /* PG_owner_priv_1 users should have descriptive aliases */
-@@ -283,6 +284,14 @@
-
- struct page; /* forward declaration */
-
-+#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
-+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
-+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
-+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
-+
-+extern int set_page_constant(struct page *page);
-+extern void clear_page_constant(struct page *);
-+
- int test_clear_page_dirty(struct page *page);
- int test_clear_page_writeback(struct page *page);
- int test_set_page_writeback(struct page *page);
-Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h 2009-06-02 23:24:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h 2009-06-02 23:24:55.000000000 -0600
-@@ -156,8 +156,9 @@
- #define R5_Overlap 7 /* There is a pending overlapping request on this block */
- #define R5_ReadError 8 /* seen a read error here recently */
- #define R5_ReWrite 9 /* have tried to over-write the readerror */
--
- #define R5_Expanded 10 /* This block now has post-expand data */
-+#define R5_Direct 11 /* Use the pages in bio to do the write directly. */
-+
- /*
- * Write method
- */
-Index: linux-2.6.18-128.1.6/mm/filemap.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/mm/filemap.c 2009-04-14 21:05:46.000000000 -0600
-+++ linux-2.6.18-128.1.6/mm/filemap.c 2009-06-02 23:24:55.000000000 -0600
-@@ -30,6 +30,7 @@
- #include <linux/security.h>
- #include <linux/syscalls.h>
- #include <linux/cpuset.h>
-+#include <linux/rmap.h>
- #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
- #include <trace/mm.h>
- #include "internal.h"
-@@ -567,11 +568,55 @@
- if (!test_clear_page_writeback(page))
- BUG();
- }
-+ clear_page_constant(page);
- smp_mb__after_clear_bit();
- wake_up_page(page, PG_writeback);
- }
- EXPORT_SYMBOL(end_page_writeback);
-
-+/* Make a page to be constant, `constant' means any write to this page will
-+ * be blocked until clear_page_constant is called.
-+ * The page lock must be held.
-+ */
-+int set_page_constant(struct page *page)
-+{
-+ BUG_ON(!PageLocked(page));
-+
-+ /* If it's an anonymous page and haven't been added to swap cache,
-+ * return directly because we have no way to swap this page.
-+ */
-+ if (page_mapping(page) == NULL)
-+ return SWAP_FAIL;
-+
-+ BUG_ON(!PageUptodate(page));
-+
-+ /* I have to clear page uptodate before trying to remove
-+ * it from user's page table because otherwise, the page may be
-+ * reinstalled by a page access which happens between try_to_unmap()
-+ * and ClearPageUptodate(). -jay
-+ */
-+ ClearPageUptodate(page);
-+ if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
-+ SetPageUptodate(page);
-+ return SWAP_FAIL;
-+ }
-+ SetPageConstant(page);
-+ return SWAP_SUCCESS;
-+}
-+
-+void clear_page_constant(struct page *page)
-+{
-+ if (PageConstant(page)) {
-+ BUG_ON(!PageLocked(page));
-+ BUG_ON(PageUptodate(page));
-+ ClearPageConstant(page);
-+ SetPageUptodate(page);
-+ unlock_page(page);
-+ }
-+}
-+EXPORT_SYMBOL(set_page_constant);
-+EXPORT_SYMBOL(clear_page_constant);
-+
- /**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @page: the page to lock
+++ /dev/null
-Index: linux-2.6.16.60-0.37/drivers/scsi/Kconfig
-===================================================================
---- linux-2.6.16.60-0.37.orig/drivers/scsi/Kconfig 2009-03-24 05:46:32.000000000 -0700
-+++ linux-2.6.16.60-0.37/drivers/scsi/Kconfig 2009-06-02 23:33:14.000000000 -0600
-@@ -78,6 +78,14 @@
- To compile this driver as a module, choose M here and read
- <file:Documentation/scsi/scsi.txt>. The module will be called st.
-
-+config SD_IOSTATS
-+ bool "Enable SCSI disk I/O stats"
-+ depends on BLK_DEV_SD
-+ default y
-+ ---help---
-+ This enables SCSI disk I/O stats collection. You must also enable
-+ /proc file system support if you want this feature.
-+
- config CHR_DEV_OSST
- tristate "SCSI OnStream SC-x0 tape support"
- depends on SCSI
-Index: linux-2.6.16.60-0.37/drivers/scsi/scsi_proc.c
-===================================================================
---- linux-2.6.16.60-0.37.orig/drivers/scsi/scsi_proc.c 2009-03-24 05:46:25.000000000 -0700
-+++ linux-2.6.16.60-0.37/drivers/scsi/scsi_proc.c 2009-06-02 23:33:14.000000000 -0600
-@@ -40,7 +40,8 @@
- /* 4K page size, but our output routines, use some slack for overruns */
- #define PROC_BLOCK_SIZE (3*1024)
-
--static struct proc_dir_entry *proc_scsi;
-+struct proc_dir_entry *proc_scsi;
-+EXPORT_SYMBOL(proc_scsi);
-
- /* Protect sht->present and sht->proc_dir */
- static DEFINE_MUTEX(global_host_template_mutex);
-Index: linux-2.6.16.60-0.37/drivers/scsi/sd.c
-===================================================================
---- linux-2.6.16.60-0.37.orig/drivers/scsi/sd.c 2009-03-24 05:46:25.000000000 -0700
-+++ linux-2.6.16.60-0.37/drivers/scsi/sd.c 2009-06-02 23:33:14.000000000 -0600
-@@ -63,6 +63,63 @@
-
- #include "scsi_logging.h"
-
-+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
-+# include <linux/proc_fs.h>
-+# include <linux/seq_file.h>
-+
-+typedef struct {
-+ unsigned long long iostat_size;
-+ unsigned long long iostat_count;
-+} iostat_counter_t;
-+
-+#define IOSTAT_NCOUNTERS 16
-+typedef struct {
-+ iostat_counter_t iostat_read_histogram[IOSTAT_NCOUNTERS];
-+ iostat_counter_t iostat_write_histogram[IOSTAT_NCOUNTERS];
-+ struct timeval iostat_timeval;
-+
-+ /* queue depth: how well the pipe is filled up */
-+ unsigned long long iostat_queue_ticks[IOSTAT_NCOUNTERS];
-+ unsigned long long iostat_queue_ticks_sum;
-+ unsigned long iostat_queue_depth;
-+ unsigned long iostat_queue_stamp;
-+
-+ /* seeks: how linear the traffic is */
-+ unsigned long long iostat_next_sector;
-+ unsigned long long iostat_seek_sectors;
-+ unsigned long long iostat_seeks;
-+ unsigned long long iostat_sectors;
-+ unsigned long long iostat_reqs;
-+ unsigned long iostat_read_reqs;
-+ unsigned long iostat_write_reqs;
-+
-+ /* process time: how long it takes to process requests */
-+ unsigned long iostat_rtime[IOSTAT_NCOUNTERS];
-+ unsigned long iostat_wtime[IOSTAT_NCOUNTERS];
-+
-+ /* queue time: how long process spent in elevator's queue */
-+ unsigned long iostat_rtime_in_queue[IOSTAT_NCOUNTERS];
-+ unsigned long iostat_wtime_in_queue[IOSTAT_NCOUNTERS];
-+
-+ /* must be the last field, as it's used to know size to be memset'ed */
-+ spinlock_t iostat_lock;
-+} ____cacheline_aligned_in_smp iostat_stats_t;
-+
-+struct proc_dir_entry *sd_iostats_procdir = NULL;
-+char sd_iostats_procdir_name[] = "sd_iostats";
-+static struct file_operations sd_iostats_proc_fops;
-+
-+extern void sd_iostats_init(void);
-+extern void sd_iostats_fini(void);
-+void sd_iostats_start_req(struct scsi_cmnd *SCpnt);
-+void sd_iostats_finish_req(struct scsi_cmnd *SCpnt);
-+#else
-+static inline void sd_iostats_init(void) {}
-+static inline void sd_iostats_fini(void) {}
-+static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {}
-+static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {}
-+#endif
-+
- /*
- * More than enough for everybody ;) The huge number of majors
- * is a leftover from 16bit dev_t days, we don't really need that
-@@ -127,6 +184,9 @@
- unsigned WCE : 1; /* state of disk WCE bit */
- unsigned RCD : 1; /* state of disk RCD bit, unused */
- unsigned DPOFUA : 1; /* state of disk DPOFUA bit */
-+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
-+ iostat_stats_t *stats; /* scsi disk statistics */
-+#endif
- };
- #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev)
-
-@@ -520,6 +580,8 @@
- */
- SCpnt->done = sd_rw_intr;
-
-+ sd_iostats_start_req(SCpnt);
-+
- /*
- * This indicates that the command is ready from our end to be
- * queued.
-@@ -1014,6 +1076,7 @@
- break;
- }
- out:
-+ sd_iostats_finish_req(SCpnt);
- scsi_io_completion(SCpnt, good_bytes);
- }
-
-@@ -1713,6 +1776,36 @@
- if (sdp->removable)
- gd->flags |= GENHD_FL_REMOVABLE;
-
-+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
-+ sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL);
-+ if (!sdkp->stats) {
-+ printk(KERN_WARNING "cannot allocate iostat structure for"
-+ "%s\n", gd->disk_name);
-+ } else {
-+ do_gettimeofday(&sdkp->stats->iostat_timeval);
-+ sdkp->stats->iostat_queue_stamp = jiffies;
-+ spin_lock_init(&sdkp->stats->iostat_lock);
-+ if (sd_iostats_procdir) {
-+ struct proc_dir_entry *pde;
-+ pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR,
-+ sd_iostats_procdir);
-+ if (!pde) {
-+ printk(KERN_WARNING "Can't create /proc/scsi/"
-+ "%s/%s\n",
-+ sd_iostats_procdir_name,
-+ gd->disk_name);
-+ kfree(sdkp->stats);
-+ sdkp->stats = NULL;
-+ } else {
-+ pde->proc_fops = &sd_iostats_proc_fops;
-+ pde->data = gd;
-+ }
-+ } else {
-+ kfree(sdkp->stats);
-+ sdkp->stats = NULL;
-+ }
-+ }
-+#endif
- dev_set_drvdata(dev, sdkp);
- add_disk(gd);
-
-@@ -1756,6 +1849,366 @@
- return 0;
- }
-
-+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
-+static int
-+sd_iostats_seq_show(struct seq_file *seq, void *v)
-+{
-+ struct timeval now;
-+ struct gendisk *disk = seq->private;
-+ iostat_stats_t *stats;
-+ unsigned long long read_len;
-+ unsigned long long read_len_tot;
-+ unsigned long read_num;
-+ unsigned long read_num_tot;
-+ unsigned long long write_len;
-+ unsigned long long write_len_tot;
-+ unsigned long write_num;
-+ unsigned long write_num_tot;
-+ int i;
-+ int maxi;
-+
-+ stats = scsi_disk(disk)->stats;
-+ if (stats == NULL) {
-+ printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n");
-+ BUG();
-+ }
-+
-+ do_gettimeofday(&now);
-+ now.tv_sec -= stats->iostat_timeval.tv_sec;
-+ now.tv_usec -= stats->iostat_timeval.tv_usec;
-+ if (now.tv_usec < 0) {
-+ now.tv_usec += 1000000;
-+ now.tv_sec--;
-+ }
-+
-+ /* this sampling races with updates */
-+ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n",
-+ (unsigned long) scsi_disk(disk)->index,
-+ now.tv_sec, now.tv_usec);
-+
-+ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--)
-+ if (stats->iostat_read_histogram[i].iostat_count != 0 ||
-+ stats->iostat_write_histogram[i].iostat_count != 0)
-+ break;
-+ maxi = i;
-+
-+ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size",
-+ "reads", "total", "writes", "total");
-+
-+ read_len_tot = write_len_tot = 0;
-+ read_num_tot = write_num_tot = 0;
-+ for (i = 0; i <= maxi; i++) {
-+ read_len = stats->iostat_read_histogram[i].iostat_size;
-+ read_len_tot += read_len;
-+ read_num = stats->iostat_read_histogram[i].iostat_count;
-+ read_num_tot += read_num;
-+
-+ write_len = stats->iostat_write_histogram[i].iostat_size;
-+ write_len_tot += write_len;
-+ write_num = stats->iostat_write_histogram[i].iostat_count;
-+ write_num_tot += write_num;
-+
-+ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n",
-+ 512<<i, read_num, read_len, write_num, write_len);
-+ }
-+
-+ seq_printf(seq, "%8s %8lu %12llu %8lu %12llu\n\n", "total",
-+ read_num_tot, read_len_tot,
-+ write_num_tot, write_len_tot);
-+
-+ seq_printf(seq, "%8s %8s %8s\n", "qdepth", "ticks", "%");
-+ for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
-+ unsigned long long ticks, percent;
-+ ticks = stats->iostat_queue_ticks[i];
-+ if (ticks == 0)
-+ continue;
-+ percent = stats->iostat_queue_ticks[i] * 100;
-+ do_div(percent, stats->iostat_queue_ticks_sum);
-+ seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent);
-+ }
-+
-+ if (stats->iostat_reqs != 0) {
-+ unsigned long long aveseek = 0, percent = 0;
-+
-+ if (stats->iostat_seeks) {
-+ aveseek = stats->iostat_seek_sectors;
-+ do_div(aveseek, stats->iostat_seeks);
-+ percent = stats->iostat_seeks * 100;
-+ do_div(percent, stats->iostat_reqs);
-+ }
-+
-+ seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over "
-+ "%llu sectors in ave, %llu%% of all reqs\n",
-+ stats->iostat_sectors, stats->iostat_reqs,
-+ stats->iostat_seeks, aveseek, percent);
-+ }
-+
-+ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads",
-+ "%%", "writes", "%%");
-+ for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
-+ unsigned long read_percent = 0, write_percent = 0;
-+ if (stats->iostat_wtime[i] == 0 &&
-+ stats->iostat_rtime[i] == 0)
-+ continue;
-+ if (stats->iostat_read_reqs)
-+ read_percent = stats->iostat_rtime[i] * 100 /
-+ stats->iostat_read_reqs;
-+ if (stats->iostat_write_reqs)
-+ write_percent = stats->iostat_wtime[i] * 100 /
-+ stats->iostat_write_reqs;
-+ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
-+ jiffies_to_msecs(((1UL << i) >> 1) << 1),
-+ stats->iostat_rtime[i], read_percent,
-+ stats->iostat_wtime[i], write_percent);
-+ }
-+
-+ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads",
-+ "%%", "writes", "%%");
-+ for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
-+ unsigned long read_percent = 0, write_percent = 0;
-+ if (stats->iostat_wtime_in_queue[i] == 0 &&
-+ stats->iostat_rtime_in_queue[i] == 0)
-+ continue;
-+ if (stats->iostat_read_reqs)
-+ read_percent = stats->iostat_rtime_in_queue[i] * 100 /
-+ stats->iostat_read_reqs;
-+ if (stats->iostat_write_reqs)
-+ write_percent = stats->iostat_wtime_in_queue[i] * 100 /
-+ stats->iostat_write_reqs;
-+ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
-+ jiffies_to_msecs(((1UL << i) >> 1) << 1),
-+ stats->iostat_rtime_in_queue[i],
-+ read_percent,
-+ stats->iostat_wtime_in_queue[i],
-+ write_percent);
-+ }
-+
-+ return 0;
-+}
-+
-+static void *
-+sd_iostats_seq_start(struct seq_file *p, loff_t *pos)
-+{
-+ return (*pos == 0) ? (void *)1 : NULL;
-+}
-+
-+static void *
-+sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos)
-+{
-+ ++*pos;
-+ return NULL;
-+}
-+
-+static void
-+sd_iostats_seq_stop(struct seq_file *p, void *v)
-+{
-+}
-+
-+static struct seq_operations sd_iostats_seqops = {
-+ .start = sd_iostats_seq_start,
-+ .stop = sd_iostats_seq_stop,
-+ .next = sd_iostats_seq_next,
-+ .show = sd_iostats_seq_show,
-+};
-+
-+static int
-+sd_iostats_seq_open (struct inode *inode, struct file *file)
-+{
-+ int rc;
-+
-+ rc = seq_open(file, &sd_iostats_seqops);
-+ if (rc != 0)
-+ return rc;
-+
-+ ((struct seq_file *)file->private_data)->private = PDE(inode)->data;
-+ return 0;
-+}
-+
-+static ssize_t
-+sd_iostats_seq_write(struct file *file, const char *buffer,
-+ size_t len, loff_t *off)
-+{
-+ struct seq_file *seq = file->private_data;
-+ struct gendisk *disk = seq->private;
-+ iostat_stats_t *stats = scsi_disk(disk)->stats;
-+ unsigned long flags;
-+ unsigned long qdepth;
-+
-+
-+ spin_lock_irqsave (&stats->iostat_lock, flags);
-+ qdepth = stats->iostat_queue_depth;
-+ memset (stats, 0, offsetof(iostat_stats_t, iostat_lock));
-+ do_gettimeofday(&stats->iostat_timeval);
-+ stats->iostat_queue_stamp = jiffies;
-+ stats->iostat_queue_depth = qdepth;
-+ spin_unlock_irqrestore (&stats->iostat_lock, flags);
-+
-+ return len;
-+}
-+
-+static struct file_operations sd_iostats_proc_fops = {
-+ .owner = THIS_MODULE,
-+ .open = sd_iostats_seq_open,
-+ .read = seq_read,
-+ .write = sd_iostats_seq_write,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
-+};
-+
-+extern struct proc_dir_entry *proc_scsi;
-+
-+void
-+sd_iostats_init(void)
-+{
-+ if (proc_scsi == NULL) {
-+ printk(KERN_WARNING "No access to sd iostats: "
-+ "proc_scsi is NULL\n");
-+ return;
-+ }
-+
-+ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name,
-+ S_IFDIR | S_IRUGO | S_IXUGO,
-+ proc_scsi);
-+ if (sd_iostats_procdir == NULL) {
-+ printk(KERN_WARNING "No access to sd iostats: "
-+ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
-+ return;
-+ }
-+}
-+
-+void sd_iostats_fini(void)
-+{
-+ if (proc_scsi != NULL && sd_iostats_procdir != NULL)
-+ remove_proc_entry(sd_iostats_procdir_name, proc_scsi);
-+
-+ sd_iostats_procdir = NULL;
-+}
-+
-+void sd_iostats_finish_req(struct scsi_cmnd *SCpnt)
-+{
-+ struct request *rq = SCpnt->request;
-+ iostat_stats_t *stats;
-+ unsigned long *tcounter;
-+ int tbucket;
-+ int tmp;
-+ unsigned long irqflags;
-+ unsigned long i;
-+
-+ stats = scsi_disk(rq->rq_disk)->stats;
-+ if (stats == NULL)
-+ return;
-+
-+ tmp = jiffies - rq->start_time;
-+ for (tbucket = 0; tmp > 1; tbucket++)
-+ tmp >>= 1;
-+ if (tbucket >= IOSTAT_NCOUNTERS)
-+ tbucket = IOSTAT_NCOUNTERS - 1;
-+ //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket);
-+
-+ tcounter = rq_data_dir(rq) == WRITE ?
-+ &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket];
-+
-+ spin_lock_irqsave(&stats->iostat_lock, irqflags);
-+
-+ /* update delay stats */
-+ (*tcounter)++;
-+
-+ /* update queue depth stats */
-+ i = stats->iostat_queue_depth;
-+ if (i >= IOSTAT_NCOUNTERS)
-+ i = IOSTAT_NCOUNTERS - 1;
-+ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
-+ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
-+ BUG_ON(stats->iostat_queue_depth == 0);
-+ stats->iostat_queue_depth--;
-+
-+ /* update seek stats. XXX: not sure about nr_sectors */
-+ stats->iostat_sectors += rq->nr_sectors;
-+ stats->iostat_reqs++;
-+ if (rq->sector != stats->iostat_next_sector) {
-+ stats->iostat_seek_sectors +=
-+ rq->sector > stats->iostat_next_sector ?
-+ rq->sector - stats->iostat_next_sector :
-+ stats->iostat_next_sector - rq->sector;
-+ stats->iostat_seeks++;
-+ }
-+ stats->iostat_next_sector = rq->sector + rq->nr_sectors;
-+
-+ stats->iostat_queue_stamp = jiffies;
-+
-+ spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
-+}
-+
-+void sd_iostats_start_req(struct scsi_cmnd *SCpnt)
-+{
-+ struct request *rq = SCpnt->request;
-+ iostat_stats_t *stats;
-+ iostat_counter_t *counter;
-+ int bucket;
-+ int tbucket;
-+ int tmp;
-+ unsigned long irqflags;
-+ unsigned long i;
-+ int nsect;
-+
-+ stats = scsi_disk(rq->rq_disk)->stats;
-+ if (stats == NULL)
-+ return;
-+
-+ nsect = SCpnt->request_bufflen >> 9;
-+ for (bucket = 0, tmp = nsect; tmp > 1; bucket++)
-+ tmp >>= 1;
-+
-+ if (bucket >= IOSTAT_NCOUNTERS) {
-+ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect);
-+ BUG();
-+ }
-+
-+ counter = rq_data_dir(rq) == WRITE ?
-+ &stats->iostat_write_histogram[bucket] :
-+ &stats->iostat_read_histogram[bucket];
-+
-+ tmp = jiffies - rq->start_time;
-+ for (tbucket = 0; tmp > 1; tbucket++)
-+ tmp >>= 1;
-+ if (tbucket >= IOSTAT_NCOUNTERS)
-+ tbucket = IOSTAT_NCOUNTERS - 1;
-+ //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket);
-+
-+ /* an ugly hack to know exact processing time. the right
-+ * solution is to add one more field to struct request
-+ * hopefully it will break nothing ... */
-+ rq->start_time = jiffies;
-+
-+ spin_lock_irqsave(&stats->iostat_lock, irqflags);
-+
-+ /* update queue depth stats */
-+ i = stats->iostat_queue_depth;
-+ if (i >= IOSTAT_NCOUNTERS)
-+ i = IOSTAT_NCOUNTERS - 1;
-+ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
-+ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
-+ stats->iostat_queue_depth++;
-+
-+ /* update delay stats */
-+ if (rq_data_dir(rq) == WRITE) {
-+ stats->iostat_wtime_in_queue[tbucket]++;
-+ stats->iostat_write_reqs++;
-+ } else {
-+ stats->iostat_rtime_in_queue[tbucket]++;
-+ stats->iostat_read_reqs++;
-+ }
-+
-+ /* update size stats */
-+ counter->iostat_size += nsect;
-+ counter->iostat_count++;
-+
-+ stats->iostat_queue_stamp = jiffies;
-+
-+ spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
-+}
-+#endif
-+
- /**
- * scsi_disk_release - Called to free the scsi_disk structure
- * @cdev: pointer to embedded class device
-@@ -1774,10 +2227,16 @@
- idr_remove(&sd_index_idr, sdkp->index);
- spin_unlock(&sd_index_lock);
-
-+#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
-+ if (sdkp->stats) {
-+ remove_proc_entry(disk->disk_name, sd_iostats_procdir);
-+ kfree(sdkp->stats);
-+ sdkp->stats = NULL;
-+ }
-+#endif
- disk->private_data = NULL;
- put_disk(disk);
- put_device(&sdkp->device->sdev_gendev);
--
- kfree(sdkp);
- }
-
-@@ -1844,6 +2303,7 @@
- static int __init init_sd(void)
- {
- int majors = 0, i;
-+ int rc = 0;
-
- SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
-
-@@ -1854,9 +2314,13 @@
- if (!majors)
- return -ENODEV;
-
-+ sd_iostats_init();
- class_register(&sd_disk_class);
-
-- return scsi_register_driver(&sd_template.gendrv);
-+ rc = scsi_register_driver(&sd_template.gendrv);
-+ if (rc)
-+ sd_iostats_fini();
-+ return rc;
- }
-
- /**
-@@ -1875,6 +2339,7 @@
- unregister_blkdev(sd_major(i), "sd");
-
- class_unregister(&sd_disk_class);
-+ sd_iostats_fini();
- }
-
- module_init(init_sd);
+++ /dev/null
-diff -pur linux-2.6.18-128.orig/fs/jbd/commit.c linux-2.6.18-128/fs/jbd/commit.c
---- linux-2.6.18-128.orig/fs/jbd/commit.c 2009-04-10 16:31:40.000000000 +0800
-+++ linux-2.6.18-128/fs/jbd/commit.c 2009-04-10 16:33:14.000000000 +0800
-@@ -862,7 +862,8 @@ wait_for_iobuf:
- if (err)
- __journal_abort_hard(journal);
- }
-- err = journal_wait_on_commit_record(cbh);
-+ if (!err && !is_journal_aborted(journal))
-+ err = journal_wait_on_commit_record(cbh);
-
- if (err)
- journal_abort(journal, err);
+++ /dev/null
-lustre_version.patch
-jbd-jcberr-2.6.18-vanilla.patch
-export_symbols-2.6.12.patch
-dev_read_only-2.6.18-vanilla.patch
-export-2.6.18-vanilla.patch
-sd_iostats-2.6-rhel5.patch
-export_symbol_numa-2.6-fc5.patch
-blkdev_tunables-2.6-rhel5.patch
-jbd-stats-2.6-rhel5.patch
-raid5-stats-rhel5.patch
-raid5-configurable-cachesize-rhel5.patch
-raid5-large-io-rhel5.patch
-raid5-stripe-by-stripe-handling-rhel5.patch
-raid5-merge-ios-rhel5.patch
-raid5-zerocopy-rhel5.patch
-raid5-maxsectors-rhel5.patch
-raid5-rebuild-corrupt-bug.patch
-md-rebuild-policy.patch
-jbd-journal-chksum-2.6.18-vanilla.patch
-quota-large-limits-rhel5.patch
-raid5-mmp-unplug-dev.patch
-small-fixes-about-jbd.patch
-mpt-fusion-max-sge.patch
-prune-icache-use-trylock-rhel5.patch
-jbd2-jcberr-2.6-rhel5.patch
-jbd2-commit-timer-no-jiffies-rounding.diff
-md-avoid-bug_on-when-bmc-overflow.patch
-jbd2_stats_proc_init-wrong-place.patch
-lustre_iser_max_sectors_tuning_lustre2.0.patch
-fix-forever-in-do_get_write_access.patch
SERIES VERSION COMMENT
SUPPORTED KERNELS:
-2.6-rhel5 RHEL5: 2.6.18-238.19.1.el5
2.6-rhel6 RHEL6: 2.6.32-279.14.1.el6
CLIENT SUPPORT FOR UNPATCHED KERNELS: