Fix 2.6.18 ext3 mballoc, extents patches for several omissions.
Reorder patch hunks to match other patches to facilitate comparisons.
b=10090 ldiskfs corruption under memory pressure, coverity fixes
b=6191 mballoc fails on x86_64 > 2TB
b=10634 mballoc using wrong find_next_bit() on big endian systems
b=6449 don't print "too long searching" message to console
Compile tested.
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
Opt_grpquota
};
-@@ -690,6 +694,8 @@ static match_table_t tokens = {
+@@ -690,6 +694,9 @@ static match_table_t tokens = {
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -1035,6 +1041,12 @@ clear_qf_name:
+@@ -1035,6 +1041,15 @@ clear_qf_name:
case Opt_bh:
clear_opt(sbi->s_mount_opt, NOBH);
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
+Index: linux-stage/include/linux/ext3_fs.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800
+@@ -53,6 +53,14 @@
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
+ /*
+ * Special inodes numbers
+ */
+@@ -379,6 +387,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -405,6 +413,14 @@
+ #define ext3_find_first_zero_bit ext2_find_first_zero_bit
+ #define ext3_find_next_zero_bit ext2_find_next_zero_bit
+
++#ifndef ext2_find_next_le_bit
++#ifdef __LITTLE_ENDIAN
++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off))
++#else
++#error "mballoc needs a patch for big-endian systems - CFS bug 10634"
++#endif /* __LITTLE_ENDIAN */
++#endif /* !ext2_find_next_le_bit */
++
+ /*
+ * Maximal mount counts between two filesystem checks
+ */
+@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b
+ /* balloc.c */
+ extern int ext3_bg_has_super(struct super_block *sb, int group);
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
++extern ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp);
+ extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, unsigned long *count, int *errp);
+ extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+- ext3_fsblk_t block, unsigned long count);
++ ext3_fsblk_t block, unsigned long count, int metadata);
+ extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+ ext3_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks);
+@@ -881,6 +890,21 @@ extern void ext3_extents_initialize_bloc
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+
++/* mballoc.c */
++extern long ext3_mb_stats;
++extern long ext3_mb_max_to_scan;
++extern int ext3_mb_init(struct super_block *sb, int needs_recovery);
++extern int ext3_mb_release(struct super_block *sb);
++extern ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
++ ext3_fsblk_t goal, int *errp);
++extern ext3_fsblk_t ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
++ ext3_fsblk_t goal, int *len, int flags,
++ int *errp);
++extern int ext3_mb_reserve_blocks(struct super_block *sb, int);
++extern void ext3_mb_release_blocks(struct super_block *sb, int);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
++
+ #endif /* __KERNEL__ */
+
+ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+Index: linux-stage/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800
+@@ -21,8 +21,14 @@
+ #include <linux/wait.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
+
+ /*
+ * third extended-fs super-block data in memory
+@@ -78,6 +84,43 @@ struct ext3_sb_info {
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info ***s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++ unsigned long s_stripe;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
++
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-stage/fs/ext3/super.c
+===================================================================
+--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800
+@@ -391,6 +391,7 @@ static void ext3_put_super (struct super
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -642,6 +643,7 @@ enum {
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_grpquota
+ };
+
+@@ -696,6 +697,9 @@ static match_table_t tokens = {
+ {Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
++ {Opt_stripe, "stripe=%u"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -1047,6 +1049,19 @@ clear_qf_name:
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_stripe:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_stripe = option;
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super
+ "writeback");
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+ lock_kernel();
+ return 0;
+
+@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
+ {
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
+ }
+
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+Index: linux-stage/fs/ext3/extents.c
+===================================================================
+--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800
+@@ -771,7 +771,7 @@ cleanup:
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
+ }
+ }
+ kfree(ablocks);
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
+ path->p_idx->ei_leaf);
+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
+- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
+ return err;
+ }
+
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
+ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
+ handle_t *handle = ext3_journal_start(tree->inode, needed);
+ struct buffer_head *bh;
+- int i;
++ int i, metadata = 0;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
++ metadata = 1;
+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
+ /* tail removal */
+ unsigned long num, start;
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
+ bh = sb_find_get_block(tree->inode->i_sb, start + i);
+ ext3_forget(handle, 0, tree->inode, bh, start + i);
+ }
+- ext3_free_blocks(handle, tree->inode, start, num);
++ ext3_free_blocks(handle, tree->inode, start, num, metadata);
+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, ex->ee_block, ex->ee_len);
+Index: linux-stage/fs/ext3/inode.c
+===================================================================
+--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800
+@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h
+ return ret;
+ failed_out:
+ for (i = 0; i <index; i++)
+- ext3_free_blocks(handle, inode, new_blocks[i], 1);
++ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
+ return ret;
+ }
+
+@@ -661,9 +661,9 @@ failed:
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i <indirect_blks; i++)
+- ext3_free_blocks(handle, inode, new_blocks[i], 1);
++ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
+
+- ext3_free_blocks(handle, inode, new_blocks[i], num);
++ ext3_free_blocks(handle, inode, new_blocks[i], num, 1);
+
+ return err;
+ }
+@@ -760,9 +760,9 @@ err_out:
+ for (i = 1; i <= num; i++) {
+ BUFFER_TRACE(where[i].bh, "call journal_forget");
+ ext3_journal_forget(handle, where[i].bh);
+- ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
++ ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1,1);
+ }
+- ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
++ ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 1);
+
+ return err;
+ }
+@@ -2007,7 +2007,7 @@ static void ext3_clear_blocks(handle_t *
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2180,7 +2180,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-stage/fs/ext3/balloc.c
+===================================================================
+--- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -490,24 +490,6 @@ error_return:
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- ext3_fsblk_t block, unsigned long count)
+-{
+- struct super_block * sb;
+- unsigned long dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1463,7 +1445,7 @@ out:
+ return 0;
+ }
+
+-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp)
+ {
+ unsigned long count = 1;
+Index: linux-stage/fs/ext3/xattr.c
+===================================================================
+--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800
+@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl
+ ea_bdebug(bh, "refcount now=0; freeing");
+ if (ce)
+ mb_cache_entry_free(ce);
+- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
+ get_bh(bh);
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+ } else {
+@@ -805,7 +805,7 @@ inserted:
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+ getblk_failed:
+- ext3_free_blocks(handle, inode, block, 1);
++ ext3_free_blocks(handle, inode, block, 1, 1);
+ error = -EIO;
+ goto cleanup;
+ }
Index: linux-stage/fs/ext3/mballoc.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800
-@@ -0,0 +1,2434 @@
+@@ -0,0 +1,2727 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+long ext3_mb_stats = 1;
+
++/*
++ * for which requests use 2^N search using buddies
++ */
++long ext3_mb_order2_reqs = 8;
++
+#ifdef EXT3_BB_MAX_BLOCKS
+#undef EXT3_BB_MAX_BLOCKS
+#endif
+ __u8 ac_repeats;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+ * N > 0, the field stores N, otherwise 0 */
++
++ struct page *ac_buddy_page;
++ struct page *ac_bitmap_page;
+};
+
+#define AC_STATUS_CONTINUE 1
+struct ext3_mb_history {
+ struct ext3_free_extent goal; /* goal allocation */
+ struct ext3_free_extent result; /* result allocation */
++ unsigned pid;
++ unsigned ino;
+ __u16 found; /* how many extents have been found */
+ __u16 groups; /* how many groups have been scanned */
+ __u16 tail; /* what tail broke some buddy */
+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
+
+#ifndef EXT3_MB_HISTORY
-+#define ext3_mb_store_history(sb,ac)
++#define ext3_mb_store_history(sb,ino,ac)
+#else
-+static void ext3_mb_store_history(struct super_block *,
++static void ext3_mb_store_history(struct super_block *, unsigned ino,
+ struct ext3_allocation_context *ac);
+#endif
+
+
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
-+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+int ext3_mb_reserve_blocks(struct super_block *, int);
+void ext3_mb_release_blocks(struct super_block *, int);
+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
+
+static void
+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
-+ struct ext3_group_info *grp)
++ int group)
+{
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group);
+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
+ unsigned short i = 0, first, len;
+ unsigned free = 0, fragments = 0;
+ while (i < max) {
+ fragments++;
+ first = i;
-+ i = find_next_bit(bitmap, max, i);
++ i = ext2_find_next_le_bit(bitmap, max, i);
+ len = i - first;
+ free += len;
+ if (len > 1)
+ * others waits for init completion on page lock */
+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
+ if (free != grp->bb_free) {
-+ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
-+ free, grp->bb_free);
++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n",
++ group, free, grp->bb_free);
+ grp->bb_free = free;
+ }
+
+ for (i = 0; i < groups_per_page && bh[i]; i++)
+ wait_on_buffer(bh[i]);
+
-+ /* XXX: I/O error handling here */
++ err = -EIO;
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ if (!buffer_uptodate(bh[i]))
++ goto out;
+
+ first_block = page->index * blocks_per_page;
+ for (i = 0; i < blocks_per_page; i++) {
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
-+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ ext3_mb_generate_buddy(sb, data, bitmap, group);
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ SetPageUptodate(page);
+
+out:
-+ for (i = 0; i < groups_per_page && bh[i]; i++)
-+ brelse(bh[i]);
-+ if (bh && bh != &bhs)
-+ kfree(bh);
++ if (bh) {
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh != &bhs)
++ kfree(bh);
++ }
+ return err;
+}
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+
++ /* we could use find_or_create_page(), but it locks page
++ * what we'd like to avoid in fast path ... */
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page == NULL || !PageUptodate(page)) {
+ if (page)
+ page_cache_release(page);
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
++ BUG_ON(page->mapping != inode->i_mapping);
+ if (!PageUptodate(page))
+ ext3_mb_init_cache(page);
+ unlock_page(page);
+ page_cache_release(page);
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
++ BUG_ON(page->mapping != inode->i_mapping);
+ if (!PageUptodate(page))
+ ext3_mb_init_cache(page);
+ unlock_page(page);
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
+ int needed, struct ext3_free_extent *ex)
+{
-+ int next, max, ord;
++ int next = block, max, ord;
+ void *buddy;
+
+ J_ASSERT(ex != NULL);
+ ex->fe_start = block << order;
+ ex->fe_group = e3b->bd_group;
+
++ /* calc difference from given start */
++ next = next - ex->fe_start;
++ ex->fe_len -= next;
++ ex->fe_start += next;
++
+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ ac->ac_status = AC_STATUS_FOUND;
+ ac->ac_tail = ret & 0xffff;
+ ac->ac_buddy = ret >> 16;
++
++ /* hold in-core structures until allocated
++ * blocks are marked non-free in on-disk bitmap */
++ ac->ac_buddy_page = e3b->bd_buddy_page;
++ page_cache_get(e3b->bd_buddy_page);
++ ac->ac_bitmap_page = e3b->bd_bitmap_page;
++ page_cache_get(e3b->bd_bitmap_page);
+}
+
+/*
+ }
+
+ /*
-+ * Let's check whether the chuck is good enough
++ * Let's check whether the chunk is good enough
+ */
+ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ struct ext3_buddy *e3b)
+{
+ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_super_block *es = sbi->s_es;
+ struct ext3_free_extent ex;
+
+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
+ ac->ac_g_ex.fe_len, &ex);
+
-+ if (max > 0) {
++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
++ unsigned long start;
++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ if (start % sbi->s_stripe == 0) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ } else if (max >= ac->ac_g_ex.fe_len) {
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) {
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+ int i, k, max;
+
+ J_ASSERT(ac->ac_2order > 0);
-+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+ if (grp->bb_counters[i] == 0)
+ continue;
+
+ }
+}
+
++/*
++ * This is a special case for storages like raid5
++ * we try to find stripe-aligned chunks for stripe-size requests
++ */
++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ unsigned long i, max;
++
++ J_ASSERT(sbi->s_stripe != 0);
++
++ /* find first stripe-aligned block */
++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(sbi->s_es->s_first_data_block);
++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe;
++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
++ % EXT3_BLOCKS_PER_GROUP(sb);
++
++ while (i < sb->s_blocksize * 8) {
++ if (!mb_test_bit(i, bitmap)) {
++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex);
++ if (max >= sbi->s_stripe) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ break;
++ }
++ }
++ i += sbi->s_stripe;
++ }
++}
++
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+ case 0:
+ J_ASSERT(ac->ac_2order != 0);
+ bits = ac->ac_sb->s_blocksize_bits + 1;
-+ for (i = ac->ac_2order; i < bits; i++)
++ for (i = ac->ac_2order; i <= bits; i++)
+ if (grp->bb_counters[i] > 0)
+ return 1;
++ break;
+ case 1:
+ if ((free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
++ break;
+ case 2:
+ if (free >= ac->ac_g_ex.fe_len)
+ return 1;
++ break;
+ case 3:
+ return 1;
+ default:
+ }
+ }
+
++ ac.ac_buddy_page = NULL;
++ ac.ac_bitmap_page = NULL;
++
+ /*
+ * Check quota for allocation of this blocks.
+ */
+ ac.ac_2order = 0;
+ ac.ac_criteria = 0;
+
++ if (*len == 1 && sbi->s_stripe) {
++ /* looks like a metadata, let's use a dirty hack for raid5
++ * move all metadata in first groups in hope to hit cached
++ * sectors and thus avoid read-modify cycles in raid5 */
++ ac.ac_g_ex.fe_group = group = 0;
++ }
++
+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
+ i = ffs(*len);
-+ if (i >= 8) {
++ if (i >= ext3_mb_order2_reqs) {
+ i--;
+ if ((*len & (~(1 << i))) == 0)
+ ac.ac_2order = i;
+ }
+
-+ /* Sometimes, caller may want to merge even small
-+ * number of blocks to an existing extent */
-+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
-+ err = ext3_mb_find_by_goal(&ac, &e3b);
-+ if (err)
-+ goto out_err;
-+ if (ac.ac_status == AC_STATUS_FOUND)
-+ goto found;
-+ }
++ /* first, try the goal */
++ err = ext3_mb_find_by_goal(&ac, &e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ goto found;
+
+ /* Let's just scan groups to find more-less suitable blocks */
+ cr = ac.ac_2order ? 0 : 1;
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ ac.ac_groups_scanned++;
+ if (cr == 0)
+ ext3_mb_simple_scan_group(&ac, &e3b);
++ else if (cr == 1 && *len == sbi->s_stripe)
++ ext3_mb_scan_aligned(&ac, &e3b);
+ else
+ ext3_mb_complex_scan_group(&ac, &e3b);
+
+
+ ext3_mb_release_desc(&e3b);
+
-+ if (err)
-+ goto out_err;
+ if (ac.ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
+ */
+
+ /*if (ac.ac_found > ext3_mb_max_to_scan)
-+ printk(KERN_ERR "EXT3-fs: too long searching at "
++ printk(KERN_DEBUG "EXT3-fs: too long searching at "
+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
+ ac.ac_g_ex.fe_len);*/
+ ext3_mb_try_best_found(&ac, &e3b);
+ * Someone more lucky has already allocated it.
+ * The only thing we can do is just take first
+ * found block(s)
-+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n");
+ */
+ ac.ac_b_ex.fe_group = 0;
+ ac.ac_b_ex.fe_start = 0;
+ *errp = -ENOSPC;
+ block = 0;
+#if 1
-+ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n",
+ ac.ac_status, ac.ac_flags);
-+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n",
+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = err;
+ block = 0;
+out:
++ if (ac.ac_buddy_page)
++ page_cache_release(ac.ac_buddy_page);
++ if (ac.ac_bitmap_page)
++ page_cache_release(ac.ac_bitmap_page);
++
+ if (!(flags & EXT3_MB_HINT_RESERVED)) {
+ /* block wasn't reserved before and we reserved it
+ * at the beginning of allocation. it doesn't matter
+ atomic_inc(&sbi->s_bal_breaks);
+ }
+
-+ ext3_mb_store_history(sb, &ac);
++ ext3_mb_store_history(sb, inode->i_ino, &ac);
+
+ return block;
+}
+ char buf[20], buf2[20];
+
+ if (v == SEQ_START_TOKEN) {
-+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
-+ "goal", "result", "found", "grps", "cr", "merge",
-+ "tail", "broken");
++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "pid", "inode", "goal", "result", "found", "grps", "cr",
++ "merge", "tail", "broken");
+ return 0;
+ }
+
+ hs->goal.fe_start, hs->goal.fe_len);
+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
+ hs->result.fe_start, hs->result.fe_len);
-+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
-+ buf2, hs->found, hs->groups, hs->cr,
-+ hs->merged ? "M" : "", hs->tail,
++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n",
++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups,
++ hs->cr, hs->merged ? "M" : "", hs->tail,
+ hs->buddy ? 1 << hs->buddy : 0);
+ return 0;
+}
+ .release = ext3_mb_seq_history_release,
+};
+
++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
++{
++ struct super_block *sb = seq->private;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ long group;
++
++ if (*pos < 0 || *pos >= sbi->s_groups_count)
++ return NULL;
++
++ group = *pos + 1;
++ return (void *) group;
++}
++
++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ struct super_block *sb = seq->private;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ long group;
++
++ ++*pos;
++ if (*pos < 0 || *pos >= sbi->s_groups_count)
++ return NULL;
++ group = *pos + 1;
++ return (void *) group;;
++}
++
++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v)
++{
++ struct super_block *sb = seq->private;
++ long group = (long) v, i;
++ struct sg {
++ struct ext3_group_info info;
++ unsigned short counters[16];
++ } sg;
++
++ group--;
++ if (group == 0)
++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
++ "group", "free", "frags", "first", "2^0", "2^1", "2^2",
++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10",
++ "2^11", "2^12", "2^13");
++
++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
++ sizeof(struct ext3_group_info);
++ ext3_lock_group(sb, group);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
++ ext3_unlock_group(sb, group);
++
++ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
++ return 0;
++
++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
++ sg.info.bb_fragments, sg.info.bb_first_free);
++ for (i = 0; i <= 13; i++)
++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
++ sg.info.bb_counters[i] : 0);
++ seq_printf(seq, " ]\n");
++
++ return 0;
++}
++
++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_groups_ops = {
++ .start = ext3_mb_seq_groups_start,
++ .next = ext3_mb_seq_groups_next,
++ .stop = ext3_mb_seq_groups_stop,
++ .show = ext3_mb_seq_groups_show,
++};
++
++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file)
++{
++ struct super_block *sb = PDE(inode)->data;
++ int rc;
++
++ rc = seq_open(file, &ext3_mb_seq_groups_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = sb;
++ }
++ return rc;
++
++}
++
++static struct file_operations ext3_mb_seq_groups_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
+static void ext3_mb_history_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char name[64];
+
+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_groups", sbi->s_mb_proc);
+ remove_proc_entry("mb_history", sbi->s_mb_proc);
+ remove_proc_entry(name, proc_root_ext3);
+
+ p->proc_fops = &ext3_mb_seq_history_fops;
+ p->data = sb;
+ }
++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_groups_fops;
++ p->data = sb;
++ }
+ }
+
+ sbi->s_mb_history_max = 1000;
+}
+
+static void
-+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++ext3_mb_store_history(struct super_block *sb, unsigned ino,
++ struct ext3_allocation_context *ac)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_mb_history h;
+ if (likely(sbi->s_mb_history == NULL))
+ return;
+
++ h.pid = current->pid;
++ h.ino = ino;
+ h.goal = ac->ac_g_ex;
+ h.result = ac->ac_b_ex;
+ h.found = ac->ac_found;
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
-+ goto err_out;
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
-+ goto err_out;
++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
++ int i, num_meta_group_infos;
+
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ md->num, md->group, md);
+
+ err = ext3_mb_load_buddy(sb, md->group, &e3b);
++ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
+
+ /* there are blocks to put in buddy to make them really free */
+ return ret;
+}
+
-+
+void ext3_free_blocks(handle_t *handle, struct inode * inode,
+ unsigned long block, unsigned long count, int metadata)
+{
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+ char str[32];
+
+ if (count >= sizeof(str)) {
-+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
+ EXT3_MB_STATS_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+ long value;
+
+ if (count >= sizeof(str)) {
-+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+ long value;
+
+ if (count >= sizeof(str)) {
-+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+ return count;
+}
+
-+int __init init_ext3_proc(void)
++static int ext3_mb_order2_req_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
-+ struct proc_dir_entry *proc_ext3_mb_stats;
-+ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
-+ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
-+
-+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
-+ if (proc_root_ext3 == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
-+ return -EIO;
-+ }
-+
-+ /* Initialize EXT3_MB_STATS_NAME */
-+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_stats == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+ return -EIO;
-+ }
++ int len;
+
-+ proc_ext3_mb_stats->data = NULL;
-+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read;
-+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write;
++ *eof = 1;
++ if (off != 0)
++ return 0;
+
-+ /* Initialize EXT3_MAX_TO_SCAN_NAME */
-+ proc_ext3_mb_max_to_scan = create_proc_entry(
-+ EXT3_MB_MAX_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_max_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_MAX_TO_SCAN_NAME);
-+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+ return -EIO;
-+ }
++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs);
++ *start = page;
++ return len;
++}
+
-+ proc_ext3_mb_max_to_scan->data = NULL;
-+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
-+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
++static int ext3_mb_order2_req_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++ long value;
+
-+ /* Initialize EXT3_MIN_TO_SCAN_NAME */
-+ proc_ext3_mb_min_to_scan = create_proc_entry(
-+ EXT3_MB_MIN_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_min_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_MIN_TO_SCAN_NAME);
-+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+ return -EIO;
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
+ }
+
-+ proc_ext3_mb_min_to_scan->data = NULL;
-+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
-+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
-+
-+ return 0;
-+}
-+
-+void exit_ext3_proc(void)
-+{
-+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+}
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
+
-Index: linux-stage/fs/ext3/extents.c
-===================================================================
---- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800
-@@ -771,7 +771,7 @@ cleanup:
- for (i = 0; i < depth; i++) {
- if (!ablocks[i])
- continue;
-- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
-+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
- }
- }
- kfree(ablocks);
-@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
- path->p_idx->ei_leaf);
- bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
- ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
-- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
-+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
- return err;
- }
-
-@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
- int needed = ext3_remove_blocks_credits(tree, ex, from, to);
- handle_t *handle = ext3_journal_start(tree->inode, needed);
- struct buffer_head *bh;
-- int i;
-+ int i, metadata = 0;
-
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
-+ metadata = 1;
- if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
- /* tail removal */
- unsigned long num, start;
-@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
- bh = sb_find_get_block(tree->inode->i_sb, start + i);
- ext3_forget(handle, 0, tree->inode, bh, start + i);
- }
-- ext3_free_blocks(handle, tree->inode, start, num);
-+ ext3_free_blocks(handle, tree->inode, start, num, metadata);
- } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
- printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, ex->ee_block, ex->ee_len);
-Index: linux-stage/fs/ext3/xattr.c
-===================================================================
---- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800
-@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl
- ea_bdebug(bh, "refcount now=0; freeing");
- if (ce)
- mb_cache_entry_free(ce);
-- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
-+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
- get_bh(bh);
- ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
- } else {
-@@ -805,7 +805,7 @@ inserted:
- new_bh = sb_getblk(sb, block);
- if (!new_bh) {
- getblk_failed:
-- ext3_free_blocks(handle, inode, block, 1);
-+ ext3_free_blocks(handle, inode, block, 1, 1);
- error = -EIO;
- goto cleanup;
- }
-Index: linux-stage/fs/ext3/balloc.c
-===================================================================
---- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800
-@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
- {
- struct ext3_group_desc * desc;
-@@ -490,24 +490,6 @@ error_return:
- return;
- }
-
--/* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-- ext3_fsblk_t block, unsigned long count)
--{
-- struct super_block * sb;
-- unsigned long dquot_freed_blocks;
--
-- sb = inode->i_sb;
-- if (!sb) {
-- printk ("ext3_free_blocks: nonexistent device");
-- return;
-- }
-- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-- if (dquot_freed_blocks)
-- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-- return;
--}
--
- /*
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
-@@ -1463,7 +1445,7 @@ out:
- return 0;
- }
-
--ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
-+ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, int *errp)
- {
- unsigned long count = 1;
-Index: linux-stage/fs/ext3/super.c
-===================================================================
---- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800
-@@ -391,6 +391,7 @@ static void ext3_put_super (struct super
- struct ext3_super_block *es = sbi->s_es;
- int i;
-
-+ ext3_mb_release(sb);
- ext3_ext_release(sb);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
-@@ -641,7 +642,7 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc,
- Opt_grpquota
- };
-
-@@ -696,6 +697,7 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_extents, "extents"},
- {Opt_extdebug, "extdebug"},
-+ {Opt_mballoc, "mballoc"},
- {Opt_barrier, "barrier=%u"},
- {Opt_err, NULL},
- {Opt_resize, "resize"},
-@@ -1047,6 +1049,9 @@ clear_qf_name:
- case Opt_extdebug:
- set_opt (sbi->s_mount_opt, EXTDEBUG);
- break;
-+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
-+ break;
- default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super
- "writeback");
-
- ext3_ext_init(sb);
-+ ext3_mb_init(sb, needs_recovery);
- lock_kernel();
- return 0;
-
-@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t
-
- static int __init init_ext3_fs(void)
- {
-- int err = init_ext3_xattr();
-+ int err;
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
+
-+ err = init_ext3_proc();
-+ if (err)
-+ return err;
++ ext3_mb_order2_reqs = value;
+
-+ err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
-@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void)
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-+ exit_ext3_proc();
- }
-
- int ext3_prep_san_write(struct inode *inode, long *blocks,
++ return count;
++}
++
++int __init init_ext3_proc(void)
++{
++ struct proc_dir_entry *proc_ext3_mb_stats;
++ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_order2_req;
++
++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
++ if (proc_root_ext3 == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT);
++ return -EIO;
++ }
++
++ /* Initialize EXT3_MB_STATS_NAME */
++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_stats == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_stats->data = NULL;
++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read;
++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write;
++
++ /* Initialize EXT3_MAX_TO_SCAN_NAME */
++ proc_ext3_mb_max_to_scan = create_proc_entry(
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_max_to_scan == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_max_to_scan->data = NULL;
++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
++
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
++ /* Initialize EXT3_ORDER2_REQ */
++ proc_ext3_mb_order2_req = create_proc_entry(
++ EXT3_MB_ORDER2_REQ,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_order2_req == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_ORDER2_REQ);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_order2_req->data = NULL;
++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read;
++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write;
++
++ return 0;
++}
++
++void exit_ext3_proc(void)
++{
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++}
Index: linux-stage/fs/ext3/Makefile
===================================================================
--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800
ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-stage/include/linux/ext3_fs.h
-===================================================================
---- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800
-@@ -53,6 +53,14 @@
- #define ext3_debug(f, a...) do {} while (0)
- #endif
-
-+#define EXT3_MULTIBLOCK_ALLOCATOR 1
-+
-+#define EXT3_MB_HINT_MERGE 1
-+#define EXT3_MB_HINT_RESERVED 2
-+#define EXT3_MB_HINT_METADATA 4
-+#define EXT3_MB_HINT_FIRST 8
-+#define EXT3_MB_HINT_BEST 16
-+
- /*
- * Special inodes numbers
- */
-@@ -379,6 +387,7 @@ struct ext3_inode {
- #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */
- #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */
- #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef clear_opt
-@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b
- /* balloc.c */
- extern int ext3_bg_has_super(struct super_block *sb, int group);
- extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
--extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
-- ext3_fsblk_t goal, int *errp);
-+//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
-+// ext3_fsblk_t goal, int *errp);
- extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, unsigned long *count, int *errp);
- extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
-- ext3_fsblk_t block, unsigned long count);
-+ ext3_fsblk_t block, unsigned long count, int metadata);
- extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
- ext3_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks);
-@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc
- extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg);
-
-+/* mballoc.c */
-+extern long ext3_mb_stats;
-+extern long ext3_mb_max_to_scan;
-+extern int ext3_mb_init(struct super_block *, int);
-+extern int ext3_mb_release(struct super_block *);
-+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
-+extern int ext3_mb_reserve_blocks(struct super_block *, int);
-+extern void ext3_mb_release_blocks(struct super_block *, int);
-+int __init init_ext3_proc(void);
-+void exit_ext3_proc(void);
-+
- #endif /* __KERNEL__ */
-
- /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
-Index: linux-stage/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800
-@@ -21,8 +21,14 @@
- #include <linux/wait.h>
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-+#include <linux/list.h>
- #endif
- #include <linux/rbtree.h>
-+#include <linux/proc_fs.h>
-+
-+struct ext3_buddy_group_blocks;
-+struct ext3_mb_history;
-+#define EXT3_BB_MAX_BLOCKS
-
- /*
- * third extended-fs super-block data in memory
-@@ -78,6 +84,38 @@ struct ext3_sb_info {
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
- #endif
-+
-+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
-+ struct inode *s_buddy_cache;
-+ long s_blocks_reserved;
-+ spinlock_t s_reserve_lock;
-+ struct list_head s_active_transaction;
-+ struct list_head s_closed_transaction;
-+ struct list_head s_committed_transaction;
-+ spinlock_t s_md_lock;
-+ tid_t s_last_transaction;
-+ int s_mb_factor;
-+ unsigned short *s_mb_offsets, *s_mb_maxs;
-+
-+ /* history to debug policy */
-+ struct ext3_mb_history *s_mb_history;
-+ int s_mb_history_cur;
-+ int s_mb_history_max;
-+ struct proc_dir_entry *s_mb_proc;
-+ spinlock_t s_mb_history_lock;
-+
-+ /* stats for buddy allocator */
-+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
-+ atomic_t s_bal_success; /* we found long enough chunks */
-+ atomic_t s_bal_allocated; /* in blocks */
-+ atomic_t s_bal_ex_scanned; /* total extents scanned */
-+ atomic_t s_bal_goals; /* goal hits */
-+ atomic_t s_bal_breaks; /* too long searches */
-+ atomic_t s_bal_2orders; /* 2^order hits */
-+ spinlock_t s_bal_lock;
-+ unsigned long s_mb_buddies_generated;
-+ unsigned long long s_mb_generation_time;
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-stage/fs/ext3/inode.c
-===================================================================
---- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800
-@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h
- return ret;
- failed_out:
- for (i = 0; i <index; i++)
-- ext3_free_blocks(handle, inode, new_blocks[i], 1);
-+ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
- return ret;
- }
-
-@@ -661,9 +661,9 @@ failed:
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i <indirect_blks; i++)
-- ext3_free_blocks(handle, inode, new_blocks[i], 1);
-+ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
-
-- ext3_free_blocks(handle, inode, new_blocks[i], num);
-+ ext3_free_blocks(handle, inode, new_blocks[i], num, 1);
-
- return err;
- }
-@@ -760,9 +760,9 @@ err_out:
- for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call journal_forget");
- ext3_journal_forget(handle, where[i].bh);
-- ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
-+ ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1,1);
- }
-- ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 1);
-
- return err;
- }
-@@ -2007,7 +2007,7 @@ static void ext3_clear_blocks(handle_t *
- }
- }
-
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2180,7 +2180,7 @@ static void ext3_free_branches(handle_t
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
-
- if (parent_bh) {
- /*
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_extents, Opt_extdebug,
++ Opt_extents, Opt_noextents, Opt_extdebug,
Opt_grpquota
};
-@@ -690,6 +694,8 @@ static match_table_t tokens = {
+@@ -690,6 +694,9 @@ static match_table_t tokens = {
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
{Opt_barrier, "barrier=%u"},
{Opt_err, NULL},
{Opt_resize, "resize"},
-@@ -1035,6 +1041,12 @@ clear_qf_name:
+@@ -1035,6 +1041,15 @@ clear_qf_name:
case Opt_bh:
clear_opt(sbi->s_mount_opt, NOBH);
break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_noextents:
++ clear_opt (sbi->s_mount_opt, EXTENTS);
++ break;
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
+Index: linux-stage/include/linux/ext3_fs.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800
+@@ -53,6 +53,14 @@
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
+ /*
+ * Special inodes numbers
+ */
+@@ -379,6 +387,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -405,6 +413,14 @@
+ #define ext3_find_first_zero_bit ext2_find_first_zero_bit
+ #define ext3_find_next_zero_bit ext2_find_next_zero_bit
+
++#ifndef ext2_find_next_le_bit
++#ifdef __LITTLE_ENDIAN
++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off))
++#else
++#error "mballoc needs a patch for big-endian systems - CFS bug 10634"
++#endif /* __LITTLE_ENDIAN */
++#endif /* !ext2_find_next_le_bit */
++
+ /*
+ * Maximal mount counts between two filesystem checks
+ */
+@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b
+ /* balloc.c */
+ extern int ext3_bg_has_super(struct super_block *sb, int group);
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
++extern ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp);
+ extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, unsigned long *count, int *errp);
+ extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+- ext3_fsblk_t block, unsigned long count);
++ ext3_fsblk_t block, unsigned long count, int metadata);
+ extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+ ext3_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks);
+@@ -881,6 +890,21 @@ extern void ext3_extents_initialize_bloc
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+
++/* mballoc.c */
++extern long ext3_mb_stats;
++extern long ext3_mb_max_to_scan;
++extern int ext3_mb_init(struct super_block *sb, int needs_recovery);
++extern int ext3_mb_release(struct super_block *sb);
++extern ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
++ ext3_fsblk_t goal, int *errp);
++extern ext3_fsblk_t ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
++ ext3_fsblk_t goal, int *len, int flags,
++ int *errp);
++extern int ext3_mb_reserve_blocks(struct super_block *sb, int);
++extern void ext3_mb_release_blocks(struct super_block *sb, int);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
++
+ #endif /* __KERNEL__ */
+
+ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+Index: linux-stage/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800
+@@ -21,8 +21,14 @@
+ #include <linux/wait.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
+
+ /*
+ * third extended-fs super-block data in memory
+@@ -78,6 +84,43 @@ struct ext3_sb_info {
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info ***s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++ unsigned long s_stripe;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
++
++#define EXT3_GROUP_INFO(sb, group) \
++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \
++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-stage/fs/ext3/super.c
+===================================================================
+--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800
+@@ -391,6 +391,7 @@ static void ext3_put_super (struct super
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -642,6 +643,7 @@ enum {
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_extents, Opt_noextents, Opt_extdebug,
++ Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_grpquota
+ };
+
+@@ -696,6 +697,9 @@ static match_table_t tokens = {
+ {Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
++ {Opt_nomballoc, "nomballoc"},
++ {Opt_stripe, "stripe=%u"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -1047,6 +1049,19 @@ clear_qf_name:
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_nomballoc:
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ break;
++ case Opt_stripe:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_stripe = option;
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super
+ "writeback");
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+ lock_kernel();
+ return 0;
+
+@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
+ {
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
+ }
+
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+Index: linux-stage/fs/ext3/extents.c
+===================================================================
+--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800
+@@ -771,7 +771,7 @@ cleanup:
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
+ }
+ }
+ kfree(ablocks);
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
+ path->p_idx->ei_leaf);
+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
+- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
+ return err;
+ }
+
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
+ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
+ handle_t *handle = ext3_journal_start(tree->inode, needed);
+ struct buffer_head *bh;
+- int i;
++ int i, metadata = 0;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
++ metadata = 1;
+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
+ /* tail removal */
+ unsigned long num, start;
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
+ bh = sb_find_get_block(tree->inode->i_sb, start + i);
+ ext3_forget(handle, 0, tree->inode, bh, start + i);
+ }
+- ext3_free_blocks(handle, tree->inode, start, num);
++ ext3_free_blocks(handle, tree->inode, start, num, metadata);
+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, ex->ee_block, ex->ee_len);
+Index: linux-stage/fs/ext3/inode.c
+===================================================================
+--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800
+@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h
+ return ret;
+ failed_out:
+ for (i = 0; i <index; i++)
+- ext3_free_blocks(handle, inode, new_blocks[i], 1);
++ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
+ return ret;
+ }
+
+@@ -661,9 +661,9 @@ failed:
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i <indirect_blks; i++)
+- ext3_free_blocks(handle, inode, new_blocks[i], 1);
++ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
+
+- ext3_free_blocks(handle, inode, new_blocks[i], num);
++ ext3_free_blocks(handle, inode, new_blocks[i], num, 1);
+
+ return err;
+ }
+@@ -760,9 +760,9 @@ err_out:
+ for (i = 1; i <= num; i++) {
+ BUFFER_TRACE(where[i].bh, "call journal_forget");
+ ext3_journal_forget(handle, where[i].bh);
+- ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
++ ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1,1);
+ }
+- ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
++ ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 1);
+
+ return err;
+ }
+@@ -2007,7 +2007,7 @@ static void ext3_clear_blocks(handle_t *
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2180,7 +2180,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-stage/fs/ext3/balloc.c
+===================================================================
+--- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -490,24 +490,6 @@ error_return:
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- ext3_fsblk_t block, unsigned long count)
+-{
+- struct super_block * sb;
+- unsigned long dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1463,7 +1445,7 @@ out:
+ return 0;
+ }
+
+-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp)
+ {
+ unsigned long count = 1;
+Index: linux-stage/fs/ext3/xattr.c
+===================================================================
+--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800
++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800
+@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl
+ ea_bdebug(bh, "refcount now=0; freeing");
+ if (ce)
+ mb_cache_entry_free(ce);
+- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
+ get_bh(bh);
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+ } else {
+@@ -805,7 +805,7 @@ inserted:
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+ getblk_failed:
+- ext3_free_blocks(handle, inode, block, 1);
++ ext3_free_blocks(handle, inode, block, 1, 1);
+ error = -EIO;
+ goto cleanup;
+ }
Index: linux-stage/fs/ext3/mballoc.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800
-@@ -0,0 +1,2434 @@
+@@ -0,0 +1,2727 @@
+/*
+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+long ext3_mb_stats = 1;
+
++/*
++ * for which requests use 2^N search using buddies
++ */
++long ext3_mb_order2_reqs = 8;
++
+#ifdef EXT3_BB_MAX_BLOCKS
+#undef EXT3_BB_MAX_BLOCKS
+#endif
+ __u8 ac_repeats;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+ * N > 0, the field stores N, otherwise 0 */
++
++ struct page *ac_buddy_page;
++ struct page *ac_bitmap_page;
+};
+
+#define AC_STATUS_CONTINUE 1
+struct ext3_mb_history {
+ struct ext3_free_extent goal; /* goal allocation */
+ struct ext3_free_extent result; /* result allocation */
++ unsigned pid;
++ unsigned ino;
+ __u16 found; /* how many extents have been found */
+ __u16 groups; /* how many groups have been scanned */
+ __u16 tail; /* what tail broke some buddy */
+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
+
+#ifndef EXT3_MB_HISTORY
-+#define ext3_mb_store_history(sb,ac)
++#define ext3_mb_store_history(sb,ino,ac)
+#else
-+static void ext3_mb_store_history(struct super_block *,
++static void ext3_mb_store_history(struct super_block *, unsigned ino,
+ struct ext3_allocation_context *ac);
+#endif
+
+
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
-+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+int ext3_mb_reserve_blocks(struct super_block *, int);
+void ext3_mb_release_blocks(struct super_block *, int);
+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
+
+static void
+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
-+ struct ext3_group_info *grp)
++ int group)
+{
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group);
+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
+ unsigned short i = 0, first, len;
+ unsigned free = 0, fragments = 0;
+ while (i < max) {
+ fragments++;
+ first = i;
-+ i = find_next_bit(bitmap, max, i);
++ i = ext2_find_next_le_bit(bitmap, max, i);
+ len = i - first;
+ free += len;
+ if (len > 1)
+ * others waits for init completion on page lock */
+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
+ if (free != grp->bb_free) {
-+ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
-+ free, grp->bb_free);
++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n",
++ group, free, grp->bb_free);
+ grp->bb_free = free;
+ }
+
+ for (i = 0; i < groups_per_page && bh[i]; i++)
+ wait_on_buffer(bh[i]);
+
-+ /* XXX: I/O error handling here */
++ err = -EIO;
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ if (!buffer_uptodate(bh[i]))
++ goto out;
+
+ first_block = page->index * blocks_per_page;
+ for (i = 0; i < blocks_per_page; i++) {
+ mb_debug("put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ memset(data, 0xff, blocksize);
-+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
-+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0;
++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0,
+ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
-+ ext3_mb_generate_buddy(sb, data, bitmap,
-+ EXT3_SB(sb)->s_group_info[group]);
++ ext3_mb_generate_buddy(sb, data, bitmap, group);
+ } else {
+ /* this is block of bitmap */
+ mb_debug("put bitmap for group %u in page %lu/%x\n",
+ SetPageUptodate(page);
+
+out:
-+ for (i = 0; i < groups_per_page && bh[i]; i++)
-+ brelse(bh[i]);
-+ if (bh && bh != &bhs)
-+ kfree(bh);
++ if (bh) {
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh != &bhs)
++ kfree(bh);
++ }
+ return err;
+}
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_info = EXT3_GROUP_INFO(sb, group);
+ e3b->bd_sb = sb;
+ e3b->bd_group = group;
+ e3b->bd_buddy_page = NULL;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+
++ /* we could use find_or_create_page(), but it locks page
++ * what we'd like to avoid in fast path ... */
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page == NULL || !PageUptodate(page)) {
+ if (page)
+ page_cache_release(page);
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
++ BUG_ON(page->mapping != inode->i_mapping);
+ if (!PageUptodate(page))
+ ext3_mb_init_cache(page);
+ unlock_page(page);
+ page_cache_release(page);
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
++ BUG_ON(page->mapping != inode->i_mapping);
+ if (!PageUptodate(page))
+ ext3_mb_init_cache(page);
+ unlock_page(page);
+ext3_lock_group(struct super_block *sb, int group)
+{
+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
-+ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++ &EXT3_GROUP_INFO(sb, group)->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
+ int needed, struct ext3_free_extent *ex)
+{
-+ int next, max, ord;
++ int next = block, max, ord;
+ void *buddy;
+
+ J_ASSERT(ex != NULL);
+ ex->fe_start = block << order;
+ ex->fe_group = e3b->bd_group;
+
++ /* calc difference from given start */
++ next = next - ex->fe_start;
++ ex->fe_len -= next;
++ ex->fe_start += next;
++
+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ ac->ac_status = AC_STATUS_FOUND;
+ ac->ac_tail = ret & 0xffff;
+ ac->ac_buddy = ret >> 16;
++
++ /* hold in-core structures until allocated
++ * blocks are marked non-free in on-disk bitmap */
++ ac->ac_buddy_page = e3b->bd_buddy_page;
++ page_cache_get(e3b->bd_buddy_page);
++ ac->ac_bitmap_page = e3b->bd_bitmap_page;
++ page_cache_get(e3b->bd_bitmap_page);
+}
+
+/*
+ }
+
+ /*
-+ * Let's check whether the chuck is good enough
++ * Let's check whether the chunk is good enough
+ */
+ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ struct ext3_buddy *e3b)
+{
+ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_super_block *es = sbi->s_es;
+ struct ext3_free_extent ex;
+
+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
+ ac->ac_g_ex.fe_len, &ex);
+
-+ if (max > 0) {
++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
++ unsigned long start;
++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) +
++ ex.fe_start + le32_to_cpu(es->s_first_data_block));
++ if (start % sbi->s_stripe == 0) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ } else if (max >= ac->ac_g_ex.fe_len) {
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) {
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
+ int i, k, max;
+
+ J_ASSERT(ac->ac_2order > 0);
-+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+ if (grp->bb_counters[i] == 0)
+ continue;
+
+ }
+}
+
++/*
++ * This is a special case for storages like raid5
++ * we try to find stripe-aligned chunks for stripe-size requests
++ */
++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ unsigned long i, max;
++
++ J_ASSERT(sbi->s_stripe != 0);
++
++ /* find first stripe-aligned block */
++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(sbi->s_es->s_first_data_block);
++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe;
++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
++ % EXT3_BLOCKS_PER_GROUP(sb);
++
++ while (i < sb->s_blocksize * 8) {
++ if (!mb_test_bit(i, bitmap)) {
++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex);
++ if (max >= sbi->s_stripe) {
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ break;
++ }
++ }
++ i += sbi->s_stripe;
++ }
++}
++
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
-+ struct ext3_group_info *grp = sbi->s_group_info[group];
++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group);
+ unsigned free, fragments, i, bits;
+
+ J_ASSERT(cr >= 0 && cr < 4);
+ case 0:
+ J_ASSERT(ac->ac_2order != 0);
+ bits = ac->ac_sb->s_blocksize_bits + 1;
-+ for (i = ac->ac_2order; i < bits; i++)
++ for (i = ac->ac_2order; i <= bits; i++)
+ if (grp->bb_counters[i] > 0)
+ return 1;
++ break;
+ case 1:
+ if ((free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
++ break;
+ case 2:
+ if (free >= ac->ac_g_ex.fe_len)
+ return 1;
++ break;
+ case 3:
+ return 1;
+ default:
+ }
+ }
+
++ ac.ac_buddy_page = NULL;
++ ac.ac_bitmap_page = NULL;
++
+ /*
+ * Check quota for allocation of this blocks.
+ */
+ ac.ac_2order = 0;
+ ac.ac_criteria = 0;
+
++ if (*len == 1 && sbi->s_stripe) {
++ /* looks like a metadata, let's use a dirty hack for raid5
++ * move all metadata in first groups in hope to hit cached
++ * sectors and thus avoid read-modify cycles in raid5 */
++ ac.ac_g_ex.fe_group = group = 0;
++ }
++
+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
+ i = ffs(*len);
-+ if (i >= 8) {
++ if (i >= ext3_mb_order2_reqs) {
+ i--;
+ if ((*len & (~(1 << i))) == 0)
+ ac.ac_2order = i;
+ }
+
-+ /* Sometimes, caller may want to merge even small
-+ * number of blocks to an existing extent */
-+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
-+ err = ext3_mb_find_by_goal(&ac, &e3b);
-+ if (err)
-+ goto out_err;
-+ if (ac.ac_status == AC_STATUS_FOUND)
-+ goto found;
-+ }
++ /* first, try the goal */
++ err = ext3_mb_find_by_goal(&ac, &e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ goto found;
+
+ /* Let's just scan groups to find more-less suitable blocks */
+ cr = ac.ac_2order ? 0 : 1;
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
-+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) {
+ /* we need full data about the group
+ * to make a good selection */
+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ ac.ac_groups_scanned++;
+ if (cr == 0)
+ ext3_mb_simple_scan_group(&ac, &e3b);
++ else if (cr == 1 && *len == sbi->s_stripe)
++ ext3_mb_scan_aligned(&ac, &e3b);
+ else
+ ext3_mb_complex_scan_group(&ac, &e3b);
+
+
+ ext3_mb_release_desc(&e3b);
+
-+ if (err)
-+ goto out_err;
+ if (ac.ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
+ */
+
+ /*if (ac.ac_found > ext3_mb_max_to_scan)
-+ printk(KERN_ERR "EXT3-fs: too long searching at "
++ printk(KERN_DEBUG "EXT3-fs: too long searching at "
+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
+ ac.ac_g_ex.fe_len);*/
+ ext3_mb_try_best_found(&ac, &e3b);
+ * Someone more lucky has already allocated it.
+ * The only thing we can do is just take first
+ * found block(s)
-+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n");
+ */
+ ac.ac_b_ex.fe_group = 0;
+ ac.ac_b_ex.fe_start = 0;
+ *errp = -ENOSPC;
+ block = 0;
+#if 1
-+ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n",
+ ac.ac_status, ac.ac_flags);
-+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n",
+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
+ sbi->s_blocks_reserved, ac.ac_found);
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ printk("%d: %d ", i,
-+ sbi->s_group_info[i]->bb_free);
++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ *errp = err;
+ block = 0;
+out:
++ if (ac.ac_buddy_page)
++ page_cache_release(ac.ac_buddy_page);
++ if (ac.ac_bitmap_page)
++ page_cache_release(ac.ac_bitmap_page);
++
+ if (!(flags & EXT3_MB_HINT_RESERVED)) {
+ /* block wasn't reserved before and we reserved it
+ * at the beginning of allocation. it doesn't matter
+ atomic_inc(&sbi->s_bal_breaks);
+ }
+
-+ ext3_mb_store_history(sb, &ac);
++ ext3_mb_store_history(sb, inode->i_ino, &ac);
+
+ return block;
+}
+ char buf[20], buf2[20];
+
+ if (v == SEQ_START_TOKEN) {
-+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
-+ "goal", "result", "found", "grps", "cr", "merge",
-+ "tail", "broken");
++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "pid", "inode", "goal", "result", "found", "grps", "cr",
++ "merge", "tail", "broken");
+ return 0;
+ }
+
+ hs->goal.fe_start, hs->goal.fe_len);
+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
+ hs->result.fe_start, hs->result.fe_len);
-+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
-+ buf2, hs->found, hs->groups, hs->cr,
-+ hs->merged ? "M" : "", hs->tail,
++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n",
++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups,
++ hs->cr, hs->merged ? "M" : "", hs->tail,
+ hs->buddy ? 1 << hs->buddy : 0);
+ return 0;
+}
+ .release = ext3_mb_seq_history_release,
+};
+
++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
++{
++ struct super_block *sb = seq->private;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ long group;
++
++ if (*pos < 0 || *pos >= sbi->s_groups_count)
++ return NULL;
++
++ group = *pos + 1;
++ return (void *) group;
++}
++
++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ struct super_block *sb = seq->private;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ long group;
++
++ ++*pos;
++ if (*pos < 0 || *pos >= sbi->s_groups_count)
++ return NULL;
++ group = *pos + 1;
++ return (void *) group;;
++}
++
++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v)
++{
++ struct super_block *sb = seq->private;
++ long group = (long) v, i;
++ struct sg {
++ struct ext3_group_info info;
++ unsigned short counters[16];
++ } sg;
++
++ group--;
++ if (group == 0)
++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
++ "group", "free", "frags", "first", "2^0", "2^1", "2^2",
++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10",
++ "2^11", "2^12", "2^13");
++
++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
++ sizeof(struct ext3_group_info);
++ ext3_lock_group(sb, group);
++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i);
++ ext3_unlock_group(sb, group);
++
++ if (EXT3_MB_GRP_NEED_INIT(&sg.info))
++ return 0;
++
++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
++ sg.info.bb_fragments, sg.info.bb_first_free);
++ for (i = 0; i <= 13; i++)
++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
++ sg.info.bb_counters[i] : 0);
++ seq_printf(seq, " ]\n");
++
++ return 0;
++}
++
++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_groups_ops = {
++ .start = ext3_mb_seq_groups_start,
++ .next = ext3_mb_seq_groups_next,
++ .stop = ext3_mb_seq_groups_stop,
++ .show = ext3_mb_seq_groups_show,
++};
++
++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file)
++{
++ struct super_block *sb = PDE(inode)->data;
++ int rc;
++
++ rc = seq_open(file, &ext3_mb_seq_groups_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = sb;
++ }
++ return rc;
++
++}
++
++static struct file_operations ext3_mb_seq_groups_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_groups_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
+static void ext3_mb_history_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char name[64];
+
+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_groups", sbi->s_mb_proc);
+ remove_proc_entry("mb_history", sbi->s_mb_proc);
+ remove_proc_entry(name, proc_root_ext3);
+
+ p->proc_fops = &ext3_mb_seq_history_fops;
+ p->data = sb;
+ }
++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_groups_fops;
++ p->data = sb;
++ }
+ }
+
+ sbi->s_mb_history_max = 1000;
+}
+
+static void
-+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++ext3_mb_store_history(struct super_block *sb, unsigned ino,
++ struct ext3_allocation_context *ac)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_mb_history h;
+ if (likely(sbi->s_mb_history == NULL))
+ return;
+
++ h.pid = current->pid;
++ h.ino = ino;
+ h.goal = ac->ac_g_ex;
+ h.result = ac->ac_b_ex;
+ h.found = ac->ac_found;
+int ext3_mb_init_backend(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i, len;
-+
-+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ int i, j, len, metalen;
++ int num_meta_group_infos =
++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ struct ext3_group_info **meta_group_info;
++
++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
++ * So a two level scheme suffices for now. */
++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
++ num_meta_group_infos, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
-+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_group_info, 0, len);
-+
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
-+ kfree(sbi->s_group_info);
-+ return -ENOMEM;
++ goto err_freesgi;
++ }
++
++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++) {
++ if ((i + 1) == num_meta_group_infos)
++ metalen = sizeof(*meta_group_info) *
++ (sbi->s_groups_count -
++ (i << EXT3_DESC_PER_BLOCK_BITS(sb)));
++ meta_group_info = kmalloc(metalen, GFP_KERNEL);
++ if (meta_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a "
++ "buddy group\n");
++ goto err_freemeta;
++ }
++ sbi->s_group_info[i] = meta_group_info;
+ }
+
+ /*
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc * desc;
+
-+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_group_info[i] == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
-+ goto err_out;
++ meta_group_info =
++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)];
++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1);
++
++ meta_group_info[j] = kmalloc(len, GFP_KERNEL);
++ if (meta_group_info[j] == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n");
++ i--;
++ goto err_freebuddy;
+ }
+ desc = ext3_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
-+ goto err_out;
++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
++ goto err_freebuddy;
+ }
-+ memset(sbi->s_group_info[i], 0, len);
++ memset(meta_group_info[j], 0, len);
+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
-+ &sbi->s_group_info[i]->bb_state);
-+ sbi->s_group_info[i]->bb_free =
++ &meta_group_info[j]->bb_state);
++ meta_group_info[j]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ return 0;
+
-+err_out:
++err_freebuddy:
++ while (i >= 0) {
++ kfree(EXT3_GROUP_INFO(sb, i));
++ i--;
++ }
++ i = num_meta_group_infos;
++err_freemeta:
+ while (--i >= 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
-+
++err_freesgi:
++ kfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
-+
+
+ /* init file for buddy data */
+ if ((i = ext3_mb_init_backend(sb))) {
+int ext3_mb_release(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int i;
++ int i, num_meta_group_infos;
+
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_group_info) {
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_group_info[i] == NULL)
-+ continue;
++ for (i = 0; i < sbi->s_groups_count; i++)
++ kfree(EXT3_GROUP_INFO(sb, i));
++ num_meta_group_infos = (sbi->s_groups_count +
++ EXT3_DESC_PER_BLOCK(sb) - 1) >>
++ EXT3_DESC_PER_BLOCK_BITS(sb);
++ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
-+ }
+ kfree(sbi->s_group_info);
+ }
+ if (sbi->s_mb_offsets)
+ md->num, md->group, md);
+
+ err = ext3_mb_load_buddy(sb, md->group, &e3b);
++ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
+
+ /* there are blocks to put in buddy to make them really free */
+ return ret;
+}
+
-+
+void ext3_free_blocks(handle_t *handle, struct inode * inode,
+ unsigned long block, unsigned long count, int metadata)
+{
+ int freed;
+
+ sb = inode->i_sb;
-+ if (!test_opt(sb, MBALLOC))
++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
+ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
++#define EXT3_MB_ORDER2_REQ "mb_order2_req"
+
+static int ext3_mb_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+ char str[32];
+
+ if (count >= sizeof(str)) {
-+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
+ EXT3_MB_STATS_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+ long value;
+
+ if (count >= sizeof(str)) {
-+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+ long value;
+
+ if (count >= sizeof(str)) {
-+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+ return count;
+}
+
-+int __init init_ext3_proc(void)
++static int ext3_mb_order2_req_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
-+ struct proc_dir_entry *proc_ext3_mb_stats;
-+ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
-+ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
-+
-+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
-+ if (proc_root_ext3 == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
-+ return -EIO;
-+ }
-+
-+ /* Initialize EXT3_MB_STATS_NAME */
-+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_stats == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+ return -EIO;
-+ }
++ int len;
+
-+ proc_ext3_mb_stats->data = NULL;
-+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read;
-+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write;
++ *eof = 1;
++ if (off != 0)
++ return 0;
+
-+ /* Initialize EXT3_MAX_TO_SCAN_NAME */
-+ proc_ext3_mb_max_to_scan = create_proc_entry(
-+ EXT3_MB_MAX_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_max_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_MAX_TO_SCAN_NAME);
-+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+ return -EIO;
-+ }
++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs);
++ *start = page;
++ return len;
++}
+
-+ proc_ext3_mb_max_to_scan->data = NULL;
-+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
-+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
++static int ext3_mb_order2_req_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++ long value;
+
-+ /* Initialize EXT3_MIN_TO_SCAN_NAME */
-+ proc_ext3_mb_min_to_scan = create_proc_entry(
-+ EXT3_MB_MIN_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_min_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_MIN_TO_SCAN_NAME);
-+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+ return -EIO;
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n",
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
+ }
+
-+ proc_ext3_mb_min_to_scan->data = NULL;
-+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
-+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
-+
-+ return 0;
-+}
-+
-+void exit_ext3_proc(void)
-+{
-+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
-+}
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
+
-Index: linux-stage/fs/ext3/extents.c
-===================================================================
---- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800
-@@ -771,7 +771,7 @@ cleanup:
- for (i = 0; i < depth; i++) {
- if (!ablocks[i])
- continue;
-- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
-+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
- }
- }
- kfree(ablocks);
-@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
- path->p_idx->ei_leaf);
- bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
- ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
-- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
-+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
- return err;
- }
-
-@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
- int needed = ext3_remove_blocks_credits(tree, ex, from, to);
- handle_t *handle = ext3_journal_start(tree->inode, needed);
- struct buffer_head *bh;
-- int i;
-+ int i, metadata = 0;
-
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
-+ metadata = 1;
- if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
- /* tail removal */
- unsigned long num, start;
-@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
- bh = sb_find_get_block(tree->inode->i_sb, start + i);
- ext3_forget(handle, 0, tree->inode, bh, start + i);
- }
-- ext3_free_blocks(handle, tree->inode, start, num);
-+ ext3_free_blocks(handle, tree->inode, start, num, metadata);
- } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
- printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, ex->ee_block, ex->ee_len);
-Index: linux-stage/fs/ext3/xattr.c
-===================================================================
---- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800
-@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl
- ea_bdebug(bh, "refcount now=0; freeing");
- if (ce)
- mb_cache_entry_free(ce);
-- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
-+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
- get_bh(bh);
- ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
- } else {
-@@ -805,7 +805,7 @@ inserted:
- new_bh = sb_getblk(sb, block);
- if (!new_bh) {
- getblk_failed:
-- ext3_free_blocks(handle, inode, block, 1);
-+ ext3_free_blocks(handle, inode, block, 1, 1);
- error = -EIO;
- goto cleanup;
- }
-Index: linux-stage/fs/ext3/balloc.c
-===================================================================
---- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800
-@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
- {
- struct ext3_group_desc * desc;
-@@ -490,24 +490,6 @@ error_return:
- return;
- }
-
--/* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-- ext3_fsblk_t block, unsigned long count)
--{
-- struct super_block * sb;
-- unsigned long dquot_freed_blocks;
--
-- sb = inode->i_sb;
-- if (!sb) {
-- printk ("ext3_free_blocks: nonexistent device");
-- return;
-- }
-- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-- if (dquot_freed_blocks)
-- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-- return;
--}
--
- /*
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
-@@ -1463,7 +1445,7 @@ out:
- return 0;
- }
-
--ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
-+ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, int *errp)
- {
- unsigned long count = 1;
-Index: linux-stage/fs/ext3/super.c
-===================================================================
---- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800
-@@ -391,6 +391,7 @@ static void ext3_put_super (struct super
- struct ext3_super_block *es = sbi->s_es;
- int i;
-
-+ ext3_mb_release(sb);
- ext3_ext_release(sb);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
-@@ -641,7 +642,7 @@ enum {
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_extents, Opt_extdebug,
-+ Opt_extents, Opt_extdebug, Opt_mballoc,
- Opt_grpquota
- };
-
-@@ -696,6 +697,7 @@ static match_table_t tokens = {
- {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_extents, "extents"},
- {Opt_extdebug, "extdebug"},
-+ {Opt_mballoc, "mballoc"},
- {Opt_barrier, "barrier=%u"},
- {Opt_err, NULL},
- {Opt_resize, "resize"},
-@@ -1047,6 +1049,9 @@ clear_qf_name:
- case Opt_extdebug:
- set_opt (sbi->s_mount_opt, EXTDEBUG);
- break;
-+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
-+ break;
- default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super
- "writeback");
-
- ext3_ext_init(sb);
-+ ext3_mb_init(sb, needs_recovery);
- lock_kernel();
- return 0;
-
-@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t
-
- static int __init init_ext3_fs(void)
- {
-- int err = init_ext3_xattr();
-+ int err;
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
+
-+ err = init_ext3_proc();
-+ if (err)
-+ return err;
++ ext3_mb_order2_reqs = value;
+
-+ err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
-@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void)
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-+ exit_ext3_proc();
- }
-
- int ext3_prep_san_write(struct inode *inode, long *blocks,
++ return count;
++}
++
++int __init init_ext3_proc(void)
++{
++ struct proc_dir_entry *proc_ext3_mb_stats;
++ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_order2_req;
++
++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
++ if (proc_root_ext3 == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT);
++ return -EIO;
++ }
++
++ /* Initialize EXT3_MB_STATS_NAME */
++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_stats == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_stats->data = NULL;
++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read;
++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write;
++
++ /* Initialize EXT3_MAX_TO_SCAN_NAME */
++ proc_ext3_mb_max_to_scan = create_proc_entry(
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_max_to_scan == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_max_to_scan->data = NULL;
++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
++
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
++ /* Initialize EXT3_ORDER2_REQ */
++ proc_ext3_mb_order2_req = create_proc_entry(
++ EXT3_MB_ORDER2_REQ,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_order2_req == NULL) {
++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n",
++ EXT3_MB_ORDER2_REQ);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_order2_req->data = NULL;
++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read;
++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write;
++
++ return 0;
++}
++
++void exit_ext3_proc(void)
++{
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++}
Index: linux-stage/fs/ext3/Makefile
===================================================================
--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800
ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-stage/include/linux/ext3_fs.h
-===================================================================
---- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800
-@@ -53,6 +53,14 @@
- #define ext3_debug(f, a...) do {} while (0)
- #endif
-
-+#define EXT3_MULTIBLOCK_ALLOCATOR 1
-+
-+#define EXT3_MB_HINT_MERGE 1
-+#define EXT3_MB_HINT_RESERVED 2
-+#define EXT3_MB_HINT_METADATA 4
-+#define EXT3_MB_HINT_FIRST 8
-+#define EXT3_MB_HINT_BEST 16
-+
- /*
- * Special inodes numbers
- */
-@@ -379,6 +387,7 @@ struct ext3_inode {
- #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */
- #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */
- #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef clear_opt
-@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b
- /* balloc.c */
- extern int ext3_bg_has_super(struct super_block *sb, int group);
- extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
--extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
-- ext3_fsblk_t goal, int *errp);
-+//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
-+// ext3_fsblk_t goal, int *errp);
- extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, unsigned long *count, int *errp);
- extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
-- ext3_fsblk_t block, unsigned long count);
-+ ext3_fsblk_t block, unsigned long count, int metadata);
- extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
- ext3_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks);
-@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc
- extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg);
-
-+/* mballoc.c */
-+extern long ext3_mb_stats;
-+extern long ext3_mb_max_to_scan;
-+extern int ext3_mb_init(struct super_block *, int);
-+extern int ext3_mb_release(struct super_block *);
-+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
-+extern int ext3_mb_reserve_blocks(struct super_block *, int);
-+extern void ext3_mb_release_blocks(struct super_block *, int);
-+int __init init_ext3_proc(void);
-+void exit_ext3_proc(void);
-+
- #endif /* __KERNEL__ */
-
- /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
-Index: linux-stage/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800
-@@ -21,8 +21,14 @@
- #include <linux/wait.h>
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-+#include <linux/list.h>
- #endif
- #include <linux/rbtree.h>
-+#include <linux/proc_fs.h>
-+
-+struct ext3_buddy_group_blocks;
-+struct ext3_mb_history;
-+#define EXT3_BB_MAX_BLOCKS
-
- /*
- * third extended-fs super-block data in memory
-@@ -78,6 +84,38 @@ struct ext3_sb_info {
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
- #endif
-+
-+ /* for buddy allocator */
-+ struct ext3_group_info **s_group_info;
-+ struct inode *s_buddy_cache;
-+ long s_blocks_reserved;
-+ spinlock_t s_reserve_lock;
-+ struct list_head s_active_transaction;
-+ struct list_head s_closed_transaction;
-+ struct list_head s_committed_transaction;
-+ spinlock_t s_md_lock;
-+ tid_t s_last_transaction;
-+ int s_mb_factor;
-+ unsigned short *s_mb_offsets, *s_mb_maxs;
-+
-+ /* history to debug policy */
-+ struct ext3_mb_history *s_mb_history;
-+ int s_mb_history_cur;
-+ int s_mb_history_max;
-+ struct proc_dir_entry *s_mb_proc;
-+ spinlock_t s_mb_history_lock;
-+
-+ /* stats for buddy allocator */
-+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
-+ atomic_t s_bal_success; /* we found long enough chunks */
-+ atomic_t s_bal_allocated; /* in blocks */
-+ atomic_t s_bal_ex_scanned; /* total extents scanned */
-+ atomic_t s_bal_goals; /* goal hits */
-+ atomic_t s_bal_breaks; /* too long searches */
-+ atomic_t s_bal_2orders; /* 2^order hits */
-+ spinlock_t s_bal_lock;
-+ unsigned long s_mb_buddies_generated;
-+ unsigned long long s_mb_generation_time;
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-stage/fs/ext3/inode.c
-===================================================================
---- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800
-+++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800
-@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h
- return ret;
- failed_out:
- for (i = 0; i <index; i++)
-- ext3_free_blocks(handle, inode, new_blocks[i], 1);
-+ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
- return ret;
- }
-
-@@ -661,9 +661,9 @@ failed:
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i <indirect_blks; i++)
-- ext3_free_blocks(handle, inode, new_blocks[i], 1);
-+ ext3_free_blocks(handle, inode, new_blocks[i], 1, 1);
-
-- ext3_free_blocks(handle, inode, new_blocks[i], num);
-+ ext3_free_blocks(handle, inode, new_blocks[i], num, 1);
-
- return err;
- }
-@@ -760,9 +760,9 @@ err_out:
- for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call journal_forget");
- ext3_journal_forget(handle, where[i].bh);
-- ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
-+ ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1,1);
- }
-- ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 1);
-
- return err;
- }
-@@ -2007,7 +2007,7 @@ static void ext3_clear_blocks(handle_t *
- }
- }
-
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2180,7 +2180,7 @@ static void ext3_free_branches(handle_t
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
-
- if (parent_bh) {
- /*