From f49753ed4ed0b16d7da336204bc3eadb278d78c1 Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 14 Feb 2007 04:56:39 +0000 Subject: [PATCH] Branch HEAD Fix 2.6.18 ext3 mballoc, extents patches for several omissions. Reorder patch hunks to match other patches to facilitate comparisons. b=10090 ldiskfs corruption under memory pressure, coverity fixes b=6191 mballoc fails on x86_64 > 2TB b=10634 mballoc using wrong find_next_bit() on big endian systems b=6449 don't print "too long searching" message to console Compile tested. --- .../patches/ext3-extents-2.6.18-vanilla.patch | 10 +- .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 1333 ++++++++++++-------- .../patches/ext3-extents-2.6.18-vanilla.patch | 10 +- .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 1333 ++++++++++++-------- 4 files changed, 1674 insertions(+), 1012 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch index f2988a2..8bd7acb 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch @@ -2538,26 +2538,30 @@ Index: linux-stage/fs/ext3/super.c Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_noextents, Opt_extdebug, Opt_grpquota }; -@@ -690,6 +694,8 @@ static match_table_t tokens = { +@@ -690,6 +694,9 @@ static match_table_t tokens = { {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -1035,6 +1041,12 @@ clear_qf_name: +@@ -1035,6 +1041,15 @@ clear_qf_name: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch index 20fa78a..d83625a 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch @@ -1,8 +1,400 @@ +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 +@@ -53,6 +53,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -379,6 +387,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -405,6 +413,14 @@ + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + ++#ifndef ext2_find_next_le_bit ++#ifdef __LITTLE_ENDIAN ++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) ++#else ++#error "mballoc needs a patch for big-endian systems - CFS bug 10634" ++#endif /* __LITTLE_ENDIAN */ ++#endif /* !ext2_find_next_le_bit */ ++ + /* + * Maximal mount counts between two filesystem checks + */ +@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, ++extern ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp); + extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); + extern void ext3_free_blocks (handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count); ++ ext3_fsblk_t block, unsigned long count, int metadata); + extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +@@ -881,6 +890,21 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *sb, int needs_recovery); ++extern int ext3_mb_release(struct super_block *sb); ++extern ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ ext3_fsblk_t goal, int *errp); ++extern ext3_fsblk_t ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ ext3_fsblk_t goal, int *len, int flags, ++ int *errp); ++extern int ext3_mb_reserve_blocks(struct super_block *sb, int); ++extern void ext3_mb_release_blocks(struct super_block *sb, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,43 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info ***s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -642,6 +643,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_grpquota + }; + +@@ -696,6 +697,9 @@ static match_table_t tokens = { + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1047,6 +1049,19 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super + "writeback"); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 +@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h + return ret; + failed_out: + for (i = 0; i i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1463,7 +1445,7 @@ out: + return 0; + } + +-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) + { + unsigned long count = 1; +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -805,7 +805,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } Index: linux-stage/fs/ext3/mballoc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800 -@@ -0,0 +1,2434 @@ +@@ -0,0 +1,2727 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -91,6 +483,11 @@ Index: linux-stage/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -147,6 +544,9 @@ Index: linux-stage/fs/ext3/mballoc.c + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ ++ ++ struct page *ac_buddy_page; ++ struct page *ac_bitmap_page; +}; + +#define AC_STATUS_CONTINUE 1 @@ -156,6 +556,8 @@ Index: linux-stage/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -178,9 +580,9 @@ Index: linux-stage/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -190,7 +592,6 @@ Index: linux-stage/fs/ext3/mballoc.c + +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); @@ -424,8 +825,9 @@ Index: linux-stage/fs/ext3/mballoc.c + +static void +ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ struct ext3_group_info *grp) ++ int group) +{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); + unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); + unsigned short i = 0, first, len; + unsigned free = 0, fragments = 0; @@ -436,7 +838,7 @@ Index: linux-stage/fs/ext3/mballoc.c + while (i < max) { + fragments++; + first = i; -+ i = find_next_bit(bitmap, max, i); ++ i = ext2_find_next_le_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) @@ -452,8 +854,8 @@ Index: linux-stage/fs/ext3/mballoc.c + * others waits for init completion on page lock */ + clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); + if (free != grp->bb_free) { -+ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", -+ free, grp->bb_free); ++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", ++ group, free, grp->bb_free); + grp->bb_free = free; + } + @@ -534,7 +936,10 @@ Index: linux-stage/fs/ext3/mballoc.c + for (i = 0; i < groups_per_page && bh[i]; i++) + wait_on_buffer(bh[i]); + -+ /* XXX: I/O error handling here */ ++ err = -EIO; ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ if (!buffer_uptodate(bh[i])) ++ goto out; + + first_block = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { @@ -552,11 +957,10 @@ Index: linux-stage/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ ext3_mb_generate_buddy(sb, data, bitmap, group); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -567,10 +971,12 @@ Index: linux-stage/fs/ext3/mballoc.c + SetPageUptodate(page); + +out: -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh && bh != &bhs) -+ kfree(bh); ++ if (bh) { ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } + return err; +} + @@ -587,7 +993,7 @@ Index: linux-stage/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -597,12 +1003,15 @@ Index: linux-stage/fs/ext3/mballoc.c + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + ++ /* we could use find_or_create_page(), but it locks page ++ * what we'd like to avoid in fast path ... */ + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) + ext3_mb_init_cache(page); + unlock_page(page); @@ -624,6 +1033,7 @@ Index: linux-stage/fs/ext3/mballoc.c + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) + ext3_mb_init_cache(page); + unlock_page(page); @@ -663,14 +1073,14 @@ Index: linux-stage/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -800,7 +1210,7 @@ Index: linux-stage/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -825,6 +1235,11 @@ Index: linux-stage/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -928,6 +1343,13 @@ Index: linux-stage/fs/ext3/mballoc.c + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; ++ ++ /* hold in-core structures until allocated ++ * blocks are marked non-free in on-disk bitmap */ ++ ac->ac_buddy_page = e3b->bd_buddy_page; ++ page_cache_get(e3b->bd_buddy_page); ++ ac->ac_bitmap_page = e3b->bd_bitmap_page; ++ page_cache_get(e3b->bd_bitmap_page); +} + +/* @@ -963,7 +1385,7 @@ Index: linux-stage/fs/ext3/mballoc.c + } + + /* -+ * Let's check whether the chuck is good enough ++ * Let's check whether the chunk is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; @@ -1037,6 +1459,8 @@ Index: linux-stage/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1047,7 +1471,25 @@ Index: linux-stage/fs/ext3/mballoc.c + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + -+ if (max > 0) { ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1075,7 +1517,7 @@ Index: linux-stage/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1140,11 +1582,46 @@ Index: linux-stage/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1161,15 +1638,18 @@ Index: linux-stage/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1234,6 +1714,9 @@ Index: linux-stage/fs/ext3/mballoc.c + } + } + ++ ac.ac_buddy_page = NULL; ++ ac.ac_bitmap_page = NULL; ++ + /* + * Check quota for allocation of this blocks. + */ @@ -1270,23 +1753,27 @@ Index: linux-stage/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1297,7 +1784,7 @@ Index: linux-stage/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1325,6 +1812,8 @@ Index: linux-stage/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1332,8 +1821,6 @@ Index: linux-stage/fs/ext3/mballoc.c + + ext3_mb_release_desc(&e3b); + -+ if (err) -+ goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } @@ -1347,7 +1834,7 @@ Index: linux-stage/fs/ext3/mballoc.c + */ + + /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_ERR "EXT3-fs: too long searching at " ++ printk(KERN_DEBUG "EXT3-fs: too long searching at " + "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, + ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); @@ -1356,7 +1843,7 @@ Index: linux-stage/fs/ext3/mballoc.c + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) -+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); + */ + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; @@ -1376,17 +1863,16 @@ Index: linux-stage/fs/ext3/mballoc.c + *errp = -ENOSPC; + block = 0; +#if 1 -+ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", + ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", + ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, + ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); + printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1484,6 +1970,11 @@ Index: linux-stage/fs/ext3/mballoc.c + *errp = err; + block = 0; +out: ++ if (ac.ac_buddy_page) ++ page_cache_release(ac.ac_buddy_page); ++ if (ac.ac_bitmap_page) ++ page_cache_release(ac.ac_bitmap_page); ++ + if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter @@ -1507,7 +1998,7 @@ Index: linux-stage/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1572,9 +2063,9 @@ Index: linux-stage/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1582,9 +2073,9 @@ Index: linux-stage/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1652,12 +2143,107 @@ Index: linux-stage/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ long group = (long) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -1680,6 +2266,11 @@ Index: linux-stage/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -1692,7 +2283,8 @@ Index: linux-stage/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -1700,6 +2292,8 @@ Index: linux-stage/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -1727,21 +2321,40 @@ Index: linux-stage/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -1753,30 +2366,42 @@ Index: linux-stage/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); -+ goto err_out; ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); -+ goto err_out; ++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -1818,7 +2443,6 @@ Index: linux-stage/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -1855,7 +2479,7 @@ Index: linux-stage/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; ++ int i, num_meta_group_infos; + + if (!test_opt(sb, MBALLOC)) + return 0; @@ -1870,11 +2494,13 @@ Index: linux-stage/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -1936,6 +2562,7 @@ Index: linux-stage/fs/ext3/mballoc.c + md->num, md->group, md); + + err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ @@ -2234,7 +2861,6 @@ Index: linux-stage/fs/ext3/mballoc.c + return ret; +} + -+ +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ @@ -2242,7 +2868,7 @@ Index: linux-stage/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); @@ -2255,6 +2881,7 @@ Index: linux-stage/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2276,7 +2903,7 @@ Index: linux-stage/fs/ext3/mballoc.c + char str[32]; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2310,7 +2937,7 @@ Index: linux-stage/fs/ext3/mballoc.c + long value; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2349,7 +2976,7 @@ Index: linux-stage/fs/ext3/mballoc.c + long value; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2367,260 +2994,134 @@ Index: linux-stage/fs/ext3/mballoc.c + return count; +} + -+int __init init_ext3_proc(void) ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } ++ int len; + -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ *eof = 1; ++ if (off != 0) ++ return 0; + -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} + -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; + -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; + } + -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; + -Index: linux-stage/fs/ext3/extents.c -=================================================================== ---- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 -@@ -771,7 +771,7 @@ cleanup: - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-stage/fs/ext3/xattr.c -=================================================================== ---- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 -@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); -- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { -@@ -805,7 +805,7 @@ inserted: - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -Index: linux-stage/fs/ext3/balloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -490,24 +490,6 @@ error_return: - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- ext3_fsblk_t block, unsigned long count) --{ -- struct super_block * sb; -- unsigned long dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1463,7 +1445,7 @@ out: - return 0; - } - --ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, -+ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp) - { - unsigned long count = 1; -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 -@@ -391,6 +391,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -641,7 +642,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, - Opt_grpquota - }; - -@@ -696,6 +697,7 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1047,6 +1049,9 @@ clear_qf_name: - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super - "writeback"); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - lock_kernel(); - return 0; - -@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; + -+ err = init_ext3_proc(); -+ if (err) -+ return err; ++ ext3_mb_order2_reqs = value; + -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} Index: linux-stage/fs/ext3/Makefile =================================================================== --- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800 @@ -2634,177 +3135,3 @@ Index: linux-stage/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 -@@ -53,6 +53,14 @@ - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -379,6 +387,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b - /* balloc.c */ - extern int ext3_bg_has_super(struct super_block *sb, int group); - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); --extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, -- ext3_fsblk_t goal, int *errp); -+//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, -+// ext3_fsblk_t goal, int *errp); - extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp); - extern void ext3_free_blocks (handle_t *handle, struct inode *inode, -- ext3_fsblk_t block, unsigned long count); -+ ext3_fsblk_t block, unsigned long count, int metadata); - extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); -@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-stage/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 -@@ -21,8 +21,14 @@ - #include - #include - #include -+#include - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -78,6 +84,38 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 -@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h - return ret; - failed_out: - for (i = 0; i s_mount_opt, NOBH); break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch index 20fa78a..d83625a 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch @@ -1,8 +1,400 @@ +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 +@@ -53,6 +53,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -379,6 +387,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -405,6 +413,14 @@ + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + ++#ifndef ext2_find_next_le_bit ++#ifdef __LITTLE_ENDIAN ++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) ++#else ++#error "mballoc needs a patch for big-endian systems - CFS bug 10634" ++#endif /* __LITTLE_ENDIAN */ ++#endif /* !ext2_find_next_le_bit */ ++ + /* + * Maximal mount counts between two filesystem checks + */ +@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, ++extern ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp); + extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); + extern void ext3_free_blocks (handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count); ++ ext3_fsblk_t block, unsigned long count, int metadata); + extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +@@ -881,6 +890,21 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *sb, int needs_recovery); ++extern int ext3_mb_release(struct super_block *sb); ++extern ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ ext3_fsblk_t goal, int *errp); ++extern ext3_fsblk_t ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ ext3_fsblk_t goal, int *len, int flags, ++ int *errp); ++extern int ext3_mb_reserve_blocks(struct super_block *sb, int); ++extern void ext3_mb_release_blocks(struct super_block *sb, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,43 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info ***s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -642,6 +643,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_grpquota + }; + +@@ -696,6 +697,9 @@ static match_table_t tokens = { + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1047,6 +1049,19 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super + "writeback"); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 +@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h + return ret; + failed_out: + for (i = 0; i i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1463,7 +1445,7 @@ out: + return 0; + } + +-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) + { + unsigned long count = 1; +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -805,7 +805,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } Index: linux-stage/fs/ext3/mballoc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800 -@@ -0,0 +1,2434 @@ +@@ -0,0 +1,2727 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -91,6 +483,11 @@ Index: linux-stage/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -147,6 +544,9 @@ Index: linux-stage/fs/ext3/mballoc.c + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ ++ ++ struct page *ac_buddy_page; ++ struct page *ac_bitmap_page; +}; + +#define AC_STATUS_CONTINUE 1 @@ -156,6 +556,8 @@ Index: linux-stage/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -178,9 +580,9 @@ Index: linux-stage/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -190,7 +592,6 @@ Index: linux-stage/fs/ext3/mballoc.c + +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); @@ -424,8 +825,9 @@ Index: linux-stage/fs/ext3/mballoc.c + +static void +ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ struct ext3_group_info *grp) ++ int group) +{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); + unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); + unsigned short i = 0, first, len; + unsigned free = 0, fragments = 0; @@ -436,7 +838,7 @@ Index: linux-stage/fs/ext3/mballoc.c + while (i < max) { + fragments++; + first = i; -+ i = find_next_bit(bitmap, max, i); ++ i = ext2_find_next_le_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) @@ -452,8 +854,8 @@ Index: linux-stage/fs/ext3/mballoc.c + * others waits for init completion on page lock */ + clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); + if (free != grp->bb_free) { -+ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", -+ free, grp->bb_free); ++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", ++ group, free, grp->bb_free); + grp->bb_free = free; + } + @@ -534,7 +936,10 @@ Index: linux-stage/fs/ext3/mballoc.c + for (i = 0; i < groups_per_page && bh[i]; i++) + wait_on_buffer(bh[i]); + -+ /* XXX: I/O error handling here */ ++ err = -EIO; ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ if (!buffer_uptodate(bh[i])) ++ goto out; + + first_block = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { @@ -552,11 +957,10 @@ Index: linux-stage/fs/ext3/mballoc.c + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); -+ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; -+ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, -+ EXT3_SB(sb)->s_group_info[group]); ++ ext3_mb_generate_buddy(sb, data, bitmap, group); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", @@ -567,10 +971,12 @@ Index: linux-stage/fs/ext3/mballoc.c + SetPageUptodate(page); + +out: -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh && bh != &bhs) -+ kfree(bh); ++ if (bh) { ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } + return err; +} + @@ -587,7 +993,7 @@ Index: linux-stage/fs/ext3/mballoc.c + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; @@ -597,12 +1003,15 @@ Index: linux-stage/fs/ext3/mballoc.c + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + ++ /* we could use find_or_create_page(), but it locks page ++ * what we'd like to avoid in fast path ... */ + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) + ext3_mb_init_cache(page); + unlock_page(page); @@ -624,6 +1033,7 @@ Index: linux-stage/fs/ext3/mballoc.c + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) + ext3_mb_init_cache(page); + unlock_page(page); @@ -663,14 +1073,14 @@ Index: linux-stage/fs/ext3/mballoc.c +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -800,7 +1210,7 @@ Index: linux-stage/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -825,6 +1235,11 @@ Index: linux-stage/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -928,6 +1343,13 @@ Index: linux-stage/fs/ext3/mballoc.c + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; ++ ++ /* hold in-core structures until allocated ++ * blocks are marked non-free in on-disk bitmap */ ++ ac->ac_buddy_page = e3b->bd_buddy_page; ++ page_cache_get(e3b->bd_buddy_page); ++ ac->ac_bitmap_page = e3b->bd_bitmap_page; ++ page_cache_get(e3b->bd_bitmap_page); +} + +/* @@ -963,7 +1385,7 @@ Index: linux-stage/fs/ext3/mballoc.c + } + + /* -+ * Let's check whether the chuck is good enough ++ * Let's check whether the chunk is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; @@ -1037,6 +1459,8 @@ Index: linux-stage/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1047,7 +1471,25 @@ Index: linux-stage/fs/ext3/mballoc.c + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + -+ if (max > 0) { ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1075,7 +1517,7 @@ Index: linux-stage/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1140,11 +1582,46 @@ Index: linux-stage/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); @@ -1161,15 +1638,18 @@ Index: linux-stage/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1234,6 +1714,9 @@ Index: linux-stage/fs/ext3/mballoc.c + } + } + ++ ac.ac_buddy_page = NULL; ++ ac.ac_bitmap_page = NULL; ++ + /* + * Check quota for allocation of this blocks. + */ @@ -1270,23 +1753,27 @@ Index: linux-stage/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1297,7 +1784,7 @@ Index: linux-stage/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); @@ -1325,6 +1812,8 @@ Index: linux-stage/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1332,8 +1821,6 @@ Index: linux-stage/fs/ext3/mballoc.c + + ext3_mb_release_desc(&e3b); + -+ if (err) -+ goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } @@ -1347,7 +1834,7 @@ Index: linux-stage/fs/ext3/mballoc.c + */ + + /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_ERR "EXT3-fs: too long searching at " ++ printk(KERN_DEBUG "EXT3-fs: too long searching at " + "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, + ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); @@ -1356,7 +1843,7 @@ Index: linux-stage/fs/ext3/mballoc.c + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) -+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); + */ + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; @@ -1376,17 +1863,16 @@ Index: linux-stage/fs/ext3/mballoc.c + *errp = -ENOSPC; + block = 0; +#if 1 -+ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", + ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", + ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, + ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); + printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, -+ sbi->s_group_info[i]->bb_free); ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; @@ -1484,6 +1970,11 @@ Index: linux-stage/fs/ext3/mballoc.c + *errp = err; + block = 0; +out: ++ if (ac.ac_buddy_page) ++ page_cache_release(ac.ac_buddy_page); ++ if (ac.ac_bitmap_page) ++ page_cache_release(ac.ac_bitmap_page); ++ + if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter @@ -1507,7 +1998,7 @@ Index: linux-stage/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1572,9 +2063,9 @@ Index: linux-stage/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1582,9 +2073,9 @@ Index: linux-stage/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1652,12 +2143,107 @@ Index: linux-stage/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ long group = (long) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -1680,6 +2266,11 @@ Index: linux-stage/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -1692,7 +2283,8 @@ Index: linux-stage/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -1700,6 +2292,8 @@ Index: linux-stage/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -1727,21 +2321,40 @@ Index: linux-stage/fs/ext3/mballoc.c +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, len; -+ -+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } -+ memset(sbi->s_group_info, 0, len); -+ + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ kfree(sbi->s_group_info); -+ return -ENOMEM; ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; + } + + /* @@ -1753,30 +2366,42 @@ Index: linux-stage/fs/ext3/mballoc.c + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + -+ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_group_info[i] == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); -+ goto err_out; ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); ++ i--; ++ goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); -+ goto err_out; ++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); ++ goto err_freebuddy; + } -+ memset(sbi->s_group_info[i], 0, len); ++ memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &sbi->s_group_info[i]->bb_state); -+ sbi->s_group_info[i]->bb_free = ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + -+err_out: ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); -+ ++err_freesgi: ++ kfree(sbi->s_group_info); + return -ENOMEM; +} + @@ -1818,7 +2443,6 @@ Index: linux-stage/fs/ext3/mballoc.c + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); -+ + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { @@ -1855,7 +2479,7 @@ Index: linux-stage/fs/ext3/mballoc.c +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; ++ int i, num_meta_group_infos; + + if (!test_opt(sb, MBALLOC)) + return 0; @@ -1870,11 +2494,13 @@ Index: linux-stage/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_group_info[i] == NULL) -+ continue; ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); -+ } + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) @@ -1936,6 +2562,7 @@ Index: linux-stage/fs/ext3/mballoc.c + md->num, md->group, md); + + err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ @@ -2234,7 +2861,6 @@ Index: linux-stage/fs/ext3/mballoc.c + return ret; +} + -+ +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ @@ -2242,7 +2868,7 @@ Index: linux-stage/fs/ext3/mballoc.c + int freed; + + sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC)) ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); @@ -2255,6 +2881,7 @@ Index: linux-stage/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2276,7 +2903,7 @@ Index: linux-stage/fs/ext3/mballoc.c + char str[32]; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2310,7 +2937,7 @@ Index: linux-stage/fs/ext3/mballoc.c + long value; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2349,7 +2976,7 @@ Index: linux-stage/fs/ext3/mballoc.c + long value; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2367,260 +2994,134 @@ Index: linux-stage/fs/ext3/mballoc.c + return count; +} + -+int __init init_ext3_proc(void) ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } ++ int len; + -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ *eof = 1; ++ if (off != 0) ++ return 0; + -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} + -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; + -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; + } + -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; + -Index: linux-stage/fs/ext3/extents.c -=================================================================== ---- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 -@@ -771,7 +771,7 @@ cleanup: - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-stage/fs/ext3/xattr.c -=================================================================== ---- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 -@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); -- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { -@@ -805,7 +805,7 @@ inserted: - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -Index: linux-stage/fs/ext3/balloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -490,24 +490,6 @@ error_return: - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- ext3_fsblk_t block, unsigned long count) --{ -- struct super_block * sb; -- unsigned long dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1463,7 +1445,7 @@ out: - return 0; - } - --ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, -+ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp) - { - unsigned long count = 1; -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 -@@ -391,6 +391,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -641,7 +642,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, - Opt_grpquota - }; - -@@ -696,6 +697,7 @@ static match_table_t tokens = { - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1047,6 +1049,9 @@ clear_qf_name: - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super - "writeback"); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - lock_kernel(); - return 0; - -@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; + -+ err = init_ext3_proc(); -+ if (err) -+ return err; ++ ext3_mb_order2_reqs = value; + -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} Index: linux-stage/fs/ext3/Makefile =================================================================== --- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800 @@ -2634,177 +3135,3 @@ Index: linux-stage/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 -@@ -53,6 +53,14 @@ - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -379,6 +387,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b - /* balloc.c */ - extern int ext3_bg_has_super(struct super_block *sb, int group); - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); --extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, -- ext3_fsblk_t goal, int *errp); -+//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, -+// ext3_fsblk_t goal, int *errp); - extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp); - extern void ext3_free_blocks (handle_t *handle, struct inode *inode, -- ext3_fsblk_t block, unsigned long count); -+ ext3_fsblk_t block, unsigned long count, int metadata); - extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); -@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-stage/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 -@@ -21,8 +21,14 @@ - #include - #include - #include -+#include - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -78,6 +84,38 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info **s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 -@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h - return ret; - failed_out: - for (i = 0; i