Index: linux-2.6.16.i686/include/linux/ext3_fs.h =================================================================== --- linux-2.6.16.i686.orig/include/linux/ext3_fs.h 2006-05-30 22:55:32.000000000 +0800 +++ linux-2.6.16.i686/include/linux/ext3_fs.h 2006-05-30 23:02:59.000000000 +0800 @@ -57,6 +57,14 @@ #define ext3_debug(f, a...) do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR 1 + +#define EXT3_MB_HINT_MERGE 1 +#define EXT3_MB_HINT_RESERVED 2 +#define EXT3_MB_HINT_METADATA 4 +#define EXT3_MB_HINT_FIRST 8 +#define EXT3_MB_HINT_BEST 16 + /* * Special inodes numbers */ @@ -383,6 +391,7 @@ struct ext3_inode { #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ +#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt @@ -404,6 +413,14 @@ #define ext3_find_first_zero_bit ext2_find_first_zero_bit #define ext3_find_next_zero_bit ext2_find_next_zero_bit +#ifndef ext2_find_next_le_bit +#ifdef __LITTLE_ENDIAN +#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) +#else +#error "mballoc needs a patch for big-endian systems - CFS bug 10634" +#endif /* __LITTLE_ENDIAN */ +#endif /* !ext2_find_next_le_bit */ + /* * Maximal mount counts between two filesystem checks */ @@ -744,7 +753,7 @@ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); extern void ext3_free_blocks_sb (handle_t *, struct super_block *, unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); @@ -865,6 +874,17 @@ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +/* mballoc.c */ +extern long ext3_mb_stats; +extern long ext3_mb_max_to_scan; +extern int ext3_mb_init(struct super_block *, int); +extern int ext3_mb_release(struct super_block *); +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); +extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); +int __init init_ext3_proc(void); +void exit_ext3_proc(void); + #endif /* __KERNEL__ */ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ Index: linux-2.6.16.i686/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.6.16.i686.orig/include/linux/ext3_fs_sb.h 2006-03-20 13:53:29.000000000 +0800 +++ linux-2.6.16.i686/include/linux/ext3_fs_sb.h 2006-05-30 23:02:59.000000000 +0800 @@ -21,8 +21,14 @@ #include #include #include +#include #endif #include +#include + +struct ext3_buddy_group_blocks; +struct ext3_mb_history; +#define EXT3_BB_MAX_BLOCKS /* * third extended-fs super-block data in memory @@ -78,6 +84,43 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + + /* for buddy allocator */ + struct ext3_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + struct list_head s_active_transaction; + struct list_head s_closed_transaction; + struct list_head s_committed_transaction; + spinlock_t s_md_lock; + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; + unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; + int s_mb_history_cur; + int s_mb_history_max; + struct proc_dir_entry *s_mb_proc; + spinlock_t s_mb_history_lock; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; }; + +#define EXT3_GROUP_INFO(sb, group) \ + EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ + [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.16.i686/fs/ext3/super.c =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/super.c 2006-05-30 22:55:32.000000000 +0800 +++ linux-2.6.16.i686/fs/ext3/super.c 2006-05-30 23:02:59.000000000 +0800 @@ -392,6 +392,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; int i; + ext3_mb_release(sb); ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); @@ -640,6 +641,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_extents, Opt_noextents, Opt_extdebug, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_grpquota }; @@ -694,6 +695,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, + {Opt_nomballoc, "nomballoc"}, + {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, @@ -1041,6 +1043,19 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt(sbi->s_mount_opt, MBALLOC); + break; + case Opt_nomballoc: + clear_opt(sbi->s_mount_opt, MBALLOC); + break; + case Opt_stripe: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_stripe = option; + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1766,6 +1771,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); + ext3_mb_init(sb, needs_recovery); lock_kernel(); return 0; @@ -2699,7 +2705,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { - int err = init_ext3_xattr(); + int err; + + err = init_ext3_proc(); + if (err) + return err; + + err = init_ext3_xattr(); if (err) return err; err = init_inodecache(); @@ -2721,6 +2733,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); + exit_ext3_proc(); } int ext3_prep_san_write(struct inode *inode, long *blocks, Index: linux-2.6.16.i686/fs/ext3/extents.c =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/extents.c 2006-05-30 22:55:32.000000000 +0800 +++ linux-2.6.16.i686/fs/ext3/extents.c 2006-05-30 23:02:59.000000000 +0800 @@ -771,7 +771,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext3_free_blocks(handle, tree->inode, ablocks[i], 1); + ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); } } kfree(ablocks); @@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); - ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); + ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); return err; } @@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; - int i; + int i, metadata = 0; if (IS_ERR(handle)) return PTR_ERR(handle); + if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; @@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } - ext3_free_blocks(handle, tree->inode, start, num); + ext3_free_blocks(handle, tree->inode, start, num, metadata); } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); Index: linux-2.6.16.i686/fs/ext3/inode.c =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/inode.c 2006-05-30 22:55:32.000000000 +0800 +++ linux-2.6.16.i686/fs/ext3/inode.c 2006-05-30 23:02:59.000000000 +0800 @@ -568,7 +568,7 @@ failed: ext3_journal_forget(handle, branch[i].bh); } for (i = 0; i < keys; i++) - ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); return err; } @@ -1862,7 +1862,7 @@ static void ext3_clear_blocks(handle_t * } } - ext3_free_blocks(handle, inode, block_to_free, count); + ext3_free_blocks(handle, inode, block_to_free, count, 1); } /** @@ -2035,7 +2035,7 @@ static void ext3_free_branches(handle_t ext3_journal_test_restart(handle, inode); } - ext3_free_blocks(handle, inode, nr, 1); + ext3_free_blocks(handle, inode, nr, 1, 1); if (parent_bh) { /* Index: linux-2.6.16.i686/fs/ext3/balloc.c =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/balloc.c 2006-03-20 13:53:29.000000000 +0800 +++ linux-2.6.16.i686/fs/ext3/balloc.c 2006-05-30 23:02:59.000000000 +0800 @@ -80,7 +80,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. */ -static struct buffer_head * +struct buffer_head * read_block_bitmap(struct super_block *sb, unsigned int block_group) { struct ext3_group_desc * desc; @@ -491,24 +491,6 @@ error_return: return; } -/* Free given blocks, update quota and i_blocks field */ -void ext3_free_blocks(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) -{ - struct super_block * sb; - int dquot_freed_blocks; - - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } - ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); - if (dquot_freed_blocks) - DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); - return; -} - /* * For ext3 allocations, we must not reuse any blocks which are * allocated in the bitmap buffer's "last committed data" copy. This @@ -1154,7 +1136,7 @@ out: * bitmap, and then for any free bit if that fails. * This function also updates quota and i_blocks field. */ -int ext3_new_block(handle_t *handle, struct inode *inode, +int ext3_new_block_old(handle_t *handle, struct inode *inode, unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.16.i686/fs/ext3/xattr.c =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/xattr.c 2006-03-20 13:53:29.000000000 +0800 +++ linux-2.6.16.i686/fs/ext3/xattr.c 2006-05-30 23:02:59.000000000 +0800 @@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl ea_bdebug(bh, "refcount now=0; freeing"); if (ce) mb_cache_entry_free(ce); - ext3_free_blocks(handle, inode, bh->b_blocknr, 1); + ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); get_bh(bh); ext3_forget(handle, 1, inode, bh, bh->b_blocknr); } else { @@ -804,7 +804,7 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext3_free_blocks(handle, inode, block, 1); + ext3_free_blocks(handle, inode, block, 1, 1); error = -EIO; goto cleanup; } Index: linux-2.6.16.i686/fs/ext3/mballoc.c =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/mballoc.c 2006-05-31 04:14:15.752410384 +0800 +++ linux-2.6.16.i686/fs/ext3/mballoc.c 2006-05-30 23:03:38.000000000 +0800 @@ -0,0 +1,2729 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TODO: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc + * - tree of groups sorted by number of free blocks + * - percpu reservation code (hotpath) + * - error handling + */ + +/* + * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG +#define mb_debug(fmt,a...) printk(fmt, ##a) +#else +#define mb_debug(fmt,a...) +#endif + +/* + * with EXT3_MB_HISTORY mballoc stores last N allocations in memory + * and you can monitor it in /proc/fs/ext3//mb_history + */ +#define EXT3_MB_HISTORY + +/* + * How long mballoc can look for a best extent (in found extents) + */ +long ext3_mb_max_to_scan = 500; + +/* + * How long mballoc must look for a best extent + */ +long ext3_mb_min_to_scan = 30; + +/* + * with 'ext3_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ + +long ext3_mb_stats = 1; + +/* + * for which requests use 2^N search using buddies + */ +long ext3_mb_order2_reqs = 8; + +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif +#define EXT3_BB_MAX_BLOCKS 30 + +struct ext3_free_metadata { + unsigned short group; + unsigned short num; + unsigned short blocks[EXT3_BB_MAX_BLOCKS]; + struct list_head list; +}; + +struct ext3_group_info { + unsigned long bb_state; + unsigned long bb_tid; + struct ext3_free_metadata *bb_md_cur; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + unsigned short bb_counters[]; +}; + + +#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT3_GROUP_INFO_LOCKED_BIT 1 + +#define EXT3_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) + +struct ext3_free_extent { + __u16 fe_start; + __u16 fe_len; + __u16 fe_group; +}; + +struct ext3_allocation_context { + struct super_block *ac_sb; + + /* search goals */ + struct ext3_free_extent ac_g_ex; + + /* the best found extent */ + struct ext3_free_extent ac_b_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u8 ac_status; + __u8 ac_flags; /* allocation hints */ + __u8 ac_criteria; + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + + struct page *ac_buddy_page; + struct page *ac_bitmap_page; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ + unsigned pid; + unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u8 cr; /* which phase the result extent was found at */ + __u8 merged; +}; + +struct ext3_buddy { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + struct ext3_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + __u16 bd_group; +}; +#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY +#define ext3_mb_store_history(sb,ino,ac) +#else +static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +static struct proc_dir_entry *proc_root_ext3; + +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); + +#if BITS_PER_LONG == 64 +#define mb_correct_addr_and_bit(bit,addr) \ +{ \ + bit += ((unsigned long) addr & 7UL) << 3; \ + addr = (void *) ((unsigned long) addr & ~7UL); \ +} +#elif BITS_PER_LONG == 32 +#define mb_correct_addr_and_bit(bit,addr) \ +{ \ + bit += ((unsigned long) addr & 3UL) << 3; \ + addr = (void *) ((unsigned long) addr & ~3UL); \ +} +#else +#error "how many bits you are?!" +#endif + +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + return ext2_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_set_bit_atomic(NULL, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_clear_bit_atomic(NULL, bit, addr); +} + +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix; +#if BITS_PER_LONG == 64 + fix = ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + fix = ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + max += fix; + start += fix; + return ext2_find_next_zero_bit(addr, max, start) - fix; +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ + char *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(max != NULL); + + if (order > e3b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + *max = 1 << (e3b->bd_blkbits + 3); + if (order == 0) + return EXT3_MB_BITMAP(e3b); + + bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; + *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; + + return bb; +} + +#ifdef AGGRESSIVE_CHECK + +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; + int fragments = 0, fstart; + void *buddy, *buddy2; + + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + + { + static int mb_check_counter = 0; + if (mb_check_counter++ % 300 != 0) + return; + } + + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + buddy2 = mb_find_buddy(e3b, order - 1, &max2); + J_ASSERT(buddy2); + J_ASSERT(buddy != buddy2); + J_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (!mb_test_bit(i << 1, buddy2)) + J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); + else if (!mb_test_bit((i << 1) + 1, buddy2)) + J_ASSERT(mb_test_bit(i << 1, buddy2)); + continue; + } + + /* both bits in buddy2 must be 0 */ + J_ASSERT(mb_test_bit(i << 1, buddy2)); + J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); + } + count++; + } + J_ASSERT(e3b->bd_info->bb_counters[order] == count); + order--; + } + + fstart = -1; + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) { + J_ASSERT(i >= e3b->bd_info->bb_first_free); + if (fstart == -1) { + fragments++; + fstart = i; + } + continue; + } + fstart = -1; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); + k = i >> j; + J_ASSERT(k < max2); + J_ASSERT(mb_test_bit(k, buddy2)); + } + } + J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); + J_ASSERT(e3b->bd_info->bb_fragments == fragments); +} + +#else +#define mb_check_buddy(e3b) +#endif + +/* find most significant bit */ +static int inline fmsb(unsigned short word) +{ + int order; + + if (word > 255) { + order = 7; + word >>= 8; + } else { + order = -1; + } + + do { + order++; + word >>= 1; + } while (word != 0); + + return order; +} + +static void inline +ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, + int len, struct ext3_group_info *grp) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned short min, max, chunk, border; + + mb_debug("mark %u/%u free\n", first, len); + J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); + + border = 2 << sb->s_blocksize_bits; + + while (len > 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fmsb(len); + + mb_debug(" %u/%u -> max %u, min %u\n", + first & ((2 << sb->s_blocksize_bits) - 1), + len, max, min); + + if (max < min) + min = max; + chunk = 1 << min; + + /* mark multiblock chunks only */ + grp->bb_counters[min]++; + if (min > 0) { + mb_debug(" set %u at %u \n", first >> min, + sbi->s_mb_offsets[min]); + mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); + } + + len -= chunk; + first += chunk; + } +} + +static void +ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, + int group) +{ + struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); + unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); + unsigned short i = 0, first, len; + unsigned free = 0, fragments = 0; + unsigned long long period = get_cycles(); + + i = mb_find_next_zero_bit(bitmap, max, 0); + grp->bb_first_free = i; + while (i < max) { + fragments++; + first = i; + i = ext2_find_next_le_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) + ext3_mb_mark_free_simple(sb, buddy, first, len, grp); + else + grp->bb_counters[0]++; + if (i < max) + i = mb_find_next_zero_bit(bitmap, max, i); + } + grp->bb_fragments = fragments; + + /* bb_state shouldn't being modified because all + * others waits for init completion on page lock */ + clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); + if (free != grp->bb_free) { + printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", + group, free, grp->bb_free); + grp->bb_free = free; + } + + period = get_cycles() - period; + spin_lock(&EXT3_SB(sb)->s_bal_lock); + EXT3_SB(sb)->s_mb_buddies_generated++; + EXT3_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT3_SB(sb)->s_bal_lock); +} + +static int ext3_mb_init_cache(struct page *page) +{ + int blocksize, blocks_per_page, groups_per_page; + int err = 0, i, first_group, first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh; + struct inode *inode; + char *data, *bitmap; + + mb_debug("init page %lu\n", page->index); + + inode = page->mapping->host; + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + + /* allocate buffer_heads to read bitmaps */ + if (groups_per_page > 1) { + err = -ENOMEM; + i = sizeof(struct buffer_head *) * groups_per_page; + bh = kmalloc(i, GFP_NOFS); + if (bh == NULL) + goto out; + memset(bh, 0, i); + } else + bh = &bhs; + + first_group = page->index * blocks_per_page / 2; + + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext3_group_desc * desc; + + if (first_group + i >= EXT3_SB(sb)->s_groups_count) + break; + + err = -EIO; + desc = ext3_get_group_desc(sb, first_group + i, NULL); + if (desc == NULL) + goto out; + + err = -ENOMEM; + bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); + if (bh[i] == NULL) + goto out; + + if (buffer_uptodate(bh[i])) + continue; + + lock_buffer(bh[i]); + if (buffer_uptodate(bh[i])) { + unlock_buffer(bh[i]); + continue; + } + + get_bh(bh[i]); + bh[i]->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh[i]); + mb_debug("read bitmap for group %u\n", first_group + i); + } + + /* wait for I/O completion */ + for (i = 0; i < groups_per_page && bh[i]; i++) + wait_on_buffer(bh[i]); + + err = -EIO; + for (i = 0; i < groups_per_page && bh[i]; i++) + if (!buffer_uptodate(bh[i])) + goto out; + + first_block = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { + int group; + + group = (first_block + i) >> 1; + if (group >= EXT3_SB(sb)->s_groups_count) + break; + + data = page_address(page) + (i * blocksize); + bitmap = bh[group - first_group]->b_data; + + if ((first_block + i) & 1) { + /* this is block of buddy */ + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memset(data, 0xff, blocksize); + EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; + memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + ext3_mb_generate_buddy(sb, data, bitmap, group); + } else { + /* this is block of bitmap */ + mb_debug("put bitmap for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + memcpy(data, bitmap, blocksize); + } + } + SetPageUptodate(page); + +out: + if (bh) { + for (i = 0; i < groups_per_page && bh[i]; i++) + brelse(bh[i]); + if (bh != &bhs) + kfree(bh); + } + return err; +} + +static int ext3_mb_load_buddy(struct super_block *sb, int group, + struct ext3_buddy *e3b) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page, block, pnum, poff; + struct page *page; + + mb_debug("load group %u\n", group); + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + + e3b->bd_blkbits = sb->s_blocksize_bits; + e3b->bd_info = EXT3_GROUP_INFO(sb, group); + e3b->bd_sb = sb; + e3b->bd_group = group; + e3b->bd_buddy_page = NULL; + e3b->bd_bitmap_page = NULL; + + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + /* we could use find_or_create_page(), but it locks page + * what we'd like to avoid in fast path ... */ + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) + ext3_mb_init_cache(page); + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) + goto err; + e3b->bd_bitmap_page = page; + e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) + ext3_mb_init_cache(page); + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) + goto err; + e3b->bd_buddy_page = page; + e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + J_ASSERT(e3b->bd_bitmap_page != NULL); + J_ASSERT(e3b->bd_buddy_page != NULL); + + return 0; + +err: + if (e3b->bd_bitmap_page) + page_cache_release(e3b->bd_bitmap_page); + if (e3b->bd_buddy_page) + page_cache_release(e3b->bd_buddy_page); + e3b->bd_buddy = NULL; + e3b->bd_bitmap = NULL; + return -EIO; +} + +static void ext3_mb_release_desc(struct ext3_buddy *e3b) +{ + if (e3b->bd_bitmap_page) + page_cache_release(e3b->bd_bitmap_page); + if (e3b->bd_buddy_page) + page_cache_release(e3b->bd_buddy_page); +} + + +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ + bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, + &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, + &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) +{ + int order = 1; + void *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); + + bb = EXT3_MB_BUDDY(e3b); + while (order <= e3b->bd_blkbits + 1) { + block = block >> 1; + if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e3b->bd_blkbits - order); + order++; + } + return 0; +} + +static inline void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit_atomic(cur, bm); + cur++; + } +} + +static inline void mb_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit_atomic(cur, bm); + cur++; + } +} + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ + int block = 0, max = 0, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + + e3b->bd_info->bb_free += count; + if (first < e3b->bd_info->bb_first_free) + e3b->bd_info->bb_first_free = first; + + /* let's maintain fragments counter */ + if (first != 0) + block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); + if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); + if (block && max) + e3b->bd_info->bb_fragments--; + else if (!block && !max) + e3b->bd_info->bb_fragments++; + + /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); + mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); + e3b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); + + do { + block &= ~1UL; + if (mb_test_bit(block, buddy) || + mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ + buddy2 = mb_find_buddy(e3b, order + 1, &max); + + if (!buddy2) + break; + + if (order > 0) { + /* for special purposes, we don't set + * free bits in bitmap */ + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } + e3b->bd_info->bb_counters[order]--; + e3b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; + e3b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; + } while (1); + } + mb_check_buddy(e3b); + + return 0; +} + +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ + int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); + + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + J_ASSERT(block < max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; + } + + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + + /* calc difference from given start */ + next = next - ex->fe_start; + ex->fe_len -= next; + ex->fe_start += next; + + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) + break; + + ord = mb_find_order_for_block(e3b, next); + + order = ord; + block = next >> order; + ex->fe_len += 1 << order; + } + + J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ + int ord, mlen = 0, max = 0, cur; + int start = ex->fe_start; + int len = ex->fe_len; + unsigned ret = 0; + int len0 = len; + void *buddy; + + mb_check_buddy(e3b); + + e3b->bd_info->bb_free -= len; + if (e3b->bd_info->bb_first_free == start) + e3b->bd_info->bb_first_free += len; + + /* let's maintain fragments counter */ + if (start != 0) + mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); + if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); + if (mlen && max) + e3b->bd_info->bb_fragments++; + else if (!mlen && !max) + e3b->bd_info->bb_fragments--; + + /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e3b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_set_bit(start >> ord, buddy); + e3b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + + /* store for history */ + if (ret == 0) + ret = len | (ord << 16); + + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(start >> ord, buddy); + e3b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); + e3b->bd_info->bb_counters[ord]++; + e3b->bd_info->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ + mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); + + mb_check_buddy(e3b); + + return ret; +} + +/* + * Must be called under group lock! + */ +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + unsigned long ret; + + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + ret = mb_mark_used(e3b, &ac->ac_b_ex); + + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; + + /* hold in-core structures until allocated + * blocks are marked non-free in on-disk bitmap */ + ac->ac_buddy_page = e3b->bd_buddy_page; + page_cache_get(e3b->bd_buddy_page); + ac->ac_bitmap_page = e3b->bd_bitmap_page; + page_cache_get(e3b->bd_bitmap_page); +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, + struct ext3_free_extent *ex, + struct ext3_buddy *e3b) +{ + struct ext3_free_extent *bex = &ac->ac_b_ex; + struct ext3_free_extent *gex = &ac->ac_g_ex; + + J_ASSERT(ex->fe_len > 0); + J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); + J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); + + ac->ac_found++; + + /* + * The special case - take what you catch first + */ + if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; + } + + /* + * Let's check whether the chunk is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + */ + if (bex->fe_len < gex->fe_len) { + /* if the request isn't satisfied, any found extent + * larger than previous best one is better */ + if (ex->fe_len > bex->fe_len) + *bex = *ex; + } else if (ex->fe_len > gex->fe_len) { + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ + *bex = *ex; + } + + /* + * Let's scan at least few extents and don't pick up a first one + */ + if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) + ac->ac_status = AC_STATUS_BREAK; + + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > ext3_mb_max_to_scan) + ac->ac_status = AC_STATUS_BREAK; +} + +static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + struct ext3_free_extent ex = ac->ac_b_ex; + int group = ex.fe_group, max, err; + + J_ASSERT(ex.fe_len > 0); + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); + if (err) + return err; + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) { + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + + ext3_unlock_group(ac->ac_sb, group); + + ext3_mb_release_desc(e3b); + + return 0; +} + +static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; + struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); + struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); + if (err) + return err; + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + unsigned long start; + start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + + ex.fe_start + le32_to_cpu(es->s_first_data_block)); + if (start % sbi->s_stripe == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + } else if (max >= ac->ac_g_ex.fe_len) { + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { + /* Sometimes, caller may want to merge even small + * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + ext3_unlock_group(ac->ac_sb, group); + + ext3_mb_release_desc(e3b); + + return 0; +} + +/* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req + */ +static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + struct ext3_group_info *grp = e3b->bd_info; + void *buddy; + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); + for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + + buddy = mb_find_buddy(e3b, i, &max); + if (buddy == NULL) { + printk(KERN_ALERT "looking for wrong order?\n"); + break; + } + + k = mb_find_next_zero_bit(buddy, max, 0); + J_ASSERT(k < max); + + ac->ac_found++; + + ac->ac_b_ex.fe_len = 1 << i; + ac->ac_b_ex.fe_start = k << i; + ac->ac_b_ex.fe_group = e3b->bd_group; + + ext3_mb_use_best_found(ac, e3b); + J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); + + if (unlikely(ext3_mb_stats)) + atomic_inc(&EXT3_SB(sb)->s_bal_2orders); + + break; + } +} + +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can know upper limit. + */ +static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + int i, free; + + free = e3b->bd_info->bb_free; + J_ASSERT(free > 0); + + i = e3b->bd_info->bb_first_free; + + while (free && ac->ac_status == AC_STATUS_CONTINUE) { + i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; + } + + mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); + J_ASSERT(ex.fe_len > 0); + J_ASSERT(free >= ex.fe_len); + + ext3_mb_measure_extent(ac, &ex, e3b); + + i += ex.fe_len; + free -= ex.fe_len; + } +} + +/* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size requests + */ +static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + unsigned long i, max; + + J_ASSERT(sbi->s_stripe != 0); + + /* find first stripe-aligned block */ + i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(sbi->s_es->s_first_data_block); + i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; + i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + + while (i < sb->s_blocksize * 8) { + if (!mb_test_bit(i, bitmap)) { + max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); + if (max >= sbi->s_stripe) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + break; + } + } + i += sbi->s_stripe; + } +} + +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ + struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); + unsigned free, fragments, i, bits; + + J_ASSERT(cr >= 0 && cr < 4); + J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); + + free = grp->bb_free; + fragments = grp->bb_fragments; + if (free == 0) + return 0; + if (fragments == 0) + return 0; + + switch (cr) { + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; + for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; + break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; + break; + case 3: + return 1; + default: + BUG(); + } + + return 0; +} + +int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, + unsigned long goal, int *len, int flags, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_allocation_context ac; + int i, group, block, cr, err = 0; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct buffer_head *gdp_bh; + struct ext3_sb_info *sbi; + struct super_block *sb; + struct ext3_buddy e3b; + + J_ASSERT(len != NULL); + J_ASSERT(*len > 0); + + sb = inode->i_sb; + if (!sb) { + printk("ext3_mb_new_nblocks: nonexistent device"); + return 0; + } + + if (!test_opt(sb, MBALLOC)) { + static int ext3_mballoc_warning = 0; + if (ext3_mballoc_warning == 0) { + printk(KERN_ERR "EXT3-fs: multiblock request with " + "mballoc disabled!\n"); + ext3_mballoc_warning++; + } + *len = 1; + err = ext3_new_block_old(handle, inode, goal, errp); + return err; + } + + ext3_mb_poll_new_transaction(sb, handle); + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + + /* + * We can't allocate > group size + */ + if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) + *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; + + if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* someone asks for non-reserved blocks */ + BUG_ON(*len > 1); + err = ext3_mb_reserve_blocks(sb, 1); + if (err) { + *errp = err; + return 0; + } + } + + ac.ac_buddy_page = NULL; + ac.ac_bitmap_page = NULL; + + /* + * Check quota for allocation of this blocks. + */ + while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) + *len -= 1; + if (*len == 0) { + *errp = -EDQUOT; + block = 0; + goto out; + } + + /* start searching from the goal */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + block = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + + /* set up allocation goals */ + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_groups_scanned = 0; + ac.ac_ex_scanned = 0; + ac.ac_found = 0; + ac.ac_sb = inode->i_sb; + ac.ac_g_ex.fe_group = group; + ac.ac_g_ex.fe_start = block; + ac.ac_g_ex.fe_len = *len; + ac.ac_flags = flags; + ac.ac_2order = 0; + ac.ac_criteria = 0; + + if (*len == 1 && sbi->s_stripe) { + /* looks like a metadata, let's use a dirty hack for raid5 + * move all metadata in first groups in hope to hit cached + * sectors and thus avoid read-modify cycles in raid5 */ + ac.ac_g_ex.fe_group = group = 0; + } + + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); + if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + + /* first, try the goal */ + err = ext3_mb_find_by_goal(&ac, &e3b); + if (err) + goto out_err; + if (ac.ac_status == AC_STATUS_FOUND) + goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; +repeat: + for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { + ac.ac_criteria = cr; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + + if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { + /* we need full data about the group + * to make a good selection */ + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + ext3_mb_release_desc(&e3b); + } + + /* check is group good for our criteries */ + if (!ext3_mb_good_group(&ac, group, cr)) + continue; + + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + + ext3_lock_group(sb, group); + if (!ext3_mb_good_group(&ac, group, cr)) { + /* someone did allocation from this group */ + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + continue; + } + + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); + else if (cr == 1 && *len == sbi->s_stripe) + ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + + ext3_unlock_group(sb, group); + + ext3_mb_release_desc(&e3b); + + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + + if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + + /*if (ac.ac_found > ext3_mb_max_to_scan) + printk(KERN_DEBUG "EXT3-fs: too long searching at " + "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, + ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); + */ + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_flags |= EXT3_MB_HINT_FIRST; + cr = 3; + goto repeat; + } + } + + if (ac.ac_status != AC_STATUS_FOUND) { + /* + * We aren't lucky definitely + */ + DQUOT_FREE_BLOCK(inode, *len); + *errp = -ENOSPC; + block = 0; +#if 1 + printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", + ac.ac_status, ac.ac_flags); + printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", + ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, + ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); + printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); + printk("\n"); +#endif + goto out; + } + +found: + J_ASSERT(ac.ac_b_ex.fe_len > 0); + + /* good news - free block(s) have been found. now it's time + * to mark block(s) in good old journaled bitmap */ + block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) + + ac.ac_b_ex.fe_start + + le32_to_cpu(es->s_first_data_block); + + /* we made a desicion, now mark found blocks in good old + * bitmap to be journaled */ + + ext3_debug("using block group %d(%d)\n", + ac.ac_b_group.group, gdp->bg_free_blocks_count); + + bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); + if (!bitmap_bh) { + *errp = -EIO; + goto out_err; + } + + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) { + *errp = err; + goto out_err; + } + + gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out_err; + } + + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) + + ac.ac_b_ex.fe_start + + le32_to_cpu(es->s_first_data_block); + + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); +#ifdef AGGRESSIVE_CHECK + for (i = 0; i < ac.ac_b_ex.fe_len; i++) + J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); +#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + + spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + - ac.ac_b_ex.fe_len); + spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) + goto out_err; + err = ext3_journal_dirty_metadata(handle, gdp_bh); + if (err) + goto out_err; + + sb->s_dirt = 1; + *errp = 0; + brelse(bitmap_bh); + + /* drop non-allocated, but dquote'd blocks */ + J_ASSERT(*len >= ac.ac_b_ex.fe_len); + DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); + + *len = ac.ac_b_ex.fe_len; + J_ASSERT(*len > 0); + J_ASSERT(block != 0); + goto out; + +out_err: + /* if we've already allocated something, roll it back */ + if (ac.ac_status == AC_STATUS_FOUND) { + /* FIXME: free blocks here */ + } + + DQUOT_FREE_BLOCK(inode, *len); + brelse(bitmap_bh); + *errp = err; + block = 0; +out: + if (ac.ac_buddy_page) + page_cache_release(ac.ac_buddy_page); + if (ac.ac_bitmap_page) + page_cache_release(ac.ac_bitmap_page); + + if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter + * whether we allocated anything or we failed: time + * to release reservation. NOTE: because I expect + * any multiblock request from delayed allocation + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } + + if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(*len, &sbi->s_bal_allocated); + if (*len >= ac.ac_g_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); + if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && + ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); + if (ac.ac_found > ext3_mb_max_to_scan) + atomic_inc(&sbi->s_bal_breaks); + } + + ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} +EXPORT_SYMBOL(ext3_mb_new_blocks); + +#ifdef EXT3_MB_HISTORY +struct ext3_mb_proc_session { + struct ext3_mb_history *history; + struct super_block *sb; + int start; + int max; +}; + +static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, + struct ext3_mb_history *hs, + int first) +{ + if (hs == s->history + s->max) + hs = s->history; + if (!first && hs == s->history + s->start) + return NULL; + while (hs->goal.fe_len == 0) { + hs++; + if (hs == s->history + s->max) + hs = s->history; + if (hs == s->history + s->start) + return NULL; + } + return hs; +} + +static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) +{ + struct ext3_mb_proc_session *s = seq->private; + struct ext3_mb_history *hs; + int l = *pos; + + if (l == 0) + return SEQ_START_TOKEN; + hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); + if (!hs) + return NULL; + while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); + return hs; +} + +static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ext3_mb_proc_session *s = seq->private; + struct ext3_mb_history *hs = v; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ext3_mb_history_skip_empty(s, s->history + s->start, 1); + else + return ext3_mb_history_skip_empty(s, ++hs, 0); +} + +static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) +{ + struct ext3_mb_history *hs = v; + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", + "pid", "inode", "goal", "result", "found", "grps", "cr", + "merge", "tail", "broken"); + return 0; + } + + sprintf(buf, "%u/%u/%u", hs->goal.fe_group, + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", + hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, + hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} + +static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations ext3_mb_seq_history_ops = { + .start = ext3_mb_seq_history_start, + .next = ext3_mb_seq_history_next, + .stop = ext3_mb_seq_history_stop, + .show = ext3_mb_seq_history_show, +}; + +static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE(inode)->data; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -EIO; + size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; + s->history = kmalloc(size, GFP_KERNEL); + if (s == NULL) { + kfree(s); + return -EIO; + } + + spin_lock(&sbi->s_mb_history_lock); + memcpy(s->history, sbi->s_mb_history, size); + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); + + rc = seq_open(file, &ext3_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = s; + } else { + kfree(s->history); + kfree(s); + } + return rc; + +} + +static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = (struct seq_file *)file->private_data; + struct ext3_mb_proc_session *s = seq->private; + kfree(s->history); + kfree(s); + return seq_release(inode, file); +} + +static struct file_operations ext3_mb_seq_history_fops = { + .owner = THIS_MODULE, + .open = ext3_mb_seq_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ext3_mb_seq_history_release, +}; + +static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = seq->private; + struct ext3_sb_info *sbi = EXT3_SB(sb); + long group; + + if (*pos < 0 || *pos >= sbi->s_groups_count) + return NULL; + + group = *pos + 1; + return (void *) group; +} + +static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = seq->private; + struct ext3_sb_info *sbi = EXT3_SB(sb); + long group; + + ++*pos; + if (*pos < 0 || *pos >= sbi->s_groups_count) + return NULL; + group = *pos + 1; + return (void *) group;; +} + +static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; + long group = (long) v, i; + struct sg { + struct ext3_group_info info; + unsigned short counters[16]; + } sg; + + group--; + if (group == 0) + seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", + "group", "free", "frags", "first", "2^0", "2^1", "2^2", + "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", + "2^11", "2^12", "2^13"); + + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext3_group_info); + ext3_lock_group(sb, group); + memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + + if (EXT3_MB_GRP_NEED_INIT(&sg.info)) + return 0; + + seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, + sg.info.bb_fragments, sg.info.bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); + seq_printf(seq, " ]\n"); + + return 0; +} + +static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations ext3_mb_seq_groups_ops = { + .start = ext3_mb_seq_groups_start, + .next = ext3_mb_seq_groups_next, + .stop = ext3_mb_seq_groups_stop, + .show = ext3_mb_seq_groups_show, +}; + +static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE(inode)->data; + int rc; + + rc = seq_open(file, &ext3_mb_seq_groups_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = sb; + } + return rc; + +} + +static struct file_operations ext3_mb_seq_groups_fops = { + .owner = THIS_MODULE, + .open = ext3_mb_seq_groups_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); + remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + + if (sbi->s_mb_history) + kfree(sbi->s_mb_history); +} + +static void ext3_mb_history_init(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + int i; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); + sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); + if (sbi->s_mb_proc != NULL) { + struct proc_dir_entry *p; + p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); + if (p) { + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } + p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); + if (p) { + p->proc_fops = &ext3_mb_seq_groups_fops; + p->data = sb; + } + } + + sbi->s_mb_history_max = 1000; + sbi->s_mb_history_cur = 0; + spin_lock_init(&sbi->s_mb_history_lock); + i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); + sbi->s_mb_history = kmalloc(i, GFP_KERNEL); + memset(sbi->s_mb_history, 0, i); + /* if we can't allocate history, then we simple won't use it */ +} + +static void +ext3_mb_store_history(struct super_block *sb, unsigned ino, + struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; + + if (likely(sbi->s_mb_history == NULL)) + return; + + h.pid = current->pid; + h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; + h.cr = ac->ac_criteria; + h.groups = ac->ac_groups_scanned; + h.tail = ac->ac_tail; + h.buddy = ac->ac_buddy; + h.merged = 0; + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + h.merged = 1; + + spin_lock(&sbi->s_mb_history_lock); + memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); + if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) + sbi->s_mb_history_cur = 0; + spin_unlock(&sbi->s_mb_history_lock); +} + +#else +#define ext3_mb_history_release(sb) +#define ext3_mb_history_init(sb) +#endif + +int ext3_mb_init_backend(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int i, j, len, metalen; + int num_meta_group_infos = + (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> + EXT3_DESC_PER_BLOCK_BITS(sb); + struct ext3_group_info **meta_group_info; + + /* An 8TB filesystem with 64-bit pointers requires a 4096 byte + * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. + * So a two level scheme suffices for now. */ + sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * + num_meta_group_infos, GFP_KERNEL); + if (sbi->s_group_info == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT3-fs: can't get new inode\n"); + goto err_freesgi; + } + + metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) { + if ((i + 1) == num_meta_group_infos) + metalen = sizeof(*meta_group_info) * + (sbi->s_groups_count - + (i << EXT3_DESC_PER_BLOCK_BITS(sb))); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate mem for a " + "buddy group\n"); + goto err_freemeta; + } + sbi->s_group_info[i] = meta_group_info; + } + + /* + * calculate needed size. if change bb_counters size, + * don't forget about ext3_mb_generate_buddy() + */ + len = sizeof(struct ext3_group_info); + len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc * desc; + + meta_group_info = + sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; + j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[j] = kmalloc(len, GFP_KERNEL); + if (meta_group_info[j] == NULL) { + printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); + i--; + goto err_freebuddy; + } + desc = ext3_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); + goto err_freebuddy; + } + memset(meta_group_info[j], 0, len); + set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, + &meta_group_info[j]->bb_state); + meta_group_info[j]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + return 0; + +err_freebuddy: + while (i >= 0) { + kfree(EXT3_GROUP_INFO(sb, i)); + i--; + } + i = num_meta_group_infos; +err_freemeta: + while (--i >= 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); +err_freesgi: + kfree(sbi->s_group_info); + return -ENOMEM; +} + +int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct inode *root = sb->s_root->d_inode; + unsigned i, offset, max; + struct dentry *dentry; + + if (!test_opt(sb, MBALLOC)) + return 0; + + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { + clear_opt(sbi->s_mount_opt, MBALLOC); + return -ENOMEM; + } + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + clear_opt(sbi->s_mount_opt, MBALLOC); + kfree(sbi->s_mb_maxs); + return -ENOMEM; + } + + /* order 0 is regular bitmap */ + sbi->s_mb_maxs[0] = sb->s_blocksize << 3; + sbi->s_mb_offsets[0] = 0; + + i = 1; + offset = 0; + max = sb->s_blocksize << 2; + do { + sbi->s_mb_offsets[i] = offset; + sbi->s_mb_maxs[i] = max; + offset += 1 << (sb->s_blocksize_bits - i); + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); + + /* init file for buddy data */ + if ((i = ext3_mb_init_backend(sb))) { + clear_opt(sbi->s_mount_opt, MBALLOC); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return i; + } + + spin_lock_init(&sbi->s_reserve_lock); + spin_lock_init(&sbi->s_md_lock); + INIT_LIST_HEAD(&sbi->s_active_transaction); + INIT_LIST_HEAD(&sbi->s_closed_transaction); + INIT_LIST_HEAD(&sbi->s_committed_transaction); + spin_lock_init(&sbi->s_bal_lock); + + /* remove old on-disk buddy file */ + mutex_lock(&root->i_mutex); + dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); + if (dentry->d_inode != NULL) { + i = vfs_unlink(root, dentry); + if (i != 0) + printk("EXT3-fs: can't remove .buddy file: %d\n", i); + } + dput(dentry); + mutex_unlock(&root->i_mutex); + + ext3_mb_history_init(sb); + + printk("EXT3-fs: mballoc enabled\n"); + return 0; +} + +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int i, num_meta_group_infos; + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* release freed, non-committed blocks */ + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_closed_transaction, + &sbi->s_committed_transaction); + list_splice_init(&sbi->s_active_transaction, + &sbi->s_committed_transaction); + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_group_info) { + for (i = 0; i < sbi->s_groups_count; i++) + kfree(EXT3_GROUP_INFO(sb, i)); + num_meta_group_infos = (sbi->s_groups_count + + EXT3_DESC_PER_BLOCK(sb) - 1) >> + EXT3_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); + kfree(sbi->s_group_info); + } + if (sbi->s_mb_offsets) + kfree(sbi->s_mb_offsets); + if (sbi->s_mb_maxs) + kfree(sbi->s_mb_maxs); + if (sbi->s_buddy_cache) + iput(sbi->s_buddy_cache); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); + if (ext3_mb_stats) { + printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks\n", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks)); + printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", + sbi->s_mb_buddies_generated++, + sbi->s_mb_generation_time); + } + + ext3_mb_history_release(sb); + + return 0; +} + +void ext3_mb_free_committed_blocks(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int err, i, count = 0, count2 = 0; + struct ext3_free_metadata *md; + struct ext3_buddy e3b; + + if (list_empty(&sbi->s_committed_transaction)) + return; + + /* there is committed blocks to be freed yet */ + do { + /* get next array of blocks */ + md = NULL; + spin_lock(&sbi->s_md_lock); + if (!list_empty(&sbi->s_committed_transaction)) { + md = list_entry(sbi->s_committed_transaction.next, + struct ext3_free_metadata, list); + list_del(&md->list); + } + spin_unlock(&sbi->s_md_lock); + + if (md == NULL) + break; + + mb_debug("gonna free %u blocks in group %u (0x%p):", + md->num, md->group, md); + + err = ext3_mb_load_buddy(sb, md->group, &e3b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ + count += md->num; + count2++; + ext3_lock_group(sb, md->group); + for (i = 0; i < md->num; i++) { + mb_debug(" %u", md->blocks[i]); + mb_free_blocks(&e3b, md->blocks[i], 1); + } + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + + /* balance refcounts from ext3_mb_free_metadata() */ + page_cache_release(e3b.bd_buddy_page); + page_cache_release(e3b.bd_bitmap_page); + + kfree(md); + ext3_mb_release_desc(&e3b); + + } while (md); + mb_debug("freed %u blocks in %u structures\n", count, count2); +} + +void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_last_transaction == handle->h_transaction->t_tid) + return; + + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we + * know that transaction before previous is known to be already + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + + spin_lock(&sbi->s_md_lock); + if (sbi->s_last_transaction != handle->h_transaction->t_tid) { + mb_debug("new transaction %lu, old %lu\n", + (unsigned long) handle->h_transaction->t_tid, + (unsigned long) sbi->s_last_transaction); + list_splice_init(&sbi->s_closed_transaction, + &sbi->s_committed_transaction); + list_splice_init(&sbi->s_active_transaction, + &sbi->s_closed_transaction); + sbi->s_last_transaction = handle->h_transaction->t_tid; + } + spin_unlock(&sbi->s_md_lock); + + ext3_mb_free_committed_blocks(sb); +} + +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ + struct ext3_group_info *db = e3b->bd_info; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + + J_ASSERT(e3b->bd_bitmap_page != NULL); + J_ASSERT(e3b->bd_buddy_page != NULL); + + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; + if (md && db->bb_tid != handle->h_transaction->t_tid) { + db->bb_md_cur = NULL; + md = NULL; + } + + if (md == NULL) { + ext3_unlock_group(sb, group); + md = kmalloc(sizeof(*md), GFP_KERNEL); + if (md == NULL) + return -ENOMEM; + md->num = 0; + md->group = group; + + ext3_lock_group(sb, group); + if (db->bb_md_cur == NULL) { + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); + /* protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e3b->bd_buddy_page); + page_cache_get(e3b->bd_bitmap_page); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", + md, md->group); + } else { + kfree(md); + md = db->bb_md_cur; + } + } + + BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); + md->blocks[md->num] = block + i; + md->num++; + if (md->num == EXT3_BB_MAX_BLOCKS) { + /* no more space, put full container on a sb's list */ + db->bb_md_cur = NULL; + } + } + ext3_unlock_group(sb, group); + return 0; +} + +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count, + int metadata, int *freed) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + unsigned long bit, overflow; + struct buffer_head *gd_bh; + unsigned long block_group; + struct ext3_sb_info *sbi; + struct super_block *sb; + struct ext3_buddy e3b; + int err = 0, ret; + + *freed = 0; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + + ext3_mb_poll_new_transaction(sb, handle); + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext3_debug("freeing block %lu\n", block); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; + + if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + err = ext3_mb_load_buddy(sb, block_group, &e3b); + if (err) + goto error_return; + +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < count; i++) + J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif + mb_clear_bits(bitmap_bh->b_data, bit, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + if (metadata) { + /* blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed */ + ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); + } else { + ext3_lock_group(sb, block_group); + mb_free_blocks(&e3b, bit, count); + ext3_unlock_group(sb, block_group); + } + + spin_lock(sb_bgl_lock(sbi, block_group)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + + ext3_mb_release_desc(&e3b); + + *freed = count; + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, err); + return; +} + +int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int free, ret = -ENOSPC; + + BUG_ON(blocks < 0); + spin_lock(&sbi->s_reserve_lock); + free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + if (blocks <= free - sbi->s_blocks_reserved) { + sbi->s_blocks_reserved += blocks; + ret = 0; + } + spin_unlock(&sbi->s_reserve_lock); + return ret; +} + +void ext3_mb_release_blocks(struct super_block *sb, int blocks) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + BUG_ON(blocks < 0); + spin_lock(&sbi->s_reserve_lock); + sbi->s_blocks_reserved -= blocks; + WARN_ON(sbi->s_blocks_reserved < 0); + if (sbi->s_blocks_reserved < 0) + sbi->s_blocks_reserved = 0; + spin_unlock(&sbi->s_reserve_lock); +} + +int ext3_new_block(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) +{ + int ret, len; + + if (!test_opt(inode->i_sb, MBALLOC)) { + ret = ext3_new_block_old(handle, inode, goal, errp); + goto out; + } + len = 1; + ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); +out: + return ret; +} + + +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ + struct super_block *sb; + int freed; + + sb = inode->i_sb; + if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); + if (freed) + DQUOT_FREE_BLOCK(inode, freed); + return; +} + +#define EXT3_ROOT "ext3" +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" +#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + *eof = 1; + if (off != 0) + return 0; + + len = sprintf(page, "%ld\n", ext3_mb_stats); + *start = page; + return len; +} + +static int ext3_mb_stats_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char str[32]; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + + if (copy_from_user(str, buffer, count)) + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); + return count; +} + +static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + *eof = 1; + if (off != 0) + return 0; + + len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); + *start = page; + return len; +} + +static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + + if (copy_from_user(str, buffer, count)) + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); + if (value <= 0) + return -ERANGE; + + ext3_mb_max_to_scan = value; + + return count; +} + +static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + *eof = 1; + if (off != 0) + return 0; + + len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); + *start = page; + return len; +} + +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + + if (copy_from_user(str, buffer, count)) + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); + if (value <= 0) + return -ERANGE; + + ext3_mb_min_to_scan = value; + + return count; +} + +static int ext3_mb_order2_req_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + *eof = 1; + if (off != 0) + return 0; + + len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); + *start = page; + return len; +} + +static int ext3_mb_order2_req_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + + if (copy_from_user(str, buffer, count)) + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); + if (value <= 0) + return -ERANGE; + + ext3_mb_order2_reqs = value; + + return count; +} + +int __init init_ext3_proc(void) +{ + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; + struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { + printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + + /* Initialize EXT3_MB_STATS_NAME */ + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { + printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } + + proc_ext3_mb_stats->data = NULL; + proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; + proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; + + /* Initialize EXT3_MAX_TO_SCAN_NAME */ + proc_ext3_mb_max_to_scan = create_proc_entry( + EXT3_MB_MAX_TO_SCAN_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { + printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } + + proc_ext3_mb_max_to_scan->data = NULL; + proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; + proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; + + /* Initialize EXT3_MIN_TO_SCAN_NAME */ + proc_ext3_mb_min_to_scan = create_proc_entry( + EXT3_MB_MIN_TO_SCAN_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_min_to_scan == NULL) { + printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_MIN_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } + + proc_ext3_mb_min_to_scan->data = NULL; + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + + /* Initialize EXT3_ORDER2_REQ */ + proc_ext3_mb_order2_req = create_proc_entry( + EXT3_MB_ORDER2_REQ, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_order2_req == NULL) { + printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_ORDER2_REQ); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } + + proc_ext3_mb_order2_req->data = NULL; + proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; + proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; + + return 0; +} + +void exit_ext3_proc(void) +{ + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} Index: linux-2.6.16.i686/fs/ext3/Makefile =================================================================== --- linux-2.6.16.i686.orig/fs/ext3/Makefile 2006-05-30 22:55:32.000000000 +0800 +++ linux-2.6.16.i686/fs/ext3/Makefile 2006-05-30 23:02:59.000000000 +0800 @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o \ - extents.o + extents.o mballoc.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o