Index: linux-2.6.10/fs/ext3/mballoc.c =================================================================== --- linux-2.6.10.orig/fs/ext3/mballoc.c 2005-02-25 17:28:41.836311072 +0200 +++ linux-2.6.10/fs/ext3/mballoc.c 2005-02-25 17:28:41.859307576 +0200 @@ -0,0 +1,1861 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TODO: + * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - is it worthwhile to use buddies directly if req is 2^N blocks? + * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc + * - tree of groups sorted by number of free blocks + * - percpu reservation code (hotpath) + * - error handling + */ + +/* + * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with MBALLOC_STATS allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MBALLOC_STATS + +/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG +#define mb_debug(fmt,a...) printk(fmt, ##a) +#else +#define mb_debug(fmt,a...) +#endif + +/* + * where to save buddies structures beetween umount/mount (clean case only) + */ +#define EXT3_BUDDY_FILE ".buddy" + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define EXT3_MB_MAX_TO_SCAN 100 + +/* + * This structure is on-disk description of a group for mballoc + */ +struct ext3_mb_group_descr { + __u16 mgd_first_free; /* first free block in the group */ + __u16 mgd_free; /* number of free blocks in the group */ + __u16 mgd_counters[16]; /* number of free blocks by order */ +}; + +/* + * This structure is header of mballoc's file + */ +struct ext3_mb_grp_header { + __u32 mh_magic; +}; + +#define EXT3_MB_MAGIC_V1 0xbabd16fd + + +struct ext3_free_extent { + __u16 fe_start; + __u16 fe_len; + __u16 fe_group; +}; + +struct ext3_allocation_context { + struct super_block *ac_sb; + + /* search goals */ +struct ext3_free_extent ac_g_ex; + + /* the best found extent */ + struct ext3_free_extent ac_b_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u8 ac_status; + __u8 ac_flags; /* allocation hints */ + __u8 ac_repeats; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext3_buddy { + struct buffer_head *bd_bh; + struct buffer_head *bd_bh2; + struct ext3_buddy_group_blocks *bd_bd; + struct super_block *bd_sb; + __u16 bd_blkbits; + __u16 bd_group; +}; +#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); + +#if BITS_PER_LONG == 64 +#define mb_correct_addr_and_bit(bit,addr) \ +{ \ + bit += ((unsigned long) addr & 7UL) << 3; \ + addr = (void *) ((unsigned long) addr & ~7UL); \ +} +#elif BITS_PER_LONG == 32 +#define mb_correct_addr_and_bit(bit,addr) \ +{ \ + bit += ((unsigned long) addr & 3UL) << 3; \ + addr = (void *) ((unsigned long) addr & ~3UL); \ +} +#else +#error "how many bits you are?!" +#endif + +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + return ext2_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_set_bit_atomic(NULL, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + ext2_clear_bit_atomic(NULL, bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ + int i = 1; + char *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(max != NULL); + + if (order > e3b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + *max = 1 << (e3b->bd_blkbits + 3); + if (order == 0) + return EXT3_MB_BITMAP(e3b); + + bb = EXT3_MB_BUDDY(e3b); + *max = *max >> 1; + while (i < order) { + bb += 1 << (e3b->bd_blkbits - i); + i++; + *max = *max >> 1; + } + J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < + e3b->bd_sb->s_blocksize); + return bb; +} + +static int ext3_mb_load_buddy(struct super_block *sb, int group, + struct ext3_buddy *e3b) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); + J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); + + /* load bitmap */ + e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); + if (e3b->bd_bh == NULL) { + ext3_error(sb, "ext3_mb_load_buddy", + "can't get block for buddy bitmap\n"); + goto out; + } + /* load buddy */ + e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); + if (e3b->bd_bh2 == NULL) { + ext3_error(sb, "ext3_mb_load_buddy", + "can't get block for buddy bitmap\n"); + goto out; + } + + if (!buffer_uptodate(e3b->bd_bh)) + ll_rw_block(READ, 1, &e3b->bd_bh); + if (!buffer_uptodate(e3b->bd_bh2)) + ll_rw_block(READ, 1, &e3b->bd_bh2); + + wait_on_buffer(e3b->bd_bh); + J_ASSERT(buffer_uptodate(e3b->bd_bh)); + wait_on_buffer(e3b->bd_bh2); + J_ASSERT(buffer_uptodate(e3b->bd_bh2)); + + e3b->bd_blkbits = sb->s_blocksize_bits; + e3b->bd_bd = sbi->s_buddy_blocks[group]; + e3b->bd_sb = sb; + e3b->bd_group = group; + + return 0; +out: + brelse(e3b->bd_bh); + brelse(e3b->bd_bh2); + e3b->bd_bh = NULL; + e3b->bd_bh2 = NULL; + return -EIO; +} + +static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) +{ + mark_buffer_dirty(e3b->bd_bh); + mark_buffer_dirty(e3b->bd_bh2); +} + +static void ext3_mb_release_desc(struct ext3_buddy *e3b) +{ + brelse(e3b->bd_bh); + brelse(e3b->bd_bh2); +} + +#ifdef AGGRESSIVE_CHECK +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; + void *buddy, *buddy2; + + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + buddy2 = mb_find_buddy(e3b, order - 1, &max2); + J_ASSERT(buddy2); + J_ASSERT(buddy != buddy2); + J_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (!mb_test_bit(i << 1, buddy2)) + J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); + else if (!mb_test_bit((i << 1) + 1, buddy2)) + J_ASSERT(mb_test_bit(i << 1, buddy2)); + continue; + } + + /* both bits in buddy2 must be 0 */ + J_ASSERT(mb_test_bit(i << 1, buddy2)); + J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); + } + count++; + } + J_ASSERT(e3b->bd_bd->bb_counters[order] == count); + order--; + } + + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) + continue; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); + k = i >> j; + J_ASSERT(k < max2); + J_ASSERT(mb_test_bit(k, buddy2)); + } + } +} +#else +#define mb_check_buddy(e3b) +#endif + +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ + spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) +{ + int order = 1; + void *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); + + bb = EXT3_MB_BUDDY(e3b); + while (order <= e3b->bd_blkbits + 1) { + block = block >> 1; + if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e3b->bd_blkbits - order); + order++; + } + return 0; +} + +static inline void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit_atomic(cur, bm); + cur++; + } +} + +static inline void mb_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit_atomic(cur, bm); + cur++; + } +} + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ + int block, max, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + + e3b->bd_bd->bb_free += count; + if (first < e3b->bd_bd->bb_first_free) + e3b->bd_bd->bb_first_free = first; + + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); + mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); + e3b->bd_bd->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); + + do { + block &= ~1UL; + if (mb_test_bit(block, buddy) || + mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ + buddy2 = mb_find_buddy(e3b, order + 1, &max); + + if (!buddy2) + break; + + if (order > 0) { + /* for special purposes, we don't set + * free bits in bitmap */ + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } + e3b->bd_bd->bb_counters[order]--; + e3b->bd_bd->bb_counters[order]--; + + block = block >> 1; + order++; + e3b->bd_bd->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; + } while (1); + } + mb_check_buddy(e3b); + + return 0; +} + +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ + int next, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); + + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + J_ASSERT(block < max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + if (order == 0) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; + } + + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + + while ((buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) + break; + + ord = mb_find_order_for_block(e3b, next); + + order = ord; + block = next >> order; + ex->fe_len += 1 << order; + } + + J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ + int start = ex->fe_start; + int len = ex->fe_len; + int ord, mlen, max, cur; + int len0 = len; + void *buddy; + + e3b->bd_bd->bb_free -= len; + if (e3b->bd_bd->bb_first_free == start) + e3b->bd_bd->bb_first_free += len; + + while (len) { + ord = mb_find_order_for_block(e3b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_set_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); + e3b->bd_bd->bb_counters[ord]++; + e3b->bd_bd->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ + mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); + + mb_check_buddy(e3b); + + return 0; +} + +/* + * Must be called under group lock! + */ +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + mb_mark_used(e3b, &ac->ac_b_ex); + ac->ac_status = AC_STATUS_FOUND; +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, + struct ext3_free_extent *ex, + struct ext3_buddy *e3b) +{ + int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; + struct ext3_free_extent *bex = &ac->ac_b_ex; + int diff = ac->ac_g_ex.fe_len - ex->fe_len; + + J_ASSERT(ex->fe_len > 0); + J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); + J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); + + ac->ac_found++; + + /* + * The special case - take what you catch first + */ + if (ac->ac_flags & EXT3_MB_HINT_FIRST) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; + } + + /* + * Let's check whether the chuck is good enough + */ + if (ex->fe_len >= ac->ac_g_ex.fe_len) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; + } + + /* + * If the request is vey large, then it makes sense to use large + * chunks for it. Even if they don't satisfy whole request. + */ + if (ex->fe_len > 1000) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; + } + + /* + * Sometimes it's worty to take close chunk + */ + if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + * FIXME: possible the policy should be more complex? + */ + if (ex->fe_len > bex->fe_len) { + *bex = *ex; + } + + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > EXT3_MB_MAX_TO_SCAN) + ac->ac_status = AC_STATUS_BREAK; +} + +static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + struct ext3_free_extent ex = ac->ac_b_ex; + int group = ex.fe_group, max, err; + + J_ASSERT(ex.fe_len > 0); + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); + if (err) + return err; + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) + ext3_mb_use_best_found(ac, e3b); + + ext3_unlock_group(ac->ac_sb, group); + + if (ac->ac_status == AC_STATUS_FOUND) + ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; +} + +static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); + if (err) + return err; + + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + + if (max > 0) { + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + ext3_unlock_group(ac->ac_sb, group); + + if (ac->ac_status == AC_STATUS_FOUND) + ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; +} +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can upper limit. + */ +static void ext3_mb_scan_group(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + int i, free; + + free = e3b->bd_bd->bb_free; + J_ASSERT(free > 0); + + i = e3b->bd_bd->bb_first_free; + + while (free && ac->ac_status != AC_STATUS_FOUND) { + i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; + } + + mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); + J_ASSERT(ex.fe_len > 0); + J_ASSERT(free >= ex.fe_len); + + ext3_mb_measure_extent(ac, &ex, e3b); + + i += ex.fe_len; + free -= ex.fe_len; + } +} + +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ + int free; + + J_ASSERT(cr >= 0 && cr < 3); + + free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; + if (free == 0) + return 0; + + if (cr == 0) { + if (free >= ac->ac_g_ex.fe_len >> 1) + return 1; + } else if (cr == 1) { + if (free >= ac->ac_g_ex.fe_len >> 2) + return 1; + } else if (cr == 2) { + return 1; + } + return 0; +} + +int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, + unsigned long goal, int *len, int flags, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_allocation_context ac; + int i, group, block, cr, err = 0; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct buffer_head *gdp_bh; + struct ext3_sb_info *sbi; + struct super_block *sb; + struct ext3_buddy e3b; + + J_ASSERT(len != NULL); + J_ASSERT(*len > 0); + + sb = inode->i_sb; + if (!sb) { + printk("ext3_mb_new_nblocks: nonexistent device"); + return 0; + } + + if (!test_opt(sb, MBALLOC)) { + static int ext3_mballoc_warning = 0; + if (ext3_mballoc_warning == 0) { + printk(KERN_ERR "EXT3-fs: multiblock request with " + "mballoc disabled!\n"); + ext3_mballoc_warning++; + } + *len = 1; + err = ext3_new_block_old(handle, inode, goal, errp); + return err; + } + + ext3_mb_poll_new_transaction(sb, handle); + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + + /* + * We can't allocate > group size + */ + if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) + *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; + + if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* someone asks for non-reserved blocks */ + BUG_ON(*len > 1); + err = ext3_mb_reserve_blocks(sb, 1); + if (err) { + *errp = err; + return 0; + } + } + + /* + * Check quota for allocation of this blocks. + */ + while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) + *len -= 1; + if (*len == 0) { + *errp = -EDQUOT; + block = 0; + goto out; + } + + /* start searching from the goal */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + block = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + + /* set up allocation goals */ + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_groups_scanned = 0; + ac.ac_ex_scanned = 0; + ac.ac_found = 0; + ac.ac_sb = inode->i_sb; + ac.ac_g_ex.fe_group = group; + ac.ac_g_ex.fe_start = block; + ac.ac_g_ex.fe_len = *len; + ac.ac_flags = flags; + + /* + * Sometimes, caller may want to merge even small number + * of blocks to an existing extent + */ + if (ac.ac_flags & EXT3_MB_HINT_MERGE) { + err = ext3_mb_find_by_goal(&ac, &e3b); + if (err) + goto out_err; + if (ac.ac_status == AC_STATUS_FOUND) + goto found; + } + + /* + * FIXME + * If requested chunk is power of 2 length, we can try + * to exploit buddy nature to speed allocation up + */ + + + /* + * Let's just scan groups to find more-less suitable blocks + */ + cr = 0; +repeat: + for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + + /* check is group good for our criteries */ + if (!ext3_mb_good_group(&ac, group, cr)) + continue; + + err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + + ext3_lock_group(sb, group); + if (!ext3_mb_good_group(&ac, group, cr)) { + /* someone did allocation from this group */ + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + continue; + } + + ext3_mb_scan_group(&ac, &e3b); + ext3_unlock_group(sb, group); + + if (ac.ac_status == AC_STATUS_FOUND) + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + if (err) + goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + + if (ac.ac_status == AC_STATUS_BREAK && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + /*ext3_warning(inode->i_sb, __FUNCTION__, + "too long searching: got %d want %d\n", + ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + */ + /*printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");*/ + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_flags |= EXT3_MB_HINT_FIRST; + cr = 2; + goto repeat; + } + } + + if (ac.ac_status != AC_STATUS_FOUND) { + /* + * We aren't lucky definitely + */ + DQUOT_FREE_BLOCK(inode, *len); + *errp = -ENOSPC; + block = 0; +#if 1 + printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", + ac.ac_status, ac.ac_flags); + printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", + ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, + ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); + printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", + sbi->s_blocks_reserved, ac.ac_found); + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + printk("%d: %d ", i, + sbi->s_buddy_blocks[i]->bb_free); + printk("\n"); +#endif + goto out; + } + +found: + J_ASSERT(ac.ac_b_ex.fe_len > 0); + + /* good news - free block(s) have been found. now it's time + * to mark block(s) in good old journaled bitmap */ + block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) + + ac.ac_b_ex.fe_start + + le32_to_cpu(es->s_first_data_block); + + /* we made a desicion, now mark found blocks in good old + * bitmap to be journaled */ + + ext3_debug("using block group %d(%d)\n", + ac.ac_b_group.group, gdp->bg_free_blocks_count); + + bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); + if (!bitmap_bh) { + *errp = -EIO; + goto out_err; + } + + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) { + *errp = err; + goto out_err; + } + + gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out_err; + } + + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) + + ac.ac_b_ex.fe_start + + le32_to_cpu(es->s_first_data_block); + + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); +#ifdef AGGRESSIVE_CHECK + for (i = 0; i < ac.ac_b_ex.fe_len; i++) + J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); +#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + + spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + - ac.ac_b_ex.fe_len); + spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) + goto out_err; + err = ext3_journal_dirty_metadata(handle, gdp_bh); + if (err) + goto out_err; + + sb->s_dirt = 1; + *errp = 0; + brelse(bitmap_bh); + + /* drop non-allocated, but dquote'd blocks */ + J_ASSERT(*len >= ac.ac_b_ex.fe_len); + DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); + + *len = ac.ac_b_ex.fe_len; + J_ASSERT(*len > 0); + J_ASSERT(block != 0); + goto out; + +out_err: + /* if we've already allocated something, roll it back */ + if (ac.ac_status == AC_STATUS_FOUND) { + /* FIXME: free blocks here */ + } + + DQUOT_FREE_BLOCK(inode, *len); + brelse(bitmap_bh); + *errp = err; + block = 0; +out: + if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter + * whether we allocated anything or we failed: time + * to release reservation. NOTE: because I expect + * any multiblock request from delayed allocation + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } +#ifdef MBALLOC_STATS + if (ac.ac_g_ex.fe_len > 1) { + spin_lock(&sbi->s_bal_lock); + sbi->s_bal_reqs++; + sbi->s_bal_allocated += *len; + if (*len >= ac.ac_g_ex.fe_len) + sbi->s_bal_success++; + sbi->s_bal_ex_scanned += ac.ac_found; + if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && + ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) + sbi->s_bal_goals++; + if (ac.ac_found > EXT3_MB_MAX_TO_SCAN) + sbi->s_bal_breaks++; + spin_unlock(&sbi->s_bal_lock); + } +#endif + return block; +} + +int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, + struct ext3_mb_group_descr **grp) +{ + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + int descr_per_block, err, offset; + struct ext3_mb_grp_header *hdr; + unsigned long block; + + descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) + / sizeof(struct ext3_mb_group_descr); + block = e3b->bd_group / descr_per_block; + *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); + if (*bh == NULL) { + printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", + e3b->bd_group, err); + return err; + } + + hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; + if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { + printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", + e3b->bd_group); + brelse(*bh); + *bh = NULL; + return -EIO; + } + + offset = e3b->bd_group % descr_per_block + * sizeof(struct ext3_mb_group_descr) + + sizeof(struct ext3_mb_grp_header); + *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); + + return 0; +} + +int ext3_mb_load_descr(struct ext3_buddy *e3b) +{ + struct ext3_mb_group_descr *grp; + struct ext3_group_desc *gdp; + struct buffer_head *bh; + int err, i; + + err = ext3_mb_get_descr_loc(e3b, &bh, &grp); + if (err) + return err; + + e3b->bd_bd->bb_first_free = grp->mgd_first_free; + e3b->bd_bd->bb_free = grp->mgd_free; + for (i = 0; i <= e3b->bd_blkbits + 1; i++) { + J_ASSERT(i < 16); + e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; + } + brelse(bh); + + /* additional checks against old group descriptor */ + gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); + if (!gdp) + return -EIO; + if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { + printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", + e3b->bd_group, e3b->bd_bd->bb_free, + le16_to_cpu(gdp->bg_free_blocks_count)); + return -ENODATA; + } + + return 0; +} + + +int ext3_mb_update_descr(struct ext3_buddy *e3b) +{ + struct ext3_mb_group_descr *grp; + struct ext3_group_desc *gdp; + struct buffer_head *bh; + handle_t *handle; + int err, i; + + /* additional checks against old group descriptor */ + gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); + if (!gdp) + return -EIO; + if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { + printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", + e3b->bd_group, e3b->bd_bd->bb_free, + le16_to_cpu(gdp->bg_free_blocks_count)); + return -ENODATA; + } + + err = ext3_mb_get_descr_loc(e3b, &bh, &grp); + if (err) + return err; + + handle = ext3_journal_start_sb(e3b->bd_sb, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + goto out; + } + + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto out; + grp->mgd_first_free = e3b->bd_bd->bb_first_free; + grp->mgd_free = e3b->bd_bd->bb_free; + for (i = 0; i <= e3b->bd_blkbits + 1; i++) { + J_ASSERT(i < 16); + grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; + } + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto out; + err = 0; +out: + brelse(bh); + if (handle) + ext3_journal_stop(handle); + return err; +} + +int ext3_mb_generate_buddy(struct ext3_buddy *e3b) +{ + struct super_block *sb = e3b->bd_sb; + struct buffer_head *bh; + int i, count = 0; + + mb_debug("generate buddy for group %d\n", e3b->bd_group); + memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); + memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); + + bh = read_block_bitmap(sb, e3b->bd_group); + if (bh == NULL) + return -EIO; + + /* mb_free_blocks will set real free */ + e3b->bd_bd->bb_free = 0; + e3b->bd_bd->bb_first_free = 1 << 15; + /* + * if change bb_counters size, don't forget about + * ext3_mb_init_backend() -bzzz + */ + memset(e3b->bd_bd->bb_counters, 0, + sizeof(unsigned) * (sb->s_blocksize_bits + 2)); + + /* loop over the blocks, and create buddies for free ones */ + for (i = 0; i < sb->s_blocksize * 8; i++) { + if (!mb_test_bit(i, (void *) bh->b_data)) { + mb_free_blocks(e3b, i, 1); + count++; + } + } + brelse(bh); + mb_check_buddy(e3b); + ext3_mb_dirty_buddy(e3b); + + return 0; +} + +EXPORT_SYMBOL(ext3_mb_new_blocks); + +#define MB_CREDITS \ + (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) + +int ext3_mb_init_backend(struct super_block *sb, int *created) +{ + int err, i, len, descr_per_block, buddy_offset, size; + struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_grp_header *hdr; + struct buffer_head *bh = NULL; + unsigned long block; + struct dentry *db; + handle_t *handle; + tid_t target; + + *created = 0; + len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; + sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks == NULL) { + printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); + return -ENOMEM; + } + memset(sbi->s_buddy_blocks, 0, len); + sbi->s_buddy = NULL; + + down(&root->i_sem); + len = strlen(EXT3_BUDDY_FILE); + db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); + if (IS_ERR(db)) { + err = PTR_ERR(db); + printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); + up(&root->i_sem); + goto out; + } + + if (db->d_inode == NULL) { + err = ext3_create(root, db, S_IFREG, NULL); + if (err) { + printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); + up(&root->i_sem); + goto out; + } + db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; + *created = 1; + mb_debug("no buddy file, regenerate\n"); + } + up(&root->i_sem); + sbi->s_buddy = igrab(db->d_inode); + + /* calculate needed size */ + descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) + / sizeof(struct ext3_mb_group_descr); + buddy_offset = (sbi->s_groups_count + descr_per_block - 1) + / descr_per_block; + len = sbi->s_groups_count * sb->s_blocksize * 2 + + buddy_offset * sb->s_blocksize; + if (len != i_size_read(sbi->s_buddy)) { + if (*created == 0) + printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", + (unsigned) len, + (unsigned) i_size_read(sbi->s_buddy)); + *created = 1; + } + + /* read/create mb group descriptors */ + for (i = 0; i < buddy_offset; i++) { + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { + printk(KERN_ERR "EXT3-fs: cant start transaction\n"); + err = PTR_ERR(handle); + goto err_out; + } + + bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); + if (bh == NULL) { + printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); + goto err_out; + } + hdr = (struct ext3_mb_grp_header *) bh->b_data; + if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto err_out; + if (*created == 0) + printk(KERN_ERR + "EXT3-fs: invalid header 0x%x in %d," + "regenerate\n", hdr->mh_magic, i); + *created = 1; + hdr->mh_magic = EXT3_MB_MAGIC_V1; + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto err_out; + } + brelse(bh); + ext3_journal_stop(handle); + } + + /* + * if change bb_counters size, don't forget about ext3_mb_generate_buddy() + */ + len = sizeof(struct ext3_buddy_group_blocks); + len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { + + sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks[i] == NULL) { + printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); + err = -ENOMEM; + goto out2; + } + memset(sbi->s_buddy_blocks[i], 0, len); + + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { + printk(KERN_ERR "EXT3-fs: cant start transaction\n"); + err = PTR_ERR(handle); + goto out2; + } + + /* allocate block for bitmap */ + block = buddy_offset + i * 2; + bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { + printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; + brelse(bh); + + /* allocate block for buddy */ + block = buddy_offset + i * 2 + 1; + bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { + printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; + brelse(bh); + + size = (block + 1) << sbi->s_buddy->i_blkbits; + if (size > sbi->s_buddy->i_size) { + *created = 1; + EXT3_I(sbi->s_buddy)->i_disksize = size; + i_size_write(sbi->s_buddy, size); + mark_inode_dirty(sbi->s_buddy); + } + ext3_journal_stop(handle); + + spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); + sbi->s_buddy_blocks[i]->bb_md_cur = NULL; + sbi->s_buddy_blocks[i]->bb_tid = 0; + } + + if (journal_start_commit(sbi->s_journal, &target)) + log_wait_commit(sbi->s_journal, target); + +out2: + dput(db); +out: + return err; + +err_out: + return err; +} + +int ext3_mb_write_descriptors(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_buddy e3b; + int ret = 0, i, err; + + for (i = 0; i < sbi->s_groups_count; i++) { + if (sbi->s_buddy_blocks[i] == NULL) + continue; + + err = ext3_mb_load_buddy(sb, i, &e3b); + if (err == 0) { + ext3_mb_update_descr(&e3b); + ext3_mb_release_desc(&e3b); + } else + ret = err; + } + return ret; +} + +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int i; + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* release freed, non-committed blocks */ + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_closed_transaction, + &sbi->s_committed_transaction); + list_splice_init(&sbi->s_active_transaction, + &sbi->s_committed_transaction); + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_buddy_blocks) { + ext3_mb_write_descriptors(sb); + for (i = 0; i < sbi->s_groups_count; i++) { + if (sbi->s_buddy_blocks[i] == NULL) + continue; + kfree(sbi->s_buddy_blocks[i]); + } + kfree(sbi->s_buddy_blocks); + } + if (sbi->s_buddy) + iput(sbi->s_buddy); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); +#ifdef MBALLOC_STATS + printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n", + sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success); + printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n", + sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks); +#endif + return 0; +} + +int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_buddy e3b; + int i, err, created; + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* init file for buddy data */ + clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); + if ((err = ext3_mb_init_backend(sb, &created))) + return err; + +repeat: + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + err = ext3_mb_load_buddy(sb, i, &e3b); + if (err) { + /* FIXME: release backend */ + return err; + } + if (created || needs_recovery) + ext3_mb_generate_buddy(&e3b); + else + err = ext3_mb_load_descr(&e3b); + ext3_mb_release_desc(&e3b); + if (err == -ENODATA) { + created = 1; + goto repeat; + } + } + if (created || needs_recovery) + printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", + EXT3_SB(sb)->s_groups_count); + spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); + spin_lock_init(&EXT3_SB(sb)->s_md_lock); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); + set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); + +#ifdef MBALLOC_STATS + spin_lock_init(&EXT3_SB(sb)->s_bal_lock); +#define MBALLOC_INFO " (stats)" +#else +#define MBALLOC_INFO "" +#endif + printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO); + return 0; +} + +void ext3_mb_free_committed_blocks(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int err, i, count = 0, count2 = 0; + struct ext3_free_metadata *md; + struct ext3_buddy e3b; + + if (list_empty(&sbi->s_committed_transaction)) + return; + + /* there is committed blocks to be freed yet */ + do { + /* get next array of blocks */ + md = NULL; + spin_lock(&sbi->s_md_lock); + if (!list_empty(&sbi->s_committed_transaction)) { + md = list_entry(sbi->s_committed_transaction.next, + struct ext3_free_metadata, list); + list_del(&md->list); + } + spin_unlock(&sbi->s_md_lock); + + if (md == NULL) + break; + + mb_debug("gonna free %u blocks in group %u (0x%p):", + md->num, md->group, md); + + err = ext3_mb_load_buddy(sb, md->group, &e3b); + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ + count += md->num; + count2++; + ext3_lock_group(sb, md->group); + for (i = 0; i < md->num; i++) { + mb_debug(" %u", md->blocks[i]); + mb_free_blocks(&e3b, md->blocks[i], 1); + } + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + + kfree(md); + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + } while (md); + mb_debug("freed %u blocks in %u structures\n", count, count2); +} + +void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_last_transaction == handle->h_transaction->t_tid) + return; + + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we + * know that transaction before previous is known to be alreade + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + + spin_lock(&sbi->s_md_lock); + if (sbi->s_last_transaction != handle->h_transaction->t_tid) { + mb_debug("new transaction %lu, old %lu\n", + (unsigned long) handle->h_transaction->t_tid, + (unsigned long) sbi->s_last_transaction); + list_splice_init(&sbi->s_closed_transaction, + &sbi->s_committed_transaction); + list_splice_init(&sbi->s_active_transaction, + &sbi->s_closed_transaction); + sbi->s_last_transaction = handle->h_transaction->t_tid; + } + spin_unlock(&sbi->s_md_lock); + + ext3_mb_free_committed_blocks(sb); +} + +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ + struct ext3_buddy_group_blocks *db = e3b->bd_bd; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; + if (md && db->bb_tid != handle->h_transaction->t_tid) { + db->bb_md_cur = NULL; + md = NULL; + } + + if (md == NULL) { + ext3_unlock_group(sb, group); + md = kmalloc(sizeof(*md), GFP_KERNEL); + if (md == NULL) + return -ENOMEM; + md->num = 0; + md->group = group; + + ext3_lock_group(sb, group); + if (db->bb_md_cur == NULL) { + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", + md, md->group); + } else { + kfree(md); + md = db->bb_md_cur; + } + } + + BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); + md->blocks[md->num] = block + i; + md->num++; + if (md->num == EXT3_BB_MAX_BLOCKS) { + /* no more space, put full container on a sb's list */ + db->bb_md_cur = NULL; + } + } + ext3_unlock_group(sb, group); + return 0; +} + +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count, + int metadata, int *freed) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + unsigned long bit, overflow; + struct buffer_head *gd_bh; + unsigned long block_group; + struct ext3_sb_info *sbi; + struct super_block *sb; + struct ext3_buddy e3b; + int err = 0, ret; + + *freed = 0; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + + ext3_mb_poll_new_transaction(sb, handle); + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext3_debug("freeing block %lu\n", block); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; + + if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + err = ext3_mb_load_buddy(sb, block_group, &e3b); + if (err) + goto error_return; + +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < count; i++) + J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif + mb_clear_bits(bitmap_bh->b_data, bit, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + if (metadata) { + /* blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed */ + ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); + } else { + ext3_lock_group(sb, block_group); + mb_free_blocks(&e3b, bit, count); + ext3_unlock_group(sb, block_group); + } + + spin_lock(sb_bgl_lock(sbi, block_group)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + *freed = count; + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, err); + return; +} + +int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int free, ret = -ENOSPC; + + BUG_ON(blocks < 0); + spin_lock(&sbi->s_reserve_lock); + free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + if (blocks <= free - sbi->s_blocks_reserved) { + sbi->s_blocks_reserved += blocks; + ret = 0; + } + spin_unlock(&sbi->s_reserve_lock); + return ret; +} + +void ext3_mb_release_blocks(struct super_block *sb, int blocks) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + BUG_ON(blocks < 0); + spin_lock(&sbi->s_reserve_lock); + sbi->s_blocks_reserved -= blocks; + WARN_ON(sbi->s_blocks_reserved < 0); + if (sbi->s_blocks_reserved < 0) + sbi->s_blocks_reserved = 0; + spin_unlock(&sbi->s_reserve_lock); +} + +int ext3_new_block(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) +{ + int ret, len; + + if (!test_opt(inode->i_sb, MBALLOC)) { + ret = ext3_new_block_old(handle, inode, goal, errp); + goto out; + } + len = 1; + ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); +out: + return ret; +} + +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ + struct super_block *sb; + int freed; + + sb = inode->i_sb; + if (!test_opt(sb, MBALLOC) || EXT3_SB(sb)->s_buddy_blocks == NULL) + ext3_free_blocks_sb(handle, sb, block, count, &freed); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); + if (freed) + DQUOT_FREE_BLOCK(inode, freed); + return; +} Index: linux-2.6.10/fs/ext3/super.c =================================================================== --- linux-2.6.10.orig/fs/ext3/super.c 2005-02-25 17:27:00.231757312 +0200 +++ linux-2.6.10/fs/ext3/super.c 2005-02-25 17:28:41.862307120 +0200 @@ -394,6 +394,7 @@ struct ext3_super_block *es = sbi->s_es; int i; + ext3_mb_release(sb); ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); @@ -592,7 +593,7 @@ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_extdebug, + Opt_extents, Opt_extdebug, Opt_mballoc, Opt_mbfactor, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, }; @@ -646,6 +647,8 @@ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, + {Opt_mballoc, "mbfactor=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -956,6 +959,16 @@ case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; + case Opt_mbfactor: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_mb_factor = option; + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1639,8 +1652,9 @@ percpu_counter_mod(&sbi->s_dirs_counter, ext3_count_dirs(sb)); ext3_ext_init(sb); + ext3_mb_init(sb, needs_recovery); return 0; cantfind_ext3: Index: linux-2.6.10/fs/ext3/Makefile =================================================================== --- linux-2.6.10.orig/fs/ext3/Makefile 2005-02-25 17:27:00.228757768 +0200 +++ linux-2.6.10/fs/ext3/Makefile 2005-02-25 17:28:41.863306968 +0200 @@ -5,7 +5,7 @@ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \ - extents.o + extents.o mballoc.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o extents-in-ea.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o Index: linux-2.6.10/fs/ext3/balloc.c =================================================================== --- linux-2.6.10.orig/fs/ext3/balloc.c 2005-02-25 17:26:58.965949744 +0200 +++ linux-2.6.10/fs/ext3/balloc.c 2005-02-25 17:28:41.865306664 +0200 @@ -79,7 +79,7 @@ * * Return buffer_head on success or NULL in case of failure. */ -static struct buffer_head * +struct buffer_head * read_block_bitmap(struct super_block *sb, unsigned int block_group) { struct ext3_group_desc * desc; @@ -450,24 +450,6 @@ return; } -/* Free given blocks, update quota and i_blocks field */ -void ext3_free_blocks(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) -{ - struct super_block * sb; - int dquot_freed_blocks; - - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } - ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); - if (dquot_freed_blocks) - DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); - return; -} - /* * For ext3 allocations, we must not reuse any blocks which are * allocated in the bitmap buffer's "last committed data" copy. This @@ -1140,7 +1122,7 @@ * bitmap, and then for any free bit if that fails. * This function also updates quota and i_blocks field. */ -int ext3_new_block(handle_t *handle, struct inode *inode, +int ext3_new_block_old(handle_t *handle, struct inode *inode, unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.10/fs/ext3/namei.c =================================================================== --- linux-2.6.10.orig/fs/ext3/namei.c 2005-02-25 17:26:59.527864320 +0200 +++ linux-2.6.10/fs/ext3/namei.c 2005-02-25 17:28:41.867306360 +0200 @@ -1639,7 +1639,7 @@ * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, +int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { handle_t *handle; Index: linux-2.6.10/fs/ext3/inode.c =================================================================== --- linux-2.6.10.orig/fs/ext3/inode.c 2005-02-25 17:27:00.227757920 +0200 +++ linux-2.6.10/fs/ext3/inode.c 2005-02-25 17:28:41.872305600 +0200 @@ -572,7 +572,7 @@ ext3_journal_forget(handle, branch[i].bh); } for (i = 0; i < keys; i++) - ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); return err; } @@ -673,7 +673,7 @@ if (err == -EAGAIN) for (i = 0; i < num; i++) ext3_free_blocks(handle, inode, - le32_to_cpu(where[i].key), 1); + le32_to_cpu(where[i].key), 1, 1); return err; } @@ -1831,7 +1831,7 @@ } } - ext3_free_blocks(handle, inode, block_to_free, count); + ext3_free_blocks(handle, inode, block_to_free, count, 1); } /** @@ -2004,7 +2004,7 @@ ext3_journal_test_restart(handle, inode); } - ext3_free_blocks(handle, inode, nr, 1); + ext3_free_blocks(handle, inode, nr, 1, 1); if (parent_bh) { /* Index: linux-2.6.10/fs/ext3/extents.c =================================================================== --- linux-2.6.10.orig/fs/ext3/extents.c 2005-02-25 17:27:00.222758680 +0200 +++ linux-2.6.10/fs/ext3/extents.c 2005-02-25 17:29:29.364085752 +0200 @@ -740,7 +740,7 @@ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext3_free_blocks(handle, tree->inode, ablocks[i], 1); + ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); } } kfree(ablocks); @@ -1391,7 +1391,7 @@ path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); - ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); + ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); return err; } @@ -1879,10 +1879,12 @@ int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; - int i; + int i, metadata = 0; if (IS_ERR(handle)) return PTR_ERR(handle); + if (S_ISDIR(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; @@ -1894,7 +1896,7 @@ bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } - ext3_free_blocks(handle, tree->inode, start, num); + ext3_free_blocks(handle, tree->inode, start, num, metadata); } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); Index: linux-2.6.10/fs/ext3/xattr.c =================================================================== --- linux-2.6.10.orig/fs/ext3/xattr.c 2005-02-25 17:26:59.876811272 +0200 +++ linux-2.6.10/fs/ext3/xattr.c 2005-02-25 17:28:41.878304688 +0200 @@ -1271,7 +1271,7 @@ new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext3_free_blocks(handle, inode, block, 1); + ext3_free_blocks(handle, inode, block, 1, 1); error = -EIO; goto cleanup; } @@ -1318,7 +1318,7 @@ if (ce) mb_cache_entry_free(ce); ea_bdebug(old_bh, "freeing"); - ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); + ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); /* ext3_forget() calls bforget() for us, but we let our caller release old_bh, so we need to @@ -1417,7 +1417,7 @@ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { if (ce) mb_cache_entry_free(ce); - ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); + ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { Index: linux-2.6.10/include/linux/ext3_fs.h =================================================================== --- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-02-25 17:27:00.234756856 +0200 +++ linux-2.6.10/include/linux/ext3_fs.h 2005-02-25 17:28:41.881304232 +0200 @@ -57,6 +57,14 @@ #define ext3_debug(f, a...) do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR 1 + +#define EXT3_MB_HINT_MERGE 1 +#define EXT3_MB_HINT_RESERVED 2 +#define EXT3_MB_HINT_METADATA 4 +#define EXT3_MB_HINT_FIRST 8 +#define EXT3_MB_HINT_BEST 16 + /* * Special inodes numbers */ @@ -365,6 +373,7 @@ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x100000 /* Extents support */ #define EXT3_MOUNT_EXTDEBUG 0x200000 /* Extents debug */ +#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -725,7 +734,7 @@ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); extern void ext3_free_blocks_sb (handle_t *, struct super_block *, unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); @@ -856,6 +865,37 @@ extern struct inode_operations ext3_symlink_inode_operations; extern struct inode_operations ext3_fast_symlink_inode_operations; +/* mballoc.c */ +extern int ext3_mb_init(struct super_block *, int); +extern int ext3_mb_release(struct super_block *); +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); +extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); + +/* writeback.c */ +extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); +extern int ext3_wb_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to); +extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); +extern int ext3_wb_writepage(struct page *, struct writeback_control *); +extern int ext3_wb_invalidatepage(struct page *, unsigned long); +extern int ext3_wb_releasepage(struct page *, int); +extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); +extern void ext3_wb_init(struct super_block *); +extern void ext3_wb_release(struct super_block *); + +/* writeback.c */ +extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); +extern int ext3_wb_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to); +extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); +extern int ext3_wb_writepage(struct page *, struct writeback_control *); +extern int ext3_wb_invalidatepage(struct page *, unsigned long); +extern int ext3_wb_releasepage(struct page *, int); +extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); +extern void ext3_wb_init(struct super_block *); +extern void ext3_wb_release(struct super_block *); + /* extents.c */ extern int ext3_ext_writepage_trans_blocks(struct inode *, int); extern int ext3_ext_get_block(handle_t *, struct inode *, long, Index: linux-2.6.10/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.6.10.orig/include/linux/ext3_fs_sb.h 2005-02-25 17:26:59.641846992 +0200 +++ linux-2.6.10/include/linux/ext3_fs_sb.h 2005-02-25 17:28:41.882304080 +0200 @@ -23,10 +23,30 @@ #define EXT_INCLUDE #include #include +#include #endif #endif #include +#define EXT3_BB_MAX_BLOCKS 30 +struct ext3_free_metadata { + unsigned short group; + unsigned short num; + unsigned short blocks[EXT3_BB_MAX_BLOCKS]; + struct list_head list; +}; + +struct ext3_buddy_group_blocks { + __u32 bb_bitmap; + __u32 bb_buddy; + spinlock_t bb_lock; + unsigned long bb_tid; + struct ext3_free_metadata *bb_md_cur; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned bb_counters[]; +}; + /* * third extended-fs super-block data in memory */ @@ -81,6 +101,27 @@ int s_jquota_fmt; /* Format of quota to use */ #endif u32 s_mdsnum; + + /* for buddy allocator */ + struct ext3_buddy_group_blocks **s_buddy_blocks; + struct inode *s_buddy; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + struct list_head s_active_transaction; + struct list_head s_closed_transaction; + struct list_head s_committed_transaction; + spinlock_t s_md_lock; + tid_t s_last_transaction; + int s_mb_factor; + + /* stats for buddy allocator */ + spinlock_t s_bal_lock; + unsigned long s_bal_reqs; /* number of reqs with len > 1 */ + unsigned long s_bal_success; /* we found long enough chunks */ + unsigned long s_bal_allocated; /* in blocks */ + unsigned long s_bal_ex_scanned; /* total extents scanned */ + unsigned long s_bal_goals; /* goal hits */ + unsigned long s_bal_breaks; /* too long searches */ }; #endif /* _LINUX_EXT3_FS_SB */