From: alex Date: Sat, 7 Aug 2004 13:42:38 +0000 (+0000) Subject: b=3733 X-Git-Tag: 1.3.4~501 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=4232d21aa8d97feb7b1a3ba8e0edf49b0e94c76b;p=fs%2Flustre-release.git b=3733 - initial backport of mballoc v2 onto 2.4.24 --- diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch new file mode 100644 index 0000000..4b6d0da --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch @@ -0,0 +1,1762 @@ +Index: linux-2.4.24/fs/ext3/mballoc.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.24/fs/ext3/mballoc.c 2004-08-06 04:50:53.000000000 +0400 +@@ -0,0 +1,1399 @@ ++/* ++ * Copyright (c) 2004, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - do not scan from the beginning, try to remember first free block ++ * - mb_mark_used_* may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. this checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * max. number of chunks to be tracked in ext3_free_extent struct ++ */ ++#define MB_ARR_SIZE 32 ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ int ac_g_group; ++ int ac_g_start; ++ int ac_g_len; ++ int ac_g_flags; ++ ++ /* the best found extent */ ++ int ac_b_group; ++ int ac_b_start; ++ int ac_b_len; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ int ac_repeats; ++ int ac_groups_scanned; ++ int ac_status; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++ ++ ++struct ext3_buddy { ++ void *bd_bitmap; ++ void *bd_buddy; ++ int bd_blkbits; ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++}; ++ ++struct ext3_free_extent { ++ int fe_start; ++ int fe_len; ++ unsigned char fe_orders[MB_ARR_SIZE]; ++ unsigned char fe_nums; ++ unsigned char fe_back; ++}; ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, u32 *, u32 *, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++int load_block_bitmap (struct super_block *, unsigned int); ++ ++struct buffer_head * ++read_block_bitmap_bh(struct super_block *sb, unsigned int block_group) ++{ ++ struct buffer_head *bh; ++ int bitmap_nr; ++ ++ bitmap_nr = load_block_bitmap(sb, block_group); ++ if (bitmap_nr < 0) ++ return NULL; ++ ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; ++ return bh; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ void *bb; ++ ++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) ++ return NULL; ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return e3b->bd_bitmap; ++ ++ bb = e3b->bd_buddy; ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ return bb; ++} ++ ++static int ext3_mb_load_desc(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group].bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group].bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_desc", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ wait_on_buffer(e3b->bd_bh); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_desc", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh2)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ wait_on_buffer(e3b->bd_bh2); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_bitmap = e3b->bd_bh->b_data; ++ e3b->bd_buddy = e3b->bd_bh2->b_data; ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks + group; ++ e3b->bd_sb = sb; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++#ifdef AGGRESSIVE_CHECK ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (!test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (test_bit(i << 1, buddy2)) ++ J_ASSERT(!test_bit((i<<1)+1, buddy2)); ++ else if (test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(!test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(!test_bit(i << 1, buddy2)); ++ J_ASSERT(!test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(test_bit(k, e3b->bd_bitmap)); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(!test_bit(k, buddy2)); ++ } ++ } ++} ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = e3b->bd_buddy; ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ clear_bit(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ set_bit(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(!test_bit(block, e3b->bd_bitmap)); ++ set_bit(block, e3b->bd_bitmap); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (!test_bit(block, buddy) || ++ !test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't clear ++ * free bits in bitmap */ ++ clear_bit(block, buddy); ++ clear_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ set_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * returns 1 if out extent is enough to fill needed space ++ */ ++int mb_make_backward_extent(struct ext3_free_extent *in, ++ struct ext3_free_extent *out, int needed) ++{ ++ int i; ++ ++ J_ASSERT(in); ++ J_ASSERT(out); ++ J_ASSERT(in->fe_nums < MB_ARR_SIZE); ++ ++ out->fe_len = 0; ++ out->fe_start = in->fe_start + in->fe_len; ++ out->fe_nums = 0; ++ ++ /* for single-chunk extent we need not back order ++ * also, if an extent doesn't fill needed space ++ * then it makes no sense to try back order becase ++ * if we select this extent then it'll be use as is */ ++ if (in->fe_nums < 2 || in->fe_len < needed) ++ return 0; ++ ++ i = in->fe_nums - 1; ++ while (i >= 0 && out->fe_len < needed) { ++ out->fe_len += (1 << in->fe_orders[i]); ++ out->fe_start -= (1 << in->fe_orders[i]); ++ i--; ++ } ++ /* FIXME: in some situation fe_orders may be too small to hold ++ * all the buddies */ ++ J_ASSERT(out->fe_len >= needed); ++ ++ for (i++; i < in->fe_nums; i++) ++ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; ++ J_ASSERT(out->fe_nums < MB_ARR_SIZE); ++ out->fe_back = 1; ++ ++ return 1; ++} ++ ++int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int space = needed; ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ ex->fe_nums = 0; ++ ex->fe_len = 0; ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (!test_bit(block, buddy)) ++ goto nofree; ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_orders[ex->fe_nums++] = order; ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_back = 0; ++ ++ while ((space = space - (1 << order)) > 0) { ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (!test_bit(next, e3b->bd_bitmap)) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ if ((1 << ord) >= needed) { ++ /* we dont want to coalesce with self-enough buddies */ ++ break; ++ } ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ ++ if (ex->fe_nums < MB_ARR_SIZE) ++ ex->fe_orders[ex->fe_nums++] = order; ++ } ++ ++nofree: ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used_backward(struct ext3_buddy *e3b, ++ struct ext3_free_extent *ex, int len) ++{ ++ int start = ex->fe_start, len0 = len; ++ int ord, mlen, max, cur; ++ void *buddy; ++ ++ start = ex->fe_start + ex->fe_len - 1; ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && ++ len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start -= mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ J_ASSERT(start >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ set_bit(cur, buddy); ++ set_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_mark_used_forward(struct ext3_buddy *e3b, ++ struct ext3_free_extent *ex, int len) ++{ ++ int start = ex->fe_start, len0 = len; ++ int ord, mlen, max, cur; ++ void *buddy; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ set_bit(cur, buddy); ++ set_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++int inline mb_mark_used(struct ext3_buddy *e3b, ++ struct ext3_free_extent *ex, int len) ++{ ++ int err; ++ ++ J_ASSERT(ex); ++ if (ex->fe_back == 0) ++ err = mb_mark_used_forward(e3b, ex, len); ++ else ++ err = mb_mark_used_backward(e3b, ex, len); ++ return err; ++} ++ ++int ext3_mb_new_in_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b, int group) ++{ ++ struct super_block *sb = ac->ac_sb; ++ int err, gorder, max, i; ++ struct ext3_free_extent curex; ++ ++ /* let's know order of allocation */ ++ gorder = 0; ++ while (ac->ac_g_len > (1 << gorder)) ++ gorder++; ++ ++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { ++ /* someone asks for space at this specified block ++ * probably he wants to merge it into existing extent */ ++ if (test_bit(ac->ac_g_start, e3b->bd_bitmap)) { ++ /* good. at least one block is free */ ++ max = mb_find_extent(e3b, 0, ac->ac_g_start, ++ ac->ac_g_len, &curex); ++ max = min(curex.fe_len, ac->ac_g_len); ++ mb_mark_used(e3b, &curex, max); ++ ++ ac->ac_b_group = group; ++ ac->ac_b_start = curex.fe_start; ++ ac->ac_b_len = max; ++ ac->ac_status = AC_STATUS_FOUND; ++ err = 0; ++ goto out; ++ } ++ /* don't try to find goal anymore */ ++ ac->ac_g_flags &= ~1; ++ } ++ ++ i = 0; ++ while (1) { ++ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) ++ break; ++ ++ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); ++ if (max >= ac->ac_g_len) { ++ max = min(curex.fe_len, ac->ac_g_len); ++ mb_mark_used(e3b, &curex, max); ++ ++ ac->ac_b_group = group; ++ ac->ac_b_start = curex.fe_start; ++ ac->ac_b_len = max; ++ ac->ac_status = AC_STATUS_FOUND; ++ break; ++ } ++ i += max; ++ } ++ ++ return 0; ++ ++out: ++ return err; ++} ++ ++int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) ++{ ++ struct ext3_group_desc *gdp; ++ int free_blocks; ++ ++ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); ++ if (!gdp) ++ return 0; ++ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); ++ if (free_blocks == 0) ++ return 0; ++ ++ /* someone wants this block very much */ ++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) ++ return 1; ++ ++ /* FIXME: I'd like to take fragmentation into account here */ ++ if (cr == 0) { ++ if (free_blocks >= ac->ac_g_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free_blocks >= ac->ac_g_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } else { ++ BUG(); ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ if (!(flags & 2)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; ++ ac.ac_status = 0; ++ ac.ac_groups_scanned = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_group = group; ++ ac.ac_g_start = block; ++ ac.ac_g_len = *len; ++ ac.ac_g_flags = flags; ++ ++ /* loop over the groups */ ++ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ err = ext3_mb_new_in_group(&ac, &e3b, group); ++ ext3_unlock_group(sb, group); ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ break; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* unfortunately, we can't satisfy this request */ ++ J_ASSERT(ac.ac_b_len == 0); ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++ goto out; ++ } ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap_bh(sb, ac.ac_b_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#if 0 ++ for (i = 0; i < ac.ac_b_len; i++) ++ J_ASSERT(!test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); ++ ++ ext3_lock_group(sb, ac.ac_b_group); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - ++ ac.ac_b_len); ++ ext3_unlock_group(sb, ac.ac_b_group); ++ spin_lock(&sbi->s_md_lock); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - ac.ac_b_len); ++ spin_unlock(&sbi->s_md_lock); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); ++ ++ *len = ac.ac_b_len; ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & 2)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ return block; ++} ++ ++int ext3_mb_generate_buddy(struct super_block *sb, int group) ++{ ++ struct buffer_head *bh; ++ int i, err, count = 0; ++ struct ext3_buddy e3b; ++ ++ err = ext3_mb_load_desc(sb, group, &e3b); ++ if (err) ++ goto out; ++ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); ++ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); ++ ++ bh = read_block_bitmap_bh(sb, group); ++ if (bh == NULL) { ++ err = -EIO; ++ goto out2; ++ } ++ ++ /* loop over the blocks, nad create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(&e3b, i, 1); ++ count++; ++ } ++ } ++ mb_check_buddy(&e3b); ++ ext3_mb_dirty_buddy(&e3b); ++ ++out2: ++ ext3_mb_release_desc(&e3b); ++out: ++ return err; ++} ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct dentry *db; ++ tid_t target; ++ int err, i; ++ ++ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks) * ++ sbi->s_groups_count, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk("can't allocate mem for buddy maps\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, ++ sizeof(struct ext3_buddy_group_blocks) * sbi->s_groups_count); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, ++ strlen(EXT3_BUDDY_FILE)); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk("can't lookup buddy file: %d\n", err); ++ goto out; ++ } ++ ++ if (db->d_inode != NULL) { ++ sbi->s_buddy = igrab(db->d_inode); ++ goto map; ++ } ++ ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk("error while creation buddy file: %d\n", err); ++ } else { ++ sbi->s_buddy = igrab(db->d_inode); ++ } ++ ++map: ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct buffer_head *bh = NULL; ++ handle_t *handle; ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); ++ if (bh == NULL) { ++ printk("can't get block for buddy bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i].bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); ++ if (bh == NULL) { ++ printk("can't get block for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i].bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ext3_journal_stop(handle, sbi->s_buddy); ++ spin_lock_init(&sbi->s_buddy_blocks[i].bb_lock); ++ sbi->s_buddy_blocks[i].bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i].bb_tid = 0; ++ } ++ ++ if ((target = log_start_commit(sbi->s_journal, NULL))) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ up(&root->i_sem); ++ return err; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) ++ kfree(sbi->s_buddy_blocks); ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb) ++{ ++ struct ext3_super_block *es; ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ext3_mb_init_backend(sb); ++ ++ es = EXT3_SB(sb)->s_es; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ ext3_mb_generate_buddy(sb, i); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_desc(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ bitmap_bh = read_block_bitmap_bh(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_desc(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ ext3_unlock_group(sb, block_group); ++ spin_lock(&sbi->s_md_lock); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) + count); ++ spin_unlock(&sbi->s_md_lock); ++ } ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ /* FIXME: undo logic will be implemented later and another way */ ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ DQUOT_FREE_BLOCK(inode, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_super_block *es; ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ es = EXT3_SB(sb)->s_es; ++ spin_lock(&sbi->s_reserve_lock); ++ free = le32_to_cpu(es->s_free_blocks_count); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ if (sbi->s_blocks_reserved < 0) ++ printk("EXT3-fs: reserve leak %ld\n", sbi->s_blocks_reserved); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, u32 *pc, u32 *pb, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, pc, pb, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ if (!test_opt(inode->i_sb, MBALLOC)) ++ ext3_free_blocks_old(handle, inode, block, count); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata); ++ return; ++} ++ +Index: linux-2.4.24/fs/ext3/super.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/super.c 2004-08-06 03:59:09.000000000 +0400 ++++ linux-2.4.24/fs/ext3/super.c 2004-08-06 03:59:09.000000000 +0400 +@@ -529,6 +529,7 @@ + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_mb_release(sb); + J_ASSERT(sbi->s_delete_inodes == 0); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); +@@ -781,6 +782,8 @@ + else if (want_numeric(value, "journal", inum)) + return 0; + } ++ else if (!strcmp (this_char, "mballoc")) ++ set_opt (*mount_options, MBALLOC); + else if (!strcmp (this_char, "noload")) + set_opt (*mount_options, NOLOAD); + else if (!strcmp (this_char, "data")) { +@@ -1406,6 +1409,7 @@ + "writeback"); + + ext3_ext_init(sb); ++ ext3_mb_init(sb); + + if (test_opt(sb, PDIROPS)) { + printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n"); +Index: linux-2.4.24/fs/ext3/Makefile +=================================================================== +--- linux-2.4.24.orig/fs/ext3/Makefile 2004-08-06 03:59:07.000000000 +0400 ++++ linux-2.4.24/fs/ext3/Makefile 2004-08-06 03:59:09.000000000 +0400 +@@ -13,7 +13,7 @@ + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ +- xattr_trusted.o extents.o extents-in-ea.o ++ xattr_trusted.o extents.o extents-in-ea.o mballoc.o + export-objs += extents.o + + obj-m := $(O_TARGET) +Index: linux-2.4.24/fs/ext3/balloc.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/balloc.c 2004-08-06 00:42:23.000000000 +0400 ++++ linux-2.4.24/fs/ext3/balloc.c 2004-08-06 03:59:09.000000000 +0400 +@@ -203,8 +203,7 @@ + * differentiating between a group for which we have never performed a bitmap + * IO request, and a group for which the last bitmap read request failed. + */ +-static inline int load_block_bitmap (struct super_block * sb, +- unsigned int block_group) ++int load_block_bitmap (struct super_block * sb, unsigned int block_group) + { + int slot; + +@@ -253,8 +252,8 @@ + } + + /* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks (handle_t *handle, struct inode * inode, +- unsigned long block, unsigned long count) ++void ext3_free_blocks_old (handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh; + struct buffer_head *gd_bh; +@@ -528,9 +527,9 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block (handle_t *handle, struct inode * inode, +- unsigned long goal, u32 * prealloc_count, +- u32 * prealloc_block, int * errp) ++int ext3_new_block_old (handle_t *handle, struct inode * inode, ++ unsigned long goal, u32 * prealloc_count, ++ u32 * prealloc_block, int * errp) + { + struct buffer_head * bh, *bhtmp; + struct buffer_head * bh2; +Index: linux-2.4.24/fs/ext3/namei.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/namei.c 2004-08-06 03:59:09.000000000 +0400 ++++ linux-2.4.24/fs/ext3/namei.c 2004-08-06 03:59:09.000000000 +0400 +@@ -1944,7 +1944,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode) + { + handle_t *handle; + struct inode * inode; +Index: linux-2.4.24/fs/ext3/inode.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/inode.c 2004-08-06 03:59:09.000000000 +0400 ++++ linux-2.4.24/fs/ext3/inode.c 2004-08-06 03:59:09.000000000 +0400 +@@ -254,7 +254,7 @@ + inode->u.ext3_i.i_prealloc_count = 0; + inode->u.ext3_i.i_prealloc_block = 0; + /* Writer: end */ +- ext3_free_blocks (inode, block, total); ++ ext3_free_blocks (inode, block, total, 1); + } + unlock_kernel(); + #endif +@@ -618,7 +618,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -722,7 +722,7 @@ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1650,7 +1650,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -1821,7 +1821,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.4.24/fs/ext3/extents.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/extents.c 2004-08-06 03:59:01.000000000 +0400 ++++ linux-2.4.24/fs/ext3/extents.c 2004-08-06 03:59:09.000000000 +0400 +@@ -741,7 +741,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1385,7 +1385,7 @@ + path->p_idx->e_leaf); + bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->e_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->e_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->e_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->e_leaf, 1, 1); + return err; + } + +@@ -1842,10 +1842,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->e_block && to == ex->e_block + ex->e_num - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1857,7 +1859,7 @@ + bh = sb_get_hash_table(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->e_block && to <= ex->e_block + ex->e_num - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->e_block, ex->e_num); +Index: linux-2.4.24/fs/ext3/xattr.c +=================================================================== +--- linux-2.4.24.orig/fs/ext3/xattr.c 2004-08-06 03:59:08.000000000 +0400 ++++ linux-2.4.24/fs/ext3/xattr.c 2004-08-06 03:59:09.000000000 +0400 +@@ -174,7 +174,7 @@ + ext3_xattr_free_block(handle_t *handle, struct inode * inode, + unsigned long block) + { +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + inode->i_blocks -= inode->i_sb->s_blocksize >> 9; + } + +@@ -182,7 +182,7 @@ + # define ext3_xattr_quota_free(inode) \ + DQUOT_FREE_BLOCK(inode, 1) + # define ext3_xattr_free_block(handle, inode, block) \ +- ext3_free_blocks(handle, inode, block, 1) ++ ext3_free_blocks(handle, inode, block, 1, 1) + #endif + + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) +Index: linux-2.4.24/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.24.orig/include/linux/ext3_fs.h 2004-08-06 03:59:09.000000000 +0400 ++++ linux-2.4.24/include/linux/ext3_fs.h 2004-08-06 03:59:09.000000000 +0400 +@@ -343,6 +343,7 @@ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + #define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x100000/* buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -677,7 +678,7 @@ + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, + __u32 *, __u32 *, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern unsigned long ext3_count_free_blocks (struct super_block *); + extern void ext3_check_blocks_bitmap (struct super_block *); + extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, +Index: linux-2.4.24/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.24.orig/include/linux/ext3_fs_sb.h 2004-08-06 03:59:09.000000000 +0400 ++++ linux-2.4.24/include/linux/ext3_fs_sb.h 2004-08-06 04:01:55.000000000 +0400 +@@ -19,6 +19,7 @@ + #ifdef __KERNEL__ + #include + #include ++#include + #endif + + /* +@@ -31,6 +32,25 @@ + + #define EXT3_DELETE_THREAD + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++#define EXT3_BB_MAX_ORDER 14 ++ ++struct ext3_buddy_group_blocks { ++ unsigned long bb_bitmap; ++ unsigned long bb_buddy; ++ spinlock_t bb_lock; ++ unsigned bb_counters[EXT3_BB_MAX_ORDER]; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned long bb_tid; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -87,6 +107,17 @@ + wait_queue_head_t s_delete_waiter_queue; + #endif + u32 s_mdsnum; ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks *s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ unsigned int s_last_transaction; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.4.24/include/asm-i386/bitops.h +=================================================================== +--- linux-2.4.24.orig/include/asm-i386/bitops.h 2004-08-06 01:43:20.000000000 +0400 ++++ linux-2.4.24/include/asm-i386/bitops.h 2004-08-06 03:59:09.000000000 +0400 +@@ -352,6 +352,67 @@ + } + + /** ++ * find_first_bit - find the first set bit in a memory region ++ * @addr: The address to start the search at ++ * @size: The maximum size to search ++ * ++ * Returns the bit-number of the first set bit, not the number of the byte ++ * containing a bit. ++ */ ++static __inline__ int find_first_bit(const unsigned long *addr, unsigned size) ++{ ++ int d0, d1; ++ int res; ++ ++ /* This looks at memory. Mark it volatile to tell gcc not to move it around */ ++ __asm__ __volatile__( ++ "xorl %%eax,%%eax\n\t" ++ "repe; scasl\n\t" ++ "jz 1f\n\t" ++ "leal -4(%%edi),%%edi\n\t" ++ "bsfl (%%edi),%%eax\n" ++ "1:\tsubl %%ebx,%%edi\n\t" ++ "shll $3,%%edi\n\t" ++ "addl %%edi,%%eax" ++ :"=a" (res), "=&c" (d0), "=&D" (d1) ++ :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); ++ return res; ++} ++ ++/** ++ * find_next_bit - find the first set bit in a memory region ++ * @addr: The address to base the search on ++ * @offset: The bitnumber to start searching at ++ * @size: The maximum size to search ++ */ ++static __inline__ int find_next_bit(const unsigned long *addr, int size, int offset) ++{ ++ const unsigned long *p = addr + (offset >> 5); ++ int set = 0, bit = offset & 31, res; ++ ++ if (bit) { ++ /* ++ * Look for nonzero in the first 32 bits: ++ */ ++ __asm__("bsfl %1,%0\n\t" ++ "jne 1f\n\t" ++ "movl $32, %0\n" ++ "1:" ++ : "=r" (set) ++ : "r" (*p >> bit)); ++ if (set < (32 - bit)) ++ return set + offset; ++ set = 32 - bit; ++ p++; ++ } ++ /* ++ * No set bit yet, search remaining full words for a bit ++ */ ++ res = find_first_bit (p, size - 32 * (p - addr)); ++ return (offset + set + res); ++} ++ ++/** + * hweightN - returns the hamming weight of a N-bit word + * @x: the word to weigh + *