Index: linux-2.6.5-sles9/fs/ext3/mballoc.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300 @@ -0,0 +1,1428 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TODO: + * - do not scan from the beginning, try to remember first free block + * - mb_mark_used_* may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc + */ + +/* + * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. this checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG +#define mb_debug(fmt,a...) printk(fmt, ##a) +#else +#define mb_debug(fmt,a...) +#endif + +/* + * where to save buddies structures beetween umount/mount (clean case only) + */ +#define EXT3_BUDDY_FILE ".buddy" + +/* + * max. number of chunks to be tracked in ext3_free_extent struct + */ +#define MB_ARR_SIZE 32 + +struct ext3_allocation_context { + struct super_block *ac_sb; + + /* search goals */ + int ac_g_group; + int ac_g_start; + int ac_g_len; + int ac_g_flags; + + /* the best found extent */ + int ac_b_group; + int ac_b_start; + int ac_b_len; + + /* number of iterations done. we have to track to limit searching */ + int ac_repeats; + int ac_groups_scanned; + int ac_status; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 + + +struct ext3_buddy { + void *bd_bitmap; + void *bd_buddy; + int bd_blkbits; + struct buffer_head *bd_bh; + struct buffer_head *bd_bh2; + struct ext3_buddy_group_blocks *bd_bd; + struct super_block *bd_sb; +}; + +struct ext3_free_extent { + int fe_start; + int fe_len; + unsigned char fe_orders[MB_ARR_SIZE]; + unsigned char fe_nums; + unsigned char fe_back; +}; + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + + +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); + +#define mb_correct_addr_and_bit(bit,addr) \ +{ \ + if ((unsigned) addr & 1) { \ + bit += 8; \ + addr--; \ + } \ + if ((unsigned) addr & 2) { \ + bit += 16; \ + addr--; \ + addr--; \ + } \ +} + +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + return test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); + clear_bit(bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ + int i = 1; + void *bb; + + J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); + J_ASSERT(max != NULL); + + if (order > e3b->bd_blkbits + 1) + return NULL; + + /* at order 0 we see each particular block */ + *max = 1 << (e3b->bd_blkbits + 3); + if (order == 0) + return e3b->bd_bitmap; + + bb = e3b->bd_buddy; + *max = *max >> 1; + while (i < order) { + bb += 1 << (e3b->bd_blkbits - i); + i++; + *max = *max >> 1; + } + return bb; +} + +static int ext3_mb_load_desc(struct super_block *sb, int group, + struct ext3_buddy *e3b) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + J_ASSERT(sbi->s_buddy_blocks[group].bb_bitmap); + J_ASSERT(sbi->s_buddy_blocks[group].bb_buddy); + + /* load bitmap */ + e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_bitmap); + if (e3b->bd_bh == NULL) { + ext3_error(sb, "ext3_mb_load_desc", + "can't get block for buddy bitmap\n"); + goto out; + } + if (!buffer_uptodate(e3b->bd_bh)) { + ll_rw_block(READ, 1, &e3b->bd_bh); + wait_on_buffer(e3b->bd_bh); + } + J_ASSERT(buffer_uptodate(e3b->bd_bh)); + + /* load buddy */ + e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_buddy); + if (e3b->bd_bh2 == NULL) { + ext3_error(sb, "ext3_mb_load_desc", + "can't get block for buddy bitmap\n"); + goto out; + } + if (!buffer_uptodate(e3b->bd_bh2)) { + ll_rw_block(READ, 1, &e3b->bd_bh2); + wait_on_buffer(e3b->bd_bh2); + } + J_ASSERT(buffer_uptodate(e3b->bd_bh2)); + + e3b->bd_bitmap = e3b->bd_bh->b_data; + e3b->bd_buddy = e3b->bd_bh2->b_data; + e3b->bd_blkbits = sb->s_blocksize_bits; + e3b->bd_bd = sbi->s_buddy_blocks + group; + e3b->bd_sb = sb; + + return 0; +out: + brelse(e3b->bd_bh); + brelse(e3b->bd_bh2); + e3b->bd_bh = NULL; + e3b->bd_bh2 = NULL; + return -EIO; +} + +static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) +{ + mark_buffer_dirty(e3b->bd_bh); + mark_buffer_dirty(e3b->bd_bh2); +} + +static void ext3_mb_release_desc(struct ext3_buddy *e3b) +{ + brelse(e3b->bd_bh); + brelse(e3b->bd_bh2); +} + +#ifdef AGGRESSIVE_CHECK +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; + void *buddy, *buddy2; + + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + buddy2 = mb_find_buddy(e3b, order - 1, &max2); + J_ASSERT(buddy2); + J_ASSERT(buddy != buddy2); + J_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (!mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (mb_test_bit(i << 1, buddy2)) + J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); + else if (mb_test_bit((i << 1) + 1, buddy2)) + J_ASSERT(!mb_test_bit(i << 1, buddy2)); + continue; + } + + /* both bits in buddy2 must be 0 */ + J_ASSERT(!mb_test_bit(i << 1, buddy2)); + J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); + } + count++; + } + J_ASSERT(e3b->bd_bd->bb_counters[order] == count); + order--; + } + + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { + if (mb_test_bit(i, buddy)) + continue; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); + k = i >> j; + J_ASSERT(k < max2); + J_ASSERT(!mb_test_bit(k, buddy2)); + } + } +} +#else +#define mb_check_buddy(e3b) +#endif + +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ + spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ + spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) +{ + int order = 1; + void *bb; + + J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); + J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); + + bb = e3b->bd_buddy; + while (order <= e3b->bd_blkbits + 1) { + block = block >> 1; + if (mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e3b->bd_blkbits - order); + order++; + } + return 0; +} + +static inline void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit(cur, bm); + cur++; + } +} + +static inline void mb_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit(cur, bm); + cur++; + } +} + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ + int block, max, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); + mb_set_bit(block, e3b->bd_bitmap); + e3b->bd_bd->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); + + do { + block &= ~1UL; + if (!mb_test_bit(block, buddy) || + !mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ + buddy2 = mb_find_buddy(e3b, order + 1, &max); + + if (!buddy2) + break; + + if (order > 0) { + /* for special purposes, we don't clear + * free bits in bitmap */ + mb_clear_bit(block, buddy); + mb_clear_bit(block + 1, buddy); + } + e3b->bd_bd->bb_counters[order]--; + e3b->bd_bd->bb_counters[order]--; + + block = block >> 1; + order++; + e3b->bd_bd->bb_counters[order]++; + + mb_set_bit(block, buddy2); + buddy = buddy2; + } while (1); + } + mb_check_buddy(e3b); + + return 0; +} + +/* + * returns 1 if out extent is enough to fill needed space + */ +int mb_make_backward_extent(struct ext3_free_extent *in, + struct ext3_free_extent *out, int needed) +{ + int i; + + J_ASSERT(in); + J_ASSERT(out); + J_ASSERT(in->fe_nums < MB_ARR_SIZE); + + out->fe_len = 0; + out->fe_start = in->fe_start + in->fe_len; + out->fe_nums = 0; + + /* for single-chunk extent we need not back order + * also, if an extent doesn't fill needed space + * then it makes no sense to try back order becase + * if we select this extent then it'll be use as is */ + if (in->fe_nums < 2 || in->fe_len < needed) + return 0; + + i = in->fe_nums - 1; + while (i >= 0 && out->fe_len < needed) { + out->fe_len += (1 << in->fe_orders[i]); + out->fe_start -= (1 << in->fe_orders[i]); + i--; + } + /* FIXME: in some situation fe_orders may be too small to hold + * all the buddies */ + J_ASSERT(out->fe_len >= needed); + + for (i++; i < in->fe_nums; i++) + out->fe_orders[out->fe_nums++] = in->fe_orders[i]; + J_ASSERT(out->fe_nums < MB_ARR_SIZE); + out->fe_back = 1; + + return 1; +} + +int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ + int space = needed; + int next, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); + + ex->fe_nums = 0; + ex->fe_len = 0; + + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + J_ASSERT(block < max); + if (!mb_test_bit(block, buddy)) + goto nofree; + + if (order == 0) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; + } + + ex->fe_orders[ex->fe_nums++] = order; + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_back = 0; + + while ((space = space - (1 << order)) > 0) { + + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (!mb_test_bit(next, e3b->bd_bitmap)) + break; + + ord = mb_find_order_for_block(e3b, next); + + if ((1 << ord) >= needed) { + /* we dont want to coalesce with self-enough buddies */ + break; + } + order = ord; + block = next >> order; + ex->fe_len += 1 << order; + + if (ex->fe_nums < MB_ARR_SIZE) + ex->fe_orders[ex->fe_nums++] = order; + } + +nofree: + J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used_backward(struct ext3_buddy *e3b, + struct ext3_free_extent *ex, int len) +{ + int start = ex->fe_start, len0 = len; + int ord, mlen, max, cur; + void *buddy; + + start = ex->fe_start + ex->fe_len - 1; + while (len) { + ord = mb_find_order_for_block(e3b, start); + if (((start >> ord) << ord) == (start - (1 << ord) + 1) && + len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_clear_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; + start -= mlen; + len -= mlen; + J_ASSERT(len >= 0); + J_ASSERT(start >= 0); + continue; + } + + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(cur, buddy); + mb_set_bit(cur + 1, buddy); + e3b->bd_bd->bb_counters[ord]++; + e3b->bd_bd->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ + mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); + + mb_check_buddy(e3b); + + return 0; +} + +static int mb_mark_used_forward(struct ext3_buddy *e3b, + struct ext3_free_extent *ex, int len) +{ + int start = ex->fe_start, len0 = len; + int ord, mlen, max, cur; + void *buddy; + + while (len) { + ord = mb_find_order_for_block(e3b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_clear_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(cur, buddy); + mb_set_bit(cur + 1, buddy); + e3b->bd_bd->bb_counters[ord]++; + e3b->bd_bd->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ + mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); + + mb_check_buddy(e3b); + + return 0; +} + +int inline mb_mark_used(struct ext3_buddy *e3b, + struct ext3_free_extent *ex, int len) +{ + int err; + + J_ASSERT(ex); + if (ex->fe_back == 0) + err = mb_mark_used_forward(e3b, ex, len); + else + err = mb_mark_used_backward(e3b, ex, len); + return err; +} + +int ext3_mb_new_in_group(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b, int group) +{ + struct super_block *sb = ac->ac_sb; + int err, gorder, max, i; + struct ext3_free_extent curex; + + /* let's know order of allocation */ + gorder = 0; + while (ac->ac_g_len > (1 << gorder)) + gorder++; + + if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { + /* someone asks for space at this specified block + * probably he wants to merge it into existing extent */ + if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { + /* good. at least one block is free */ + max = mb_find_extent(e3b, 0, ac->ac_g_start, + ac->ac_g_len, &curex); + max = min(curex.fe_len, ac->ac_g_len); + mb_mark_used(e3b, &curex, max); + + ac->ac_b_group = group; + ac->ac_b_start = curex.fe_start; + ac->ac_b_len = max; + ac->ac_status = AC_STATUS_FOUND; + err = 0; + goto out; + } + /* don't try to find goal anymore */ + ac->ac_g_flags &= ~1; + } + + i = 0; + while (1) { + i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) + break; + + max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); + if (max >= ac->ac_g_len) { + max = min(curex.fe_len, ac->ac_g_len); + mb_mark_used(e3b, &curex, max); + + ac->ac_b_group = group; + ac->ac_b_start = curex.fe_start; + ac->ac_b_len = max; + ac->ac_status = AC_STATUS_FOUND; + break; + } + i += max; + } + + return 0; + +out: + return err; +} + +int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) +{ + struct ext3_group_desc *gdp; + int free_blocks; + + gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); + if (!gdp) + return 0; + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks == 0) + return 0; + + /* someone wants this block very much */ + if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) + return 1; + + /* FIXME: I'd like to take fragmentation into account here */ + if (cr == 0) { + if (free_blocks >= ac->ac_g_len >> 1) + return 1; + } else if (cr == 1) { + if (free_blocks >= ac->ac_g_len >> 2) + return 1; + } else if (cr == 2) { + return 1; + } else { + BUG(); + } + return 0; +} + +int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, + unsigned long goal, int *len, int flags, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_allocation_context ac; + int i, group, block, cr, err = 0; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct buffer_head *gdp_bh; + struct ext3_sb_info *sbi; + struct super_block *sb; + struct ext3_buddy e3b; + + J_ASSERT(len != NULL); + J_ASSERT(*len > 0); + + sb = inode->i_sb; + if (!sb) { + printk("ext3_mb_new_nblocks: nonexistent device"); + return 0; + } + + if (!test_opt(sb, MBALLOC)) { + static int ext3_mballoc_warning = 0; + if (ext3_mballoc_warning == 0) { + printk(KERN_ERR "EXT3-fs: multiblock request with " + "mballoc disabled!\n"); + ext3_mballoc_warning++; + } + *len = 1; + err = ext3_new_block_old(handle, inode, goal, errp); + return err; + } + + ext3_mb_poll_new_transaction(sb, handle); + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + + if (!(flags & 2)) { + /* someone asks for non-reserved blocks */ + BUG_ON(*len > 1); + err = ext3_mb_reserve_blocks(sb, 1); + if (err) { + *errp = err; + return 0; + } + } + + /* + * Check quota for allocation of this blocks. + */ + while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) + *len -= 1; + if (*len == 0) { + *errp = -EDQUOT; + block = 0; + goto out; + } + + /* start searching from the goal */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + block = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + + /* set up allocation goals */ + ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; + ac.ac_status = 0; + ac.ac_groups_scanned = 0; + ac.ac_sb = inode->i_sb; + ac.ac_g_group = group; + ac.ac_g_start = block; + ac.ac_g_len = *len; + ac.ac_g_flags = flags; + + /* loop over the groups */ + for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + + /* check is group good for our criteries */ + if (!mb_good_group(&ac, group, cr)) + continue; + + err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + + ext3_lock_group(sb, group); + if (!mb_good_group(&ac, group, cr)) { + /* someone did allocation from this group */ + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + continue; + } + + err = ext3_mb_new_in_group(&ac, &e3b, group); + ext3_unlock_group(sb, group); + if (ac.ac_status == AC_STATUS_FOUND) + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + if (err) + goto out_err; + if (ac.ac_status == AC_STATUS_FOUND) + break; + } + } + + if (ac.ac_status != AC_STATUS_FOUND) { + /* unfortunately, we can't satisfy this request */ + J_ASSERT(ac.ac_b_len == 0); + DQUOT_FREE_BLOCK(inode, *len); + *errp = -ENOSPC; + block = 0; + goto out; + } + + /* good news - free block(s) have been found. now it's time + * to mark block(s) in good old journaled bitmap */ + block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) + + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); + + /* we made a desicion, now mark found blocks in good old + * bitmap to be journaled */ + + ext3_debug("using block group %d(%d)\n", + ac.ac_b_group.group, gdp->bg_free_blocks_count); + + bitmap_bh = read_block_bitmap(sb, ac.ac_b_group); + if (!bitmap_bh) { + *errp = -EIO; + goto out_err; + } + + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) { + *errp = err; + goto out_err; + } + + gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out_err; + } + + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); + + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); +#if 0 + for (i = 0; i < ac.ac_b_len; i++) + J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); +#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); + + ext3_lock_group(sb, ac.ac_b_group); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - + ac.ac_b_len); + ext3_unlock_group(sb, ac.ac_b_group); + percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) + goto out_err; + err = ext3_journal_dirty_metadata(handle, gdp_bh); + if (err) + goto out_err; + + sb->s_dirt = 1; + *errp = 0; + brelse(bitmap_bh); + + /* drop non-allocated, but dquote'd blocks */ + J_ASSERT(*len >= ac.ac_b_len); + DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); + + *len = ac.ac_b_len; + J_ASSERT(block != 0); + goto out; + +out_err: + /* if we've already allocated something, roll it back */ + if (ac.ac_status == AC_STATUS_FOUND) { + /* FIXME: free blocks here */ + } + + DQUOT_FREE_BLOCK(inode, *len); + brelse(bitmap_bh); + *errp = err; + block = 0; +out: + if (!(flags & 2)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter + * whether we allocated anything or we failed: time + * to release reservation. NOTE: because I expect + * any multiblock request from delayed allocation + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } + return block; +} + +int ext3_mb_generate_buddy(struct super_block *sb, int group) +{ + struct buffer_head *bh; + int i, err, count = 0; + struct ext3_buddy e3b; + + err = ext3_mb_load_desc(sb, group, &e3b); + if (err) + goto out; + memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); + memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); + + bh = read_block_bitmap(sb, group); + if (bh == NULL) { + err = -EIO; + goto out2; + } + + /* loop over the blocks, nad create buddies for free ones */ + for (i = 0; i < sb->s_blocksize * 8; i++) { + if (!mb_test_bit(i, (void *) bh->b_data)) { + mb_free_blocks(&e3b, i, 1); + count++; + } + } + brelse(bh); + mb_check_buddy(&e3b); + ext3_mb_dirty_buddy(&e3b); + +out2: + ext3_mb_release_desc(&e3b); +out: + return err; +} + +EXPORT_SYMBOL(ext3_mb_new_blocks); + +#define MB_CREDITS \ + (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ + + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) + +int ext3_mb_init_backend(struct super_block *sb) +{ + struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct dentry *db; + tid_t target; + int err, i; + + sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks) * + sbi->s_groups_count, GFP_KERNEL); + if (sbi->s_buddy_blocks == NULL) { + printk("can't allocate mem for buddy maps\n"); + return -ENOMEM; + } + memset(sbi->s_buddy_blocks, 0, + sizeof(struct ext3_buddy_group_blocks) * sbi->s_groups_count); + sbi->s_buddy = NULL; + + down(&root->i_sem); + db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, + strlen(EXT3_BUDDY_FILE)); + if (IS_ERR(db)) { + err = PTR_ERR(db); + printk("can't lookup buddy file: %d\n", err); + goto out; + } + + if (db->d_inode != NULL) { + sbi->s_buddy = igrab(db->d_inode); + goto map; + } + + err = ext3_create(root, db, S_IFREG, NULL); + if (err) { + printk("error while creation buddy file: %d\n", err); + } else { + sbi->s_buddy = igrab(db->d_inode); + } + +map: + for (i = 0; i < sbi->s_groups_count; i++) { + struct buffer_head *bh = NULL; + handle_t *handle; + + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out2; + } + + /* allocate block for bitmap */ + bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); + if (bh == NULL) { + printk("can't get block for buddy bitmap: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i].bb_bitmap = bh->b_blocknr; + brelse(bh); + + /* allocate block for buddy */ + bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); + if (bh == NULL) { + printk("can't get block for buddy: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i].bb_buddy = bh->b_blocknr; + brelse(bh); + ext3_journal_stop(handle); + spin_lock_init(&sbi->s_buddy_blocks[i].bb_lock); + sbi->s_buddy_blocks[i].bb_md_cur = NULL; + sbi->s_buddy_blocks[i].bb_tid = 0; + } + + if (journal_start_commit(sbi->s_journal, &target)) + log_wait_commit(sbi->s_journal, target); + +out2: + dput(db); +out: + up(&root->i_sem); + return err; +} + +int ext3_mb_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* release freed, non-committed blocks */ + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_closed_transaction, + &sbi->s_committed_transaction); + list_splice_init(&sbi->s_active_transaction, + &sbi->s_committed_transaction); + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_buddy_blocks) + kfree(sbi->s_buddy_blocks); + if (sbi->s_buddy) + iput(sbi->s_buddy); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); + return 0; +} + +int ext3_mb_init(struct super_block *sb) +{ + struct ext3_super_block *es; + int i; + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* init file for buddy data */ + clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); + ext3_mb_init_backend(sb); + + es = EXT3_SB(sb)->s_es; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + ext3_mb_generate_buddy(sb, i); + spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); + spin_lock_init(&EXT3_SB(sb)->s_md_lock); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); + set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); + printk("EXT3-fs: mballoc enabled\n"); + return 0; +} + +void ext3_mb_free_committed_blocks(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int err, i, count = 0, count2 = 0; + struct ext3_free_metadata *md; + struct ext3_buddy e3b; + + if (list_empty(&sbi->s_committed_transaction)) + return; + + /* there is committed blocks to be freed yet */ + do { + /* get next array of blocks */ + md = NULL; + spin_lock(&sbi->s_md_lock); + if (!list_empty(&sbi->s_committed_transaction)) { + md = list_entry(sbi->s_committed_transaction.next, + struct ext3_free_metadata, list); + list_del(&md->list); + } + spin_unlock(&sbi->s_md_lock); + + if (md == NULL) + break; + + mb_debug("gonna free %u blocks in group %u (0x%p):", + md->num, md->group, md); + + err = ext3_mb_load_desc(sb, md->group, &e3b); + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ + count += md->num; + count2++; + ext3_lock_group(sb, md->group); + for (i = 0; i < md->num; i++) { + mb_debug(" %u", md->blocks[i]); + mb_free_blocks(&e3b, md->blocks[i], 1); + } + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + + kfree(md); + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + } while (md); + mb_debug("freed %u blocks in %u structures\n", count, count2); +} + +void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_last_transaction == handle->h_transaction->t_tid) + return; + + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we + * know that transaction before previous is known to be alreade + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + + spin_lock(&sbi->s_md_lock); + if (sbi->s_last_transaction != handle->h_transaction->t_tid) { + mb_debug("new transaction %lu, old %lu\n", + (unsigned long) handle->h_transaction->t_tid, + (unsigned long) sbi->s_last_transaction); + list_splice_init(&sbi->s_closed_transaction, + &sbi->s_committed_transaction); + list_splice_init(&sbi->s_active_transaction, + &sbi->s_closed_transaction); + sbi->s_last_transaction = handle->h_transaction->t_tid; + } + spin_unlock(&sbi->s_md_lock); + + ext3_mb_free_committed_blocks(sb); +} + +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ + struct ext3_buddy_group_blocks *db = e3b->bd_bd; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; + if (md && db->bb_tid != handle->h_transaction->t_tid) { + db->bb_md_cur = NULL; + md = NULL; + } + + if (md == NULL) { + ext3_unlock_group(sb, group); + md = kmalloc(sizeof(*md), GFP_KERNEL); + if (md == NULL) + return -ENOMEM; + md->num = 0; + md->group = group; + + ext3_lock_group(sb, group); + if (db->bb_md_cur == NULL) { + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", + md, md->group); + } else { + kfree(md); + md = db->bb_md_cur; + } + } + + BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); + md->blocks[md->num] = block + i; + md->num++; + if (md->num == EXT3_BB_MAX_BLOCKS) { + /* no more space, put full container on a sb's list */ + db->bb_md_cur = NULL; + } + } + ext3_unlock_group(sb, group); + return 0; +} + +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count, int metadata) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + unsigned long bit, overflow; + struct buffer_head *gd_bh; + unsigned long block_group; + struct ext3_sb_info *sbi; + struct super_block *sb; + struct ext3_buddy e3b; + int err = 0, ret; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + + ext3_mb_poll_new_transaction(sb, handle); + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext3_debug("freeing block %lu\n", block); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; + + if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + err = ext3_mb_load_desc(sb, block_group, &e3b); + if (err) + goto error_return; + + if (metadata) { + /* blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed */ + ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); + } else { + ext3_lock_group(sb, block_group); + mb_free_blocks(&e3b, bit, count); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + ext3_unlock_group(sb, block_group); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + } + + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + /* FIXME: undo logic will be implemented later and another way */ + mb_clear_bits(bitmap_bh->b_data, bit, count); + DQUOT_FREE_BLOCK(inode, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, err); + return; +} + +int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int free, ret = -ENOSPC; + + BUG_ON(blocks < 0); + spin_lock(&sbi->s_reserve_lock); + free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + if (blocks <= free - sbi->s_blocks_reserved) { + sbi->s_blocks_reserved += blocks; + ret = 0; + } + spin_unlock(&sbi->s_reserve_lock); + return ret; +} + +void ext3_mb_release_blocks(struct super_block *sb, int blocks) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + BUG_ON(blocks < 0); + spin_lock(&sbi->s_reserve_lock); + sbi->s_blocks_reserved -= blocks; + WARN_ON(sbi->s_blocks_reserved < 0); + if (sbi->s_blocks_reserved < 0) + sbi->s_blocks_reserved = 0; + spin_unlock(&sbi->s_reserve_lock); +} + +int ext3_new_block(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) +{ + int ret, len; + + if (!test_opt(inode->i_sb, MBALLOC)) { + ret = ext3_new_block_old(handle, inode, goal, errp); + goto out; + } + len = 1; + ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); +out: + return ret; +} + + +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ + if (!test_opt(inode->i_sb, MBALLOC)) + ext3_free_blocks_old(handle, inode, block, count); + else + ext3_mb_free_blocks(handle, inode, block, count, metadata); + return; +} + Index: linux-2.6.5-sles9/fs/ext3/super.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 +++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300 @@ -389,6 +389,7 @@ struct ext3_super_block *es = sbi->s_es; int i; + ext3_mb_release(sb); ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); @@ -542,7 +543,7 @@ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_err, Opt_extents, Opt_extdebug + Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc, }; static match_table_t tokens = { @@ -589,6 +590,7 @@ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, {Opt_err, NULL} }; @@ -810,6 +812,9 @@ case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1463,7 +1468,8 @@ ext3_count_dirs(sb)); ext3_ext_init(sb); - + ext3_mb_init(sb); + return 0; failed_mount3: Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 +++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o extents.o + ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.5-sles9/fs/ext3/balloc.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300 +++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300 @@ -78,7 +78,7 @@ * * Return buffer_head on success or NULL in case of failure. */ -static struct buffer_head * +struct buffer_head * read_block_bitmap(struct super_block *sb, unsigned int block_group) { struct ext3_group_desc * desc; @@ -274,7 +274,7 @@ } /* Free given blocks, update quota and i_blocks field */ -void ext3_free_blocks(handle_t *handle, struct inode *inode, +void ext3_free_blocks_old(handle_t *handle, struct inode *inode, unsigned long block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; @@ -1142,7 +1142,7 @@ * bitmap, and then for any free bit if that fails. * This function also updates quota and i_blocks field. */ -int ext3_new_block(handle_t *handle, struct inode *inode, +int ext3_new_block_old(handle_t *handle, struct inode *inode, unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.5-sles9/fs/ext3/namei.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300 +++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300 @@ -1640,7 +1640,7 @@ * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, +int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { handle_t *handle; Index: linux-2.6.5-sles9/fs/ext3/inode.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 +++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300 @@ -572,7 +572,7 @@ ext3_journal_forget(handle, branch[i].bh); } for (i = 0; i < keys; i++) - ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); return err; } @@ -673,7 +673,7 @@ if (err == -EAGAIN) for (i = 0; i < num; i++) ext3_free_blocks(handle, inode, - le32_to_cpu(where[i].key), 1); + le32_to_cpu(where[i].key), 1, 1); return err; } @@ -1829,7 +1829,7 @@ } } - ext3_free_blocks(handle, inode, block_to_free, count); + ext3_free_blocks(handle, inode, block_to_free, count, 1); } /** @@ -2000,7 +2000,7 @@ ext3_journal_test_restart(handle, inode); } - ext3_free_blocks(handle, inode, nr, 1); + ext3_free_blocks(handle, inode, nr, 1, 1); if (parent_bh) { /* Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 +++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300 @@ -740,7 +740,7 @@ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext3_free_blocks(handle, tree->inode, ablocks[i], 1); + ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); } } kfree(ablocks); @@ -1391,7 +1391,7 @@ path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); - ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); + ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); return err; } @@ -1879,10 +1879,12 @@ int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; - int i; + int i, metadata = 0; if (IS_ERR(handle)) return PTR_ERR(handle); + if (S_ISDIR(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; @@ -1894,7 +1896,7 @@ bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } - ext3_free_blocks(handle, tree->inode, start, num); + ext3_free_blocks(handle, tree->inode, start, num, metadata); } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); Index: linux-2.6.5-sles9/fs/ext3/xattr.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300 +++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300 @@ -1366,7 +1366,7 @@ new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext3_free_blocks(handle, inode, block, 1); + ext3_free_blocks(handle, inode, block, 1, 1); error = -EIO; goto cleanup; } @@ -1408,7 +1408,7 @@ if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { /* Free the old block. */ ea_bdebug(old_bh, "freeing"); - ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); + ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); /* ext3_forget() calls bforget() for us, but we let our caller release old_bh, so we need to @@ -1504,7 +1504,7 @@ lock_buffer(bh); if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ext3_xattr_cache_remove(bh); - ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); + ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { Index: linux-2.6.5-sles9/include/linux/ext3_fs.h =================================================================== --- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 +++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300 @@ -57,6 +57,8 @@ #define ext3_debug(f, a...) do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR 1 + /* * Special inodes numbers */ @@ -339,6 +341,7 @@ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ +#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt @@ -698,7 +701,7 @@ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300 +++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300 @@ -23,10 +23,30 @@ #define EXT_INCLUDE #include #include +#include #endif #endif #include +#define EXT3_BB_MAX_BLOCKS 30 +struct ext3_free_metadata { + unsigned short group; + unsigned short num; + unsigned short blocks[EXT3_BB_MAX_BLOCKS]; + struct list_head list; +}; + +#define EXT3_BB_MAX_ORDER 14 + +struct ext3_buddy_group_blocks { + sector_t bb_bitmap; + sector_t bb_buddy; + spinlock_t bb_lock; + unsigned bb_counters[EXT3_BB_MAX_ORDER]; + struct ext3_free_metadata *bb_md_cur; + unsigned long bb_tid; +}; + /* * third extended-fs super-block data in memory */ @@ -78,6 +98,17 @@ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif + + /* for buddy allocator */ + struct ext3_buddy_group_blocks *s_buddy_blocks; + struct inode *s_buddy; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + struct list_head s_active_transaction; + struct list_head s_closed_transaction; + struct list_head s_committed_transaction; + spinlock_t s_md_lock; + tid_t s_last_transaction; }; #endif /* _LINUX_EXT3_FS_SB */