From 4527a65cc1a46740c8edee7557a3cdd7ce035d87 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 4 Jul 2005 08:13:53 +0000 Subject: [PATCH] - add mballoc to ldiskfs series - make extents,mballoc a default options for OST --- .../kernel_patches/series/ldiskfs-2.6-fc3.series | 1 + ldiskfs/ldiskfs/Makefile.in | 2 +- .../patches/ext3-mballoc2-2.6.10-fc3.patch | 2249 ++++++++++++++++++++ .../kernel_patches/series/ldiskfs-2.6-fc3.series | 1 + lustre/ldiskfs/Makefile.in | 2 +- lustre/utils/lconf | 2 + 6 files changed, 2255 insertions(+), 2 deletions(-) create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.10-fc3.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series index 9476f8a..3e96555 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series @@ -16,3 +16,4 @@ ext3-mds-num-2.6.10-fc3.patch ext3-fid-2.6.7.patch ext3-raw-lookup-2.6.10.patch ext3-disable-reservation-2.6.10-fc3.patch +ext3-mballoc2-2.6.10-fc3.patch diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in index 80d9efb..acf0b20 100644 --- a/ldiskfs/ldiskfs/Makefile.in +++ b/ldiskfs/ldiskfs/Makefile.in @@ -9,7 +9,7 @@ ext3_headers := $(wildcard @LINUX@/fs/ext3/*.h) linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h) new_linux_hearders := ext3_extents.h ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c)) -new_sources := iopen.c iopen.h extents.c extents-in-ea.c +new_sources := iopen.c iopen.h extents.c extents-in-ea.c mballoc.c ldiskfs_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) ldiskfs-objs := $(filter %.o,$(ldiskfs_sources:.c=.o)) diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.10-fc3.patch new file mode 100644 index 0000000..b180172 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.10-fc3.patch @@ -0,0 +1,2249 @@ +Index: linux-2.6.10/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/mballoc.c 2005-02-25 17:28:41.836311072 +0200 ++++ linux-2.6.10/fs/ext3/mballoc.c 2005-02-25 17:28:41.859307576 +0200 +@@ -0,0 +1,1861 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ * with MBALLOC_STATS allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++#define MBALLOC_STATS ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++#define EXT3_MB_MAX_TO_SCAN 100 ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file ++ */ ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbabd16fd ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_buddy { ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); ++ return bb; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ ++ if (!buffer_uptodate(e3b->bd_bh)) ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ if (!buffer_uptodate(e3b->bd_bh2)) ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ ++ wait_on_buffer(e3b->bd_bh); ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ wait_on_buffer(e3b->bd_bh2); ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++#ifdef AGGRESSIVE_CHECK ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++} ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ int ord, mlen, max, cur; ++ int len0 = len; ++ void *buddy; ++ ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ int free; ++ ++ J_ASSERT(cr >= 0 && cr < 3); ++ ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; ++ ++ if (cr == 0) { ++ if (free >= ac->ac_g_ex.fe_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free >= ac->ac_g_ex.fe_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ext3_mb_scan_group(&ac, &e3b); ++ ext3_unlock_group(sb, group); ++ ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++#ifdef MBALLOC_STATS ++ if (ac.ac_g_ex.fe_len > 1) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++#endif ++ return block; ++} ++ ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ handle = ext3_journal_start_sb(e3b->bd_sb, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; ++ goto out; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ mb_debug("generate buddy for group %d\n", e3b->bd_group); ++ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_free = 0; ++ e3b->bd_bd->bb_first_free = 1 << 15; ++ /* ++ * if change bb_counters size, don't forget about ++ * ext3_mb_init_backend() -bzzz ++ */ ++ memset(e3b->bd_bd->bb_counters, 0, ++ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ ++ /* loop over the blocks, and create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!mb_test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(e3b, i, 1); ++ count++; ++ } ++ } ++ brelse(bh); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ ++ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb, int *created) ++{ ++ int err, i, len, descr_per_block, buddy_offset, size; ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; ++ struct dentry *db; ++ handle_t *handle; ++ tid_t target; ++ ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, len); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ mb_debug("no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ if (*created == 0) ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, ++ (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; ++ } ++ ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ if (*created == 0) ++ printk(KERN_ERR ++ "EXT3-fs: invalid header 0x%x in %d," ++ "regenerate\n", hdr->mh_magic, i); ++ *created = 1; ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); ++ } ++ ++ /* ++ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ err = -ENOMEM; ++ goto out2; ++ } ++ memset(sbi->s_buddy_blocks[i], 0, len); ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } ++ ext3_journal_stop(handle); ++ ++ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); ++ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ } ++ ++ if (journal_start_commit(sbi->s_journal, &target)) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) { ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } ++ kfree(sbi->s_buddy_blocks); ++ } ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++#ifdef MBALLOC_STATS ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n", ++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n", ++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks); ++#endif ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_buddy e3b; ++ int i, err, created; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; ++ ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ++#ifdef MBALLOC_STATS ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++#define MBALLOC_INFO " (stats)" ++#else ++#define MBALLOC_INFO "" ++#endif ++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO); ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} +Index: linux-2.6.10/fs/ext3/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/super.c 2005-02-25 17:27:00.231757312 +0200 ++++ linux-2.6.10/fs/ext3/super.c 2005-02-25 17:28:41.862307120 +0200 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -592,7 +593,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_mbfactor, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + }; + +@@ -646,6 +647,8 @@ + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_mballoc, "mbfactor=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, + }; +@@ -956,6 +959,16 @@ + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1639,8 +1652,9 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + + cantfind_ext3: +Index: linux-2.6.10/fs/ext3/Makefile +=================================================================== +--- linux-2.6.10.orig/fs/ext3/Makefile 2005-02-25 17:27:00.228757768 +0200 ++++ linux-2.6.10/fs/ext3/Makefile 2005-02-25 17:28:41.863306968 +0200 +@@ -5,7 +5,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \ +- extents.o ++ extents.o mballoc.o + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o extents-in-ea.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o + ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o +Index: linux-2.6.10/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/balloc.c 2005-02-25 17:26:58.965949744 +0200 ++++ linux-2.6.10/fs/ext3/balloc.c 2005-02-25 17:28:41.865306664 +0200 +@@ -79,7 +79,7 @@ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -450,24 +450,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1140,7 +1122,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.10/fs/ext3/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/namei.c 2005-02-25 17:26:59.527864320 +0200 ++++ linux-2.6.10/fs/ext3/namei.c 2005-02-25 17:28:41.867306360 +0200 +@@ -1639,7 +1639,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) + { + handle_t *handle; +Index: linux-2.6.10/fs/ext3/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/inode.c 2005-02-25 17:27:00.227757920 +0200 ++++ linux-2.6.10/fs/ext3/inode.c 2005-02-25 17:28:41.872305600 +0200 +@@ -572,7 +572,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.10/fs/ext3/extents.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/extents.c 2005-02-25 17:27:00.222758680 +0200 ++++ linux-2.6.10/fs/ext3/extents.c 2005-02-25 17:29:29.364085752 +0200 +@@ -740,7 +740,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1391,7 +1391,7 @@ + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1879,10 +1879,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1894,7 +1896,7 @@ + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.10/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-02-25 17:26:59.876811272 +0200 ++++ linux-2.6.10/fs/ext3/xattr.c 2005-02-25 17:28:41.878304688 +0200 +@@ -1271,7 +1271,7 @@ + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +@@ -1318,7 +1318,7 @@ + if (ce) + mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); +- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); + + /* ext3_forget() calls bforget() for us, but we + let our caller release old_bh, so we need to +@@ -1417,7 +1417,7 @@ + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); ++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); + } else { +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-02-25 17:27:00.234756856 +0200 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-02-25 17:28:41.881304232 +0200 +@@ -57,6 +57,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x100000 /* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x200000 /* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -725,7 +734,7 @@ + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -856,6 +865,37 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* mballoc.c */ ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ + /* extents.c */ + extern int ext3_ext_writepage_trans_blocks(struct inode *, int); + extern int ext3_ext_get_block(handle_t *, struct inode *, long, +Index: linux-2.6.10/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h 2005-02-25 17:26:59.641846992 +0200 ++++ linux-2.6.10/include/linux/ext3_fs_sb.h 2005-02-25 17:28:41.882304080 +0200 +@@ -23,10 +23,30 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_buddy_group_blocks { ++ __u32 bb_bitmap; ++ __u32 bb_buddy; ++ spinlock_t bb_lock; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -81,6 +101,27 @@ + int s_jquota_fmt; /* Format of quota to use */ + #endif + u32 s_mdsnum; ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks **s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-fc3.series b/lustre/kernel_patches/series/ldiskfs-2.6-fc3.series index 9476f8a..3e96555 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6-fc3.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6-fc3.series @@ -16,3 +16,4 @@ ext3-mds-num-2.6.10-fc3.patch ext3-fid-2.6.7.patch ext3-raw-lookup-2.6.10.patch ext3-disable-reservation-2.6.10-fc3.patch +ext3-mballoc2-2.6.10-fc3.patch diff --git a/lustre/ldiskfs/Makefile.in b/lustre/ldiskfs/Makefile.in index 80d9efb..acf0b20 100644 --- a/lustre/ldiskfs/Makefile.in +++ b/lustre/ldiskfs/Makefile.in @@ -9,7 +9,7 @@ ext3_headers := $(wildcard @LINUX@/fs/ext3/*.h) linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h) new_linux_hearders := ext3_extents.h ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c)) -new_sources := iopen.c iopen.h extents.c extents-in-ea.c +new_sources := iopen.c iopen.h extents.c extents-in-ea.c mballoc.c ldiskfs_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) ldiskfs-objs := $(filter %.o,$(ldiskfs_sources:.c=.o)) diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 59c517e..08cd122 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1053,6 +1053,8 @@ def def_mount_options(fstype, target): mountfsoptions = "errors=remount-ro" if target == 'ost' and sys_get_branch() == '2.4': mountfsoptions = "%s,asyncdel" % (mountfsoptions) + if target == 'ost' and sys_get_branch() == '2.6': + mountfsoptions = "%s,extents,mballoc" % (mountfsoptions) return mountfsoptions return "" -- 1.8.3.1