===================================================================
--- linux-stage.orig/fs/ext3/mballoc.c 2005-02-25 17:28:41.836311072 +0200
+++ linux-stage/fs/ext3/mballoc.c 2005-02-25 17:28:41.859307576 +0200
-@@ -0,0 +1,1847 @@
+@@ -0,0 +1,1860 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+/*
+ * TODO:
++ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
+ * - track min/max extents in each group for better group selection
+ * - is it worthwhile to use buddies directly if req is 2^N blocks?
+ * - mb_mark_used() may allocate chunk right after splitting buddy
+ __u32 mh_magic;
+};
+
-+#define EXT3_MB_MAGIC_V1 0xbaad16fc
++#define EXT3_MB_MAGIC_V1 0xbabd16fd
+
+
+struct ext3_free_extent {
+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
+void ext3_mb_free_committed_blocks(struct super_block *);
+
-+#define mb_correct_addr_and_bit(bit,addr) \
-+{ \
-+ if ((unsigned long)addr & 1) { \
-+ bit += 8; \
-+ addr--; \
-+ } \
-+ if ((unsigned long)addr & 2) { \
-+ bit += 16; \
-+ addr--; \
-+ addr--; \
-+ } \
++#if BITS_PER_LONG == 64
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ bit += ((unsigned long) addr & 7UL) << 3; \
++ addr = (void *) ((unsigned long) addr & ~7UL); \
+}
++#elif BITS_PER_LONG == 32
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ bit += ((unsigned long) addr & 3UL) << 3; \
++ addr = (void *) ((unsigned long) addr & ~3UL); \
++}
++#else
++#error "how many bits you are?!"
++#endif
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ return test_bit(bit, addr);
++ return ext2_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ __set_bit(bit, addr);
++ ext2_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ set_bit(bit, addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ __clear_bit(bit, addr);
++ ext2_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ clear_bit(bit, addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
+}
+
+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+ J_ASSERT(max != NULL);
+
-+ if (order > e3b->bd_blkbits + 1)
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
+ return NULL;
++ }
+
+ /* at order 0 we see each particular block */
+ *max = 1 << (e3b->bd_blkbits + 3);
+ "can't get block for buddy bitmap\n");
+ goto out;
+ }
-+ if (!buffer_uptodate(e3b->bd_bh)) {
-+ ll_rw_block(READ, 1, &e3b->bd_bh);
-+ wait_on_buffer(e3b->bd_bh);
-+ }
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh));
-+
+ /* load buddy */
+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
+ if (e3b->bd_bh2 == NULL) {
+ "can't get block for buddy bitmap\n");
+ goto out;
+ }
-+ if (!buffer_uptodate(e3b->bd_bh2)) {
++
++ if (!buffer_uptodate(e3b->bd_bh))
++ ll_rw_block(READ, 1, &e3b->bd_bh);
++ if (!buffer_uptodate(e3b->bd_bh2))
+ ll_rw_block(READ, 1, &e3b->bd_bh2);
-+ wait_on_buffer(e3b->bd_bh2);
-+ }
++
++ wait_on_buffer(e3b->bd_bh);
++ J_ASSERT(buffer_uptodate(e3b->bd_bh));
++ wait_on_buffer(e3b->bd_bh2);
+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
+
+ e3b->bd_blkbits = sb->s_blocksize_bits;
+ count = 0;
+ for (i = 0; i < max; i++) {
+
-+ if (!mb_test_bit(i, buddy)) {
++ if (mb_test_bit(i, buddy)) {
+ /* only single bit in buddy2 may be 1 */
-+ if (mb_test_bit(i << 1, buddy2))
-+ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2));
-+ else if (mb_test_bit((i << 1) + 1, buddy2))
-+ J_ASSERT(!mb_test_bit(i << 1, buddy2));
++ if (!mb_test_bit(i << 1, buddy2))
++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2));
++ else if (!mb_test_bit((i << 1) + 1, buddy2))
++ J_ASSERT(mb_test_bit(i << 1, buddy2));
+ continue;
+ }
+
+ /* both bits in buddy2 must be 0 */
-+ J_ASSERT(!mb_test_bit(i << 1, buddy2));
-+ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2));
++ J_ASSERT(mb_test_bit(i << 1, buddy2));
++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+ for (j = 0; j < (1 << order); j++) {
+ k = (i * (1 << order)) + j;
-+ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
+ }
+ count++;
+ }
+
+ buddy = mb_find_buddy(e3b, 0, &max);
+ for (i = 0; i < max; i++) {
-+ if (mb_test_bit(i, buddy))
++ if (!mb_test_bit(i, buddy))
+ continue;
+ /* check used bits only */
+ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
+ buddy2 = mb_find_buddy(e3b, j, &max2);
+ k = i >> j;
+ J_ASSERT(k < max2);
-+ J_ASSERT(!mb_test_bit(k, buddy2));
++ J_ASSERT(mb_test_bit(k, buddy2));
+ }
+ }
+}
+ bb = EXT3_MB_BUDDY(e3b);
+ while (order <= e3b->bd_blkbits + 1) {
+ block = block >> 1;
-+ if (mb_test_bit(block, bb)) {
++ if (!mb_test_bit(block, bb)) {
+ /* this block is part of buddy of order 'order' */
+ return order;
+ }
+ block = first++;
+ order = 0;
+
-+ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
-+ mb_set_bit(block, EXT3_MB_BITMAP(e3b));
++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
+ e3b->bd_bd->bb_counters[order]++;
+
+ /* start of the buddy */
+
+ do {
+ block &= ~1UL;
-+ if (!mb_test_bit(block, buddy) ||
-+ !mb_test_bit(block + 1, buddy))
++ if (mb_test_bit(block, buddy) ||
++ mb_test_bit(block + 1, buddy))
+ break;
+
+ /* both the buddies are free, try to coalesce them */
+ break;
+
+ if (order > 0) {
-+ /* for special purposes, we don't clear
++ /* for special purposes, we don't set
+ * free bits in bitmap */
-+ mb_clear_bit(block, buddy);
-+ mb_clear_bit(block + 1, buddy);
++ mb_set_bit(block, buddy);
++ mb_set_bit(block + 1, buddy);
+ }
+ e3b->bd_bd->bb_counters[order]--;
+ e3b->bd_bd->bb_counters[order]--;
+ order++;
+ e3b->bd_bd->bb_counters[order]++;
+
-+ mb_set_bit(block, buddy2);
++ mb_clear_bit(block, buddy2);
+ buddy = buddy2;
+ } while (1);
+ }
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ J_ASSERT(block < max);
-+ if (!mb_test_bit(block, buddy)) {
++ if (mb_test_bit(block, buddy)) {
+ ex->fe_len = 0;
+ ex->fe_start = 0;
+ ex->fe_group = 0;
+ break;
+
+ next = (block + 1) * (1 << order);
-+ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
+ break;
+
+ ord = mb_find_order_for_block(e3b, next);
+ mlen = 1 << ord;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
-+ mb_clear_bit(start >> ord, buddy);
++ mb_set_bit(start >> ord, buddy);
+ e3b->bd_bd->bb_counters[ord]--;
+ start += mlen;
+ len -= mlen;
+ /* we have to split large buddy */
+ J_ASSERT(ord > 0);
+ buddy = mb_find_buddy(e3b, ord, &max);
-+ mb_clear_bit(start >> ord, buddy);
++ mb_set_bit(start >> ord, buddy);
+ e3b->bd_bd->bb_counters[ord]--;
+
+ ord--;
+ cur = (start >> ord) & ~1U;
+ buddy = mb_find_buddy(e3b, ord, &max);
-+ mb_set_bit(cur, buddy);
-+ mb_set_bit(cur + 1, buddy);
++ mb_clear_bit(cur, buddy);
++ mb_clear_bit(cur + 1, buddy);
+ e3b->bd_bd->bb_counters[ord]++;
+ e3b->bd_bd->bb_counters[ord]++;
+ }
+
+ /* now drop all the bits in bitmap */
-+ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
+
+ mb_check_buddy(e3b);
+
+ i = e3b->bd_bd->bb_first_free;
+
+ while (free && ac->ac_status != AC_STATUS_FOUND) {
-+ i = find_next_bit(bitmap, sb->s_blocksize * 8, i);
++ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
+ if (i >= sb->s_blocksize * 8) {
+ J_ASSERT(free == 0);
+ break;
+ /*
+ * We aren't lucky definitely
+ */
-+ J_ASSERT(ac.ac_b_ex.fe_len == 0);
+ DQUOT_FREE_BLOCK(inode, *len);
+ *errp = -ENOSPC;
+ block = 0;
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+#if AGGRESSIVE_CHECK
-+ for (i = 0; i < ac.ac_b_len; i++)
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
+#endif
+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
+ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
+ e3b->bd_bd->bb_free = grp->mgd_free;
-+ for (i = 0; i < e3b->bd_blkbits; i++) {
++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
+ J_ASSERT(i < 16);
+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
+ }
+ goto out;
+ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
+ grp->mgd_free = e3b->bd_bd->bb_free;
-+ for (i = 0; i < e3b->bd_blkbits; i++) {
++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
+ J_ASSERT(i < 16);
+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
+ }
+ struct super_block *sb = e3b->bd_sb;
+ struct buffer_head *bh;
+ int i, count = 0;
-+
-+ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize);
-+ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize);
++
++ mb_debug("generate buddy for group %d\n", e3b->bd_group);
++ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
++ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
+
+ bh = read_block_bitmap(sb, e3b->bd_group);
+ if (bh == NULL)
+ if (err)
+ goto error_return;
+
++#ifdef AGGRESSIVE_CHECK
++ {
++ int i;
++ for (i = 0; i < count; i++)
++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
++ }
++#endif
++ mb_clear_bits(bitmap_bh->b_data, bit, count);
++
++ /* We dirtied the bitmap block */
++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++
+ if (metadata) {
+ /* blocks being freed are metadata. these blocks shouldn't
+ * be used until this transaction is committed */
+ mb_free_blocks(&e3b, bit, count);
+ ext3_unlock_group(sb, block_group);
+ }
++
+ spin_lock(sb_bgl_lock(sbi, block_group));
+ gdp->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
-+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ *freed = count;
+
-+ /* We dirtied the bitmap block */
-+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-+ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
-+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext3_journal_dirty_metadata(handle, gd_bh);
+ return ret;
+}
+
-+
+void ext3_free_blocks(handle_t *handle, struct inode * inode,
+ unsigned long block, unsigned long count, int metadata)
+{
+ DQUOT_FREE_BLOCK(inode, freed);
+ return;
+}
-+
Index: linux-stage/fs/ext3/super.c
===================================================================
--- linux-stage.orig/fs/ext3/super.c 2005-02-25 17:27:00.231757312 +0200
int i;
+ ext3_mb_release(sb);
- ext3_ext_release(sb);
+ ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
@@ -592,7 +593,7 @@
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1639,7 +1652,8 @@
+@@ -1639,6 +1652,7 @@
ext3_count_dirs(sb));
- ext3_ext_init(sb);
--
+ ext3_ext_init(sb);
+ ext3_mb_init(sb, needs_recovery);
-+
+
return 0;
- failed_mount3:
Index: linux-stage/fs/ext3/Makefile
===================================================================
--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 17:27:00.228757768 +0200
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
-
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
@@ -725,7 +734,7 @@
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
#endif
#include <linux/rbtree.h>
-+#define EXT3_BB_MAX_BLOCKS 30
++#define EXT3_BB_MAX_BLOCKS 30
+struct ext3_free_metadata {
+ unsigned short group;
+ unsigned short num;
+ spinlock_t s_md_lock;
+ tid_t s_last_transaction;
+ int s_mb_factor;
-+
++
+ /* stats for buddy allocator */
+ spinlock_t s_bal_lock;
+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */