%patch
Index: linux-2.6.5-sles9/fs/ext3/extents.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300
-@@ -0,0 +1,2313 @@
+--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300
++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300
+@@ -0,0 +1,2356 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+#include <linux/ext3_extents.h>
+#include <asm/uaccess.h>
+
++
++static inline int ext3_ext_check_header(struct ext3_extent_header *eh)
++{
++ if (eh->eh_magic != EXT3_EXT_MAGIC) {
++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n",
++ (unsigned) eh->eh_magic);
++ return -EIO;
++ }
++ if (eh->eh_max == 0) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n",
++ (unsigned) eh->eh_max);
++ return -EIO;
++ }
++ if (eh->eh_entries > eh->eh_max) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n",
++ (unsigned) eh->eh_entries);
++ return -EIO;
++ }
++ return 0;
++}
++
+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
+{
+ int err;
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
++ if (ext3_ext_check_header(eh))
++ goto err;
++
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
-+ EXT_ASSERT(i == 0 || eh->eh_entries > 0);
+
+ /* account possible depth increase */
+ if (!path) {
+ path[ppos].p_ext = NULL;
+
+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
-+ if (!bh) {
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
-+ return ERR_PTR(-EIO);
-+ }
++ if (!bh)
++ goto err;
++
+ eh = EXT_BLOCK_HDR(bh);
+ ppos++;
+ EXT_ASSERT(ppos <= depth);
+ path[ppos].p_bh = bh;
+ path[ppos].p_hdr = eh;
+ i--;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
+ }
+
+ path[ppos].p_depth = i;
+ path[ppos].p_hdr = eh;
+ path[ppos].p_ext = NULL;
++ path[ppos].p_idx = NULL;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
+
+ /* find extent */
+ ext3_ext_binsearch(tree, path + ppos, block);
+ ext3_ext_show_path(tree, path);
+
+ return path;
++
++err:
++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
+}
+
+/*
+ int depth, len, err, next;
+
+ EXT_ASSERT(newext->ee_len > 0);
-+ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
+ depth = EXT_DEPTH(tree);
+ ex = path[depth].p_ext;
+ EXT_ASSERT(path[depth].p_hdr);
+ unsigned long num, ext_prepare_callback func)
+{
+ struct ext3_ext_path *path = NULL;
-+ struct ext3_extent *ex, cbex;
++ struct ext3_ext_cache cbex;
++ struct ext3_extent *ex;
+ unsigned long next, start = 0, end = 0;
+ unsigned long last = block + num;
+ int depth, exists, err = 0;
+ EXT_ASSERT(end > start);
+
+ if (!exists) {
-+ cbex.ee_block = start;
-+ cbex.ee_len = end - start;
-+ cbex.ee_start = 0;
-+ } else
-+ cbex = *ex;
++ cbex.ec_block = start;
++ cbex.ec_len = end - start;
++ cbex.ec_start = 0;
++ cbex.ec_type = EXT3_EXT_CACHE_GAP;
++ } else {
++ cbex.ec_block = ex->ee_block;
++ cbex.ec_len = ex->ee_len;
++ cbex.ec_start = ex->ee_start;
++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT;
++ }
+
++ EXT_ASSERT(cbex.ec_len > 0);
+ EXT_ASSERT(path[depth].p_hdr);
-+ err = func(tree, path, &cbex, exists);
++ err = func(tree, path, &cbex);
+ ext3_ext_drop_refs(path);
+
+ if (err < 0)
+ path = NULL;
+ }
+
-+ block = cbex.ee_block + cbex.ee_len;
++ block = cbex.ec_block + cbex.ec_len;
+ }
+
+ if (path) {
+ tree->root = (void *) EXT3_I(inode)->i_data;
+ tree->buffer = (void *) inode;
+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
-+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent;
++ tree->cex = &EXT3_I(inode)->i_cached_extent;
+ tree->ops = &ext3_blockmap_helpers;
+}
+
+ int goal, newblock, err = 0, depth;
+ struct ext3_extents_tree tree;
+
-+ clear_buffer_new(bh_result);
++ __clear_bit(BH_New, &bh_result->b_state);
+ ext3_init_tree_desc(&tree, inode);
+ ext_debug(&tree, "block %d requested for inode %u\n",
+ (int) iblock, (unsigned) inode->i_ino);
+
+ /* previous routine could use block we allocated */
+ newblock = newex.ee_start;
-+ set_buffer_new(bh_result);
++ __set_bit(BH_New, &bh_result->b_state);
+
+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len,
+ newex.ee_start, EXT3_EXT_CACHE_EXTENT);
+out:
+ ext3_ext_show_leaf(&tree, path);
-+ map_bh(bh_result, inode->i_sb, newblock);
++ __set_bit(BH_Mapped, &bh_result->b_state);
++ bh_result->b_bdev = inode->i_sb->s_bdev;
++ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext3_ext_drop_refs(path);
+static int
+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
+ struct ext3_ext_path *path,
-+ struct ext3_extent *newex, int exist)
++ struct ext3_ext_cache *newex)
+{
+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
+
-+ if (!exist)
++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT)
+ return EXT_CONTINUE;
++
+ if (buf->err < 0)
+ return EXT_BREAK;
+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
+static int
+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
+ struct ext3_ext_path *path,
-+ struct ext3_extent *ex, int exist)
++ struct ext3_ext_cache *ex)
+{
+ struct ext3_extent_tree_stats *buf =
+ (struct ext3_extent_tree_stats *) tree->private;
+ int depth;
+
-+ if (!exist)
++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT)
+ return EXT_CONTINUE;
+
+ depth = EXT_DEPTH(tree);
+}
+
+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-+ unsigned long arg)
++ unsigned long arg)
+{
+ int err = 0;
+
+
Index: linux-2.6.5-sles9/fs/ext3/ialloc.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2004-11-09 02:22:55.763148128 +0300
-+++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2004-11-09 02:23:21.587222272 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2005-02-23 01:01:52.366281264 +0300
++++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2005-02-23 01:02:37.398435336 +0300
@@ -647,6 +647,10 @@
DQUOT_FREE_INODE(inode);
goto fail2;
ext3_std_error(sb, err);
Index: linux-2.6.5-sles9/fs/ext3/inode.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:22:55.767147520 +0300
-+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:01:52.373280200 +0300
++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300
@@ -796,6 +796,17 @@
goto reread;
}
else
Index: linux-2.6.5-sles9/fs/ext3/Makefile
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300
-+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300
++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300
@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
Index: linux-2.6.5-sles9/fs/ext3/super.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:22:56.450043704 +0300
-+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:02:34.072940888 +0300
++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300
@@ -389,6 +389,7 @@
struct ext3_super_block *es = sbi->s_es;
int i;
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
-@@ -447,6 +448,10 @@
+@@ -447,6 +448,8 @@
#endif
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
ei->vfs_inode.i_version = 1;
-+ ei->i_cached_extent[0] = 0;
-+ ei->i_cached_extent[1] = 0;
-+ ei->i_cached_extent[2] = 0;
-+ ei->i_cached_extent[3] = 0;
++
++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
return &ei->vfs_inode;
}
-@@ -537,7 +542,7 @@
+@@ -537,7 +540,7 @@
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
};
static match_table_t tokens = {
-@@ -582,6 +587,8 @@
+@@ -582,6 +585,8 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
{Opt_err, NULL}
};
-@@ -797,6 +804,12 @@
+@@ -797,6 +802,12 @@
break;
case Opt_ignore:
break;
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1449,6 +1462,8 @@
+@@ -1449,6 +1460,8 @@
percpu_counter_mod(&sbi->s_dirs_counter,
ext3_count_dirs(sb));
failed_mount3:
Index: linux-2.6.5-sles9/fs/ext3/ioctl.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2004-11-09 02:15:44.610693264 +0300
-+++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2004-11-09 02:23:52.991448104 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2005-02-23 01:01:42.887722224 +0300
++++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2005-02-23 01:02:37.412433208 +0300
@@ -124,6 +124,10 @@
err = ext3_change_inode_journal_flag(inode, jflag);
return err;
return put_user(inode->i_generation, (int *) arg);
Index: linux-2.6.5-sles9/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:22:58.767691368 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:35.823674736 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300
@@ -186,6 +186,7 @@
#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2004-11-09 02:23:21.606219384 +0300
-@@ -0,0 +1,252 @@
+--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300
+@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ */
+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
+ struct ext3_ext_path *,
-+ struct ext3_extent *, int);
++ struct ext3_ext_cache *);
+
+#define EXT_CONTINUE 0
+#define EXT_BREAK 1
+
+
+#define EXT_MAX_BLOCK 0xffffffff
-+#define EXT_CACHE_MARK 0xffff
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
++#define EXT_CHECK_PATH(tree,path) \
++{ \
++ int depth = EXT_DEPTH(tree); \
++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_idx < \
++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_ext < \
++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \
++ && depth != 0); \
++ BUG_ON((path)[0].p_depth != depth); \
++}
++
+
+/*
+ * this structure is used to gather extents from the tree via ioctl
+
Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:22:55.780145544 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300
-@@ -128,6 +128,8 @@
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2005-02-23 01:01:52.425272296 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2005-02-23 01:45:55.611446920 +0300
+@@ -19,6 +19,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
++#include <linux/ext3_extents.h>
+
+ struct reserve_window {
+ __u32 _rsv_start; /* First byte reserved */
+@@ -128,6 +129,8 @@
*/
struct semaphore truncate_sem;
struct inode vfs_inode;
+
-+ __u32 i_cached_extent[4];
++ struct ext3_ext_cache i_cached_extent;
};
#endif /* _LINUX_EXT3_FS_I */
%diffstat
fs/ext3/Makefile | 2
- fs/ext3/extents.c | 2313 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/extents.c | 2356 +++++++++++++++++++++++++++++++++++++++++++
fs/ext3/ialloc.c | 4
fs/ext3/inode.c | 29
fs/ext3/ioctl.c | 4
- fs/ext3/super.c | 17
- include/linux/ext3_extents.h | 252 ++++
- include/linux/ext3_fs.h | 15
- include/linux/ext3_fs_i.h | 2
- 9 files changed, 2630 insertions(+), 8 deletions(-)
+ fs/ext3/super.c | 15
+ include/linux/ext3_extents.h | 265 ++++
+ include/linux/ext3_fs.h | 17
+ include/linux/ext3_fs_i.h | 3
+ 9 files changed, 2687 insertions(+), 8 deletions(-)
Index: linux-2.6.5-sles9/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300
-@@ -0,0 +1,1441 @@
+--- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2005-02-17 22:07:57.023609040 +0300
++++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2005-02-23 01:56:19.101662000 +0300
+@@ -0,0 +1,1835 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+/*
+ * TODO:
-+ * - do not scan from the beginning, try to remember first free block
-+ * - mb_mark_used_* may allocate chunk right after splitting buddy
++ * - track min/max extents in each group for better group selection
++ * - is it worthwhile to use buddies directly if req is 2^N blocks?
++ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - special flag to advice allocator to look for requested + N blocks
+ * this may improve interaction between extents and mballoc
++ * - tree of groups sorted by number of free blocks
++ * - percpu reservation code (hotpath)
++ * - error handling
+ */
+
+/*
+ * with AGRESSIVE_CHECK allocator runs consistency checks over
-+ * structures. this checks slow things down a lot
++ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
++ * with MBALLOC_STATS allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
++ */
++#define MBALLOC_STATS
++
++/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define EXT3_BUDDY_FILE ".buddy"
+
+/*
-+ * max. number of chunks to be tracked in ext3_free_extent struct
++ * How long mballoc can look for a best extent (in found extents)
++ */
++#define EXT3_MB_MAX_TO_SCAN 100
++
++/*
++ * This structure is on-disk description of a group for mballoc
++ */
++struct ext3_mb_group_descr {
++ __u16 mgd_first_free; /* first free block in the group */
++ __u16 mgd_free; /* number of free blocks in the group */
++ __u16 mgd_counters[16]; /* number of free blocks by order */
++};
++
++/*
++ * This structure is header of mballoc's file
+ */
-+#define MB_ARR_SIZE 32
++struct ext3_mb_grp_header {
++ __u32 mh_magic;
++};
++
++#define EXT3_MB_MAGIC_V1 0xbaad16fc
++
++
++struct ext3_free_extent {
++ __u16 fe_start;
++ __u16 fe_len;
++ __u16 fe_group;
++};
+
+struct ext3_allocation_context {
+ struct super_block *ac_sb;
+
+ /* search goals */
-+ int ac_g_group;
-+ int ac_g_start;
-+ int ac_g_len;
-+ int ac_g_flags;
++struct ext3_free_extent ac_g_ex;
+
+ /* the best found extent */
-+ int ac_b_group;
-+ int ac_b_start;
-+ int ac_b_len;
++ struct ext3_free_extent ac_b_ex;
+
+ /* number of iterations done. we have to track to limit searching */
-+ int ac_repeats;
-+ int ac_groups_scanned;
-+ int ac_status;
++ unsigned long ac_ex_scanned;
++ __u16 ac_groups_scanned;
++ __u16 ac_found;
++ __u8 ac_status;
++ __u8 ac_flags; /* allocation hints */
++ __u8 ac_repeats;
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
-+
++#define AC_STATUS_BREAK 3
+
+struct ext3_buddy {
-+ void *bd_bitmap;
-+ void *bd_buddy;
-+ int bd_blkbits;
+ struct buffer_head *bd_bh;
+ struct buffer_head *bd_bh2;
+ struct ext3_buddy_group_blocks *bd_bd;
+ struct super_block *bd_sb;
++ __u16 bd_blkbits;
++ __u16 bd_group;
+};
-+
-+struct ext3_free_extent {
-+ int fe_start;
-+ int fe_len;
-+ unsigned char fe_orders[MB_ARR_SIZE];
-+ unsigned char fe_nums;
-+ unsigned char fe_back;
-+};
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data)
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
-+
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
-+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+int ext3_mb_reserve_blocks(struct super_block *, int);
++void ext3_mb_release_blocks(struct super_block *, int);
+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
+void ext3_mb_free_committed_blocks(struct super_block *);
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
++ __set_bit(bit, addr);
++}
++
++static inline void mb_set_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
+ set_bit(bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
++ __clear_bit(bit, addr);
++}
++
++static inline void mb_clear_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
+ clear_bit(bit, addr);
+}
+
+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
+ int i = 1;
-+ void *bb;
++ char *bb;
+
-+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+ J_ASSERT(max != NULL);
+
+ if (order > e3b->bd_blkbits + 1)
+ /* at order 0 we see each particular block */
+ *max = 1 << (e3b->bd_blkbits + 3);
+ if (order == 0)
-+ return e3b->bd_bitmap;
++ return EXT3_MB_BITMAP(e3b);
+
-+ bb = e3b->bd_buddy;
++ bb = EXT3_MB_BUDDY(e3b);
+ *max = *max >> 1;
+ while (i < order) {
+ bb += 1 << (e3b->bd_blkbits - i);
+ i++;
+ *max = *max >> 1;
+ }
++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
++ e3b->bd_sb->s_blocksize);
+ return bb;
+}
+
-+static int ext3_mb_load_desc(struct super_block *sb, int group,
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
+ struct ext3_buddy *e3b)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ /* load bitmap */
+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
+ if (e3b->bd_bh == NULL) {
-+ ext3_error(sb, "ext3_mb_load_desc",
++ ext3_error(sb, "ext3_mb_load_buddy",
+ "can't get block for buddy bitmap\n");
+ goto out;
+ }
+ /* load buddy */
+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
+ if (e3b->bd_bh2 == NULL) {
-+ ext3_error(sb, "ext3_mb_load_desc",
++ ext3_error(sb, "ext3_mb_load_buddy",
+ "can't get block for buddy bitmap\n");
+ goto out;
+ }
+ }
+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
+
-+ e3b->bd_bitmap = e3b->bd_bh->b_data;
-+ e3b->bd_buddy = e3b->bd_bh2->b_data;
+ e3b->bd_blkbits = sb->s_blocksize_bits;
+ e3b->bd_bd = sbi->s_buddy_blocks[group];
+ e3b->bd_sb = sb;
++ e3b->bd_group = group;
+
+ return 0;
+out:
+
+ for (j = 0; j < (1 << order); j++) {
+ k = (i * (1 << order)) + j;
-+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap));
++ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
+ }
+ count++;
+ }
+ int order = 1;
+ void *bb;
+
-+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
+
-+ bb = e3b->bd_buddy;
++ bb = EXT3_MB_BUDDY(e3b);
+ while (order <= e3b->bd_blkbits + 1) {
+ block = block >> 1;
+ if (mb_test_bit(block, bb)) {
+ cur += 32;
+ continue;
+ }
-+ mb_clear_bit(cur, bm);
++ mb_clear_bit_atomic(cur, bm);
+ cur++;
+ }
+}
+ cur += 32;
+ continue;
+ }
-+ mb_set_bit(cur, bm);
++ mb_set_bit_atomic(cur, bm);
+ cur++;
+ }
+}
+ void *buddy, *buddy2;
+
+ mb_check_buddy(e3b);
++
++ e3b->bd_bd->bb_free += count;
++ if (first < e3b->bd_bd->bb_first_free)
++ e3b->bd_bd->bb_first_free = first;
++
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
-+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap));
-+ mb_set_bit(block, e3b->bd_bitmap);
++ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
++ mb_set_bit(block, EXT3_MB_BITMAP(e3b));
+ e3b->bd_bd->bb_counters[order]++;
+
+ /* start of the buddy */
+ return 0;
+}
+
-+/*
-+ * returns 1 if out extent is enough to fill needed space
-+ */
-+int mb_make_backward_extent(struct ext3_free_extent *in,
-+ struct ext3_free_extent *out, int needed)
++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
++ int needed, struct ext3_free_extent *ex)
+{
-+ int i;
-+
-+ J_ASSERT(in);
-+ J_ASSERT(out);
-+ J_ASSERT(in->fe_nums < MB_ARR_SIZE);
-+
-+ out->fe_len = 0;
-+ out->fe_start = in->fe_start + in->fe_len;
-+ out->fe_nums = 0;
-+
-+ /* for single-chunk extent we need not back order
-+ * also, if an extent doesn't fill needed space
-+ * then it makes no sense to try back order becase
-+ * if we select this extent then it'll be use as is */
-+ if (in->fe_nums < 2 || in->fe_len < needed)
-+ return 0;
-+
-+ i = in->fe_nums - 1;
-+ while (i >= 0 && out->fe_len < needed) {
-+ out->fe_len += (1 << in->fe_orders[i]);
-+ out->fe_start -= (1 << in->fe_orders[i]);
-+ i--;
-+ }
-+ /* FIXME: in some situation fe_orders may be too small to hold
-+ * all the buddies */
-+ J_ASSERT(out->fe_len >= needed);
-+
-+ for (i++; i < in->fe_nums; i++)
-+ out->fe_orders[out->fe_nums++] = in->fe_orders[i];
-+ J_ASSERT(out->fe_nums < MB_ARR_SIZE);
-+ out->fe_back = 1;
-+
-+ return 1;
-+}
-+
-+int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
-+ int needed, struct ext3_free_extent *ex)
-+{
-+ int space = needed;
+ int next, max, ord;
+ void *buddy;
+
+ J_ASSERT(ex != NULL);
+
-+ ex->fe_nums = 0;
-+ ex->fe_len = 0;
-+
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ J_ASSERT(block < max);
-+ if (!mb_test_bit(block, buddy))
-+ goto nofree;
++ if (!mb_test_bit(block, buddy)) {
++ ex->fe_len = 0;
++ ex->fe_start = 0;
++ ex->fe_group = 0;
++ return 0;
++ }
+
+ if (order == 0) {
+ /* find actual order */
+ block = block >> order;
+ }
+
-+ ex->fe_orders[ex->fe_nums++] = order;
+ ex->fe_len = 1 << order;
+ ex->fe_start = block << order;
-+ ex->fe_back = 0;
-+
-+ while ((space = space - (1 << order)) > 0) {
++ ex->fe_group = e3b->bd_group;
+
-+ buddy = mb_find_buddy(e3b, order, &max);
-+ J_ASSERT(buddy);
++ while ((buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+ next = (block + 1) * (1 << order);
-+ if (!mb_test_bit(next, e3b->bd_bitmap))
++ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
+ break;
+
+ ord = mb_find_order_for_block(e3b, next);
+
-+ if ((1 << ord) >= needed) {
-+ /* we dont want to coalesce with self-enough buddies */
-+ break;
-+ }
+ order = ord;
+ block = next >> order;
+ ex->fe_len += 1 << order;
-+
-+ if (ex->fe_nums < MB_ARR_SIZE)
-+ ex->fe_orders[ex->fe_nums++] = order;
+ }
+
-+nofree:
+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
+ return ex->fe_len;
+}
+
-+static int mb_mark_used_backward(struct ext3_buddy *e3b,
-+ struct ext3_free_extent *ex, int len)
++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
-+ int start = ex->fe_start, len0 = len;
++ int start = ex->fe_start;
++ int len = ex->fe_len;
+ int ord, mlen, max, cur;
++ int len0 = len;
+ void *buddy;
+
-+ start = ex->fe_start + ex->fe_len - 1;
++ e3b->bd_bd->bb_free -= len;
++ if (e3b->bd_bd->bb_first_free == start)
++ e3b->bd_bd->bb_first_free += len;
++
+ while (len) {
+ ord = mb_find_order_for_block(e3b, start);
-+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) &&
-+ len >= (1 << ord)) {
++
++ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+ /* the whole chunk may be allocated at once! */
+ mlen = 1 << ord;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
+ mb_clear_bit(start >> ord, buddy);
+ e3b->bd_bd->bb_counters[ord]--;
-+ start -= mlen;
++ start += mlen;
+ len -= mlen;
+ J_ASSERT(len >= 0);
-+ J_ASSERT(start >= 0);
+ continue;
+ }
+
+ }
+
+ /* now drop all the bits in bitmap */
-+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0);
++ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
+
+ mb_check_buddy(e3b);
+
+ return 0;
+}
+
-+static int mb_mark_used_forward(struct ext3_buddy *e3b,
-+ struct ext3_free_extent *ex, int len)
++/*
++ * Must be called under group lock!
++ */
++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
-+ int start = ex->fe_start, len0 = len;
-+ int ord, mlen, max, cur;
-+ void *buddy;
++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
++ mb_mark_used(e3b, &ac->ac_b_ex);
++ ac->ac_status = AC_STATUS_FOUND;
++}
+
-+ while (len) {
-+ ord = mb_find_order_for_block(e3b, start);
++/*
++ * The routine checks whether found extent is good enough. If it is,
++ * then the extent gets marked used and flag is set to the context
++ * to stop scanning. Otherwise, the extent is compared with the
++ * previous found extent and if new one is better, then it's stored
++ * in the context. Later, the best found extent will be used, if
++ * mballoc can't find good enough extent.
++ *
++ * FIXME: real allocation policy is to be designed yet!
++ */
++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac,
++ struct ext3_free_extent *ex,
++ struct ext3_buddy *e3b)
++{
++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
++ struct ext3_free_extent *bex = &ac->ac_b_ex;
++ int diff = ac->ac_g_ex.fe_len - ex->fe_len;
+
-+ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
-+ /* the whole chunk may be allocated at once! */
-+ mlen = 1 << ord;
-+ buddy = mb_find_buddy(e3b, ord, &max);
-+ J_ASSERT((start >> ord) < max);
-+ mb_clear_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
-+ start += mlen;
-+ len -= mlen;
-+ J_ASSERT(len >= 0);
-+ continue;
-+ }
++ J_ASSERT(ex->fe_len > 0);
++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+
-+ /* we have to split large buddy */
-+ J_ASSERT(ord > 0);
-+ buddy = mb_find_buddy(e3b, ord, &max);
-+ mb_clear_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ ac->ac_found++;
+
-+ ord--;
-+ cur = (start >> ord) & ~1U;
-+ buddy = mb_find_buddy(e3b, ord, &max);
-+ mb_set_bit(cur, buddy);
-+ mb_set_bit(cur + 1, buddy);
-+ e3b->bd_bd->bb_counters[ord]++;
-+ e3b->bd_bd->bb_counters[ord]++;
++ /*
++ * The special case - take what you catch first
++ */
++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
+ }
+
-+ /* now drop all the bits in bitmap */
-+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0);
++ /*
++ * Let's check whether the chuck is good enough
++ */
++ if (ex->fe_len >= ac->ac_g_ex.fe_len) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
+
-+ mb_check_buddy(e3b);
++ /*
++ * If the request is vey large, then it makes sense to use large
++ * chunks for it. Even if they don't satisfy whole request.
++ */
++ if (ex->fe_len > 1000) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
+
-+ return 0;
++ /*
++ * Sometimes it's worty to take close chunk
++ */
++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
++
++ /*
++ * If this is first found extent, just store it in the context
++ */
++ if (bex->fe_len == 0) {
++ *bex = *ex;
++ return;
++ }
++
++ /*
++ * If new found extent is better, store it in the context
++ * FIXME: possible the policy should be more complex?
++ */
++ if (ex->fe_len > bex->fe_len) {
++ *bex = *ex;
++ }
++
++ /*
++ * We don't want to scan for a whole year
++ */
++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN)
++ ac->ac_status = AC_STATUS_BREAK;
+}
+
-+int inline mb_mark_used(struct ext3_buddy *e3b,
-+ struct ext3_free_extent *ex, int len)
++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
-+ int err;
++ struct ext3_free_extent ex = ac->ac_b_ex;
++ int group = ex.fe_group, max, err;
+
-+ J_ASSERT(ex);
-+ if (ex->fe_back == 0)
-+ err = mb_mark_used_forward(e3b, ex, len);
-+ else
-+ err = mb_mark_used_backward(e3b, ex, len);
-+ return err;
++ J_ASSERT(ex.fe_len > 0);
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
++
++ if (max > 0)
++ ext3_mb_use_best_found(ac, e3b);
++
++ ext3_unlock_group(ac->ac_sb, group);
++
++ if (ac->ac_status == AC_STATUS_FOUND)
++ ext3_mb_dirty_buddy(e3b);
++ ext3_mb_release_desc(e3b);
++
++ return 0;
+}
+
-+int ext3_mb_new_in_group(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b, int group)
++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
-+ struct super_block *sb = ac->ac_sb;
-+ int err, gorder, max, i;
-+ struct ext3_free_extent curex;
-+
-+ /* let's know order of allocation */
-+ gorder = 0;
-+ while (ac->ac_g_len > (1 << gorder))
-+ gorder++;
-+
-+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) {
-+ /* someone asks for space at this specified block
-+ * probably he wants to merge it into existing extent */
-+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) {
-+ /* good. at least one block is free */
-+ max = mb_find_extent(e3b, 0, ac->ac_g_start,
-+ ac->ac_g_len, &curex);
-+ max = min(curex.fe_len, ac->ac_g_len);
-+ mb_mark_used(e3b, &curex, max);
-+
-+ ac->ac_b_group = group;
-+ ac->ac_b_start = curex.fe_start;
-+ ac->ac_b_len = max;
-+ ac->ac_status = AC_STATUS_FOUND;
-+ err = 0;
-+ goto out;
-+ }
-+ /* don't try to find goal anymore */
-+ ac->ac_g_flags &= ~1;
++ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_free_extent ex;
++
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max > 0) {
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
+ }
++ ext3_unlock_group(ac->ac_sb, group);
+
-+ i = 0;
-+ while (1) {
-+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i);
-+ if (i >= sb->s_blocksize * 8)
-+ break;
++ if (ac->ac_status == AC_STATUS_FOUND)
++ ext3_mb_dirty_buddy(e3b);
++ ext3_mb_release_desc(e3b);
+
-+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex);
-+ if (max >= ac->ac_g_len) {
-+ max = min(curex.fe_len, ac->ac_g_len);
-+ mb_mark_used(e3b, &curex, max);
-+
-+ ac->ac_b_group = group;
-+ ac->ac_b_start = curex.fe_start;
-+ ac->ac_b_len = max;
-+ ac->ac_status = AC_STATUS_FOUND;
++ return 0;
++}
++/*
++ * The routine scans the group and measures all found extents.
++ * In order to optimize scanning, caller must pass number of
++ * free blocks in the group, so the routine can upper limit.
++ */
++static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ int i, free;
++
++ free = e3b->bd_bd->bb_free;
++ J_ASSERT(free > 0);
++
++ i = e3b->bd_bd->bb_first_free;
++
++ while (free && ac->ac_status != AC_STATUS_FOUND) {
++ i = find_next_bit(bitmap, sb->s_blocksize * 8, i);
++ if (i >= sb->s_blocksize * 8) {
++ J_ASSERT(free == 0);
+ break;
+ }
-+ i += max;
-+ }
+
-+ return 0;
++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex);
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(free >= ex.fe_len);
+
-+out:
-+ return err;
++ ext3_mb_measure_extent(ac, &ex, e3b);
++
++ i += ex.fe_len;
++ free -= ex.fe_len;
++ }
+}
+
-+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr)
++static int ext3_mb_good_group(struct ext3_allocation_context *ac,
++ int group, int cr)
+{
-+ struct ext3_group_desc *gdp;
-+ int free_blocks;
++ int free;
+
-+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL);
-+ if (!gdp)
-+ return 0;
-+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-+ if (free_blocks == 0)
-+ return 0;
++ J_ASSERT(cr >= 0 && cr < 3);
+
-+ /* someone wants this block very much */
-+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group)
-+ return 1;
++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
++ if (free == 0)
++ return 0;
+
-+ /* FIXME: I'd like to take fragmentation into account here */
+ if (cr == 0) {
-+ if (free_blocks >= ac->ac_g_len >> 1)
++ if (free >= ac->ac_g_ex.fe_len >> 1)
+ return 1;
+ } else if (cr == 1) {
-+ if (free_blocks >= ac->ac_g_len >> 2)
++ if (free >= ac->ac_g_ex.fe_len >> 2)
+ return 1;
+ } else if (cr == 2) {
+ return 1;
-+ } else {
-+ BUG();
+ }
+ return 0;
+}
+ sbi = EXT3_SB(sb);
+ es = EXT3_SB(sb)->s_es;
+
-+ if (!(flags & 2)) {
++ /*
++ * We can't allocate > group size
++ */
++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10)
++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10;
++
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
+ /* someone asks for non-reserved blocks */
+ BUG_ON(*len > 1);
+ err = ext3_mb_reserve_blocks(sb, 1);
+ EXT3_BLOCKS_PER_GROUP(sb));
+
+ /* set up allocation goals */
-+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0;
-+ ac.ac_status = 0;
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
+ ac.ac_groups_scanned = 0;
++ ac.ac_ex_scanned = 0;
++ ac.ac_found = 0;
+ ac.ac_sb = inode->i_sb;
-+ ac.ac_g_group = group;
-+ ac.ac_g_start = block;
-+ ac.ac_g_len = *len;
-+ ac.ac_g_flags = flags;
++ ac.ac_g_ex.fe_group = group;
++ ac.ac_g_ex.fe_start = block;
++ ac.ac_g_ex.fe_len = *len;
++ ac.ac_flags = flags;
++
++ /*
++ * Sometimes, caller may want to merge even small number
++ * of blocks to an existing extent
++ */
++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
++ err = ext3_mb_find_by_goal(&ac, &e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ goto found;
++ }
+
-+ /* loop over the groups */
-+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) {
++ /*
++ * FIXME
++ * If requested chunk is power of 2 length, we can try
++ * to exploit buddy nature to speed allocation up
++ */
++
++
++ /*
++ * Let's just scan groups to find more-less suitable blocks
++ */
++ cr = 0;
++repeat:
++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
+ /* check is group good for our criteries */
-+ if (!mb_good_group(&ac, group, cr))
++ if (!ext3_mb_good_group(&ac, group, cr))
+ continue;
+
-+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b);
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ if (err)
+ goto out_err;
+
+ ext3_lock_group(sb, group);
-+ if (!mb_good_group(&ac, group, cr)) {
++ if (!ext3_mb_good_group(&ac, group, cr)) {
+ /* someone did allocation from this group */
+ ext3_unlock_group(sb, group);
+ ext3_mb_release_desc(&e3b);
+ continue;
+ }
+
-+ err = ext3_mb_new_in_group(&ac, &e3b, group);
++ ext3_mb_scan_group(&ac, &e3b);
+ ext3_unlock_group(sb, group);
++
+ if (ac.ac_status == AC_STATUS_FOUND)
+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
++
+ if (err)
+ goto out_err;
-+ if (ac.ac_status == AC_STATUS_FOUND)
++ if (ac.ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
+ }
+
++ if (ac.ac_status == AC_STATUS_BREAK &&
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++ printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n",
++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
++ ext3_mb_try_best_found(&ac, &e3b);
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /*
++ * Someone more lucky has already allocated it.
++ * The only thing we can do is just take first
++ * found block(s)
++ */
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
++ ac.ac_flags |= EXT3_MB_HINT_FIRST;
++ cr = 2;
++ goto repeat;
++ }
++ }
++
+ if (ac.ac_status != AC_STATUS_FOUND) {
-+ /* unfortunately, we can't satisfy this request */
-+ J_ASSERT(ac.ac_b_len == 0);
++ /*
++ * We aren't lucky definitely
++ */
++ J_ASSERT(ac.ac_b_ex.fe_len == 0);
+ DQUOT_FREE_BLOCK(inode, *len);
+ *errp = -ENOSPC;
+ block = 0;
++#if 1
++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
++ ac.ac_status, ac.ac_flags);
++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
++ sbi->s_blocks_reserved, ac.ac_found);
++ printk("EXT3-fs: groups: ");
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ printk("%d: %d ", i,
++ sbi->s_buddy_blocks[i]->bb_free);
++ printk("\n");
++#endif
+ goto out;
+ }
+
++found:
++ J_ASSERT(ac.ac_b_ex.fe_len > 0);
++
+ /* good news - free block(s) have been found. now it's time
+ * to mark block(s) in good old journaled bitmap */
-+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
-+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block);
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
+
+ /* we made a desicion, now mark found blocks in good old
+ * bitmap to be journaled */
+ ext3_debug("using block group %d(%d)\n",
+ ac.ac_b_group.group, gdp->bg_free_blocks_count);
+
-+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group);
++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group);
+ if (!bitmap_bh) {
+ *errp = -EIO;
+ goto out_err;
+ goto out_err;
+ }
+
-+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh);
++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh);
+ if (!gdp) {
+ *errp = -EIO;
+ goto out_err;
+ if (err)
+ goto out_err;
+
-+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
-+ + le32_to_cpu(es->s_first_data_block);
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
+
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+#if 0
++#if AGGRESSIVE_CHECK
+ for (i = 0; i < ac.ac_b_len; i++)
-+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data));
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
+#endif
-+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len);
++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
-+ ext3_lock_group(sb, ac.ac_b_group);
++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
-+ ac.ac_b_len);
-+ ext3_unlock_group(sb, ac.ac_b_group);
-+ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len);
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
++ - ac.ac_b_ex.fe_len);
++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len);
+
+ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+ if (err)
+ brelse(bitmap_bh);
+
+ /* drop non-allocated, but dquote'd blocks */
-+ J_ASSERT(*len >= ac.ac_b_len);
-+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len);
++ J_ASSERT(*len >= ac.ac_b_ex.fe_len);
++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len);
+
-+ *len = ac.ac_b_len;
++ *len = ac.ac_b_ex.fe_len;
++ J_ASSERT(*len > 0);
+ J_ASSERT(block != 0);
+ goto out;
+
+ *errp = err;
+ block = 0;
+out:
-+ if (!(flags & 2)) {
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
+ /* block wasn't reserved before and we reserved it
+ * at the beginning of allocation. it doesn't matter
+ * whether we allocated anything or we failed: time
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
++#ifdef MBALLOC_STATS
++ if (ac.ac_g_ex.fe_len > 1) {
++ spin_lock(&sbi->s_bal_lock);
++ sbi->s_bal_reqs++;
++ sbi->s_bal_allocated += *len;
++ if (*len >= ac.ac_g_ex.fe_len)
++ sbi->s_bal_success++;
++ sbi->s_bal_ex_scanned += ac.ac_found;
++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
++ sbi->s_bal_goals++;
++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN)
++ sbi->s_bal_breaks++;
++ spin_unlock(&sbi->s_bal_lock);
++ }
++#endif
+ return block;
+}
+
-+int ext3_mb_generate_buddy(struct super_block *sb, int group)
++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
++ struct ext3_mb_group_descr **grp)
+{
++ struct super_block *sb = e3b->bd_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int descr_per_block, err, offset;
++ struct ext3_mb_grp_header *hdr;
++ unsigned long block;
++
++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
++ / sizeof(struct ext3_mb_group_descr);
++ block = e3b->bd_group / descr_per_block;
++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
++ if (*bh == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
++ e3b->bd_group, err);
++ return err;
++ }
++
++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
++ e3b->bd_group);
++ brelse(*bh);
++ *bh = NULL;
++ return -EIO;
++ }
++
++ offset = e3b->bd_group % descr_per_block
++ * sizeof(struct ext3_mb_group_descr)
++ + sizeof(struct ext3_mb_grp_header);
++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
++
++ return 0;
++}
++
++int ext3_mb_load_descr(struct ext3_buddy *e3b)
++{
++ struct ext3_mb_group_descr *grp;
++ struct ext3_group_desc *gdp;
+ struct buffer_head *bh;
-+ int i, err, count = 0;
-+ struct ext3_buddy e3b;
++ int err, i;
++
++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
++ if (err)
++ return err;
+
-+ err = ext3_mb_load_desc(sb, group, &e3b);
++ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
++ e3b->bd_bd->bb_free = grp->mgd_free;
++ for (i = 0; i < e3b->bd_blkbits; i++) {
++ J_ASSERT(i < 16);
++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
++ }
++ brelse(bh);
++
++ /* additional checks against old group descriptor */
++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
++ if (!gdp)
++ return -EIO;
++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
++ e3b->bd_group, e3b->bd_bd->bb_free,
++ le16_to_cpu(gdp->bg_free_blocks_count));
++ BUG();
++ return -ENODATA;
++ }
++
++ return 0;
++}
++
++
++int ext3_mb_update_descr(struct ext3_buddy *e3b)
++{
++ struct ext3_mb_group_descr *grp;
++ struct ext3_group_desc *ogdp;
++ struct buffer_head *bh;
++ handle_t *handle;
++ int err, i;
++
++ /* additional checks against old group descriptor */
++ ogdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
++ if (!ogdp)
++ return -EIO;
++ if (e3b->bd_bd->bb_free != le16_to_cpu(ogdp->bg_free_blocks_count)) {
++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
++ e3b->bd_group, e3b->bd_bd->bb_free,
++ le16_to_cpu(ogdp->bg_free_blocks_count));
++ BUG();
++ return -ENODATA;
++ }
++
++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
+ if (err)
++ return err;
++
++ handle = journal_start(EXT3_SB(e3b->bd_sb)->s_journal, 1);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ handle = NULL;
+ goto out;
-+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize);
-+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize);
++ }
+
-+ bh = read_block_bitmap(sb, group);
-+ if (bh == NULL) {
-+ err = -EIO;
-+ goto out2;
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto out;
++ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
++ grp->mgd_free = e3b->bd_bd->bb_free;
++ for (i = 0; i < e3b->bd_blkbits; i++) {
++ J_ASSERT(i < 16);
++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
+ }
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ goto out;
++ err = 0;
++out:
++ brelse(bh);
++ if (handle)
++ ext3_journal_stop(handle);
++ return err;
++}
++
++int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
++{
++ struct super_block *sb = e3b->bd_sb;
++ struct buffer_head *bh;
++ int i, count = 0;
++
++ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize);
++ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize);
++
++ bh = read_block_bitmap(sb, e3b->bd_group);
++ if (bh == NULL)
++ return -EIO;
++
++ /* mb_free_blocks will set real free */
++ e3b->bd_bd->bb_first_free = 1 << 15;
+
+ /* loop over the blocks, and create buddies for free ones */
+ for (i = 0; i < sb->s_blocksize * 8; i++) {
+ if (!mb_test_bit(i, (void *) bh->b_data)) {
-+ mb_free_blocks(&e3b, i, 1);
++ mb_free_blocks(e3b, i, 1);
+ count++;
+ }
+ }
+ brelse(bh);
-+ mb_check_buddy(&e3b);
-+ ext3_mb_dirty_buddy(&e3b);
++ mb_check_buddy(e3b);
++ ext3_mb_dirty_buddy(e3b);
+
-+out2:
-+ ext3_mb_release_desc(&e3b);
-+out:
-+ return err;
++ return 0;
+}
+
+EXPORT_SYMBOL(ext3_mb_new_blocks);
+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
+
-+int ext3_mb_init_backend(struct super_block *sb)
++int ext3_mb_init_backend(struct super_block *sb, int *created)
+{
++ int err, i, len, descr_per_block, buddy_offset, size;
+ struct inode *root = sb->s_root->d_inode;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_grp_header *hdr;
++ struct buffer_head *bh = NULL;
++ unsigned long block;
+ struct dentry *db;
++ handle_t *handle;
+ tid_t target;
-+ int err, i;
+
-+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) *
-+ sbi->s_groups_count, GFP_KERNEL);
++ *created = 0;
++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
+ if (sbi->s_buddy_blocks == NULL) {
-+ printk("EXT3-fs: can't allocate mem for buddy maps\n");
++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_buddy_blocks, 0,
-+ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count);
++ memset(sbi->s_buddy_blocks, 0, len);
+ sbi->s_buddy = NULL;
+
+ down(&root->i_sem);
-+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root,
-+ strlen(EXT3_BUDDY_FILE));
++ len = strlen(EXT3_BUDDY_FILE);
++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
+ if (IS_ERR(db)) {
+ err = PTR_ERR(db);
-+ printk("EXT3-fs: can't lookup buddy file: %d\n", err);
++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
++ up(&root->i_sem);
+ goto out;
+ }
+
-+ if (db->d_inode != NULL) {
-+ sbi->s_buddy = igrab(db->d_inode);
-+ goto map;
++ if (db->d_inode == NULL) {
++ err = ext3_create(root, db, S_IFREG, NULL);
++ if (err) {
++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
++ up(&root->i_sem);
++ goto out;
++ }
++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
++ *created = 1;
++ printk("EXT3-fs: no buddy file, regenerate\n");
++ }
++ up(&root->i_sem);
++ sbi->s_buddy = igrab(db->d_inode);
++
++ /* calculate needed size */
++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
++ / sizeof(struct ext3_mb_group_descr);
++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
++ / descr_per_block;
++ len = sbi->s_groups_count * sb->s_blocksize * 2 +
++ buddy_offset * sb->s_blocksize;
++ if (len != i_size_read(sbi->s_buddy)) {
++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
++ (unsigned) len, (unsigned) i_size_read(sbi->s_buddy));
++ *created = 1;
+ }
+
-+ err = ext3_create(root, db, S_IFREG, NULL);
-+ if (err) {
-+ printk("error while creation buddy file: %d\n", err);
-+ } else {
-+ sbi->s_buddy = igrab(db->d_inode);
++ /* read/create mb group descriptors */
++ for (i = 0; i < buddy_offset; i++) {
++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
++ if (IS_ERR(handle)) {
++ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
++ err = PTR_ERR(handle);
++ goto err_out;
++ }
++
++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
++ if (bh == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
++ goto err_out;
++ }
++ hdr = (struct ext3_mb_group_hdr *) bh->b_data;
++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto err_out;
++ *created = 1;
++ printk("EXT3-fs: invalid header 0x%x in %d, regenerate\n", hdr->mh_magic, i);
++ hdr->mh_magic = EXT3_MB_MAGIC_V1;
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ goto err_out;
++ }
++ brelse(bh);
++ ext3_journal_stop(handle);
+ }
+
-+map:
++ len = sizeof(struct ext3_buddy_group_blocks);
++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ struct buffer_head *bh = NULL;
-+ handle_t *handle;
+
-+ sbi->s_buddy_blocks[i] =
-+ kmalloc(sizeof(struct ext3_buddy_group_blocks),
-+ GFP_KERNEL);
++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
+ if (sbi->s_buddy_blocks[i] == NULL) {
-+ printk("EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
+ err = -ENOMEM;
+ goto out2;
+ }
++ memset(sbi->s_buddy_blocks[i], 0, len);
+
+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
+ if (IS_ERR(handle)) {
++ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
+ err = PTR_ERR(handle);
+ goto out2;
+ }
+
+ /* allocate block for bitmap */
-+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err);
++ block = buddy_offset + i * 2;
++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
+ if (bh == NULL) {
-+ printk("can't get block for buddy bitmap: %d\n", err);
++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
+ goto out2;
+ }
+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
+ brelse(bh);
+
+ /* allocate block for buddy */
-+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err);
++ block = buddy_offset + i * 2 + 1;
++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
+ if (bh == NULL) {
-+ printk("can't get block for buddy: %d\n", err);
++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
+ goto out2;
+ }
+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
+ brelse(bh);
++
++ size = (block + 1) << sbi->s_buddy->i_blkbits;
++ if (size > sbi->s_buddy->i_size) {
++ *created = 1;
++ EXT3_I(sbi->s_buddy)->i_disksize = size;
++ i_size_write(sbi->s_buddy, size);
++ mark_inode_dirty(sbi->s_buddy);
++ }
+ ext3_journal_stop(handle);
++
+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
+ sbi->s_buddy_blocks[i]->bb_tid = 0;
+out2:
+ dput(db);
+out:
-+ up(&root->i_sem);
+ return err;
++
++err_out:
++ return err;
++}
++
++int ext3_mb_write_descriptors(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_buddy e3b;
++ int ret = 0, i, err;
++
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ if (sbi->s_buddy_blocks[i] == NULL)
++ continue;
++
++ err = ext3_mb_load_buddy(sb, i, &e3b);
++ if (err == 0) {
++ ext3_mb_update_descr(&e3b);
++ ext3_mb_release_desc(&e3b);
++ } else
++ ret = err;
++ }
++ return ret;
+}
+
+int ext3_mb_release(struct super_block *sb)
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_buddy_blocks) {
-+ for (i = 0; i < sbi->s_groups_count; i++)
-+ if (sbi->s_buddy_blocks[i])
-+ kfree(sbi->s_buddy_blocks[i]);
++ ext3_mb_write_descriptors(sb);
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ if (sbi->s_buddy_blocks[i] == NULL)
++ continue;
++ kfree(sbi->s_buddy_blocks[i]);
++ }
+ kfree(sbi->s_buddy_blocks);
+ }
+ if (sbi->s_buddy)
+ if (sbi->s_blocks_reserved)
+ printk("ext3-fs: %ld blocks being reserved at umount!\n",
+ sbi->s_blocks_reserved);
++#ifdef MBALLOC_STATS
++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n",
++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success);
++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n",
++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks);
++#endif
+ return 0;
+}
+
-+int ext3_mb_init(struct super_block *sb)
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
-+ struct ext3_super_block *es;
-+ int i;
++ struct ext3_buddy e3b;
++ int i, err, created;
+
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ /* init file for buddy data */
+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ if (ext3_mb_init_backend(sb))
-+ return 0;
++ if ((err = ext3_mb_init_backend(sb, &created)))
++ return err;
+
-+ es = EXT3_SB(sb)->s_es;
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ ext3_mb_generate_buddy(sb, i);
++repeat:
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
++ err = ext3_mb_load_buddy(sb, i, &e3b);
++ if (err) {
++ /* FIXME: release backend */
++ return err;
++ }
++ if (created || needs_recovery)
++ ext3_mb_generate_buddy(&e3b);
++ else
++ err = ext3_mb_load_descr(&e3b);
++ ext3_mb_release_desc(&e3b);
++ if (err == -ENODATA) {
++ created = 1;
++ goto repeat;
++ }
++ }
++ if (created || needs_recovery)
++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
++ EXT3_SB(sb)->s_groups_count);
+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
+ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ printk("EXT3-fs: mballoc enabled\n");
++
++#ifdef MBALLOC_STATS
++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
++#define MBALLOC_INFO " (stats)"
++#else
++#define MBALLOC_INFO ""
++#endif
++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO);
+ return 0;
+}
+
+ mb_debug("gonna free %u blocks in group %u (0x%p):",
+ md->num, md->group, md);
+
-+ err = ext3_mb_load_desc(sb, md->group, &e3b);
++ err = ext3_mb_load_buddy(sb, md->group, &e3b);
+ BUG_ON(err != 0);
+
+ /* there are blocks to put in buddy to make them really free */
+}
+
+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count, int metadata)
++ unsigned long block, unsigned long count,
++ int metadata, int *freed)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext3_group_desc *gdp;
+ struct ext3_buddy e3b;
+ int err = 0, ret;
+
++ *freed = 0;
+ sb = inode->i_sb;
+ if (!sb) {
+ printk ("ext3_free_blocks: nonexistent device");
+ if (err)
+ goto error_return;
+
-+ err = ext3_mb_load_desc(sb, block_group, &e3b);
++ err = ext3_mb_load_buddy(sb, block_group, &e3b);
+ if (err)
+ goto error_return;
+
+ } else {
+ ext3_lock_group(sb, block_group);
+ mb_free_blocks(&e3b, bit, count);
-+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ ext3_unlock_group(sb, block_group);
-+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+ }
++ spin_lock(sb_bgl_lock(sbi, block_group));
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
++ spin_unlock(sb_bgl_lock(sbi, block_group));
+
+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
-+ /* FIXME: undo logic will be implemented later and another way */
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
-+ DQUOT_FREE_BLOCK(inode, count);
++ *freed = count;
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+}
+
+int ext3_new_block(handle_t *handle, struct inode *inode,
-+ unsigned long goal, int *errp)
++ unsigned long goal, int *errp)
+{
+ int ret, len;
+
+}
+
+
++extern void ext3_free_blocks_old(handle_t *, struct inode *,
++ unsigned long, unsigned long);
+void ext3_free_blocks(handle_t *handle, struct inode * inode,
+ unsigned long block, unsigned long count, int metadata)
+{
++ int freed;
++
+ if (!test_opt(inode->i_sb, MBALLOC))
+ ext3_free_blocks_old(handle, inode, block, count);
-+ else
-+ ext3_mb_free_blocks(handle, inode, block, count, metadata);
++ else {
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
++ if (freed)
++ DQUOT_FREE_BLOCK(inode, freed);
++ }
+ return;
+}
++
Index: linux-2.6.5-sles9/fs/ext3/super.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300
-+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300
++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:48:54.515249408 +0300
@@ -389,6 +389,7 @@
struct ext3_super_block *es = sbi->s_es;
int i;
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -542,7 +543,7 @@
+@@ -540,6 +541,7 @@
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_err, Opt_extents, Opt_extdebug
-+ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc,
++ Opt_mballoc, Opt_mbfactor,
+ Opt_err, Opt_extents, Opt_extdebug
};
- static match_table_t tokens = {
-@@ -589,6 +590,7 @@
+@@ -587,6 +589,8 @@
{Opt_iopen_nopriv, "iopen_nopriv"},
{Opt_extents, "extents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_mballoc, "mbfactor=%u"},
{Opt_err, NULL}
};
-@@ -810,6 +812,9 @@
+@@ -808,6 +812,16 @@
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
+ set_opt (sbi->s_mount_opt, MBALLOC);
+ break;
++ case Opt_mbfactor:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_mb_factor = option;
++ break;
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1463,7 +1468,8 @@
+@@ -1461,7 +1475,8 @@
ext3_count_dirs(sb));
ext3_ext_init(sb);
-
-+ ext3_mb_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+
return 0;
failed_mount3:
Index: linux-2.6.5-sles9/fs/ext3/Makefile
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300
-+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300
++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:48:54.517249104 +0300
@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
Index: linux-2.6.5-sles9/fs/ext3/balloc.c
===================================================================
--- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300
++++ linux-2.6.5-sles9/fs/ext3/balloc.c 2005-02-23 01:48:54.520248648 +0300
@@ -78,7 +78,7 @@
*
* Return buffer_head on success or NULL in case of failure.
struct buffer_head *bitmap_bh = NULL;
Index: linux-2.6.5-sles9/fs/ext3/namei.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300
-+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2005-02-23 01:01:46.551165296 +0300
++++ linux-2.6.5-sles9/fs/ext3/namei.c 2005-02-23 01:48:54.523248192 +0300
@@ -1640,7 +1640,7 @@
* If the create succeeds, we fill in the inode information
* with d_instantiate().
handle_t *handle;
Index: linux-2.6.5-sles9/fs/ext3/inode.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300
-+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300
++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:48:54.529247280 +0300
@@ -572,7 +572,7 @@
ext3_journal_forget(handle, branch[i].bh);
}
/*
Index: linux-2.6.5-sles9/fs/ext3/extents.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300
-+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300
-@@ -740,7 +740,7 @@
+--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300
++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:48:54.533246672 +0300
+@@ -774,7 +774,7 @@
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
}
}
kfree(ablocks);
-@@ -1391,7 +1391,7 @@
+@@ -1431,7 +1431,7 @@
path->p_idx->ei_leaf);
bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
return err;
}
-@@ -1879,10 +1879,12 @@
+@@ -1919,10 +1919,12 @@
int needed = ext3_remove_blocks_credits(tree, ex, from, to);
handle_t *handle = ext3_journal_start(tree->inode, needed);
struct buffer_head *bh;
if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
/* tail removal */
unsigned long num, start;
-@@ -1894,7 +1896,7 @@
+@@ -1934,7 +1936,7 @@
bh = sb_find_get_block(tree->inode->i_sb, start + i);
ext3_forget(handle, 0, tree->inode, bh, start + i);
}
from, to, ex->ee_block, ex->ee_len);
Index: linux-2.6.5-sles9/fs/ext3/xattr.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2005-02-23 01:01:52.387278072 +0300
++++ linux-2.6.5-sles9/fs/ext3/xattr.c 2005-02-23 01:48:54.537246064 +0300
@@ -1366,7 +1366,7 @@
new_bh = sb_getblk(sb, block);
if (!new_bh) {
} else {
Index: linux-2.6.5-sles9/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300
-@@ -57,6 +57,8 @@
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:48:54.539245760 +0300
+@@ -57,6 +57,14 @@
#define ext3_debug(f, a...) do {} while (0)
#endif
+#define EXT3_MULTIBLOCK_ALLOCATOR 1
+
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
/*
* Special inodes numbers
*/
-@@ -339,6 +341,7 @@
+@@ -339,6 +347,7 @@
#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
++#define EXT3_MOUNT_MBALLOC 0x100000/* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef clear_opt
-@@ -698,7 +701,7 @@
+@@ -698,7 +707,7 @@
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
extern unsigned long ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-@@ -743,6 +746,13 @@
- extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
- unsigned long);
+@@ -820,6 +829,37 @@
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+/* mballoc.c */
-+extern int ext3_mb_init(struct super_block *sb);
-+extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long goal,int *len, int flags,int *errp);
-+extern int ext3_mb_release(struct super_block *sb);
++extern int ext3_mb_init(struct super_block *, int);
++extern int ext3_mb_release(struct super_block *);
++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
++extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
+
- /* namei.c */
- extern int ext3_orphan_add(handle_t *, struct inode *);
- extern int ext3_orphan_del(handle_t *, struct inode *);
++/* writeback.c */
++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
++extern int ext3_wb_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to);
++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
++extern int ext3_wb_writepage(struct page *, struct writeback_control *);
++extern int ext3_wb_invalidatepage(struct page *, unsigned long);
++extern int ext3_wb_releasepage(struct page *, int);
++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
++extern void ext3_wb_init(struct super_block *);
++extern void ext3_wb_release(struct super_block *);
++
++/* writeback.c */
++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
++extern int ext3_wb_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to);
++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
++extern int ext3_wb_writepage(struct page *, struct writeback_control *);
++extern int ext3_wb_invalidatepage(struct page *, unsigned long);
++extern int ext3_wb_releasepage(struct page *, int);
++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
++extern void ext3_wb_init(struct super_block *);
++extern void ext3_wb_release(struct super_block *);
++
+ #endif /* __KERNEL__ */
+
+ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2005-02-23 01:01:48.242908112 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2005-02-23 01:48:54.541245456 +0300
@@ -23,10 +23,30 @@
#define EXT_INCLUDE
#include <linux/blockgroup_lock.h>
+ struct list_head list;
+};
+
-+#define EXT3_BB_MAX_ORDER 14
-+
+struct ext3_buddy_group_blocks {
-+ sector_t bb_bitmap;
-+ sector_t bb_buddy;
++ __u32 bb_bitmap;
++ __u32 bb_buddy;
+ spinlock_t bb_lock;
-+ unsigned bb_counters[EXT3_BB_MAX_ORDER];
++ unsigned long bb_tid;
+ struct ext3_free_metadata *bb_md_cur;
-+ unsigned long bb_tid;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned bb_counters[];
+};
+
/*
* third extended-fs super-block data in memory
*/
-@@ -78,6 +98,17 @@
+@@ -78,6 +98,27 @@
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
#endif
+ struct list_head s_committed_transaction;
+ spinlock_t s_md_lock;
+ tid_t s_last_transaction;
++ int s_mb_factor;
++
++ /* stats for buddy allocator */
++ spinlock_t s_bal_lock;
++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */
++ unsigned long s_bal_success; /* we found long enough chunks */
++ unsigned long s_bal_allocated; /* in blocks */
++ unsigned long s_bal_ex_scanned; /* total extents scanned */
++ unsigned long s_bal_goals; /* goal hits */
++ unsigned long s_bal_breaks; /* too long searches */
};
#endif /* _LINUX_EXT3_FS_SB */
- hold NS lock when calling handle_ast_error->del_waiting_lock (5746)
- fix setattr mtime regression from lovcleanup merge (4829, 5669)
- workaround for 2.6 crash in ll_unhash_aliases (5687, 5210)
+ - small ext3 extents cleanups and fixes (5733)
+ - improved mballoc code, several small races and bugs fixed (5733, 5638)
* miscellania
- service request history (4965)
- put {ll,lov,osc}_async_page structs in a single slab (4699)
%patch
Index: linux-2.6.5-sles9/fs/ext3/extents.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300
-@@ -0,0 +1,2313 @@
+--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300
++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300
+@@ -0,0 +1,2356 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+#include <linux/ext3_extents.h>
+#include <asm/uaccess.h>
+
++
++static inline int ext3_ext_check_header(struct ext3_extent_header *eh)
++{
++ if (eh->eh_magic != EXT3_EXT_MAGIC) {
++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n",
++ (unsigned) eh->eh_magic);
++ return -EIO;
++ }
++ if (eh->eh_max == 0) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n",
++ (unsigned) eh->eh_max);
++ return -EIO;
++ }
++ if (eh->eh_entries > eh->eh_max) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n",
++ (unsigned) eh->eh_entries);
++ return -EIO;
++ }
++ return 0;
++}
++
+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
+{
+ int err;
+
+ eh = EXT_ROOT_HDR(tree);
+ EXT_ASSERT(eh);
++ if (ext3_ext_check_header(eh))
++ goto err;
++
+ i = depth = EXT_DEPTH(tree);
+ EXT_ASSERT(eh->eh_max);
+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
-+ EXT_ASSERT(i == 0 || eh->eh_entries > 0);
+
+ /* account possible depth increase */
+ if (!path) {
+ path[ppos].p_ext = NULL;
+
+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
-+ if (!bh) {
-+ ext3_ext_drop_refs(path);
-+ kfree(path);
-+ return ERR_PTR(-EIO);
-+ }
++ if (!bh)
++ goto err;
++
+ eh = EXT_BLOCK_HDR(bh);
+ ppos++;
+ EXT_ASSERT(ppos <= depth);
+ path[ppos].p_bh = bh;
+ path[ppos].p_hdr = eh;
+ i--;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
+ }
+
+ path[ppos].p_depth = i;
+ path[ppos].p_hdr = eh;
+ path[ppos].p_ext = NULL;
++ path[ppos].p_idx = NULL;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
+
+ /* find extent */
+ ext3_ext_binsearch(tree, path + ppos, block);
+ ext3_ext_show_path(tree, path);
+
+ return path;
++
++err:
++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
+}
+
+/*
+ int depth, len, err, next;
+
+ EXT_ASSERT(newext->ee_len > 0);
-+ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
+ depth = EXT_DEPTH(tree);
+ ex = path[depth].p_ext;
+ EXT_ASSERT(path[depth].p_hdr);
+ unsigned long num, ext_prepare_callback func)
+{
+ struct ext3_ext_path *path = NULL;
-+ struct ext3_extent *ex, cbex;
++ struct ext3_ext_cache cbex;
++ struct ext3_extent *ex;
+ unsigned long next, start = 0, end = 0;
+ unsigned long last = block + num;
+ int depth, exists, err = 0;
+ EXT_ASSERT(end > start);
+
+ if (!exists) {
-+ cbex.ee_block = start;
-+ cbex.ee_len = end - start;
-+ cbex.ee_start = 0;
-+ } else
-+ cbex = *ex;
++ cbex.ec_block = start;
++ cbex.ec_len = end - start;
++ cbex.ec_start = 0;
++ cbex.ec_type = EXT3_EXT_CACHE_GAP;
++ } else {
++ cbex.ec_block = ex->ee_block;
++ cbex.ec_len = ex->ee_len;
++ cbex.ec_start = ex->ee_start;
++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT;
++ }
+
++ EXT_ASSERT(cbex.ec_len > 0);
+ EXT_ASSERT(path[depth].p_hdr);
-+ err = func(tree, path, &cbex, exists);
++ err = func(tree, path, &cbex);
+ ext3_ext_drop_refs(path);
+
+ if (err < 0)
+ path = NULL;
+ }
+
-+ block = cbex.ee_block + cbex.ee_len;
++ block = cbex.ec_block + cbex.ec_len;
+ }
+
+ if (path) {
+ tree->root = (void *) EXT3_I(inode)->i_data;
+ tree->buffer = (void *) inode;
+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
-+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent;
++ tree->cex = &EXT3_I(inode)->i_cached_extent;
+ tree->ops = &ext3_blockmap_helpers;
+}
+
+ int goal, newblock, err = 0, depth;
+ struct ext3_extents_tree tree;
+
-+ clear_buffer_new(bh_result);
++ __clear_bit(BH_New, &bh_result->b_state);
+ ext3_init_tree_desc(&tree, inode);
+ ext_debug(&tree, "block %d requested for inode %u\n",
+ (int) iblock, (unsigned) inode->i_ino);
+
+ /* previous routine could use block we allocated */
+ newblock = newex.ee_start;
-+ set_buffer_new(bh_result);
++ __set_bit(BH_New, &bh_result->b_state);
+
+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len,
+ newex.ee_start, EXT3_EXT_CACHE_EXTENT);
+out:
+ ext3_ext_show_leaf(&tree, path);
-+ map_bh(bh_result, inode->i_sb, newblock);
++ __set_bit(BH_Mapped, &bh_result->b_state);
++ bh_result->b_bdev = inode->i_sb->s_bdev;
++ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext3_ext_drop_refs(path);
+static int
+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
+ struct ext3_ext_path *path,
-+ struct ext3_extent *newex, int exist)
++ struct ext3_ext_cache *newex)
+{
+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
+
-+ if (!exist)
++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT)
+ return EXT_CONTINUE;
++
+ if (buf->err < 0)
+ return EXT_BREAK;
+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
+static int
+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
+ struct ext3_ext_path *path,
-+ struct ext3_extent *ex, int exist)
++ struct ext3_ext_cache *ex)
+{
+ struct ext3_extent_tree_stats *buf =
+ (struct ext3_extent_tree_stats *) tree->private;
+ int depth;
+
-+ if (!exist)
++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT)
+ return EXT_CONTINUE;
+
+ depth = EXT_DEPTH(tree);
+}
+
+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-+ unsigned long arg)
++ unsigned long arg)
+{
+ int err = 0;
+
+
Index: linux-2.6.5-sles9/fs/ext3/ialloc.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2004-11-09 02:22:55.763148128 +0300
-+++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2004-11-09 02:23:21.587222272 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2005-02-23 01:01:52.366281264 +0300
++++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2005-02-23 01:02:37.398435336 +0300
@@ -647,6 +647,10 @@
DQUOT_FREE_INODE(inode);
goto fail2;
ext3_std_error(sb, err);
Index: linux-2.6.5-sles9/fs/ext3/inode.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:22:55.767147520 +0300
-+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:01:52.373280200 +0300
++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300
@@ -796,6 +796,17 @@
goto reread;
}
else
Index: linux-2.6.5-sles9/fs/ext3/Makefile
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300
-+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300
++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300
@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
Index: linux-2.6.5-sles9/fs/ext3/super.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:22:56.450043704 +0300
-+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:02:34.072940888 +0300
++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300
@@ -389,6 +389,7 @@
struct ext3_super_block *es = sbi->s_es;
int i;
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
-@@ -447,6 +448,10 @@
+@@ -447,6 +448,8 @@
#endif
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
ei->vfs_inode.i_version = 1;
-+ ei->i_cached_extent[0] = 0;
-+ ei->i_cached_extent[1] = 0;
-+ ei->i_cached_extent[2] = 0;
-+ ei->i_cached_extent[3] = 0;
++
++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
return &ei->vfs_inode;
}
-@@ -537,7 +542,7 @@
+@@ -537,7 +540,7 @@
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
};
static match_table_t tokens = {
-@@ -582,6 +587,8 @@
+@@ -582,6 +585,8 @@
{Opt_iopen, "iopen"},
{Opt_noiopen, "noiopen"},
{Opt_iopen_nopriv, "iopen_nopriv"},
{Opt_err, NULL}
};
-@@ -797,6 +804,12 @@
+@@ -797,6 +802,12 @@
break;
case Opt_ignore:
break;
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1449,6 +1462,8 @@
+@@ -1449,6 +1460,8 @@
percpu_counter_mod(&sbi->s_dirs_counter,
ext3_count_dirs(sb));
failed_mount3:
Index: linux-2.6.5-sles9/fs/ext3/ioctl.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2004-11-09 02:15:44.610693264 +0300
-+++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2004-11-09 02:23:52.991448104 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2005-02-23 01:01:42.887722224 +0300
++++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2005-02-23 01:02:37.412433208 +0300
@@ -124,6 +124,10 @@
err = ext3_change_inode_journal_flag(inode, jflag);
return err;
return put_user(inode->i_generation, (int *) arg);
Index: linux-2.6.5-sles9/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:22:58.767691368 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:35.823674736 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300
@@ -186,6 +186,7 @@
#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2004-11-09 02:23:21.606219384 +0300
-@@ -0,0 +1,252 @@
+--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300
+@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ */
+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
+ struct ext3_ext_path *,
-+ struct ext3_extent *, int);
++ struct ext3_ext_cache *);
+
+#define EXT_CONTINUE 0
+#define EXT_BREAK 1
+
+
+#define EXT_MAX_BLOCK 0xffffffff
-+#define EXT_CACHE_MARK 0xffff
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
++#define EXT_CHECK_PATH(tree,path) \
++{ \
++ int depth = EXT_DEPTH(tree); \
++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_idx < \
++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_ext < \
++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \
++ && depth != 0); \
++ BUG_ON((path)[0].p_depth != depth); \
++}
++
+
+/*
+ * this structure is used to gather extents from the tree via ioctl
+
Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:22:55.780145544 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300
-@@ -128,6 +128,8 @@
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2005-02-23 01:01:52.425272296 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2005-02-23 01:45:55.611446920 +0300
+@@ -19,6 +19,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
++#include <linux/ext3_extents.h>
+
+ struct reserve_window {
+ __u32 _rsv_start; /* First byte reserved */
+@@ -128,6 +129,8 @@
*/
struct semaphore truncate_sem;
struct inode vfs_inode;
+
-+ __u32 i_cached_extent[4];
++ struct ext3_ext_cache i_cached_extent;
};
#endif /* _LINUX_EXT3_FS_I */
%diffstat
fs/ext3/Makefile | 2
- fs/ext3/extents.c | 2313 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/extents.c | 2356 +++++++++++++++++++++++++++++++++++++++++++
fs/ext3/ialloc.c | 4
fs/ext3/inode.c | 29
fs/ext3/ioctl.c | 4
- fs/ext3/super.c | 17
- include/linux/ext3_extents.h | 252 ++++
- include/linux/ext3_fs.h | 15
- include/linux/ext3_fs_i.h | 2
- 9 files changed, 2630 insertions(+), 8 deletions(-)
+ fs/ext3/super.c | 15
+ include/linux/ext3_extents.h | 265 ++++
+ include/linux/ext3_fs.h | 17
+ include/linux/ext3_fs_i.h | 3
+ 9 files changed, 2687 insertions(+), 8 deletions(-)
Index: linux-2.6.5-sles9/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300
-@@ -0,0 +1,1441 @@
+--- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2005-02-17 22:07:57.023609040 +0300
++++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2005-02-23 01:56:19.101662000 +0300
+@@ -0,0 +1,1835 @@
+/*
+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+
+/*
+ * TODO:
-+ * - do not scan from the beginning, try to remember first free block
-+ * - mb_mark_used_* may allocate chunk right after splitting buddy
++ * - track min/max extents in each group for better group selection
++ * - is it worthwhile to use buddies directly if req is 2^N blocks?
++ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - special flag to advice allocator to look for requested + N blocks
+ * this may improve interaction between extents and mballoc
++ * - tree of groups sorted by number of free blocks
++ * - percpu reservation code (hotpath)
++ * - error handling
+ */
+
+/*
+ * with AGRESSIVE_CHECK allocator runs consistency checks over
-+ * structures. this checks slow things down a lot
++ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
++ * with MBALLOC_STATS allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
++ */
++#define MBALLOC_STATS
++
++/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define EXT3_BUDDY_FILE ".buddy"
+
+/*
-+ * max. number of chunks to be tracked in ext3_free_extent struct
++ * How long mballoc can look for a best extent (in found extents)
++ */
++#define EXT3_MB_MAX_TO_SCAN 100
++
++/*
++ * This structure is on-disk description of a group for mballoc
++ */
++struct ext3_mb_group_descr {
++ __u16 mgd_first_free; /* first free block in the group */
++ __u16 mgd_free; /* number of free blocks in the group */
++ __u16 mgd_counters[16]; /* number of free blocks by order */
++};
++
++/*
++ * This structure is header of mballoc's file
+ */
-+#define MB_ARR_SIZE 32
++struct ext3_mb_grp_header {
++ __u32 mh_magic;
++};
++
++#define EXT3_MB_MAGIC_V1 0xbaad16fc
++
++
++struct ext3_free_extent {
++ __u16 fe_start;
++ __u16 fe_len;
++ __u16 fe_group;
++};
+
+struct ext3_allocation_context {
+ struct super_block *ac_sb;
+
+ /* search goals */
-+ int ac_g_group;
-+ int ac_g_start;
-+ int ac_g_len;
-+ int ac_g_flags;
++struct ext3_free_extent ac_g_ex;
+
+ /* the best found extent */
-+ int ac_b_group;
-+ int ac_b_start;
-+ int ac_b_len;
++ struct ext3_free_extent ac_b_ex;
+
+ /* number of iterations done. we have to track to limit searching */
-+ int ac_repeats;
-+ int ac_groups_scanned;
-+ int ac_status;
++ unsigned long ac_ex_scanned;
++ __u16 ac_groups_scanned;
++ __u16 ac_found;
++ __u8 ac_status;
++ __u8 ac_flags; /* allocation hints */
++ __u8 ac_repeats;
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
-+
++#define AC_STATUS_BREAK 3
+
+struct ext3_buddy {
-+ void *bd_bitmap;
-+ void *bd_buddy;
-+ int bd_blkbits;
+ struct buffer_head *bd_bh;
+ struct buffer_head *bd_bh2;
+ struct ext3_buddy_group_blocks *bd_bd;
+ struct super_block *bd_sb;
++ __u16 bd_blkbits;
++ __u16 bd_group;
+};
-+
-+struct ext3_free_extent {
-+ int fe_start;
-+ int fe_len;
-+ unsigned char fe_orders[MB_ARR_SIZE];
-+ unsigned char fe_nums;
-+ unsigned char fe_back;
-+};
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data)
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
-+
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
-+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+int ext3_mb_reserve_blocks(struct super_block *, int);
++void ext3_mb_release_blocks(struct super_block *, int);
+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
+void ext3_mb_free_committed_blocks(struct super_block *);
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
++ __set_bit(bit, addr);
++}
++
++static inline void mb_set_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
+ set_bit(bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
++ __clear_bit(bit, addr);
++}
++
++static inline void mb_clear_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
+ clear_bit(bit, addr);
+}
+
+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
+ int i = 1;
-+ void *bb;
++ char *bb;
+
-+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+ J_ASSERT(max != NULL);
+
+ if (order > e3b->bd_blkbits + 1)
+ /* at order 0 we see each particular block */
+ *max = 1 << (e3b->bd_blkbits + 3);
+ if (order == 0)
-+ return e3b->bd_bitmap;
++ return EXT3_MB_BITMAP(e3b);
+
-+ bb = e3b->bd_buddy;
++ bb = EXT3_MB_BUDDY(e3b);
+ *max = *max >> 1;
+ while (i < order) {
+ bb += 1 << (e3b->bd_blkbits - i);
+ i++;
+ *max = *max >> 1;
+ }
++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
++ e3b->bd_sb->s_blocksize);
+ return bb;
+}
+
-+static int ext3_mb_load_desc(struct super_block *sb, int group,
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
+ struct ext3_buddy *e3b)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ /* load bitmap */
+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
+ if (e3b->bd_bh == NULL) {
-+ ext3_error(sb, "ext3_mb_load_desc",
++ ext3_error(sb, "ext3_mb_load_buddy",
+ "can't get block for buddy bitmap\n");
+ goto out;
+ }
+ /* load buddy */
+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
+ if (e3b->bd_bh2 == NULL) {
-+ ext3_error(sb, "ext3_mb_load_desc",
++ ext3_error(sb, "ext3_mb_load_buddy",
+ "can't get block for buddy bitmap\n");
+ goto out;
+ }
+ }
+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
+
-+ e3b->bd_bitmap = e3b->bd_bh->b_data;
-+ e3b->bd_buddy = e3b->bd_bh2->b_data;
+ e3b->bd_blkbits = sb->s_blocksize_bits;
+ e3b->bd_bd = sbi->s_buddy_blocks[group];
+ e3b->bd_sb = sb;
++ e3b->bd_group = group;
+
+ return 0;
+out:
+
+ for (j = 0; j < (1 << order); j++) {
+ k = (i * (1 << order)) + j;
-+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap));
++ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
+ }
+ count++;
+ }
+ int order = 1;
+ void *bb;
+
-+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
+
-+ bb = e3b->bd_buddy;
++ bb = EXT3_MB_BUDDY(e3b);
+ while (order <= e3b->bd_blkbits + 1) {
+ block = block >> 1;
+ if (mb_test_bit(block, bb)) {
+ cur += 32;
+ continue;
+ }
-+ mb_clear_bit(cur, bm);
++ mb_clear_bit_atomic(cur, bm);
+ cur++;
+ }
+}
+ cur += 32;
+ continue;
+ }
-+ mb_set_bit(cur, bm);
++ mb_set_bit_atomic(cur, bm);
+ cur++;
+ }
+}
+ void *buddy, *buddy2;
+
+ mb_check_buddy(e3b);
++
++ e3b->bd_bd->bb_free += count;
++ if (first < e3b->bd_bd->bb_first_free)
++ e3b->bd_bd->bb_first_free = first;
++
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
-+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap));
-+ mb_set_bit(block, e3b->bd_bitmap);
++ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
++ mb_set_bit(block, EXT3_MB_BITMAP(e3b));
+ e3b->bd_bd->bb_counters[order]++;
+
+ /* start of the buddy */
+ return 0;
+}
+
-+/*
-+ * returns 1 if out extent is enough to fill needed space
-+ */
-+int mb_make_backward_extent(struct ext3_free_extent *in,
-+ struct ext3_free_extent *out, int needed)
++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
++ int needed, struct ext3_free_extent *ex)
+{
-+ int i;
-+
-+ J_ASSERT(in);
-+ J_ASSERT(out);
-+ J_ASSERT(in->fe_nums < MB_ARR_SIZE);
-+
-+ out->fe_len = 0;
-+ out->fe_start = in->fe_start + in->fe_len;
-+ out->fe_nums = 0;
-+
-+ /* for single-chunk extent we need not back order
-+ * also, if an extent doesn't fill needed space
-+ * then it makes no sense to try back order becase
-+ * if we select this extent then it'll be use as is */
-+ if (in->fe_nums < 2 || in->fe_len < needed)
-+ return 0;
-+
-+ i = in->fe_nums - 1;
-+ while (i >= 0 && out->fe_len < needed) {
-+ out->fe_len += (1 << in->fe_orders[i]);
-+ out->fe_start -= (1 << in->fe_orders[i]);
-+ i--;
-+ }
-+ /* FIXME: in some situation fe_orders may be too small to hold
-+ * all the buddies */
-+ J_ASSERT(out->fe_len >= needed);
-+
-+ for (i++; i < in->fe_nums; i++)
-+ out->fe_orders[out->fe_nums++] = in->fe_orders[i];
-+ J_ASSERT(out->fe_nums < MB_ARR_SIZE);
-+ out->fe_back = 1;
-+
-+ return 1;
-+}
-+
-+int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
-+ int needed, struct ext3_free_extent *ex)
-+{
-+ int space = needed;
+ int next, max, ord;
+ void *buddy;
+
+ J_ASSERT(ex != NULL);
+
-+ ex->fe_nums = 0;
-+ ex->fe_len = 0;
-+
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ J_ASSERT(block < max);
-+ if (!mb_test_bit(block, buddy))
-+ goto nofree;
++ if (!mb_test_bit(block, buddy)) {
++ ex->fe_len = 0;
++ ex->fe_start = 0;
++ ex->fe_group = 0;
++ return 0;
++ }
+
+ if (order == 0) {
+ /* find actual order */
+ block = block >> order;
+ }
+
-+ ex->fe_orders[ex->fe_nums++] = order;
+ ex->fe_len = 1 << order;
+ ex->fe_start = block << order;
-+ ex->fe_back = 0;
-+
-+ while ((space = space - (1 << order)) > 0) {
++ ex->fe_group = e3b->bd_group;
+
-+ buddy = mb_find_buddy(e3b, order, &max);
-+ J_ASSERT(buddy);
++ while ((buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+ next = (block + 1) * (1 << order);
-+ if (!mb_test_bit(next, e3b->bd_bitmap))
++ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
+ break;
+
+ ord = mb_find_order_for_block(e3b, next);
+
-+ if ((1 << ord) >= needed) {
-+ /* we dont want to coalesce with self-enough buddies */
-+ break;
-+ }
+ order = ord;
+ block = next >> order;
+ ex->fe_len += 1 << order;
-+
-+ if (ex->fe_nums < MB_ARR_SIZE)
-+ ex->fe_orders[ex->fe_nums++] = order;
+ }
+
-+nofree:
+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
+ return ex->fe_len;
+}
+
-+static int mb_mark_used_backward(struct ext3_buddy *e3b,
-+ struct ext3_free_extent *ex, int len)
++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
-+ int start = ex->fe_start, len0 = len;
++ int start = ex->fe_start;
++ int len = ex->fe_len;
+ int ord, mlen, max, cur;
++ int len0 = len;
+ void *buddy;
+
-+ start = ex->fe_start + ex->fe_len - 1;
++ e3b->bd_bd->bb_free -= len;
++ if (e3b->bd_bd->bb_first_free == start)
++ e3b->bd_bd->bb_first_free += len;
++
+ while (len) {
+ ord = mb_find_order_for_block(e3b, start);
-+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) &&
-+ len >= (1 << ord)) {
++
++ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+ /* the whole chunk may be allocated at once! */
+ mlen = 1 << ord;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
+ mb_clear_bit(start >> ord, buddy);
+ e3b->bd_bd->bb_counters[ord]--;
-+ start -= mlen;
++ start += mlen;
+ len -= mlen;
+ J_ASSERT(len >= 0);
-+ J_ASSERT(start >= 0);
+ continue;
+ }
+
+ }
+
+ /* now drop all the bits in bitmap */
-+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0);
++ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
+
+ mb_check_buddy(e3b);
+
+ return 0;
+}
+
-+static int mb_mark_used_forward(struct ext3_buddy *e3b,
-+ struct ext3_free_extent *ex, int len)
++/*
++ * Must be called under group lock!
++ */
++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
-+ int start = ex->fe_start, len0 = len;
-+ int ord, mlen, max, cur;
-+ void *buddy;
++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
++ mb_mark_used(e3b, &ac->ac_b_ex);
++ ac->ac_status = AC_STATUS_FOUND;
++}
+
-+ while (len) {
-+ ord = mb_find_order_for_block(e3b, start);
++/*
++ * The routine checks whether found extent is good enough. If it is,
++ * then the extent gets marked used and flag is set to the context
++ * to stop scanning. Otherwise, the extent is compared with the
++ * previous found extent and if new one is better, then it's stored
++ * in the context. Later, the best found extent will be used, if
++ * mballoc can't find good enough extent.
++ *
++ * FIXME: real allocation policy is to be designed yet!
++ */
++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac,
++ struct ext3_free_extent *ex,
++ struct ext3_buddy *e3b)
++{
++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
++ struct ext3_free_extent *bex = &ac->ac_b_ex;
++ int diff = ac->ac_g_ex.fe_len - ex->fe_len;
+
-+ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
-+ /* the whole chunk may be allocated at once! */
-+ mlen = 1 << ord;
-+ buddy = mb_find_buddy(e3b, ord, &max);
-+ J_ASSERT((start >> ord) < max);
-+ mb_clear_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
-+ start += mlen;
-+ len -= mlen;
-+ J_ASSERT(len >= 0);
-+ continue;
-+ }
++ J_ASSERT(ex->fe_len > 0);
++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+
-+ /* we have to split large buddy */
-+ J_ASSERT(ord > 0);
-+ buddy = mb_find_buddy(e3b, ord, &max);
-+ mb_clear_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ ac->ac_found++;
+
-+ ord--;
-+ cur = (start >> ord) & ~1U;
-+ buddy = mb_find_buddy(e3b, ord, &max);
-+ mb_set_bit(cur, buddy);
-+ mb_set_bit(cur + 1, buddy);
-+ e3b->bd_bd->bb_counters[ord]++;
-+ e3b->bd_bd->bb_counters[ord]++;
++ /*
++ * The special case - take what you catch first
++ */
++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
+ }
+
-+ /* now drop all the bits in bitmap */
-+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0);
++ /*
++ * Let's check whether the chuck is good enough
++ */
++ if (ex->fe_len >= ac->ac_g_ex.fe_len) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
+
-+ mb_check_buddy(e3b);
++ /*
++ * If the request is vey large, then it makes sense to use large
++ * chunks for it. Even if they don't satisfy whole request.
++ */
++ if (ex->fe_len > 1000) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
+
-+ return 0;
++ /*
++ * Sometimes it's worty to take close chunk
++ */
++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
++
++ /*
++ * If this is first found extent, just store it in the context
++ */
++ if (bex->fe_len == 0) {
++ *bex = *ex;
++ return;
++ }
++
++ /*
++ * If new found extent is better, store it in the context
++ * FIXME: possible the policy should be more complex?
++ */
++ if (ex->fe_len > bex->fe_len) {
++ *bex = *ex;
++ }
++
++ /*
++ * We don't want to scan for a whole year
++ */
++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN)
++ ac->ac_status = AC_STATUS_BREAK;
+}
+
-+int inline mb_mark_used(struct ext3_buddy *e3b,
-+ struct ext3_free_extent *ex, int len)
++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
-+ int err;
++ struct ext3_free_extent ex = ac->ac_b_ex;
++ int group = ex.fe_group, max, err;
+
-+ J_ASSERT(ex);
-+ if (ex->fe_back == 0)
-+ err = mb_mark_used_forward(e3b, ex, len);
-+ else
-+ err = mb_mark_used_backward(e3b, ex, len);
-+ return err;
++ J_ASSERT(ex.fe_len > 0);
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
++
++ if (max > 0)
++ ext3_mb_use_best_found(ac, e3b);
++
++ ext3_unlock_group(ac->ac_sb, group);
++
++ if (ac->ac_status == AC_STATUS_FOUND)
++ ext3_mb_dirty_buddy(e3b);
++ ext3_mb_release_desc(e3b);
++
++ return 0;
+}
+
-+int ext3_mb_new_in_group(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b, int group)
++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
-+ struct super_block *sb = ac->ac_sb;
-+ int err, gorder, max, i;
-+ struct ext3_free_extent curex;
-+
-+ /* let's know order of allocation */
-+ gorder = 0;
-+ while (ac->ac_g_len > (1 << gorder))
-+ gorder++;
-+
-+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) {
-+ /* someone asks for space at this specified block
-+ * probably he wants to merge it into existing extent */
-+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) {
-+ /* good. at least one block is free */
-+ max = mb_find_extent(e3b, 0, ac->ac_g_start,
-+ ac->ac_g_len, &curex);
-+ max = min(curex.fe_len, ac->ac_g_len);
-+ mb_mark_used(e3b, &curex, max);
-+
-+ ac->ac_b_group = group;
-+ ac->ac_b_start = curex.fe_start;
-+ ac->ac_b_len = max;
-+ ac->ac_status = AC_STATUS_FOUND;
-+ err = 0;
-+ goto out;
-+ }
-+ /* don't try to find goal anymore */
-+ ac->ac_g_flags &= ~1;
++ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_free_extent ex;
++
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max > 0) {
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
+ }
++ ext3_unlock_group(ac->ac_sb, group);
+
-+ i = 0;
-+ while (1) {
-+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i);
-+ if (i >= sb->s_blocksize * 8)
-+ break;
++ if (ac->ac_status == AC_STATUS_FOUND)
++ ext3_mb_dirty_buddy(e3b);
++ ext3_mb_release_desc(e3b);
+
-+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex);
-+ if (max >= ac->ac_g_len) {
-+ max = min(curex.fe_len, ac->ac_g_len);
-+ mb_mark_used(e3b, &curex, max);
-+
-+ ac->ac_b_group = group;
-+ ac->ac_b_start = curex.fe_start;
-+ ac->ac_b_len = max;
-+ ac->ac_status = AC_STATUS_FOUND;
++ return 0;
++}
++/*
++ * The routine scans the group and measures all found extents.
++ * In order to optimize scanning, caller must pass number of
++ * free blocks in the group, so the routine can upper limit.
++ */
++static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ int i, free;
++
++ free = e3b->bd_bd->bb_free;
++ J_ASSERT(free > 0);
++
++ i = e3b->bd_bd->bb_first_free;
++
++ while (free && ac->ac_status != AC_STATUS_FOUND) {
++ i = find_next_bit(bitmap, sb->s_blocksize * 8, i);
++ if (i >= sb->s_blocksize * 8) {
++ J_ASSERT(free == 0);
+ break;
+ }
-+ i += max;
-+ }
+
-+ return 0;
++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex);
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(free >= ex.fe_len);
+
-+out:
-+ return err;
++ ext3_mb_measure_extent(ac, &ex, e3b);
++
++ i += ex.fe_len;
++ free -= ex.fe_len;
++ }
+}
+
-+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr)
++static int ext3_mb_good_group(struct ext3_allocation_context *ac,
++ int group, int cr)
+{
-+ struct ext3_group_desc *gdp;
-+ int free_blocks;
++ int free;
+
-+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL);
-+ if (!gdp)
-+ return 0;
-+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-+ if (free_blocks == 0)
-+ return 0;
++ J_ASSERT(cr >= 0 && cr < 3);
+
-+ /* someone wants this block very much */
-+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group)
-+ return 1;
++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
++ if (free == 0)
++ return 0;
+
-+ /* FIXME: I'd like to take fragmentation into account here */
+ if (cr == 0) {
-+ if (free_blocks >= ac->ac_g_len >> 1)
++ if (free >= ac->ac_g_ex.fe_len >> 1)
+ return 1;
+ } else if (cr == 1) {
-+ if (free_blocks >= ac->ac_g_len >> 2)
++ if (free >= ac->ac_g_ex.fe_len >> 2)
+ return 1;
+ } else if (cr == 2) {
+ return 1;
-+ } else {
-+ BUG();
+ }
+ return 0;
+}
+ sbi = EXT3_SB(sb);
+ es = EXT3_SB(sb)->s_es;
+
-+ if (!(flags & 2)) {
++ /*
++ * We can't allocate > group size
++ */
++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10)
++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10;
++
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
+ /* someone asks for non-reserved blocks */
+ BUG_ON(*len > 1);
+ err = ext3_mb_reserve_blocks(sb, 1);
+ EXT3_BLOCKS_PER_GROUP(sb));
+
+ /* set up allocation goals */
-+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0;
-+ ac.ac_status = 0;
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
+ ac.ac_groups_scanned = 0;
++ ac.ac_ex_scanned = 0;
++ ac.ac_found = 0;
+ ac.ac_sb = inode->i_sb;
-+ ac.ac_g_group = group;
-+ ac.ac_g_start = block;
-+ ac.ac_g_len = *len;
-+ ac.ac_g_flags = flags;
++ ac.ac_g_ex.fe_group = group;
++ ac.ac_g_ex.fe_start = block;
++ ac.ac_g_ex.fe_len = *len;
++ ac.ac_flags = flags;
++
++ /*
++ * Sometimes, caller may want to merge even small number
++ * of blocks to an existing extent
++ */
++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
++ err = ext3_mb_find_by_goal(&ac, &e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ goto found;
++ }
+
-+ /* loop over the groups */
-+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) {
++ /*
++ * FIXME
++ * If requested chunk is power of 2 length, we can try
++ * to exploit buddy nature to speed allocation up
++ */
++
++
++ /*
++ * Let's just scan groups to find more-less suitable blocks
++ */
++ cr = 0;
++repeat:
++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
+ /* check is group good for our criteries */
-+ if (!mb_good_group(&ac, group, cr))
++ if (!ext3_mb_good_group(&ac, group, cr))
+ continue;
+
-+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b);
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
+ if (err)
+ goto out_err;
+
+ ext3_lock_group(sb, group);
-+ if (!mb_good_group(&ac, group, cr)) {
++ if (!ext3_mb_good_group(&ac, group, cr)) {
+ /* someone did allocation from this group */
+ ext3_unlock_group(sb, group);
+ ext3_mb_release_desc(&e3b);
+ continue;
+ }
+
-+ err = ext3_mb_new_in_group(&ac, &e3b, group);
++ ext3_mb_scan_group(&ac, &e3b);
+ ext3_unlock_group(sb, group);
++
+ if (ac.ac_status == AC_STATUS_FOUND)
+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
++
+ if (err)
+ goto out_err;
-+ if (ac.ac_status == AC_STATUS_FOUND)
++ if (ac.ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
+ }
+
++ if (ac.ac_status == AC_STATUS_BREAK &&
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++ printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n",
++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
++ ext3_mb_try_best_found(&ac, &e3b);
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /*
++ * Someone more lucky has already allocated it.
++ * The only thing we can do is just take first
++ * found block(s)
++ */
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
++ ac.ac_flags |= EXT3_MB_HINT_FIRST;
++ cr = 2;
++ goto repeat;
++ }
++ }
++
+ if (ac.ac_status != AC_STATUS_FOUND) {
-+ /* unfortunately, we can't satisfy this request */
-+ J_ASSERT(ac.ac_b_len == 0);
++ /*
++ * We aren't lucky definitely
++ */
++ J_ASSERT(ac.ac_b_ex.fe_len == 0);
+ DQUOT_FREE_BLOCK(inode, *len);
+ *errp = -ENOSPC;
+ block = 0;
++#if 1
++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
++ ac.ac_status, ac.ac_flags);
++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
++ sbi->s_blocks_reserved, ac.ac_found);
++ printk("EXT3-fs: groups: ");
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ printk("%d: %d ", i,
++ sbi->s_buddy_blocks[i]->bb_free);
++ printk("\n");
++#endif
+ goto out;
+ }
+
++found:
++ J_ASSERT(ac.ac_b_ex.fe_len > 0);
++
+ /* good news - free block(s) have been found. now it's time
+ * to mark block(s) in good old journaled bitmap */
-+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
-+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block);
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
+
+ /* we made a desicion, now mark found blocks in good old
+ * bitmap to be journaled */
+ ext3_debug("using block group %d(%d)\n",
+ ac.ac_b_group.group, gdp->bg_free_blocks_count);
+
-+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group);
++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group);
+ if (!bitmap_bh) {
+ *errp = -EIO;
+ goto out_err;
+ goto out_err;
+ }
+
-+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh);
++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh);
+ if (!gdp) {
+ *errp = -EIO;
+ goto out_err;
+ if (err)
+ goto out_err;
+
-+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
-+ + le32_to_cpu(es->s_first_data_block);
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
+
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+#if 0
++#if AGGRESSIVE_CHECK
+ for (i = 0; i < ac.ac_b_len; i++)
-+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data));
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
+#endif
-+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len);
++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
-+ ext3_lock_group(sb, ac.ac_b_group);
++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
-+ ac.ac_b_len);
-+ ext3_unlock_group(sb, ac.ac_b_group);
-+ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len);
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
++ - ac.ac_b_ex.fe_len);
++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len);
+
+ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+ if (err)
+ brelse(bitmap_bh);
+
+ /* drop non-allocated, but dquote'd blocks */
-+ J_ASSERT(*len >= ac.ac_b_len);
-+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len);
++ J_ASSERT(*len >= ac.ac_b_ex.fe_len);
++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len);
+
-+ *len = ac.ac_b_len;
++ *len = ac.ac_b_ex.fe_len;
++ J_ASSERT(*len > 0);
+ J_ASSERT(block != 0);
+ goto out;
+
+ *errp = err;
+ block = 0;
+out:
-+ if (!(flags & 2)) {
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
+ /* block wasn't reserved before and we reserved it
+ * at the beginning of allocation. it doesn't matter
+ * whether we allocated anything or we failed: time
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
++#ifdef MBALLOC_STATS
++ if (ac.ac_g_ex.fe_len > 1) {
++ spin_lock(&sbi->s_bal_lock);
++ sbi->s_bal_reqs++;
++ sbi->s_bal_allocated += *len;
++ if (*len >= ac.ac_g_ex.fe_len)
++ sbi->s_bal_success++;
++ sbi->s_bal_ex_scanned += ac.ac_found;
++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
++ sbi->s_bal_goals++;
++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN)
++ sbi->s_bal_breaks++;
++ spin_unlock(&sbi->s_bal_lock);
++ }
++#endif
+ return block;
+}
+
-+int ext3_mb_generate_buddy(struct super_block *sb, int group)
++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
++ struct ext3_mb_group_descr **grp)
+{
++ struct super_block *sb = e3b->bd_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int descr_per_block, err, offset;
++ struct ext3_mb_grp_header *hdr;
++ unsigned long block;
++
++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
++ / sizeof(struct ext3_mb_group_descr);
++ block = e3b->bd_group / descr_per_block;
++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
++ if (*bh == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
++ e3b->bd_group, err);
++ return err;
++ }
++
++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
++ e3b->bd_group);
++ brelse(*bh);
++ *bh = NULL;
++ return -EIO;
++ }
++
++ offset = e3b->bd_group % descr_per_block
++ * sizeof(struct ext3_mb_group_descr)
++ + sizeof(struct ext3_mb_grp_header);
++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
++
++ return 0;
++}
++
++int ext3_mb_load_descr(struct ext3_buddy *e3b)
++{
++ struct ext3_mb_group_descr *grp;
++ struct ext3_group_desc *gdp;
+ struct buffer_head *bh;
-+ int i, err, count = 0;
-+ struct ext3_buddy e3b;
++ int err, i;
++
++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
++ if (err)
++ return err;
+
-+ err = ext3_mb_load_desc(sb, group, &e3b);
++ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
++ e3b->bd_bd->bb_free = grp->mgd_free;
++ for (i = 0; i < e3b->bd_blkbits; i++) {
++ J_ASSERT(i < 16);
++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
++ }
++ brelse(bh);
++
++ /* additional checks against old group descriptor */
++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
++ if (!gdp)
++ return -EIO;
++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
++ e3b->bd_group, e3b->bd_bd->bb_free,
++ le16_to_cpu(gdp->bg_free_blocks_count));
++ BUG();
++ return -ENODATA;
++ }
++
++ return 0;
++}
++
++
++int ext3_mb_update_descr(struct ext3_buddy *e3b)
++{
++ struct ext3_mb_group_descr *grp;
++ struct ext3_group_desc *ogdp;
++ struct buffer_head *bh;
++ handle_t *handle;
++ int err, i;
++
++ /* additional checks against old group descriptor */
++ ogdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
++ if (!ogdp)
++ return -EIO;
++ if (e3b->bd_bd->bb_free != le16_to_cpu(ogdp->bg_free_blocks_count)) {
++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
++ e3b->bd_group, e3b->bd_bd->bb_free,
++ le16_to_cpu(ogdp->bg_free_blocks_count));
++ BUG();
++ return -ENODATA;
++ }
++
++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
+ if (err)
++ return err;
++
++ handle = journal_start(EXT3_SB(e3b->bd_sb)->s_journal, 1);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ handle = NULL;
+ goto out;
-+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize);
-+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize);
++ }
+
-+ bh = read_block_bitmap(sb, group);
-+ if (bh == NULL) {
-+ err = -EIO;
-+ goto out2;
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto out;
++ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
++ grp->mgd_free = e3b->bd_bd->bb_free;
++ for (i = 0; i < e3b->bd_blkbits; i++) {
++ J_ASSERT(i < 16);
++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
+ }
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ goto out;
++ err = 0;
++out:
++ brelse(bh);
++ if (handle)
++ ext3_journal_stop(handle);
++ return err;
++}
++
++int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
++{
++ struct super_block *sb = e3b->bd_sb;
++ struct buffer_head *bh;
++ int i, count = 0;
++
++ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize);
++ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize);
++
++ bh = read_block_bitmap(sb, e3b->bd_group);
++ if (bh == NULL)
++ return -EIO;
++
++ /* mb_free_blocks will set real free */
++ e3b->bd_bd->bb_first_free = 1 << 15;
+
+ /* loop over the blocks, and create buddies for free ones */
+ for (i = 0; i < sb->s_blocksize * 8; i++) {
+ if (!mb_test_bit(i, (void *) bh->b_data)) {
-+ mb_free_blocks(&e3b, i, 1);
++ mb_free_blocks(e3b, i, 1);
+ count++;
+ }
+ }
+ brelse(bh);
-+ mb_check_buddy(&e3b);
-+ ext3_mb_dirty_buddy(&e3b);
++ mb_check_buddy(e3b);
++ ext3_mb_dirty_buddy(e3b);
+
-+out2:
-+ ext3_mb_release_desc(&e3b);
-+out:
-+ return err;
++ return 0;
+}
+
+EXPORT_SYMBOL(ext3_mb_new_blocks);
+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
+
-+int ext3_mb_init_backend(struct super_block *sb)
++int ext3_mb_init_backend(struct super_block *sb, int *created)
+{
++ int err, i, len, descr_per_block, buddy_offset, size;
+ struct inode *root = sb->s_root->d_inode;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_grp_header *hdr;
++ struct buffer_head *bh = NULL;
++ unsigned long block;
+ struct dentry *db;
++ handle_t *handle;
+ tid_t target;
-+ int err, i;
+
-+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) *
-+ sbi->s_groups_count, GFP_KERNEL);
++ *created = 0;
++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
+ if (sbi->s_buddy_blocks == NULL) {
-+ printk("EXT3-fs: can't allocate mem for buddy maps\n");
++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_buddy_blocks, 0,
-+ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count);
++ memset(sbi->s_buddy_blocks, 0, len);
+ sbi->s_buddy = NULL;
+
+ down(&root->i_sem);
-+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root,
-+ strlen(EXT3_BUDDY_FILE));
++ len = strlen(EXT3_BUDDY_FILE);
++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
+ if (IS_ERR(db)) {
+ err = PTR_ERR(db);
-+ printk("EXT3-fs: can't lookup buddy file: %d\n", err);
++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
++ up(&root->i_sem);
+ goto out;
+ }
+
-+ if (db->d_inode != NULL) {
-+ sbi->s_buddy = igrab(db->d_inode);
-+ goto map;
++ if (db->d_inode == NULL) {
++ err = ext3_create(root, db, S_IFREG, NULL);
++ if (err) {
++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
++ up(&root->i_sem);
++ goto out;
++ }
++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
++ *created = 1;
++ printk("EXT3-fs: no buddy file, regenerate\n");
++ }
++ up(&root->i_sem);
++ sbi->s_buddy = igrab(db->d_inode);
++
++ /* calculate needed size */
++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
++ / sizeof(struct ext3_mb_group_descr);
++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
++ / descr_per_block;
++ len = sbi->s_groups_count * sb->s_blocksize * 2 +
++ buddy_offset * sb->s_blocksize;
++ if (len != i_size_read(sbi->s_buddy)) {
++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
++ (unsigned) len, (unsigned) i_size_read(sbi->s_buddy));
++ *created = 1;
+ }
+
-+ err = ext3_create(root, db, S_IFREG, NULL);
-+ if (err) {
-+ printk("error while creation buddy file: %d\n", err);
-+ } else {
-+ sbi->s_buddy = igrab(db->d_inode);
++ /* read/create mb group descriptors */
++ for (i = 0; i < buddy_offset; i++) {
++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
++ if (IS_ERR(handle)) {
++ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
++ err = PTR_ERR(handle);
++ goto err_out;
++ }
++
++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
++ if (bh == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
++ goto err_out;
++ }
++ hdr = (struct ext3_mb_group_hdr *) bh->b_data;
++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto err_out;
++ *created = 1;
++ printk("EXT3-fs: invalid header 0x%x in %d, regenerate\n", hdr->mh_magic, i);
++ hdr->mh_magic = EXT3_MB_MAGIC_V1;
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ goto err_out;
++ }
++ brelse(bh);
++ ext3_journal_stop(handle);
+ }
+
-+map:
++ len = sizeof(struct ext3_buddy_group_blocks);
++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ struct buffer_head *bh = NULL;
-+ handle_t *handle;
+
-+ sbi->s_buddy_blocks[i] =
-+ kmalloc(sizeof(struct ext3_buddy_group_blocks),
-+ GFP_KERNEL);
++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
+ if (sbi->s_buddy_blocks[i] == NULL) {
-+ printk("EXT3-fs: can't allocate mem for buddy\n");
++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
+ err = -ENOMEM;
+ goto out2;
+ }
++ memset(sbi->s_buddy_blocks[i], 0, len);
+
+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
+ if (IS_ERR(handle)) {
++ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
+ err = PTR_ERR(handle);
+ goto out2;
+ }
+
+ /* allocate block for bitmap */
-+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err);
++ block = buddy_offset + i * 2;
++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
+ if (bh == NULL) {
-+ printk("can't get block for buddy bitmap: %d\n", err);
++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
+ goto out2;
+ }
+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
+ brelse(bh);
+
+ /* allocate block for buddy */
-+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err);
++ block = buddy_offset + i * 2 + 1;
++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
+ if (bh == NULL) {
-+ printk("can't get block for buddy: %d\n", err);
++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
+ goto out2;
+ }
+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
+ brelse(bh);
++
++ size = (block + 1) << sbi->s_buddy->i_blkbits;
++ if (size > sbi->s_buddy->i_size) {
++ *created = 1;
++ EXT3_I(sbi->s_buddy)->i_disksize = size;
++ i_size_write(sbi->s_buddy, size);
++ mark_inode_dirty(sbi->s_buddy);
++ }
+ ext3_journal_stop(handle);
++
+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
+ sbi->s_buddy_blocks[i]->bb_tid = 0;
+out2:
+ dput(db);
+out:
-+ up(&root->i_sem);
+ return err;
++
++err_out:
++ return err;
++}
++
++int ext3_mb_write_descriptors(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_buddy e3b;
++ int ret = 0, i, err;
++
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ if (sbi->s_buddy_blocks[i] == NULL)
++ continue;
++
++ err = ext3_mb_load_buddy(sb, i, &e3b);
++ if (err == 0) {
++ ext3_mb_update_descr(&e3b);
++ ext3_mb_release_desc(&e3b);
++ } else
++ ret = err;
++ }
++ return ret;
+}
+
+int ext3_mb_release(struct super_block *sb)
+ ext3_mb_free_committed_blocks(sb);
+
+ if (sbi->s_buddy_blocks) {
-+ for (i = 0; i < sbi->s_groups_count; i++)
-+ if (sbi->s_buddy_blocks[i])
-+ kfree(sbi->s_buddy_blocks[i]);
++ ext3_mb_write_descriptors(sb);
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ if (sbi->s_buddy_blocks[i] == NULL)
++ continue;
++ kfree(sbi->s_buddy_blocks[i]);
++ }
+ kfree(sbi->s_buddy_blocks);
+ }
+ if (sbi->s_buddy)
+ if (sbi->s_blocks_reserved)
+ printk("ext3-fs: %ld blocks being reserved at umount!\n",
+ sbi->s_blocks_reserved);
++#ifdef MBALLOC_STATS
++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n",
++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success);
++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n",
++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks);
++#endif
+ return 0;
+}
+
-+int ext3_mb_init(struct super_block *sb)
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
-+ struct ext3_super_block *es;
-+ int i;
++ struct ext3_buddy e3b;
++ int i, err, created;
+
+ if (!test_opt(sb, MBALLOC))
+ return 0;
+
+ /* init file for buddy data */
+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ if (ext3_mb_init_backend(sb))
-+ return 0;
++ if ((err = ext3_mb_init_backend(sb, &created)))
++ return err;
+
-+ es = EXT3_SB(sb)->s_es;
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ ext3_mb_generate_buddy(sb, i);
++repeat:
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
++ err = ext3_mb_load_buddy(sb, i, &e3b);
++ if (err) {
++ /* FIXME: release backend */
++ return err;
++ }
++ if (created || needs_recovery)
++ ext3_mb_generate_buddy(&e3b);
++ else
++ err = ext3_mb_load_descr(&e3b);
++ ext3_mb_release_desc(&e3b);
++ if (err == -ENODATA) {
++ created = 1;
++ goto repeat;
++ }
++ }
++ if (created || needs_recovery)
++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
++ EXT3_SB(sb)->s_groups_count);
+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
+ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ printk("EXT3-fs: mballoc enabled\n");
++
++#ifdef MBALLOC_STATS
++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
++#define MBALLOC_INFO " (stats)"
++#else
++#define MBALLOC_INFO ""
++#endif
++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO);
+ return 0;
+}
+
+ mb_debug("gonna free %u blocks in group %u (0x%p):",
+ md->num, md->group, md);
+
-+ err = ext3_mb_load_desc(sb, md->group, &e3b);
++ err = ext3_mb_load_buddy(sb, md->group, &e3b);
+ BUG_ON(err != 0);
+
+ /* there are blocks to put in buddy to make them really free */
+}
+
+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count, int metadata)
++ unsigned long block, unsigned long count,
++ int metadata, int *freed)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext3_group_desc *gdp;
+ struct ext3_buddy e3b;
+ int err = 0, ret;
+
++ *freed = 0;
+ sb = inode->i_sb;
+ if (!sb) {
+ printk ("ext3_free_blocks: nonexistent device");
+ if (err)
+ goto error_return;
+
-+ err = ext3_mb_load_desc(sb, block_group, &e3b);
++ err = ext3_mb_load_buddy(sb, block_group, &e3b);
+ if (err)
+ goto error_return;
+
+ } else {
+ ext3_lock_group(sb, block_group);
+ mb_free_blocks(&e3b, bit, count);
-+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+ ext3_unlock_group(sb, block_group);
-+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+ }
++ spin_lock(sb_bgl_lock(sbi, block_group));
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
++ spin_unlock(sb_bgl_lock(sbi, block_group));
+
+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
-+ /* FIXME: undo logic will be implemented later and another way */
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
-+ DQUOT_FREE_BLOCK(inode, count);
++ *freed = count;
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+}
+
+int ext3_new_block(handle_t *handle, struct inode *inode,
-+ unsigned long goal, int *errp)
++ unsigned long goal, int *errp)
+{
+ int ret, len;
+
+}
+
+
++extern void ext3_free_blocks_old(handle_t *, struct inode *,
++ unsigned long, unsigned long);
+void ext3_free_blocks(handle_t *handle, struct inode * inode,
+ unsigned long block, unsigned long count, int metadata)
+{
++ int freed;
++
+ if (!test_opt(inode->i_sb, MBALLOC))
+ ext3_free_blocks_old(handle, inode, block, count);
-+ else
-+ ext3_mb_free_blocks(handle, inode, block, count, metadata);
++ else {
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
++ if (freed)
++ DQUOT_FREE_BLOCK(inode, freed);
++ }
+ return;
+}
++
Index: linux-2.6.5-sles9/fs/ext3/super.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300
-+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300
++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:48:54.515249408 +0300
@@ -389,6 +389,7 @@
struct ext3_super_block *es = sbi->s_es;
int i;
ext3_ext_release(sb);
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
-@@ -542,7 +543,7 @@
+@@ -540,6 +541,7 @@
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-- Opt_err, Opt_extents, Opt_extdebug
-+ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc,
++ Opt_mballoc, Opt_mbfactor,
+ Opt_err, Opt_extents, Opt_extdebug
};
- static match_table_t tokens = {
-@@ -589,6 +590,7 @@
+@@ -587,6 +589,8 @@
{Opt_iopen_nopriv, "iopen_nopriv"},
{Opt_extents, "extents"},
{Opt_extdebug, "extdebug"},
+ {Opt_mballoc, "mballoc"},
++ {Opt_mballoc, "mbfactor=%u"},
{Opt_err, NULL}
};
-@@ -810,6 +812,9 @@
+@@ -808,6 +812,16 @@
case Opt_extdebug:
set_opt (sbi->s_mount_opt, EXTDEBUG);
break;
+ case Opt_mballoc:
+ set_opt (sbi->s_mount_opt, MBALLOC);
+ break;
++ case Opt_mbfactor:
++ if (match_int(&args[0], &option))
++ return 0;
++ if (option < 0)
++ return 0;
++ sbi->s_mb_factor = option;
++ break;
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1463,7 +1468,8 @@
+@@ -1461,7 +1475,8 @@
ext3_count_dirs(sb));
ext3_ext_init(sb);
-
-+ ext3_mb_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+
return 0;
failed_mount3:
Index: linux-2.6.5-sles9/fs/ext3/Makefile
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300
-+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300
++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:48:54.517249104 +0300
@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
Index: linux-2.6.5-sles9/fs/ext3/balloc.c
===================================================================
--- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300
++++ linux-2.6.5-sles9/fs/ext3/balloc.c 2005-02-23 01:48:54.520248648 +0300
@@ -78,7 +78,7 @@
*
* Return buffer_head on success or NULL in case of failure.
struct buffer_head *bitmap_bh = NULL;
Index: linux-2.6.5-sles9/fs/ext3/namei.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300
-+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2005-02-23 01:01:46.551165296 +0300
++++ linux-2.6.5-sles9/fs/ext3/namei.c 2005-02-23 01:48:54.523248192 +0300
@@ -1640,7 +1640,7 @@
* If the create succeeds, we fill in the inode information
* with d_instantiate().
handle_t *handle;
Index: linux-2.6.5-sles9/fs/ext3/inode.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300
-+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300
++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:48:54.529247280 +0300
@@ -572,7 +572,7 @@
ext3_journal_forget(handle, branch[i].bh);
}
/*
Index: linux-2.6.5-sles9/fs/ext3/extents.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300
-+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300
-@@ -740,7 +740,7 @@
+--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300
++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:48:54.533246672 +0300
+@@ -774,7 +774,7 @@
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
}
}
kfree(ablocks);
-@@ -1391,7 +1391,7 @@
+@@ -1431,7 +1431,7 @@
path->p_idx->ei_leaf);
bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
return err;
}
-@@ -1879,10 +1879,12 @@
+@@ -1919,10 +1919,12 @@
int needed = ext3_remove_blocks_credits(tree, ex, from, to);
handle_t *handle = ext3_journal_start(tree->inode, needed);
struct buffer_head *bh;
if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
/* tail removal */
unsigned long num, start;
-@@ -1894,7 +1896,7 @@
+@@ -1934,7 +1936,7 @@
bh = sb_find_get_block(tree->inode->i_sb, start + i);
ext3_forget(handle, 0, tree->inode, bh, start + i);
}
from, to, ex->ee_block, ex->ee_len);
Index: linux-2.6.5-sles9/fs/ext3/xattr.c
===================================================================
---- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300
-+++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300
+--- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2005-02-23 01:01:52.387278072 +0300
++++ linux-2.6.5-sles9/fs/ext3/xattr.c 2005-02-23 01:48:54.537246064 +0300
@@ -1366,7 +1366,7 @@
new_bh = sb_getblk(sb, block);
if (!new_bh) {
} else {
Index: linux-2.6.5-sles9/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300
-@@ -57,6 +57,8 @@
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:48:54.539245760 +0300
+@@ -57,6 +57,14 @@
#define ext3_debug(f, a...) do {} while (0)
#endif
+#define EXT3_MULTIBLOCK_ALLOCATOR 1
+
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
/*
* Special inodes numbers
*/
-@@ -339,6 +341,7 @@
+@@ -339,6 +347,7 @@
#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
++#define EXT3_MOUNT_MBALLOC 0x100000/* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef clear_opt
-@@ -698,7 +701,7 @@
+@@ -698,7 +707,7 @@
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
extern unsigned long ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-@@ -743,6 +746,13 @@
- extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
- unsigned long);
+@@ -820,6 +829,37 @@
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+/* mballoc.c */
-+extern int ext3_mb_init(struct super_block *sb);
-+extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long goal,int *len, int flags,int *errp);
-+extern int ext3_mb_release(struct super_block *sb);
++extern int ext3_mb_init(struct super_block *, int);
++extern int ext3_mb_release(struct super_block *);
++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
++extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
+
- /* namei.c */
- extern int ext3_orphan_add(handle_t *, struct inode *);
- extern int ext3_orphan_del(handle_t *, struct inode *);
++/* writeback.c */
++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
++extern int ext3_wb_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to);
++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
++extern int ext3_wb_writepage(struct page *, struct writeback_control *);
++extern int ext3_wb_invalidatepage(struct page *, unsigned long);
++extern int ext3_wb_releasepage(struct page *, int);
++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
++extern void ext3_wb_init(struct super_block *);
++extern void ext3_wb_release(struct super_block *);
++
++/* writeback.c */
++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
++extern int ext3_wb_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to);
++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
++extern int ext3_wb_writepage(struct page *, struct writeback_control *);
++extern int ext3_wb_invalidatepage(struct page *, unsigned long);
++extern int ext3_wb_releasepage(struct page *, int);
++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
++extern void ext3_wb_init(struct super_block *);
++extern void ext3_wb_release(struct super_block *);
++
+ #endif /* __KERNEL__ */
+
+ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h
===================================================================
---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300
-+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2005-02-23 01:01:48.242908112 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2005-02-23 01:48:54.541245456 +0300
@@ -23,10 +23,30 @@
#define EXT_INCLUDE
#include <linux/blockgroup_lock.h>
+ struct list_head list;
+};
+
-+#define EXT3_BB_MAX_ORDER 14
-+
+struct ext3_buddy_group_blocks {
-+ sector_t bb_bitmap;
-+ sector_t bb_buddy;
++ __u32 bb_bitmap;
++ __u32 bb_buddy;
+ spinlock_t bb_lock;
-+ unsigned bb_counters[EXT3_BB_MAX_ORDER];
++ unsigned long bb_tid;
+ struct ext3_free_metadata *bb_md_cur;
-+ unsigned long bb_tid;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned bb_counters[];
+};
+
/*
* third extended-fs super-block data in memory
*/
-@@ -78,6 +98,17 @@
+@@ -78,6 +98,27 @@
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
#endif
+ struct list_head s_committed_transaction;
+ spinlock_t s_md_lock;
+ tid_t s_last_transaction;
++ int s_mb_factor;
++
++ /* stats for buddy allocator */
++ spinlock_t s_bal_lock;
++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */
++ unsigned long s_bal_success; /* we found long enough chunks */
++ unsigned long s_bal_allocated; /* in blocks */
++ unsigned long s_bal_ex_scanned; /* total extents scanned */
++ unsigned long s_bal_goals; /* goal hits */
++ unsigned long s_bal_breaks; /* too long searches */
};
#endif /* _LINUX_EXT3_FS_SB */
static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree,
struct ext3_ext_path *path,
- struct ext3_extent *newex, int exist)
+ struct ext3_ext_cache *cex)
{
struct inode *inode = tree->inode;
struct bpointers *bp = tree->private;
+ struct ext3_extent nex;
int count, err, goal;
unsigned long pblock;
unsigned long tgen;
EXT_ASSERT(i == path->p_depth);
EXT_ASSERT(path[i].p_hdr);
- if (exist) {
+ if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
err = EXT_CONTINUE;
goto map;
}
if (bp->create == 0) {
i = 0;
- if (newex->ee_block < bp->start)
- i = bp->start - newex->ee_block;
- if (i >= newex->ee_len)
+ if (cex->ec_block < bp->start)
+ i = bp->start - cex->ec_block;
+ if (i >= cex->ec_len)
CERROR("nothing to do?! i = %d, e_num = %u\n",
- i, newex->ee_len);
- for (; i < newex->ee_len && bp->num; i++) {
+ i, cex->ec_len);
+ for (; i < cex->ec_len && bp->num; i++) {
*(bp->created) = 0;
bp->created++;
*(bp->blocks) = 0;
return PTR_ERR(handle);
}
+ ext3_down_truncate_sem(inode);
if (tgen != EXT_GENERATION(tree)) {
/* the tree has changed. so path can be invalid at moment */
lock_24kernel();
journal_stop(handle);
unlock_24kernel();
- ext3_down_truncate_sem(inode);
return EXT_REPEAT;
}
- ext3_down_truncate_sem(inode);
- count = newex->ee_len;
- goal = ext3_ext_find_goal(inode, path, newex->ee_block, &aflags);
+ count = cex->ec_len;
+ goal = ext3_ext_find_goal(inode, path, cex->ec_block, &aflags);
aflags |= 2; /* block have been already reserved */
pblock = ext3_mb_new_blocks(handle, inode, goal, &count, aflags, &err);
if (!pblock)
goto out;
- EXT_ASSERT(count <= newex->ee_len);
+ EXT_ASSERT(count <= cex->ec_len);
/* insert new extent */
- newex->ee_start = pblock;
- newex->ee_len = count;
- err = ext3_ext_insert_extent(handle, tree, path, newex);
+ nex.ee_block = cex->ec_block;
+ nex.ee_start = pblock;
+ nex.ee_len = count;
+ err = ext3_ext_insert_extent(handle, tree, path, &nex);
if (err)
goto out;
+ /*
+ * Putting len of the actual extent we just inserted,
+ * we are asking ext3_ext_walk_space() to continue
+ * scaning after that block
+ */
+ cex->ec_len = nex.ee_len;
+ cex->ec_start = nex.ee_start;
+ BUG_ON(nex.ee_len == 0);
+ BUG_ON(nex.ee_block != cex->ec_block);
+
/* correct on-disk inode size */
- if (newex->ee_len > 0) {
- new_i_size = (loff_t) newex->ee_block + newex->ee_len;
+ if (nex.ee_len > 0) {
+ new_i_size = (loff_t) nex.ee_block + nex.ee_len;
new_i_size = new_i_size << inode->i_blkbits;
if (new_i_size > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = new_i_size;
CERROR("initial space: %lu:%u\n",
bp->start, bp->init_num);
CERROR("current extent: %u/%u/%u %d\n",
- newex->ee_block, newex->ee_len,
- newex->ee_start, exist);
+ cex->ec_block, cex->ec_len,
+ cex->ec_start, cex->ec_type);
}
i = 0;
- if (newex->ee_block < bp->start)
- i = bp->start - newex->ee_block;
- if (i >= newex->ee_len)
+ if (cex->ec_block < bp->start)
+ i = bp->start - cex->ec_block;
+ if (i >= cex->ec_len)
CERROR("nothing to do?! i = %d, e_num = %u\n",
- i, newex->ee_len);
- for (; i < newex->ee_len && bp->num; i++) {
- *(bp->created) = (exist == 0 ? 1 : 0);
+ i, cex->ec_len);
+ for (; i < cex->ec_len && bp->num; i++) {
+ if (cex->ec_type == EXT3_EXT_CACHE_EXTENT)
+ *(bp->created) = 0;
+ else
+ *(bp->created) = 1;
bp->created++;
- *(bp->blocks) = newex->ee_start + i;
+ *(bp->blocks) = cex->ec_start + i;
bp->blocks++;
bp->num--;
bp->start++;