From ec4c3830a24ec2d158670bd8b82177637b368b6a Mon Sep 17 00:00:00 2001 From: phil Date: Sat, 26 Feb 2005 03:04:22 +0000 Subject: [PATCH] b=5733,5638 Alex's patches to fix small bugs in extents and mballoc: 1) extents-related fixes: 1) callback's API used in ext3_ext_walk_space() changes a bit to reflect that callback can be given >2^16 extent len (hole) 2) fsfilt_ext3 has changed to use updated callback API 3) minor race in ext3_ext_new_extent_cb() fixed 2) mballoc-related fixes: 1) free space searching has changed to be more smart 2) three possible races have been fixed 3) lots of minor fixes 4) mballoc doesn't regenerate buddies in clean umount case --- .../patches/ext3-extents-2.6.5.patch | 188 ++- .../patches/ext3-mballoc2-2.6-suse.patch | 1211 +++++++++++++------- lustre/ChangeLog | 2 + .../patches/ext3-extents-2.6.5.patch | 188 ++- .../patches/ext3-mballoc2-2.6-suse.patch | 1211 +++++++++++++------- lustre/lvfs/fsfilt_ext3.c | 66 +- 6 files changed, 1944 insertions(+), 922 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch index b9a01d7..671fbc0 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -1,9 +1,9 @@ %patch Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 -@@ -0,0 +1,2313 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 +@@ -0,0 +1,2356 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -49,6 +49,27 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +#include +#include + ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned) eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned) eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned) eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; @@ -430,10 +451,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(i == 0 || eh->eh_entries > 0); + + /* account possible depth increase */ + if (!path) { @@ -455,22 +478,27 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path[ppos].p_ext = NULL; + + bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ return ERR_PTR(-EIO); -+ } ++ if (!bh) ++ goto err; ++ + eh = EXT_BLOCK_HDR(bh); + ppos++; + EXT_ASSERT(ppos <= depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; + + /* find extent */ + ext3_ext_binsearch(tree, path + ppos, block); @@ -478,6 +506,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + ext3_ext_show_path(tree, path); + + return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); +} + +/* @@ -1047,7 +1081,6 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int depth, len, err, next; + + EXT_ASSERT(newext->ee_len > 0); -+ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK); + depth = EXT_DEPTH(tree); + ex = path[depth].p_ext; + EXT_ASSERT(path[depth].p_hdr); @@ -1187,7 +1220,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + unsigned long num, ext_prepare_callback func) +{ + struct ext3_ext_path *path = NULL; -+ struct ext3_extent *ex, cbex; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; + unsigned long next, start = 0, end = 0; + unsigned long last = block + num; + int depth, exists, err = 0; @@ -1246,14 +1280,20 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + EXT_ASSERT(end > start); + + if (!exists) { -+ cbex.ee_block = start; -+ cbex.ee_len = end - start; -+ cbex.ee_start = 0; -+ } else -+ cbex = *ex; ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } + ++ EXT_ASSERT(cbex.ec_len > 0); + EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex, exists); ++ err = func(tree, path, &cbex); + ext3_ext_drop_refs(path); + + if (err < 0) @@ -1271,7 +1311,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path = NULL; + } + -+ block = cbex.ee_block + cbex.ee_len; ++ block = cbex.ec_block + cbex.ec_len; + } + + if (path) { @@ -1987,7 +2027,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + tree->root = (void *) EXT3_I(inode)->i_data; + tree->buffer = (void *) inode; + tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->cex = &EXT3_I(inode)->i_cached_extent; + tree->ops = &ext3_blockmap_helpers; +} + @@ -2001,7 +2041,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int goal, newblock, err = 0, depth; + struct ext3_extents_tree tree; + -+ clear_buffer_new(bh_result); ++ __clear_bit(BH_New, &bh_result->b_state); + ext3_init_tree_desc(&tree, inode); + ext_debug(&tree, "block %d requested for inode %u\n", + (int) iblock, (unsigned) inode->i_ino); @@ -2087,13 +2127,15 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + /* previous routine could use block we allocated */ + newblock = newex.ee_start; -+ set_buffer_new(bh_result); ++ __set_bit(BH_New, &bh_result->b_state); + + ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, + newex.ee_start, EXT3_EXT_CACHE_EXTENT); +out: + ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); ++ __set_bit(BH_Mapped, &bh_result->b_state); ++ bh_result->b_bdev = inode->i_sb->s_bdev; ++ bh_result->b_blocknr = newblock; +out2: + if (path) { + ext3_ext_drop_refs(path); @@ -2218,12 +2260,13 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +static int +ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, -+ struct ext3_extent *newex, int exist) ++ struct ext3_ext_cache *newex) +{ + struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; + -+ if (!exist) ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; ++ + if (buf->err < 0) + return EXT_BREAK; + if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) @@ -2242,13 +2285,13 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +static int +ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, -+ struct ext3_extent *ex, int exist) ++ struct ext3_ext_cache *ex) +{ + struct ext3_extent_tree_stats *buf = + (struct ext3_extent_tree_stats *) tree->private; + int depth; + -+ if (!exist) ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; + + depth = EXT_DEPTH(tree); @@ -2259,7 +2302,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) ++ unsigned long arg) +{ + int err = 0; + @@ -2319,8 +2362,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + Index: linux-2.6.5-sles9/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2004-11-09 02:22:55.763148128 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2004-11-09 02:23:21.587222272 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2005-02-23 01:01:52.366281264 +0300 ++++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2005-02-23 01:02:37.398435336 +0300 @@ -647,6 +647,10 @@ DQUOT_FREE_INODE(inode); goto fail2; @@ -2334,8 +2377,8 @@ Index: linux-2.6.5-sles9/fs/ext3/ialloc.c ext3_std_error(sb, err); Index: linux-2.6.5-sles9/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:22:55.767147520 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:01:52.373280200 +0300 ++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300 @@ -796,6 +796,17 @@ goto reread; } @@ -2416,8 +2459,8 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c else Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300 ++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -2429,8 +2472,8 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.5-sles9/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:22:56.450043704 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:02:34.072940888 +0300 ++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300 @@ -389,6 +389,7 @@ struct ext3_super_block *es = sbi->s_es; int i; @@ -2439,18 +2482,16 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -447,6 +448,10 @@ +@@ -447,6 +448,8 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; ei->vfs_inode.i_version = 1; -+ ei->i_cached_extent[0] = 0; -+ ei->i_cached_extent[1] = 0; -+ ei->i_cached_extent[2] = 0; -+ ei->i_cached_extent[3] = 0; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); return &ei->vfs_inode; } -@@ -537,7 +542,7 @@ +@@ -537,7 +540,7 @@ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, @@ -2459,7 +2500,7 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c }; static match_table_t tokens = { -@@ -582,6 +587,8 @@ +@@ -582,6 +585,8 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, @@ -2468,7 +2509,7 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c {Opt_err, NULL} }; -@@ -797,6 +804,12 @@ +@@ -797,6 +802,12 @@ break; case Opt_ignore: break; @@ -2481,7 +2522,7 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1449,6 +1462,8 @@ +@@ -1449,6 +1460,8 @@ percpu_counter_mod(&sbi->s_dirs_counter, ext3_count_dirs(sb)); @@ -2492,8 +2533,8 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c failed_mount3: Index: linux-2.6.5-sles9/fs/ext3/ioctl.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2004-11-09 02:15:44.610693264 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2004-11-09 02:23:52.991448104 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2005-02-23 01:01:42.887722224 +0300 ++++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2005-02-23 01:02:37.412433208 +0300 @@ -124,6 +124,10 @@ err = ext3_change_inode_journal_flag(inode, jflag); return err; @@ -2507,8 +2548,8 @@ Index: linux-2.6.5-sles9/fs/ext3/ioctl.c return put_user(inode->i_generation, (int *) arg); Index: linux-2.6.5-sles9/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:22:58.767691368 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:35.823674736 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300 @@ -186,6 +186,7 @@ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ @@ -2563,9 +2604,9 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h Index: linux-2.6.5-sles9/include/linux/ext3_extents.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2004-11-09 02:23:21.606219384 +0300 -@@ -0,0 +1,252 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300 +@@ -0,0 +1,265 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2738,7 +2779,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + */ +typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, + struct ext3_ext_path *, -+ struct ext3_extent *, int); ++ struct ext3_ext_cache *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 @@ -2746,7 +2787,6 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + + +#define EXT_MAX_BLOCK 0xffffffff -+#define EXT_CACHE_MARK 0xffff + + +#define EXT_FIRST_EXTENT(__hdr__) \ @@ -2778,6 +2818,20 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); + ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ + +/* + * this structure is used to gather extents from the tree via ioctl @@ -2820,27 +2874,35 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:22:55.780145544 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300 -@@ -128,6 +128,8 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2005-02-23 01:01:52.425272296 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2005-02-23 01:45:55.611446920 +0300 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct reserve_window { + __u32 _rsv_start; /* First byte reserved */ +@@ -128,6 +129,8 @@ */ struct semaphore truncate_sem; struct inode vfs_inode; + -+ __u32 i_cached_extent[4]; ++ struct ext3_ext_cache i_cached_extent; }; #endif /* _LINUX_EXT3_FS_I */ %diffstat fs/ext3/Makefile | 2 - fs/ext3/extents.c | 2313 +++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/extents.c | 2356 +++++++++++++++++++++++++++++++++++++++++++ fs/ext3/ialloc.c | 4 fs/ext3/inode.c | 29 fs/ext3/ioctl.c | 4 - fs/ext3/super.c | 17 - include/linux/ext3_extents.h | 252 ++++ - include/linux/ext3_fs.h | 15 - include/linux/ext3_fs_i.h | 2 - 9 files changed, 2630 insertions(+), 8 deletions(-) + fs/ext3/super.c | 15 + include/linux/ext3_extents.h | 265 ++++ + include/linux/ext3_fs.h | 17 + include/linux/ext3_fs_i.h | 3 + 9 files changed, 2687 insertions(+), 8 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 363007f..d0ffc5c 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,8 +1,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300 -@@ -0,0 +1,1441 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2005-02-23 01:56:19.101662000 +0300 +@@ -0,0 +1,1835 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -39,19 +39,29 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + +/* + * TODO: -+ * - do not scan from the beginning, try to remember first free block -+ * - mb_mark_used_* may allocate chunk right after splitting buddy ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling + */ + +/* + * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. this checks slow things down a lot ++ * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* ++ * with MBALLOC_STATS allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++#define MBALLOC_STATS ++ ++/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG @@ -66,60 +76,75 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +#define EXT3_BUDDY_FILE ".buddy" + +/* -+ * max. number of chunks to be tracked in ext3_free_extent struct ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++#define EXT3_MB_MAX_TO_SCAN 100 ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file + */ -+#define MB_ARR_SIZE 32 ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbaad16fc ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; + +struct ext3_allocation_context { + struct super_block *ac_sb; + + /* search goals */ -+ int ac_g_group; -+ int ac_g_start; -+ int ac_g_len; -+ int ac_g_flags; ++struct ext3_free_extent ac_g_ex; + + /* the best found extent */ -+ int ac_b_group; -+ int ac_b_start; -+ int ac_b_len; ++ struct ext3_free_extent ac_b_ex; + + /* number of iterations done. we have to track to limit searching */ -+ int ac_repeats; -+ int ac_groups_scanned; -+ int ac_status; ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 -+ ++#define AC_STATUS_BREAK 3 + +struct ext3_buddy { -+ void *bd_bitmap; -+ void *bd_buddy; -+ int bd_blkbits; + struct buffer_head *bd_bh; + struct buffer_head *bd_bh2; + struct ext3_buddy_group_blocks *bd_bd; + struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; +}; -+ -+struct ext3_free_extent { -+ int fe_start; -+ int fe_len; -+ unsigned char fe_orders[MB_ARR_SIZE]; -+ unsigned char fe_nums; -+ unsigned char fe_back; -+}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + -+ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); + @@ -145,21 +170,33 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); ++ __set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); + set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); ++ __clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); + clear_bit(bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ + int i = 1; -+ void *bb; ++ char *bb; + -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(max != NULL); + + if (order > e3b->bd_blkbits + 1) @@ -168,19 +205,21 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + /* at order 0 we see each particular block */ + *max = 1 << (e3b->bd_blkbits + 3); + if (order == 0) -+ return e3b->bd_bitmap; ++ return EXT3_MB_BITMAP(e3b); + -+ bb = e3b->bd_buddy; ++ bb = EXT3_MB_BUDDY(e3b); + *max = *max >> 1; + while (i < order) { + bb += 1 << (e3b->bd_blkbits - i); + i++; + *max = *max >> 1; + } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); + return bb; +} + -+static int ext3_mb_load_desc(struct super_block *sb, int group, ++static int ext3_mb_load_buddy(struct super_block *sb, int group, + struct ext3_buddy *e3b) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -191,7 +230,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + /* load bitmap */ + e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); + if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", ++ ext3_error(sb, "ext3_mb_load_buddy", + "can't get block for buddy bitmap\n"); + goto out; + } @@ -204,7 +243,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + /* load buddy */ + e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); + if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", ++ ext3_error(sb, "ext3_mb_load_buddy", + "can't get block for buddy bitmap\n"); + goto out; + } @@ -214,11 +253,10 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } + J_ASSERT(buffer_uptodate(e3b->bd_bh2)); + -+ e3b->bd_bitmap = e3b->bd_bh->b_data; -+ e3b->bd_buddy = e3b->bd_bh2->b_data; + e3b->bd_blkbits = sb->s_blocksize_bits; + e3b->bd_bd = sbi->s_buddy_blocks[group]; + e3b->bd_sb = sb; ++ e3b->bd_group = group; + + return 0; +out: @@ -277,7 +315,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; -+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); ++ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b))); + } + count++; + } @@ -319,10 +357,10 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + int order = 1; + void *bb; + -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); + -+ bb = e3b->bd_buddy; ++ bb = EXT3_MB_BUDDY(e3b); + while (order <= e3b->bd_blkbits + 1) { + block = block >> 1; + if (mb_test_bit(block, bb)) { @@ -348,7 +386,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_clear_bit(cur, bm); ++ mb_clear_bit_atomic(cur, bm); + cur++; + } +} @@ -366,7 +404,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_set_bit(cur, bm); ++ mb_set_bit_atomic(cur, bm); + cur++; + } +} @@ -377,12 +415,17 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + void *buddy, *buddy2; + + mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ + while (count-- > 0) { + block = first++; + order = 0; + -+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); -+ mb_set_bit(block, e3b->bd_bitmap); ++ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_set_bit(block, EXT3_MB_BITMAP(e3b)); + e3b->bd_bd->bb_counters[order]++; + + /* start of the buddy */ @@ -422,64 +465,23 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + return 0; +} + -+/* -+ * returns 1 if out extent is enough to fill needed space -+ */ -+int mb_make_backward_extent(struct ext3_free_extent *in, -+ struct ext3_free_extent *out, int needed) ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) +{ -+ int i; -+ -+ J_ASSERT(in); -+ J_ASSERT(out); -+ J_ASSERT(in->fe_nums < MB_ARR_SIZE); -+ -+ out->fe_len = 0; -+ out->fe_start = in->fe_start + in->fe_len; -+ out->fe_nums = 0; -+ -+ /* for single-chunk extent we need not back order -+ * also, if an extent doesn't fill needed space -+ * then it makes no sense to try back order becase -+ * if we select this extent then it'll be use as is */ -+ if (in->fe_nums < 2 || in->fe_len < needed) -+ return 0; -+ -+ i = in->fe_nums - 1; -+ while (i >= 0 && out->fe_len < needed) { -+ out->fe_len += (1 << in->fe_orders[i]); -+ out->fe_start -= (1 << in->fe_orders[i]); -+ i--; -+ } -+ /* FIXME: in some situation fe_orders may be too small to hold -+ * all the buddies */ -+ J_ASSERT(out->fe_len >= needed); -+ -+ for (i++; i < in->fe_nums; i++) -+ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; -+ J_ASSERT(out->fe_nums < MB_ARR_SIZE); -+ out->fe_back = 1; -+ -+ return 1; -+} -+ -+int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int space = needed; + int next, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); + -+ ex->fe_nums = 0; -+ ex->fe_len = 0; -+ + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + J_ASSERT(block < max); -+ if (!mb_test_bit(block, buddy)) -+ goto nofree; ++ if (!mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } + + if (order == 0) { + /* find actual order */ @@ -487,64 +489,55 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + block = block >> order; + } + -+ ex->fe_orders[ex->fe_nums++] = order; + ex->fe_len = 1 << order; + ex->fe_start = block << order; -+ ex->fe_back = 0; -+ -+ while ((space = space - (1 << order)) > 0) { ++ ex->fe_group = e3b->bd_group; + -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); -+ if (!mb_test_bit(next, e3b->bd_bitmap)) ++ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b))) + break; + + ord = mb_find_order_for_block(e3b, next); + -+ if ((1 << ord) >= needed) { -+ /* we dont want to coalesce with self-enough buddies */ -+ break; -+ } + order = ord; + block = next >> order; + ex->fe_len += 1 << order; -+ -+ if (ex->fe_nums < MB_ARR_SIZE) -+ ex->fe_orders[ex->fe_nums++] = order; + } + -+nofree: + J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); + return ex->fe_len; +} + -+static int mb_mark_used_backward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ -+ int start = ex->fe_start, len0 = len; ++ int start = ex->fe_start; ++ int len = ex->fe_len; + int ord, mlen, max, cur; ++ int len0 = len; + void *buddy; + -+ start = ex->fe_start + ex->fe_len - 1; ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ + while (len) { + ord = mb_find_order_for_block(e3b, start); -+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && -+ len >= (1 << ord)) { ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_clear_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; -+ start -= mlen; ++ start += mlen; + len -= mlen; + J_ASSERT(len >= 0); -+ J_ASSERT(start >= 0); + continue; + } + @@ -564,158 +557,218 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } + + /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); ++ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); + + mb_check_buddy(e3b); + + return 0; +} + -+static int mb_mark_used_forward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} + -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; + -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); + -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ ac->ac_found++; + -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; + } + -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } + -+ mb_check_buddy(e3b); ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } + -+ return 0; ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN) ++ ac->ac_status = AC_STATUS_BREAK; +} + -+int inline mb_mark_used(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ int err; ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; + -+ J_ASSERT(ex); -+ if (ex->fe_back == 0) -+ err = mb_mark_used_forward(e3b, ex, len); -+ else -+ err = mb_mark_used_backward(e3b, ex, len); -+ return err; ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; +} + -+int ext3_mb_new_in_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b, int group) ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ struct super_block *sb = ac->ac_sb; -+ int err, gorder, max, i; -+ struct ext3_free_extent curex; -+ -+ /* let's know order of allocation */ -+ gorder = 0; -+ while (ac->ac_g_len > (1 << gorder)) -+ gorder++; -+ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { -+ /* someone asks for space at this specified block -+ * probably he wants to merge it into existing extent */ -+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { -+ /* good. at least one block is free */ -+ max = mb_find_extent(e3b, 0, ac->ac_g_start, -+ ac->ac_g_len, &curex); -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ err = 0; -+ goto out; -+ } -+ /* don't try to find goal anymore */ -+ ac->ac_g_flags &= ~1; ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); + } ++ ext3_unlock_group(ac->ac_sb, group); + -+ i = 0; -+ while (1) { -+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) -+ break; ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); + -+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); -+ if (max >= ac->ac_g_len) { -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = find_next_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); + break; + } -+ i += max; -+ } + -+ return 0; ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); + -+out: -+ return err; ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } +} + -+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) +{ -+ struct ext3_group_desc *gdp; -+ int free_blocks; ++ int free; + -+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); -+ if (!gdp) -+ return 0; -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ if (free_blocks == 0) -+ return 0; ++ J_ASSERT(cr >= 0 && cr < 3); + -+ /* someone wants this block very much */ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) -+ return 1; ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; + -+ /* FIXME: I'd like to take fragmentation into account here */ + if (cr == 0) { -+ if (free_blocks >= ac->ac_g_len >> 1) ++ if (free >= ac->ac_g_ex.fe_len >> 1) + return 1; + } else if (cr == 1) { -+ if (free_blocks >= ac->ac_g_len >> 2) ++ if (free >= ac->ac_g_ex.fe_len >> 2) + return 1; + } else if (cr == 2) { + return 1; -+ } else { -+ BUG(); + } + return 0; +} @@ -759,7 +812,13 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + -+ if (!(flags & 2)) { ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* someone asks for non-reserved blocks */ + BUG_ON(*len > 1); + err = ext3_mb_reserve_blocks(sb, 1); @@ -790,62 +849,137 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + EXT3_BLOCKS_PER_GROUP(sb)); + + /* set up allocation goals */ -+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; -+ ac.ac_status = 0; ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; + ac.ac_sb = inode->i_sb; -+ ac.ac_g_group = group; -+ ac.ac_g_start = block; -+ ac.ac_g_len = *len; -+ ac.ac_g_flags = flags; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } + -+ /* loop over the groups */ -+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + + /* check is group good for our criteries */ -+ if (!mb_good_group(&ac, group, cr)) ++ if (!ext3_mb_good_group(&ac, group, cr)) + continue; + -+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + + ext3_lock_group(sb, group); -+ if (!mb_good_group(&ac, group, cr)) { ++ if (!ext3_mb_good_group(&ac, group, cr)) { + /* someone did allocation from this group */ + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + continue; + } + -+ err = ext3_mb_new_in_group(&ac, &e3b, group); ++ ext3_mb_scan_group(&ac, &e3b); + ext3_unlock_group(sb, group); ++ + if (ac.ac_status == AC_STATUS_FOUND) + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); ++ + if (err) + goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) ++ if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ + if (ac.ac_status != AC_STATUS_FOUND) { -+ /* unfortunately, we can't satisfy this request */ -+ J_ASSERT(ac.ac_b_len == 0); ++ /* ++ * We aren't lucky definitely ++ */ ++ J_ASSERT(ac.ac_b_ex.fe_len == 0); + DQUOT_FREE_BLOCK(inode, *len); + *errp = -ENOSPC; + block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif + goto out; + } + ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ + /* good news - free block(s) have been found. now it's time + * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); + + /* we made a desicion, now mark found blocks in good old + * bitmap to be journaled */ @@ -853,7 +987,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_debug("using block group %d(%d)\n", + ac.ac_b_group.group, gdp->bg_free_blocks_count); + -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group); ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); + if (!bitmap_bh) { + *errp = -EIO; + goto out_err; @@ -865,7 +999,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + goto out_err; + } + -+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out_err; @@ -875,8 +1009,9 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (err) + goto out_err; + -+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); + + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || @@ -885,18 +1020,18 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+#if 0 ++#if AGGRESSIVE_CHECK + for (i = 0; i < ac.ac_b_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); +#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + -+ ext3_lock_group(sb, ac.ac_b_group); ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ ac.ac_b_len); -+ ext3_unlock_group(sb, ac.ac_b_group); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len); ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) @@ -910,10 +1045,11 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + brelse(bitmap_bh); + + /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); + -+ *len = ac.ac_b_len; ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); + J_ASSERT(block != 0); + goto out; + @@ -928,7 +1064,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + *errp = err; + block = 0; +out: -+ if (!(flags & 2)) { ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter + * whether we allocated anything or we failed: time @@ -937,42 +1073,175 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } ++#ifdef MBALLOC_STATS ++ if (ac.ac_g_ex.fe_len > 1) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++#endif + return block; +} + -+int ext3_mb_generate_buddy(struct super_block *sb, int group) ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) +{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; + struct buffer_head *bh; -+ int i, err, count = 0; -+ struct ext3_buddy e3b; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; + -+ err = ext3_mb_load_desc(sb, group, &e3b); ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ BUG(); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *ogdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ ogdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!ogdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(ogdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(ogdp->bg_free_blocks_count)); ++ BUG(); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); + if (err) ++ return err; ++ ++ handle = journal_start(EXT3_SB(e3b->bd_sb)->s_journal, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; + goto out; -+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); -+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); ++ } + -+ bh = read_block_bitmap(sb, group); -+ if (bh == NULL) { -+ err = -EIO; -+ goto out2; ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; + } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_first_free = 1 << 15; + + /* loop over the blocks, and create buddies for free ones */ + for (i = 0; i < sb->s_blocksize * 8; i++) { + if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(&e3b, i, 1); ++ mb_free_blocks(e3b, i, 1); + count++; + } + } + brelse(bh); -+ mb_check_buddy(&e3b); -+ ext3_mb_dirty_buddy(&e3b); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); + -+out2: -+ ext3_mb_release_desc(&e3b); -+out: -+ return err; ++ return 0; +} + +EXPORT_SYMBOL(ext3_mb_new_blocks); @@ -981,83 +1250,143 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) + -+int ext3_mb_init_backend(struct super_block *sb) ++int ext3_mb_init_backend(struct super_block *sb, int *created) +{ ++ int err, i, len, descr_per_block, buddy_offset, size; + struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; + struct dentry *db; ++ handle_t *handle; + tid_t target; -+ int err, i; + -+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) * -+ sbi->s_groups_count, GFP_KERNEL); ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy maps\n"); ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); + return -ENOMEM; + } -+ memset(sbi->s_buddy_blocks, 0, -+ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count); ++ memset(sbi->s_buddy_blocks, 0, len); + sbi->s_buddy = NULL; + + down(&root->i_sem); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, -+ strlen(EXT3_BUDDY_FILE)); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); + if (IS_ERR(db)) { + err = PTR_ERR(db); -+ printk("EXT3-fs: can't lookup buddy file: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); + goto out; + } + -+ if (db->d_inode != NULL) { -+ sbi->s_buddy = igrab(db->d_inode); -+ goto map; ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ printk("EXT3-fs: no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; + } + -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk("error while creation buddy file: %d\n", err); -+ } else { -+ sbi->s_buddy = igrab(db->d_inode); ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_group_hdr *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ *created = 1; ++ printk("EXT3-fs: invalid header 0x%x in %d, regenerate\n", hdr->mh_magic, i); ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); + } + -+map: ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { -+ struct buffer_head *bh = NULL; -+ handle_t *handle; + -+ sbi->s_buddy_blocks[i] = -+ kmalloc(sizeof(struct ext3_buddy_group_blocks), -+ GFP_KERNEL); ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks[i] == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); + err = -ENOMEM; + goto out2; + } ++ memset(sbi->s_buddy_blocks[i], 0, len); + + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); + err = PTR_ERR(handle); + goto out2; + } + + /* allocate block for bitmap */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk("can't get block for buddy bitmap: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; + brelse(bh); + + /* allocate block for buddy */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk("can't get block for buddy: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; + brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } + ext3_journal_stop(handle); ++ + spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); + sbi->s_buddy_blocks[i]->bb_md_cur = NULL; + sbi->s_buddy_blocks[i]->bb_tid = 0; @@ -1069,8 +1398,30 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +out2: + dput(db); +out: -+ up(&root->i_sem); + return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; +} + +int ext3_mb_release(struct super_block *sb) @@ -1091,9 +1442,12 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_buddy_blocks) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ if (sbi->s_buddy_blocks[i]) -+ kfree(sbi->s_buddy_blocks[i]); ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } + kfree(sbi->s_buddy_blocks); + } + if (sbi->s_buddy) @@ -1101,32 +1455,62 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); ++#ifdef MBALLOC_STATS ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n", ++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n", ++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks); ++#endif + return 0; +} + -+int ext3_mb_init(struct super_block *sb) ++int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ -+ struct ext3_super_block *es; -+ int i; ++ struct ext3_buddy e3b; ++ int i, err, created; + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* init file for buddy data */ + clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ if (ext3_mb_init_backend(sb)) -+ return 0; ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; + -+ es = EXT3_SB(sb)->s_es; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ ext3_mb_generate_buddy(sb, i); ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); + spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); + spin_lock_init(&EXT3_SB(sb)->s_md_lock); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); + set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ printk("EXT3-fs: mballoc enabled\n"); ++ ++#ifdef MBALLOC_STATS ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++#define MBALLOC_INFO " (stats)" ++#else ++#define MBALLOC_INFO "" ++#endif ++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO); + return 0; +} + @@ -1158,7 +1542,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + mb_debug("gonna free %u blocks in group %u (0x%p):", + md->num, md->group, md); + -+ err = ext3_mb_load_desc(sb, md->group, &e3b); ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ @@ -1263,7 +1647,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; @@ -1276,6 +1661,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + struct ext3_buddy e3b; + int err = 0, ret; + ++ *freed = 0; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); @@ -1345,7 +1731,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ err = ext3_mb_load_desc(sb, block_group, &e3b); ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); + if (err) + goto error_return; + @@ -1356,18 +1742,18 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } else { + ext3_lock_group(sb, block_group); + mb_free_blocks(&e3b, bit, count); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + ext3_unlock_group(sb, block_group); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); + } ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); + + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + -+ /* FIXME: undo logic will be implemented later and another way */ + mb_clear_bits(bitmap_bh->b_data, bit, count); -+ DQUOT_FREE_BLOCK(inode, count); ++ *freed = count; + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -1420,7 +1806,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + +int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) ++ unsigned long goal, int *errp) +{ + int ret, len; + @@ -1435,19 +1821,27 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + + ++extern void ext3_free_blocks_old(handle_t *, struct inode *, ++ unsigned long, unsigned long); +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ ++ int freed; ++ + if (!test_opt(inode->i_sb, MBALLOC)) + ext3_free_blocks_old(handle, inode, block, count); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata); ++ else { ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ } + return; +} ++ Index: linux-2.6.5-sles9/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300 ++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:48:54.515249408 +0300 @@ -389,6 +389,7 @@ struct ext3_super_block *es = sbi->s_es; int i; @@ -1456,47 +1850,54 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -542,7 +543,7 @@ +@@ -540,6 +541,7 @@ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_err, Opt_extents, Opt_extdebug -+ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc, ++ Opt_mballoc, Opt_mbfactor, + Opt_err, Opt_extents, Opt_extdebug }; - static match_table_t tokens = { -@@ -589,6 +590,7 @@ +@@ -587,6 +589,8 @@ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_mballoc, "mbfactor=%u"}, {Opt_err, NULL} }; -@@ -810,6 +812,9 @@ +@@ -808,6 +812,16 @@ case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1463,7 +1468,8 @@ +@@ -1461,7 +1475,8 @@ ext3_count_dirs(sb)); ext3_ext_init(sb); - -+ ext3_mb_init(sb); ++ ext3_mb_init(sb, needs_recovery); + return 0; failed_mount3: Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 ++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:48:54.517249104 +0300 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -1509,7 +1910,7 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile Index: linux-2.6.5-sles9/fs/ext3/balloc.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300 ++++ linux-2.6.5-sles9/fs/ext3/balloc.c 2005-02-23 01:48:54.520248648 +0300 @@ -78,7 +78,7 @@ * * Return buffer_head on success or NULL in case of failure. @@ -1539,8 +1940,8 @@ Index: linux-2.6.5-sles9/fs/ext3/balloc.c struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.5-sles9/fs/ext3/namei.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300 -+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2005-02-23 01:01:46.551165296 +0300 ++++ linux-2.6.5-sles9/fs/ext3/namei.c 2005-02-23 01:48:54.523248192 +0300 @@ -1640,7 +1640,7 @@ * If the create succeeds, we fill in the inode information * with d_instantiate(). @@ -1552,8 +1953,8 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c handle_t *handle; Index: linux-2.6.5-sles9/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300 ++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:48:54.529247280 +0300 @@ -572,7 +572,7 @@ ext3_journal_forget(handle, branch[i].bh); } @@ -1592,9 +1993,9 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c /* Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300 -@@ -740,7 +740,7 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 ++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:48:54.533246672 +0300 +@@ -774,7 +774,7 @@ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -1603,7 +2004,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c } } kfree(ablocks); -@@ -1391,7 +1391,7 @@ +@@ -1431,7 +1431,7 @@ path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -1612,7 +2013,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c return err; } -@@ -1879,10 +1879,12 @@ +@@ -1919,10 +1919,12 @@ int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -1626,7 +2027,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1894,7 +1896,7 @@ +@@ -1934,7 +1936,7 @@ bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -1637,8 +2038,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c from, to, ex->ee_block, ex->ee_len); Index: linux-2.6.5-sles9/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2005-02-23 01:01:52.387278072 +0300 ++++ linux-2.6.5-sles9/fs/ext3/xattr.c 2005-02-23 01:48:54.537246064 +0300 @@ -1366,7 +1366,7 @@ new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -1668,26 +2069,32 @@ Index: linux-2.6.5-sles9/fs/ext3/xattr.c } else { Index: linux-2.6.5-sles9/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300 -@@ -57,6 +57,8 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:48:54.539245760 +0300 +@@ -57,6 +57,14 @@ #define ext3_debug(f, a...) do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR 1 + ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ /* * Special inodes numbers */ -@@ -339,6 +341,7 @@ +@@ -339,6 +347,7 @@ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ ++#define EXT3_MOUNT_MBALLOC 0x100000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -698,7 +701,7 @@ +@@ -698,7 +707,7 @@ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, @@ -1696,24 +2103,48 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -743,6 +746,13 @@ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, - unsigned long); +@@ -820,6 +829,37 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); +/* mballoc.c */ -+extern int ext3_mb_init(struct super_block *sb); -+extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal,int *len, int flags,int *errp); -+extern int ext3_mb_release(struct super_block *sb); ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); + - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ + #endif /* __KERNEL__ */ + + #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300 +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2005-02-23 01:01:48.242908112 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2005-02-23 01:48:54.541245456 +0300 @@ -23,10 +23,30 @@ #define EXT_INCLUDE #include @@ -1731,21 +2162,21 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h + struct list_head list; +}; + -+#define EXT3_BB_MAX_ORDER 14 -+ +struct ext3_buddy_group_blocks { -+ sector_t bb_bitmap; -+ sector_t bb_buddy; ++ __u32 bb_bitmap; ++ __u32 bb_buddy; + spinlock_t bb_lock; -+ unsigned bb_counters[EXT3_BB_MAX_ORDER]; ++ unsigned long bb_tid; + struct ext3_free_metadata *bb_md_cur; -+ unsigned long bb_tid; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; +}; + /* * third extended-fs super-block data in memory */ -@@ -78,6 +98,17 @@ +@@ -78,6 +98,27 @@ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif @@ -1760,6 +2191,16 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h + struct list_head s_committed_transaction; + spinlock_t s_md_lock; + tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ }; #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 349f5ba..1b9be20 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -27,6 +27,8 @@ tbd Cluster File Systems, Inc. - hold NS lock when calling handle_ast_error->del_waiting_lock (5746) - fix setattr mtime regression from lovcleanup merge (4829, 5669) - workaround for 2.6 crash in ll_unhash_aliases (5687, 5210) + - small ext3 extents cleanups and fixes (5733) + - improved mballoc code, several small races and bugs fixed (5733, 5638) * miscellania - service request history (4965) - put {ll,lov,osc}_async_page structs in a single slab (4699) diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch index b9a01d7..671fbc0 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -1,9 +1,9 @@ %patch Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 -@@ -0,0 +1,2313 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 +@@ -0,0 +1,2356 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -49,6 +49,27 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +#include +#include + ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned) eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned) eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned) eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; @@ -430,10 +451,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(i == 0 || eh->eh_entries > 0); + + /* account possible depth increase */ + if (!path) { @@ -455,22 +478,27 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path[ppos].p_ext = NULL; + + bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ return ERR_PTR(-EIO); -+ } ++ if (!bh) ++ goto err; ++ + eh = EXT_BLOCK_HDR(bh); + ppos++; + EXT_ASSERT(ppos <= depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; + + /* find extent */ + ext3_ext_binsearch(tree, path + ppos, block); @@ -478,6 +506,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + ext3_ext_show_path(tree, path); + + return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); +} + +/* @@ -1047,7 +1081,6 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int depth, len, err, next; + + EXT_ASSERT(newext->ee_len > 0); -+ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK); + depth = EXT_DEPTH(tree); + ex = path[depth].p_ext; + EXT_ASSERT(path[depth].p_hdr); @@ -1187,7 +1220,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + unsigned long num, ext_prepare_callback func) +{ + struct ext3_ext_path *path = NULL; -+ struct ext3_extent *ex, cbex; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; + unsigned long next, start = 0, end = 0; + unsigned long last = block + num; + int depth, exists, err = 0; @@ -1246,14 +1280,20 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + EXT_ASSERT(end > start); + + if (!exists) { -+ cbex.ee_block = start; -+ cbex.ee_len = end - start; -+ cbex.ee_start = 0; -+ } else -+ cbex = *ex; ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } + ++ EXT_ASSERT(cbex.ec_len > 0); + EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex, exists); ++ err = func(tree, path, &cbex); + ext3_ext_drop_refs(path); + + if (err < 0) @@ -1271,7 +1311,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path = NULL; + } + -+ block = cbex.ee_block + cbex.ee_len; ++ block = cbex.ec_block + cbex.ec_len; + } + + if (path) { @@ -1987,7 +2027,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + tree->root = (void *) EXT3_I(inode)->i_data; + tree->buffer = (void *) inode; + tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->cex = &EXT3_I(inode)->i_cached_extent; + tree->ops = &ext3_blockmap_helpers; +} + @@ -2001,7 +2041,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int goal, newblock, err = 0, depth; + struct ext3_extents_tree tree; + -+ clear_buffer_new(bh_result); ++ __clear_bit(BH_New, &bh_result->b_state); + ext3_init_tree_desc(&tree, inode); + ext_debug(&tree, "block %d requested for inode %u\n", + (int) iblock, (unsigned) inode->i_ino); @@ -2087,13 +2127,15 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + /* previous routine could use block we allocated */ + newblock = newex.ee_start; -+ set_buffer_new(bh_result); ++ __set_bit(BH_New, &bh_result->b_state); + + ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, + newex.ee_start, EXT3_EXT_CACHE_EXTENT); +out: + ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); ++ __set_bit(BH_Mapped, &bh_result->b_state); ++ bh_result->b_bdev = inode->i_sb->s_bdev; ++ bh_result->b_blocknr = newblock; +out2: + if (path) { + ext3_ext_drop_refs(path); @@ -2218,12 +2260,13 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +static int +ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, -+ struct ext3_extent *newex, int exist) ++ struct ext3_ext_cache *newex) +{ + struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; + -+ if (!exist) ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; ++ + if (buf->err < 0) + return EXT_BREAK; + if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) @@ -2242,13 +2285,13 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +static int +ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, -+ struct ext3_extent *ex, int exist) ++ struct ext3_ext_cache *ex) +{ + struct ext3_extent_tree_stats *buf = + (struct ext3_extent_tree_stats *) tree->private; + int depth; + -+ if (!exist) ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; + + depth = EXT_DEPTH(tree); @@ -2259,7 +2302,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) ++ unsigned long arg) +{ + int err = 0; + @@ -2319,8 +2362,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + Index: linux-2.6.5-sles9/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2004-11-09 02:22:55.763148128 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2004-11-09 02:23:21.587222272 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2005-02-23 01:01:52.366281264 +0300 ++++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2005-02-23 01:02:37.398435336 +0300 @@ -647,6 +647,10 @@ DQUOT_FREE_INODE(inode); goto fail2; @@ -2334,8 +2377,8 @@ Index: linux-2.6.5-sles9/fs/ext3/ialloc.c ext3_std_error(sb, err); Index: linux-2.6.5-sles9/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:22:55.767147520 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:01:52.373280200 +0300 ++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300 @@ -796,6 +796,17 @@ goto reread; } @@ -2416,8 +2459,8 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c else Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300 ++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -2429,8 +2472,8 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.5-sles9/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:22:56.450043704 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:02:34.072940888 +0300 ++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300 @@ -389,6 +389,7 @@ struct ext3_super_block *es = sbi->s_es; int i; @@ -2439,18 +2482,16 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -447,6 +448,10 @@ +@@ -447,6 +448,8 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; ei->vfs_inode.i_version = 1; -+ ei->i_cached_extent[0] = 0; -+ ei->i_cached_extent[1] = 0; -+ ei->i_cached_extent[2] = 0; -+ ei->i_cached_extent[3] = 0; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); return &ei->vfs_inode; } -@@ -537,7 +542,7 @@ +@@ -537,7 +540,7 @@ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, @@ -2459,7 +2500,7 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c }; static match_table_t tokens = { -@@ -582,6 +587,8 @@ +@@ -582,6 +585,8 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, @@ -2468,7 +2509,7 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c {Opt_err, NULL} }; -@@ -797,6 +804,12 @@ +@@ -797,6 +802,12 @@ break; case Opt_ignore: break; @@ -2481,7 +2522,7 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1449,6 +1462,8 @@ +@@ -1449,6 +1460,8 @@ percpu_counter_mod(&sbi->s_dirs_counter, ext3_count_dirs(sb)); @@ -2492,8 +2533,8 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c failed_mount3: Index: linux-2.6.5-sles9/fs/ext3/ioctl.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2004-11-09 02:15:44.610693264 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2004-11-09 02:23:52.991448104 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2005-02-23 01:01:42.887722224 +0300 ++++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2005-02-23 01:02:37.412433208 +0300 @@ -124,6 +124,10 @@ err = ext3_change_inode_journal_flag(inode, jflag); return err; @@ -2507,8 +2548,8 @@ Index: linux-2.6.5-sles9/fs/ext3/ioctl.c return put_user(inode->i_generation, (int *) arg); Index: linux-2.6.5-sles9/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:22:58.767691368 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:35.823674736 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300 @@ -186,6 +186,7 @@ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ @@ -2563,9 +2604,9 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h Index: linux-2.6.5-sles9/include/linux/ext3_extents.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2004-11-09 02:23:21.606219384 +0300 -@@ -0,0 +1,252 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300 +@@ -0,0 +1,265 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2738,7 +2779,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + */ +typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, + struct ext3_ext_path *, -+ struct ext3_extent *, int); ++ struct ext3_ext_cache *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 @@ -2746,7 +2787,6 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + + +#define EXT_MAX_BLOCK 0xffffffff -+#define EXT_CACHE_MARK 0xffff + + +#define EXT_FIRST_EXTENT(__hdr__) \ @@ -2778,6 +2818,20 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); + ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ + +/* + * this structure is used to gather extents from the tree via ioctl @@ -2820,27 +2874,35 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:22:55.780145544 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300 -@@ -128,6 +128,8 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2005-02-23 01:01:52.425272296 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2005-02-23 01:45:55.611446920 +0300 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct reserve_window { + __u32 _rsv_start; /* First byte reserved */ +@@ -128,6 +129,8 @@ */ struct semaphore truncate_sem; struct inode vfs_inode; + -+ __u32 i_cached_extent[4]; ++ struct ext3_ext_cache i_cached_extent; }; #endif /* _LINUX_EXT3_FS_I */ %diffstat fs/ext3/Makefile | 2 - fs/ext3/extents.c | 2313 +++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/extents.c | 2356 +++++++++++++++++++++++++++++++++++++++++++ fs/ext3/ialloc.c | 4 fs/ext3/inode.c | 29 fs/ext3/ioctl.c | 4 - fs/ext3/super.c | 17 - include/linux/ext3_extents.h | 252 ++++ - include/linux/ext3_fs.h | 15 - include/linux/ext3_fs_i.h | 2 - 9 files changed, 2630 insertions(+), 8 deletions(-) + fs/ext3/super.c | 15 + include/linux/ext3_extents.h | 265 ++++ + include/linux/ext3_fs.h | 17 + include/linux/ext3_fs_i.h | 3 + 9 files changed, 2687 insertions(+), 8 deletions(-) diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 363007f..d0ffc5c 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,8 +1,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300 -@@ -0,0 +1,1441 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2005-02-23 01:56:19.101662000 +0300 +@@ -0,0 +1,1835 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -39,19 +39,29 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + +/* + * TODO: -+ * - do not scan from the beginning, try to remember first free block -+ * - mb_mark_used_* may allocate chunk right after splitting buddy ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling + */ + +/* + * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. this checks slow things down a lot ++ * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* ++ * with MBALLOC_STATS allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++#define MBALLOC_STATS ++ ++/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG @@ -66,60 +76,75 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +#define EXT3_BUDDY_FILE ".buddy" + +/* -+ * max. number of chunks to be tracked in ext3_free_extent struct ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++#define EXT3_MB_MAX_TO_SCAN 100 ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file + */ -+#define MB_ARR_SIZE 32 ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbaad16fc ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; + +struct ext3_allocation_context { + struct super_block *ac_sb; + + /* search goals */ -+ int ac_g_group; -+ int ac_g_start; -+ int ac_g_len; -+ int ac_g_flags; ++struct ext3_free_extent ac_g_ex; + + /* the best found extent */ -+ int ac_b_group; -+ int ac_b_start; -+ int ac_b_len; ++ struct ext3_free_extent ac_b_ex; + + /* number of iterations done. we have to track to limit searching */ -+ int ac_repeats; -+ int ac_groups_scanned; -+ int ac_status; ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 -+ ++#define AC_STATUS_BREAK 3 + +struct ext3_buddy { -+ void *bd_bitmap; -+ void *bd_buddy; -+ int bd_blkbits; + struct buffer_head *bd_bh; + struct buffer_head *bd_bh2; + struct ext3_buddy_group_blocks *bd_bd; + struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; +}; -+ -+struct ext3_free_extent { -+ int fe_start; -+ int fe_len; -+ unsigned char fe_orders[MB_ARR_SIZE]; -+ unsigned char fe_nums; -+ unsigned char fe_back; -+}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + -+ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); + @@ -145,21 +170,33 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); ++ __set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); + set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); ++ __clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); + clear_bit(bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ + int i = 1; -+ void *bb; ++ char *bb; + -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(max != NULL); + + if (order > e3b->bd_blkbits + 1) @@ -168,19 +205,21 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + /* at order 0 we see each particular block */ + *max = 1 << (e3b->bd_blkbits + 3); + if (order == 0) -+ return e3b->bd_bitmap; ++ return EXT3_MB_BITMAP(e3b); + -+ bb = e3b->bd_buddy; ++ bb = EXT3_MB_BUDDY(e3b); + *max = *max >> 1; + while (i < order) { + bb += 1 << (e3b->bd_blkbits - i); + i++; + *max = *max >> 1; + } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); + return bb; +} + -+static int ext3_mb_load_desc(struct super_block *sb, int group, ++static int ext3_mb_load_buddy(struct super_block *sb, int group, + struct ext3_buddy *e3b) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -191,7 +230,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + /* load bitmap */ + e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); + if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", ++ ext3_error(sb, "ext3_mb_load_buddy", + "can't get block for buddy bitmap\n"); + goto out; + } @@ -204,7 +243,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + /* load buddy */ + e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); + if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", ++ ext3_error(sb, "ext3_mb_load_buddy", + "can't get block for buddy bitmap\n"); + goto out; + } @@ -214,11 +253,10 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } + J_ASSERT(buffer_uptodate(e3b->bd_bh2)); + -+ e3b->bd_bitmap = e3b->bd_bh->b_data; -+ e3b->bd_buddy = e3b->bd_bh2->b_data; + e3b->bd_blkbits = sb->s_blocksize_bits; + e3b->bd_bd = sbi->s_buddy_blocks[group]; + e3b->bd_sb = sb; ++ e3b->bd_group = group; + + return 0; +out: @@ -277,7 +315,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; -+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); ++ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b))); + } + count++; + } @@ -319,10 +357,10 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + int order = 1; + void *bb; + -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); + -+ bb = e3b->bd_buddy; ++ bb = EXT3_MB_BUDDY(e3b); + while (order <= e3b->bd_blkbits + 1) { + block = block >> 1; + if (mb_test_bit(block, bb)) { @@ -348,7 +386,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_clear_bit(cur, bm); ++ mb_clear_bit_atomic(cur, bm); + cur++; + } +} @@ -366,7 +404,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_set_bit(cur, bm); ++ mb_set_bit_atomic(cur, bm); + cur++; + } +} @@ -377,12 +415,17 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + void *buddy, *buddy2; + + mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ + while (count-- > 0) { + block = first++; + order = 0; + -+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); -+ mb_set_bit(block, e3b->bd_bitmap); ++ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_set_bit(block, EXT3_MB_BITMAP(e3b)); + e3b->bd_bd->bb_counters[order]++; + + /* start of the buddy */ @@ -422,64 +465,23 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + return 0; +} + -+/* -+ * returns 1 if out extent is enough to fill needed space -+ */ -+int mb_make_backward_extent(struct ext3_free_extent *in, -+ struct ext3_free_extent *out, int needed) ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) +{ -+ int i; -+ -+ J_ASSERT(in); -+ J_ASSERT(out); -+ J_ASSERT(in->fe_nums < MB_ARR_SIZE); -+ -+ out->fe_len = 0; -+ out->fe_start = in->fe_start + in->fe_len; -+ out->fe_nums = 0; -+ -+ /* for single-chunk extent we need not back order -+ * also, if an extent doesn't fill needed space -+ * then it makes no sense to try back order becase -+ * if we select this extent then it'll be use as is */ -+ if (in->fe_nums < 2 || in->fe_len < needed) -+ return 0; -+ -+ i = in->fe_nums - 1; -+ while (i >= 0 && out->fe_len < needed) { -+ out->fe_len += (1 << in->fe_orders[i]); -+ out->fe_start -= (1 << in->fe_orders[i]); -+ i--; -+ } -+ /* FIXME: in some situation fe_orders may be too small to hold -+ * all the buddies */ -+ J_ASSERT(out->fe_len >= needed); -+ -+ for (i++; i < in->fe_nums; i++) -+ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; -+ J_ASSERT(out->fe_nums < MB_ARR_SIZE); -+ out->fe_back = 1; -+ -+ return 1; -+} -+ -+int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int space = needed; + int next, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); + -+ ex->fe_nums = 0; -+ ex->fe_len = 0; -+ + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + J_ASSERT(block < max); -+ if (!mb_test_bit(block, buddy)) -+ goto nofree; ++ if (!mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } + + if (order == 0) { + /* find actual order */ @@ -487,64 +489,55 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + block = block >> order; + } + -+ ex->fe_orders[ex->fe_nums++] = order; + ex->fe_len = 1 << order; + ex->fe_start = block << order; -+ ex->fe_back = 0; -+ -+ while ((space = space - (1 << order)) > 0) { ++ ex->fe_group = e3b->bd_group; + -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); -+ if (!mb_test_bit(next, e3b->bd_bitmap)) ++ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b))) + break; + + ord = mb_find_order_for_block(e3b, next); + -+ if ((1 << ord) >= needed) { -+ /* we dont want to coalesce with self-enough buddies */ -+ break; -+ } + order = ord; + block = next >> order; + ex->fe_len += 1 << order; -+ -+ if (ex->fe_nums < MB_ARR_SIZE) -+ ex->fe_orders[ex->fe_nums++] = order; + } + -+nofree: + J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); + return ex->fe_len; +} + -+static int mb_mark_used_backward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ -+ int start = ex->fe_start, len0 = len; ++ int start = ex->fe_start; ++ int len = ex->fe_len; + int ord, mlen, max, cur; ++ int len0 = len; + void *buddy; + -+ start = ex->fe_start + ex->fe_len - 1; ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ + while (len) { + ord = mb_find_order_for_block(e3b, start); -+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && -+ len >= (1 << ord)) { ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_clear_bit(start >> ord, buddy); + e3b->bd_bd->bb_counters[ord]--; -+ start -= mlen; ++ start += mlen; + len -= mlen; + J_ASSERT(len >= 0); -+ J_ASSERT(start >= 0); + continue; + } + @@ -564,158 +557,218 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } + + /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); ++ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); + + mb_check_buddy(e3b); + + return 0; +} + -+static int mb_mark_used_forward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} + -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; + -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); + -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ ac->ac_found++; + -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; + } + -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } + -+ mb_check_buddy(e3b); ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } + -+ return 0; ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN) ++ ac->ac_status = AC_STATUS_BREAK; +} + -+int inline mb_mark_used(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ int err; ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; + -+ J_ASSERT(ex); -+ if (ex->fe_back == 0) -+ err = mb_mark_used_forward(e3b, ex, len); -+ else -+ err = mb_mark_used_backward(e3b, ex, len); -+ return err; ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; +} + -+int ext3_mb_new_in_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b, int group) ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ struct super_block *sb = ac->ac_sb; -+ int err, gorder, max, i; -+ struct ext3_free_extent curex; -+ -+ /* let's know order of allocation */ -+ gorder = 0; -+ while (ac->ac_g_len > (1 << gorder)) -+ gorder++; -+ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { -+ /* someone asks for space at this specified block -+ * probably he wants to merge it into existing extent */ -+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { -+ /* good. at least one block is free */ -+ max = mb_find_extent(e3b, 0, ac->ac_g_start, -+ ac->ac_g_len, &curex); -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ err = 0; -+ goto out; -+ } -+ /* don't try to find goal anymore */ -+ ac->ac_g_flags &= ~1; ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); + } ++ ext3_unlock_group(ac->ac_sb, group); + -+ i = 0; -+ while (1) { -+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) -+ break; ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); + -+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); -+ if (max >= ac->ac_g_len) { -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = find_next_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); + break; + } -+ i += max; -+ } + -+ return 0; ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); + -+out: -+ return err; ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } +} + -+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) +{ -+ struct ext3_group_desc *gdp; -+ int free_blocks; ++ int free; + -+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); -+ if (!gdp) -+ return 0; -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ if (free_blocks == 0) -+ return 0; ++ J_ASSERT(cr >= 0 && cr < 3); + -+ /* someone wants this block very much */ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) -+ return 1; ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; + -+ /* FIXME: I'd like to take fragmentation into account here */ + if (cr == 0) { -+ if (free_blocks >= ac->ac_g_len >> 1) ++ if (free >= ac->ac_g_ex.fe_len >> 1) + return 1; + } else if (cr == 1) { -+ if (free_blocks >= ac->ac_g_len >> 2) ++ if (free >= ac->ac_g_ex.fe_len >> 2) + return 1; + } else if (cr == 2) { + return 1; -+ } else { -+ BUG(); + } + return 0; +} @@ -759,7 +812,13 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + -+ if (!(flags & 2)) { ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* someone asks for non-reserved blocks */ + BUG_ON(*len > 1); + err = ext3_mb_reserve_blocks(sb, 1); @@ -790,62 +849,137 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + EXT3_BLOCKS_PER_GROUP(sb)); + + /* set up allocation goals */ -+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; -+ ac.ac_status = 0; ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; + ac.ac_sb = inode->i_sb; -+ ac.ac_g_group = group; -+ ac.ac_g_start = block; -+ ac.ac_g_len = *len; -+ ac.ac_g_flags = flags; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } + -+ /* loop over the groups */ -+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + + /* check is group good for our criteries */ -+ if (!mb_good_group(&ac, group, cr)) ++ if (!ext3_mb_good_group(&ac, group, cr)) + continue; + -+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + + ext3_lock_group(sb, group); -+ if (!mb_good_group(&ac, group, cr)) { ++ if (!ext3_mb_good_group(&ac, group, cr)) { + /* someone did allocation from this group */ + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + continue; + } + -+ err = ext3_mb_new_in_group(&ac, &e3b, group); ++ ext3_mb_scan_group(&ac, &e3b); + ext3_unlock_group(sb, group); ++ + if (ac.ac_status == AC_STATUS_FOUND) + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); ++ + if (err) + goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) ++ if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ + if (ac.ac_status != AC_STATUS_FOUND) { -+ /* unfortunately, we can't satisfy this request */ -+ J_ASSERT(ac.ac_b_len == 0); ++ /* ++ * We aren't lucky definitely ++ */ ++ J_ASSERT(ac.ac_b_ex.fe_len == 0); + DQUOT_FREE_BLOCK(inode, *len); + *errp = -ENOSPC; + block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif + goto out; + } + ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ + /* good news - free block(s) have been found. now it's time + * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); + + /* we made a desicion, now mark found blocks in good old + * bitmap to be journaled */ @@ -853,7 +987,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_debug("using block group %d(%d)\n", + ac.ac_b_group.group, gdp->bg_free_blocks_count); + -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group); ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); + if (!bitmap_bh) { + *errp = -EIO; + goto out_err; @@ -865,7 +999,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + goto out_err; + } + -+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out_err; @@ -875,8 +1009,9 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (err) + goto out_err; + -+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); + + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || @@ -885,18 +1020,18 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+#if 0 ++#if AGGRESSIVE_CHECK + for (i = 0; i < ac.ac_b_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); +#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + -+ ext3_lock_group(sb, ac.ac_b_group); ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ ac.ac_b_len); -+ ext3_unlock_group(sb, ac.ac_b_group); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len); ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) @@ -910,10 +1045,11 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + brelse(bitmap_bh); + + /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); + -+ *len = ac.ac_b_len; ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); + J_ASSERT(block != 0); + goto out; + @@ -928,7 +1064,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + *errp = err; + block = 0; +out: -+ if (!(flags & 2)) { ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter + * whether we allocated anything or we failed: time @@ -937,42 +1073,175 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } ++#ifdef MBALLOC_STATS ++ if (ac.ac_g_ex.fe_len > 1) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++#endif + return block; +} + -+int ext3_mb_generate_buddy(struct super_block *sb, int group) ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) +{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; + struct buffer_head *bh; -+ int i, err, count = 0; -+ struct ext3_buddy e3b; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; + -+ err = ext3_mb_load_desc(sb, group, &e3b); ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ BUG(); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *ogdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ ogdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!ogdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(ogdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(ogdp->bg_free_blocks_count)); ++ BUG(); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); + if (err) ++ return err; ++ ++ handle = journal_start(EXT3_SB(e3b->bd_sb)->s_journal, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; + goto out; -+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); -+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); ++ } + -+ bh = read_block_bitmap(sb, group); -+ if (bh == NULL) { -+ err = -EIO; -+ goto out2; ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; + } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_first_free = 1 << 15; + + /* loop over the blocks, and create buddies for free ones */ + for (i = 0; i < sb->s_blocksize * 8; i++) { + if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(&e3b, i, 1); ++ mb_free_blocks(e3b, i, 1); + count++; + } + } + brelse(bh); -+ mb_check_buddy(&e3b); -+ ext3_mb_dirty_buddy(&e3b); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); + -+out2: -+ ext3_mb_release_desc(&e3b); -+out: -+ return err; ++ return 0; +} + +EXPORT_SYMBOL(ext3_mb_new_blocks); @@ -981,83 +1250,143 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) + -+int ext3_mb_init_backend(struct super_block *sb) ++int ext3_mb_init_backend(struct super_block *sb, int *created) +{ ++ int err, i, len, descr_per_block, buddy_offset, size; + struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; + struct dentry *db; ++ handle_t *handle; + tid_t target; -+ int err, i; + -+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) * -+ sbi->s_groups_count, GFP_KERNEL); ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy maps\n"); ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); + return -ENOMEM; + } -+ memset(sbi->s_buddy_blocks, 0, -+ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count); ++ memset(sbi->s_buddy_blocks, 0, len); + sbi->s_buddy = NULL; + + down(&root->i_sem); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, -+ strlen(EXT3_BUDDY_FILE)); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); + if (IS_ERR(db)) { + err = PTR_ERR(db); -+ printk("EXT3-fs: can't lookup buddy file: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); + goto out; + } + -+ if (db->d_inode != NULL) { -+ sbi->s_buddy = igrab(db->d_inode); -+ goto map; ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ printk("EXT3-fs: no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; + } + -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk("error while creation buddy file: %d\n", err); -+ } else { -+ sbi->s_buddy = igrab(db->d_inode); ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_group_hdr *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ *created = 1; ++ printk("EXT3-fs: invalid header 0x%x in %d, regenerate\n", hdr->mh_magic, i); ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); + } + -+map: ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { -+ struct buffer_head *bh = NULL; -+ handle_t *handle; + -+ sbi->s_buddy_blocks[i] = -+ kmalloc(sizeof(struct ext3_buddy_group_blocks), -+ GFP_KERNEL); ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks[i] == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); + err = -ENOMEM; + goto out2; + } ++ memset(sbi->s_buddy_blocks[i], 0, len); + + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); + err = PTR_ERR(handle); + goto out2; + } + + /* allocate block for bitmap */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk("can't get block for buddy bitmap: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; + brelse(bh); + + /* allocate block for buddy */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk("can't get block for buddy: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; + brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } + ext3_journal_stop(handle); ++ + spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); + sbi->s_buddy_blocks[i]->bb_md_cur = NULL; + sbi->s_buddy_blocks[i]->bb_tid = 0; @@ -1069,8 +1398,30 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +out2: + dput(db); +out: -+ up(&root->i_sem); + return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; +} + +int ext3_mb_release(struct super_block *sb) @@ -1091,9 +1442,12 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_mb_free_committed_blocks(sb); + + if (sbi->s_buddy_blocks) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ if (sbi->s_buddy_blocks[i]) -+ kfree(sbi->s_buddy_blocks[i]); ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } + kfree(sbi->s_buddy_blocks); + } + if (sbi->s_buddy) @@ -1101,32 +1455,62 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); ++#ifdef MBALLOC_STATS ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n", ++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n", ++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks); ++#endif + return 0; +} + -+int ext3_mb_init(struct super_block *sb) ++int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ -+ struct ext3_super_block *es; -+ int i; ++ struct ext3_buddy e3b; ++ int i, err, created; + + if (!test_opt(sb, MBALLOC)) + return 0; + + /* init file for buddy data */ + clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ if (ext3_mb_init_backend(sb)) -+ return 0; ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; + -+ es = EXT3_SB(sb)->s_es; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ ext3_mb_generate_buddy(sb, i); ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); + spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); + spin_lock_init(&EXT3_SB(sb)->s_md_lock); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); + INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); + set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ printk("EXT3-fs: mballoc enabled\n"); ++ ++#ifdef MBALLOC_STATS ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++#define MBALLOC_INFO " (stats)" ++#else ++#define MBALLOC_INFO "" ++#endif ++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO); + return 0; +} + @@ -1158,7 +1542,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + mb_debug("gonna free %u blocks in group %u (0x%p):", + md->num, md->group, md); + -+ err = ext3_mb_load_desc(sb, md->group, &e3b); ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); + BUG_ON(err != 0); + + /* there are blocks to put in buddy to make them really free */ @@ -1263,7 +1647,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; @@ -1276,6 +1661,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + struct ext3_buddy e3b; + int err = 0, ret; + ++ *freed = 0; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); @@ -1345,7 +1731,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ err = ext3_mb_load_desc(sb, block_group, &e3b); ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); + if (err) + goto error_return; + @@ -1356,18 +1742,18 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } else { + ext3_lock_group(sb, block_group); + mb_free_blocks(&e3b, bit, count); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + ext3_unlock_group(sb, block_group); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); + } ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); + + ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + -+ /* FIXME: undo logic will be implemented later and another way */ + mb_clear_bits(bitmap_bh->b_data, bit, count); -+ DQUOT_FREE_BLOCK(inode, count); ++ *freed = count; + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -1420,7 +1806,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + +int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) ++ unsigned long goal, int *errp) +{ + int ret, len; + @@ -1435,19 +1821,27 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + + ++extern void ext3_free_blocks_old(handle_t *, struct inode *, ++ unsigned long, unsigned long); +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ ++ int freed; ++ + if (!test_opt(inode->i_sb, MBALLOC)) + ext3_free_blocks_old(handle, inode, block, count); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata); ++ else { ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ } + return; +} ++ Index: linux-2.6.5-sles9/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300 ++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:48:54.515249408 +0300 @@ -389,6 +389,7 @@ struct ext3_super_block *es = sbi->s_es; int i; @@ -1456,47 +1850,54 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -542,7 +543,7 @@ +@@ -540,6 +541,7 @@ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_err, Opt_extents, Opt_extdebug -+ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc, ++ Opt_mballoc, Opt_mbfactor, + Opt_err, Opt_extents, Opt_extdebug }; - static match_table_t tokens = { -@@ -589,6 +590,7 @@ +@@ -587,6 +589,8 @@ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_mballoc, "mbfactor=%u"}, {Opt_err, NULL} }; -@@ -810,6 +812,9 @@ +@@ -808,6 +812,16 @@ case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1463,7 +1468,8 @@ +@@ -1461,7 +1475,8 @@ ext3_count_dirs(sb)); ext3_ext_init(sb); - -+ ext3_mb_init(sb); ++ ext3_mb_init(sb, needs_recovery); + return 0; failed_mount3: Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 ++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:48:54.517249104 +0300 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -1509,7 +1910,7 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile Index: linux-2.6.5-sles9/fs/ext3/balloc.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300 ++++ linux-2.6.5-sles9/fs/ext3/balloc.c 2005-02-23 01:48:54.520248648 +0300 @@ -78,7 +78,7 @@ * * Return buffer_head on success or NULL in case of failure. @@ -1539,8 +1940,8 @@ Index: linux-2.6.5-sles9/fs/ext3/balloc.c struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.5-sles9/fs/ext3/namei.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300 -+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2005-02-23 01:01:46.551165296 +0300 ++++ linux-2.6.5-sles9/fs/ext3/namei.c 2005-02-23 01:48:54.523248192 +0300 @@ -1640,7 +1640,7 @@ * If the create succeeds, we fill in the inode information * with d_instantiate(). @@ -1552,8 +1953,8 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c handle_t *handle; Index: linux-2.6.5-sles9/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300 ++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:48:54.529247280 +0300 @@ -572,7 +572,7 @@ ext3_journal_forget(handle, branch[i].bh); } @@ -1592,9 +1993,9 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c /* Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300 -@@ -740,7 +740,7 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 ++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:48:54.533246672 +0300 +@@ -774,7 +774,7 @@ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -1603,7 +2004,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c } } kfree(ablocks); -@@ -1391,7 +1391,7 @@ +@@ -1431,7 +1431,7 @@ path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -1612,7 +2013,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c return err; } -@@ -1879,10 +1879,12 @@ +@@ -1919,10 +1919,12 @@ int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -1626,7 +2027,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1894,7 +1896,7 @@ +@@ -1934,7 +1936,7 @@ bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -1637,8 +2038,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c from, to, ex->ee_block, ex->ee_len); Index: linux-2.6.5-sles9/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2005-02-23 01:01:52.387278072 +0300 ++++ linux-2.6.5-sles9/fs/ext3/xattr.c 2005-02-23 01:48:54.537246064 +0300 @@ -1366,7 +1366,7 @@ new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -1668,26 +2069,32 @@ Index: linux-2.6.5-sles9/fs/ext3/xattr.c } else { Index: linux-2.6.5-sles9/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300 -@@ -57,6 +57,8 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:48:54.539245760 +0300 +@@ -57,6 +57,14 @@ #define ext3_debug(f, a...) do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR 1 + ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ /* * Special inodes numbers */ -@@ -339,6 +341,7 @@ +@@ -339,6 +347,7 @@ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ ++#define EXT3_MOUNT_MBALLOC 0x100000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -698,7 +701,7 @@ +@@ -698,7 +707,7 @@ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, @@ -1696,24 +2103,48 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -743,6 +746,13 @@ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, - unsigned long); +@@ -820,6 +829,37 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); +/* mballoc.c */ -+extern int ext3_mb_init(struct super_block *sb); -+extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal,int *len, int flags,int *errp); -+extern int ext3_mb_release(struct super_block *sb); ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); + - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ + #endif /* __KERNEL__ */ + + #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300 +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2005-02-23 01:01:48.242908112 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2005-02-23 01:48:54.541245456 +0300 @@ -23,10 +23,30 @@ #define EXT_INCLUDE #include @@ -1731,21 +2162,21 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h + struct list_head list; +}; + -+#define EXT3_BB_MAX_ORDER 14 -+ +struct ext3_buddy_group_blocks { -+ sector_t bb_bitmap; -+ sector_t bb_buddy; ++ __u32 bb_bitmap; ++ __u32 bb_buddy; + spinlock_t bb_lock; -+ unsigned bb_counters[EXT3_BB_MAX_ORDER]; ++ unsigned long bb_tid; + struct ext3_free_metadata *bb_md_cur; -+ unsigned long bb_tid; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; +}; + /* * third extended-fs super-block data in memory */ -@@ -78,6 +98,17 @@ +@@ -78,6 +98,27 @@ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif @@ -1760,6 +2191,16 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h + struct list_head s_committed_transaction; + spinlock_t s_md_lock; + tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ }; #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 8c5bc89..6ac3090 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -706,10 +706,11 @@ static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, struct ext3_ext_path *path, - struct ext3_extent *newex, int exist) + struct ext3_ext_cache *cex) { struct inode *inode = tree->inode; struct bpointers *bp = tree->private; + struct ext3_extent nex; int count, err, goal; unsigned long pblock; unsigned long tgen; @@ -721,19 +722,19 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, EXT_ASSERT(i == path->p_depth); EXT_ASSERT(path[i].p_hdr); - if (exist) { + if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) { err = EXT_CONTINUE; goto map; } if (bp->create == 0) { i = 0; - if (newex->ee_block < bp->start) - i = bp->start - newex->ee_block; - if (i >= newex->ee_len) + if (cex->ec_block < bp->start) + i = bp->start - cex->ec_block; + if (i >= cex->ec_len) CERROR("nothing to do?! i = %d, e_num = %u\n", - i, newex->ee_len); - for (; i < newex->ee_len && bp->num; i++) { + i, cex->ec_len); + for (; i < cex->ec_len && bp->num; i++) { *(bp->created) = 0; bp->created++; *(bp->blocks) = 0; @@ -757,34 +758,44 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, return PTR_ERR(handle); } + ext3_down_truncate_sem(inode); if (tgen != EXT_GENERATION(tree)) { /* the tree has changed. so path can be invalid at moment */ lock_24kernel(); journal_stop(handle); unlock_24kernel(); - ext3_down_truncate_sem(inode); return EXT_REPEAT; } - ext3_down_truncate_sem(inode); - count = newex->ee_len; - goal = ext3_ext_find_goal(inode, path, newex->ee_block, &aflags); + count = cex->ec_len; + goal = ext3_ext_find_goal(inode, path, cex->ec_block, &aflags); aflags |= 2; /* block have been already reserved */ pblock = ext3_mb_new_blocks(handle, inode, goal, &count, aflags, &err); if (!pblock) goto out; - EXT_ASSERT(count <= newex->ee_len); + EXT_ASSERT(count <= cex->ec_len); /* insert new extent */ - newex->ee_start = pblock; - newex->ee_len = count; - err = ext3_ext_insert_extent(handle, tree, path, newex); + nex.ee_block = cex->ec_block; + nex.ee_start = pblock; + nex.ee_len = count; + err = ext3_ext_insert_extent(handle, tree, path, &nex); if (err) goto out; + /* + * Putting len of the actual extent we just inserted, + * we are asking ext3_ext_walk_space() to continue + * scaning after that block + */ + cex->ec_len = nex.ee_len; + cex->ec_start = nex.ee_start; + BUG_ON(nex.ee_len == 0); + BUG_ON(nex.ee_block != cex->ec_block); + /* correct on-disk inode size */ - if (newex->ee_len > 0) { - new_i_size = (loff_t) newex->ee_block + newex->ee_len; + if (nex.ee_len > 0) { + new_i_size = (loff_t) nex.ee_block + nex.ee_len; new_i_size = new_i_size << inode->i_blkbits; if (new_i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = new_i_size; @@ -804,19 +815,22 @@ map: CERROR("initial space: %lu:%u\n", bp->start, bp->init_num); CERROR("current extent: %u/%u/%u %d\n", - newex->ee_block, newex->ee_len, - newex->ee_start, exist); + cex->ec_block, cex->ec_len, + cex->ec_start, cex->ec_type); } i = 0; - if (newex->ee_block < bp->start) - i = bp->start - newex->ee_block; - if (i >= newex->ee_len) + if (cex->ec_block < bp->start) + i = bp->start - cex->ec_block; + if (i >= cex->ec_len) CERROR("nothing to do?! i = %d, e_num = %u\n", - i, newex->ee_len); - for (; i < newex->ee_len && bp->num; i++) { - *(bp->created) = (exist == 0 ? 1 : 0); + i, cex->ec_len); + for (; i < cex->ec_len && bp->num; i++) { + if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) + *(bp->created) = 0; + else + *(bp->created) = 1; bp->created++; - *(bp->blocks) = newex->ee_start + i; + *(bp->blocks) = cex->ec_start + i; bp->blocks++; bp->num--; bp->start++; -- 1.8.3.1