X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=ldiskfs%2Fkernel_patches%2Fpatches%2Fext3-mballoc3-core.patch;h=678d87849f878d868e3fa2847396193bbd05aa23;hb=6bdb62e3d7ed206bcaef0cd3499f9cb56dc1fb92;hp=ac840d064df4605a533a053d45f7b9352e1d882c;hpb=31e1dbf5a16d19ac34a95f6ccd4a815e10306cef;p=fs%2Flustre-release.git diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch index ac840d0..678d878 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch @@ -1,8 +1,8 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2007-06-08 23:44:08.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2007-10-17 22:25:01.000000000 +0400 -@@ -57,6 +57,30 @@ struct statfs; +--- linux-stage.orig/include/linux/ext3_fs.h ++++ linux-stage/include/linux/ext3_fs.h +@@ -53,6 +53,31 @@ #define ext3_debug(f, a...) do {} while (0) #endif @@ -17,6 +17,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h +#define EXT3_MB_HINT_NOPREALLOC 64 /* don't preallocate (for tails) */ +#define EXT3_MB_HINT_GROUP_ALLOC 128 /* allocate for locality group */ +#define EXT3_MB_HINT_GOAL_ONLY 256 /* allocate goal blocks or none */ ++#define EXT3_MB_HINT_TRY_GOAL 512 /* goal is meaningful */ + +struct ext3_allocation_request { + struct inode *inode; /* target inode for block we're allocating */ @@ -33,7 +34,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h /* * Special inodes numbers */ -@@ -387,6 +411,14 @@ struct ext3_inode { +@@ -398,6 +423,14 @@ struct ext3_inode { #define ext3_find_first_zero_bit ext2_find_first_zero_bit #define ext3_find_next_zero_bit ext2_find_next_zero_bit @@ -48,7 +49,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h /* * Maximal mount counts between two filesystem checks */ -@@ -763,6 +795,20 @@ extern unsigned long ext3_count_dirs (st +@@ -799,6 +832,20 @@ extern unsigned long ext3_count_dirs (st extern void ext3_check_inodes_bitmap (struct super_block *); extern unsigned long ext3_count_free (struct buffer_head *, unsigned); @@ -68,9 +69,9 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h + /* inode.c */ - extern int ext3_block_truncate_page(handle_t *, struct page *, -@@ -804,6 +850,10 @@ extern int ext3_group_extend(struct supe - unsigned long n_blocks_count); + int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, +@@ -845,6 +892,10 @@ extern int ext3_group_extend(struct supe + ext3_fsblk_t n_blocks_count); /* super.c */ +extern struct proc_dir_entry *proc_root_ext3; @@ -80,13 +81,13 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h extern void ext3_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void __ext3_std_error (struct super_block *, const char *, int); -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h +Index: linux-stage/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2007-06-08 23:44:07.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2007-10-17 22:25:01.000000000 +0400 -@@ -81,6 +81,61 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ +--- linux-stage.orig/include/linux/ext3_fs_sb.h ++++ linux-stage/include/linux/ext3_fs_sb.h +@@ -89,6 +89,68 @@ struct ext3_sb_info { + unsigned long s_ext_blocks; + unsigned long s_ext_extents; #endif + + /* for buddy allocator */ @@ -104,12 +105,19 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + /* tunables */ + unsigned long s_mb_factor; + unsigned long s_stripe; -+ unsigned long s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned long s_mb_max_to_scan; + unsigned long s_mb_min_to_scan; + unsigned long s_mb_max_groups_to_scan; + unsigned long s_mb_stats; + unsigned long s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; ++ unsigned long s_mb_group_prealloc; ++ /* where last allocation was done - for stream allocation */ ++ unsigned long s_mb_last_group; ++ unsigned long s_mb_last_start; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -146,61 +154,82 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9-full/fs/ext3/super.c +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2007-06-08 23:44:08.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2007-10-17 22:26:27.000000000 +0400 -@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block +--- linux-stage.orig/fs/ext3/super.c ++++ linux-stage/fs/ext3/super.c +@@ -392,6 +392,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; - int i; + int i, err; + ext3_mb_release(sb); ext3_ext_release(sb); ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -438,6 +439,8 @@ void ext3_put_super (struct super_block + err = journal_destroy(sbi->s_journal); +@@ -438,6 +439,10 @@ static void ext3_put_super (struct super invalidate_bdev(sbi->journal_bdev, 0); ext3_blkdev_remove(sbi); } -+ remove_proc_entry(sb->s_id, proc_root_ext3); -+ sbi->s_dev_proc = NULL; ++ if (sbi->s_dev_proc) { ++ remove_proc_entry(sbi->s_dev_proc->name, proc_root_ext3); ++ sbi->s_dev_proc = NULL; ++ } sb->s_fs_info = NULL; kfree(sbi); return; -@@ -463,6 +466,8 @@ static struct inode *ext3_alloc_inode(st +@@ -463,6 +468,8 @@ static struct inode *ext3_alloc_inode(st ei->vfs_inode.i_version = 1; - + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); return &ei->vfs_inode; } -@@ -1353,6 +1358,13 @@ static int ext3_fill_super (struct super +@@ -1465,6 +1472,7 @@ static int ext3_fill_super (struct super + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; ++ char *devname; + int blocksize; + int hblock; + int db_count; +@@ -1480,6 +1488,22 @@ static int ext3_fill_super (struct super sbi->s_mount_opt = 0; sbi->s_resuid = EXT3_DEF_RESUID; sbi->s_resgid = EXT3_DEF_RESGID; -+ sbi->s_dev_proc = proc_mkdir(sb->s_id, proc_root_ext3); -+ if (sbi->s_dev_proc == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", sb->s_id); -+ sb->s_fs_info = NULL; -+ kfree(sbi); -+ return -ENOMEM; ++ devname = kstrdup(sb->s_id, GFP_KERNEL); ++ if (devname) { ++ char *p = devname; ++ while ((p = strchr(p, '/'))) ++ *p = '!'; ++ sbi->s_dev_proc = proc_mkdir(devname, proc_root_ext3); ++ if (sbi->s_dev_proc == NULL) ++ printk(KERN_WARNING "EXT3-fs warning: unable to create " ++ "procfs entry for %s(%s)\n", ++ sb->s_id, devname); ++ kfree(devname); ++ } else { ++ printk(KERN_WARNING "EXT3-fs warning: cannot allocate memory " ++ "to create procfs entry for %s\n", ++ sb->s_id); + } - blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); - if (!blocksize) { -@@ -1729,6 +1741,8 @@ failed_mount: + unlock_kernel(); + +@@ -1885,6 +1909,10 @@ failed_mount: ext3_blkdev_remove(sbi); brelse(bh); out_fail: -+ remove_proc_entry(sb->s_id, proc_root_ext3); -+ sbi->s_dev_proc = NULL; ++ if (sbi->s_dev_proc) { ++ remove_proc_entry(sbi->s_dev_proc->name, proc_root_ext3); ++ sbi->s_dev_proc = NULL; ++ } sb->s_fs_info = NULL; kfree(sbi); - return -EINVAL; -@@ -2593,9 +2607,47 @@ static struct file_system_type ext3_fs_t - .fs_flags = FS_REQUIRES_DEV, + lock_kernel(); +@@ -2850,9 +2878,46 @@ static struct file_system_type ext3_fs_t + |FS_HAS_TRYTOFREE, }; +#define EXT3_ROOT "ext3" @@ -208,7 +237,6 @@ Index: linux-2.6.9-full/fs/ext3/super.c + +int __init init_ext3_proc(void) +{ -+ struct proc_dir_entry *proc; + int ret; + + if ((ret = init_ext3_mb_proc())) @@ -248,7 +276,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2617,6 +2669,7 @@ static void __exit exit_ext3_fs(void) +@@ -2877,6 +2942,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -256,14 +284,14 @@ Index: linux-2.6.9-full/fs/ext3/super.c } int ext3_map_inode_page(struct inode *inode, struct page *page, -Index: linux-2.6.9-full/fs/ext3/mballoc.c +Index: linux-stage/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2007-10-17 21:59:51.072534980 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2007-10-17 23:09:22.000000000 +0400 -@@ -0,0 +1,4380 @@ +--- /dev/null ++++ linux-stage/fs/ext3/mballoc.c +@@ -0,0 +1,4486 @@ +/* -+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Written by Alex Zhuravlev + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as @@ -576,7 +604,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_BB_MAX_BLOCKS 30 + +struct ext3_free_metadata { -+ unsigned short group; ++ unsigned group; + unsigned short num; + unsigned short blocks[EXT3_BB_MAX_BLOCKS]; + struct list_head list; @@ -590,6 +618,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif @@ -693,7 +722,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u16 flags; -+ __u8 cr:3; /* which phase the result extent was found at */ ++ __u8 cr:8; /* which phase the result extent was found at */ + __u8 op:4; + __u8 merged:1; +}; @@ -705,8 +734,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + void *bd_bitmap; + struct ext3_group_info *bd_info; + struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; ++ unsigned bd_group; ++ unsigned bd_blkbits; +}; +#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) @@ -726,7 +755,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); -+void ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group); ++int ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group); +void ext3_mb_free_consumed_preallocations(struct ext3_allocation_context *ac); +void ext3_mb_return_to_preallocation(struct inode *inode, struct ext3_buddy *e3b, + sector_t block, int count); @@ -1119,7 +1148,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } +} + -+static void ++static int +ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, + int group) +{ @@ -1137,6 +1166,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + fragments++; + first = i; + i = ext2_find_next_le_bit(bitmap, max, i); ++ if (i > max) ++ i = max; + len = i - first; + free += len; + if (len > 1) @@ -1149,9 +1180,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; ++ struct ext3_group_desc *gdp; ++ gdp = ext3_get_group_desc (sb, group, NULL); ++ ext3_error(sb, __FUNCTION__, ++ "group %u: %u blocks in bitmap, %u in bb, " ++ "%u in gd, %lu pa's\n", group, free, grp->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count), ++ grp->bb_prealloc_nr); ++ return -EIO; + } + + clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); @@ -1161,6 +1197,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + EXT3_SB(sb)->s_mb_buddies_generated++; + EXT3_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++ ++ return 0; +} + +static int ext3_mb_init_cache(struct page *page, char *incore) @@ -1238,8 +1276,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (!buffer_uptodate(bh[i])) + goto out; + ++ err = 0; + first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + int group; + + group = (first_block + i) >> 1; @@ -1258,7 +1297,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; + memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, incore, group); ++ err = ext3_mb_generate_buddy(sb, data, incore, group); + incore = NULL; + } else { + /* this is block of bitmap */ @@ -1271,13 +1310,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blocks used in in-core bitmap */ -+ ext3_mb_generate_from_pa(sb, data, group); ++ err = ext3_mb_generate_from_pa(sb, data, group); + ext3_unlock_group(sb, group); + + incore = data; + } + } -+ SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + +out: + if (bh) { @@ -1329,9 +1369,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + unlock_page(page); + } + } ++ e3b->bd_bitmap_page = page; + if (page == NULL || !PageUptodate(page)) + goto err; -+ e3b->bd_bitmap_page = page; + e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + @@ -1352,9 +1392,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + unlock_page(page); + } + } ++ e3b->bd_buddy_page = page; + if (page == NULL || !PageUptodate(page)) + goto err; -+ e3b->bd_buddy_page = page; + e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + @@ -1416,7 +1456,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_clear_bit_atomic(lock, cur, bm); ++ if (lock) ++ mb_clear_bit_atomic(lock, cur, bm); ++ else ++ mb_clear_bit(cur, bm); + cur++; + } +} @@ -1434,7 +1477,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_set_bit_atomic(lock, cur, bm); ++ if (lock) ++ mb_set_bit_atomic(lock, cur, bm); ++ else ++ mb_set_bit(cur, bm); + cur++; + } +} @@ -1588,6 +1634,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + BUG_ON(start + len > (e3b->bd_sb->s_blocksize << 3)); + BUG_ON(e3b->bd_group != ex->fe_group); + BUG_ON(!ext3_is_group_locked(e3b->bd_sb, e3b->bd_group)); ++ spin_lock(sb_bgl_lock(EXT3_SB(e3b->bd_sb), ex->fe_group)); + mb_check_buddy(e3b); + mb_mark_used_double(e3b, start, len); + @@ -1641,9 +1688,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + e3b->bd_info->bb_counters[ord]++; + } + -+ mb_set_bits(sb_bgl_lock(EXT3_SB(e3b->bd_sb), ex->fe_group), -+ EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ mb_set_bits(NULL, EXT3_MB_BITMAP(e3b), ex->fe_start, len0); + mb_check_buddy(e3b); ++ spin_unlock(sb_bgl_lock(EXT3_SB(e3b->bd_sb), ex->fe_group)); + + return ret; +} @@ -1654,6 +1701,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); + unsigned long ret; + + BUG_ON(ac->ac_b_ex.fe_group != e3b->bd_group); @@ -1676,6 +1724,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + get_page(ac->ac_bitmap_page); + ac->ac_buddy_page = e3b->bd_buddy_page; + get_page(ac->ac_buddy_page); ++ ++ /* store last allocated for subsequent stream allocation */ ++ if ((ac->ac_flags & EXT3_MB_HINT_DATA)) { ++ spin_lock(&sbi->s_md_lock); ++ sbi->s_mb_last_group = ac->ac_f_ex.fe_group; ++ sbi->s_mb_last_start = ac->ac_f_ex.fe_start; ++ spin_unlock(&sbi->s_md_lock); ++ } +} + +/* @@ -1782,7 +1838,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ -+ *bex = *ex; ++ if (ex->fe_len < bex->fe_len) ++ *bex = *ex; + } + + ext3_mb_check_limits(ac, e3b, 0); @@ -1821,6 +1878,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + ++ if (!(ac->ac_flags & EXT3_MB_HINT_TRY_GOAL)) ++ return 0; ++ + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); + if (err) + return err; @@ -2043,6 +2103,16 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ac->ac_2order = i; + } + ++ /* if stream allocation is enabled, use global goal */ ++ if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) && ++ (ac->ac_flags & EXT3_MB_HINT_DATA)) { ++ /* TBD: may be hot point */ ++ spin_lock(&sbi->s_md_lock); ++ ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ++ ac->ac_g_ex.fe_start = sbi->s_mb_last_start; ++ spin_unlock(&sbi->s_md_lock); ++ } ++ + group = ac->ac_g_ex.fe_group; + + /* Let's just scan groups to find more-less suitable blocks */ @@ -2056,9 +2126,12 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + -+ /* quick check to skip empty groups */ ++ /* If there's no chance that this group has a better ++ * extent, just skip it instead of seeking to read ++ * block bitmap from disk. Initially ac_b_ex.fe_len = 0, ++ * so this always skips groups with no free space. */ + grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ if (grp->bb_free == 0) ++ if (grp->bb_free <= ac->ac_b_ex.fe_len) + continue; + + if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { @@ -2232,6 +2305,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-23s free\n", + hs->pid, hs->ino, buf2); ++ } else { ++ seq_printf(seq, "unknown op %d\n", hs->op); + } + return 0; +} @@ -2357,8 +2432,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; ++ struct ext3_group_desc *gdp; + long group = (long) v; -+ int i, err; ++ int i, err, free = 0; + struct ext3_buddy e3b; + struct sg { + struct ext3_group_info info; @@ -2367,10 +2443,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + group--; + if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s " ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s " + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", ++ "group", "free", "ingd", "frags", "first", "pa", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5","2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + @@ -2381,13 +2457,20 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + seq_printf(seq, "#%-5lu: I/O error\n", group); + return 0; + } ++ ++ gdp = ext3_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = le16_to_cpu(gdp->bg_free_blocks_count); ++ + ext3_lock_group(sb, group); + memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", group, ++ sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); @@ -2487,7 +2570,13 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + h.orig = ac->ac_o_ex; + h.result = ac->ac_b_ex; + h.flags = ac->ac_flags; ++ h.found = ac->ac_found; ++ h.groups = ac->ac_groups_scanned; ++ h.cr = ac->ac_criteria; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; + h.merged = 0; ++ h.cr = ac->ac_criteria; + if (ac->ac_op == EXT3_MB_HISTORY_ALLOC) { + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) @@ -2616,6 +2705,25 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + return -ENOMEM; +} + ++static void ext3_mb_prealloc_table_add(struct ext3_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return; ++ } ++} ++ +int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -2671,14 +2779,59 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_max_groups_to_scan = MB_DEFAULT_MAX_GROUPS_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; -+ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_history_filter = EXT3_MB_HISTORY_DEFAULT; + ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 8; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext3_mb_prealloc_table_add(sbi, 4); ++ ext3_mb_prealloc_table_add(sbi, 8); ++ ext3_mb_prealloc_table_add(sbi, 16); ++ ext3_mb_prealloc_table_add(sbi, 32); ++ ext3_mb_prealloc_table_add(sbi, 64); ++ ext3_mb_prealloc_table_add(sbi, 128); ++ ext3_mb_prealloc_table_add(sbi, 256); ++ ext3_mb_prealloc_table_add(sbi, 512); ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 256; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext3_mb_prealloc_table_add(sbi, sbi->s_stripe); ++ ext3_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); ++ ext3_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe; ++ } ++ + i = sizeof(struct ext3_locality_group) * num_possible_cpus(); + sbi->s_locality_groups = kmalloc(i, GFP_NOFS); + if (sbi->s_locality_groups == NULL) { + clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return -ENOMEM; @@ -2843,259 +2996,164 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_MB_MAX_TO_SCAN_NAME "max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "min_to_scan" +#define EXT3_MB_ORDER2_REQ "order2_req" -+#define EXT3_MB_STREAM_REQ "stream_req" ++#define EXT3_MB_SMALL_REQ "small_req" ++#define EXT3_MB_LARGE_REQ "large_req" ++#define EXT3_MB_PREALLOC_TABLE "prealloc_table" ++#define EXT3_MB_GROUP_PREALLOC "group_prealloc" + -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_read_prealloc_table(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + struct ext3_sb_info *sbi = data; -+ int len; ++ int len = 0; ++ int i; + + *eof = 1; + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", sbi->s_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ sbi->s_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ len += sprintf(page + len, "%ld ", ++ sbi->s_mb_prealloc_table[i]); ++ len += sprintf(page + len, "\n"); + -+ len = sprintf(page, "%ld\n", sbi->s_mb_max_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_write_prealloc_table(struct file *file, ++ const char __user *buf, ++ unsigned long cnt, void *data) +{ + struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) + return -EFAULT; + -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; + } + -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ ext3_mb_prealloc_table_add(sbi, value); ++ i++; + } + -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_min_to_scan = value; -+ -+ return count; ++ return cnt; +} + -+static int ext3_mb_stream_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_stream_request); -+ *start = page; -+ return len; ++#define MB_PROC_VALUE_READ(name) \ ++static int ext3_mb_read_##name(char *page, char **start, \ ++ off_t off, int count, int *eof, void *data) \ ++{ \ ++ struct ext3_sb_info *sbi = data; \ ++ int len; \ ++ *eof = 1; \ ++ if (off != 0) \ ++ return 0; \ ++ len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ ++ *start = page; \ ++ return len; \ +} + -+static int ext3_mb_stream_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STREAM_REQ, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_stream_request = value; -+ -+ return count; ++#define MB_PROC_VALUE_WRITE(name) \ ++static int ext3_mb_write_##name(struct file *file, \ ++ const char __user *buf, unsigned long cnt, void *data) \ ++{ \ ++ struct ext3_sb_info *sbi = data; \ ++ char str[32]; \ ++ long value; \ ++ if (cnt >= sizeof(str)) \ ++ return -EINVAL; \ ++ if (copy_from_user(str, buf, cnt)) \ ++ return -EFAULT; \ ++ value = simple_strtol(str, NULL, 0); \ ++ if (value <= 0) \ ++ return -ERANGE; \ ++ sbi->s_mb_##name = value; \ ++ return cnt; \ +} + ++MB_PROC_VALUE_READ(stats); ++MB_PROC_VALUE_WRITE(stats); ++MB_PROC_VALUE_READ(max_to_scan); ++MB_PROC_VALUE_WRITE(max_to_scan); ++MB_PROC_VALUE_READ(min_to_scan); ++MB_PROC_VALUE_WRITE(min_to_scan); ++MB_PROC_VALUE_READ(order2_reqs); ++MB_PROC_VALUE_WRITE(order2_reqs); ++MB_PROC_VALUE_READ(small_req); ++MB_PROC_VALUE_WRITE(small_req); ++MB_PROC_VALUE_READ(large_req); ++MB_PROC_VALUE_WRITE(large_req); ++MB_PROC_VALUE_READ(group_prealloc); ++MB_PROC_VALUE_WRITE(group_prealloc); ++ ++#define MB_PROC_HANDLER(name, var) \ ++do { \ ++ proc = create_proc_entry(name, mode, sbi->s_dev_proc); \ ++ if (proc == NULL) { \ ++ printk(KERN_ERR "EXT3-fs: can't to create %s\n", name); \ ++ goto err_out; \ ++ } \ ++ proc->data = sbi; \ ++ proc->read_proc = ext3_mb_read_##var ; \ ++ proc->write_proc = ext3_mb_write_##var; \ ++} while (0) ++ +int ext3_mb_init_per_dev_proc(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; + struct proc_dir_entry *proc; -+ char *name; -+ -+ name = EXT3_MB_STATS_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_dev_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_stats_read; -+ proc->write_proc = ext3_mb_stats_write; -+ -+ name = EXT3_MB_MAX_TO_SCAN_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_dev_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_max_to_scan_read; -+ proc->write_proc = ext3_mb_max_to_scan_write; -+ -+ name = EXT3_MB_MIN_TO_SCAN_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_dev_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_min_to_scan_read; -+ proc->write_proc = ext3_mb_min_to_scan_write; -+ -+ name = EXT3_MB_ORDER2_REQ; -+ proc = create_proc_entry(name, mode, sbi->s_dev_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_order2_req_read; -+ proc->write_proc = ext3_mb_order2_req_write; -+ -+ name = EXT3_MB_STREAM_REQ; -+ proc = create_proc_entry(name, mode, sbi->s_dev_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_stream_req_read; -+ proc->write_proc = ext3_mb_stream_req_write; ++ ++ MB_PROC_HANDLER(EXT3_MB_STATS_NAME, stats); ++ MB_PROC_HANDLER(EXT3_MB_MAX_TO_SCAN_NAME, max_to_scan); ++ MB_PROC_HANDLER(EXT3_MB_MIN_TO_SCAN_NAME, min_to_scan); ++ MB_PROC_HANDLER(EXT3_MB_ORDER2_REQ, order2_reqs); ++ MB_PROC_HANDLER(EXT3_MB_SMALL_REQ, small_req); ++ MB_PROC_HANDLER(EXT3_MB_LARGE_REQ, large_req); ++ MB_PROC_HANDLER(EXT3_MB_PREALLOC_TABLE, prealloc_table); ++ MB_PROC_HANDLER(EXT3_MB_GROUP_PREALLOC, group_prealloc); + + return 0; + +err_out: -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", name); -+ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_GROUP_PREALLOC, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_PREALLOC_TABLE, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_LARGE_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_SMALL_REQ, sbi->s_dev_proc); + remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_dev_proc); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_dev_proc); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_dev_proc); @@ -3111,7 +3169,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (sbi->s_dev_proc == NULL) + return -EINVAL; + -+ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_GROUP_PREALLOC, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_PREALLOC_TABLE, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_SMALL_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_LARGE_REQ, sbi->s_dev_proc); + remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_dev_proc); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_dev_proc); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_dev_proc); @@ -3193,6 +3254,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_error(sb, __FUNCTION__, + "Allocating block in system zone - block = %lu", + (unsigned long) block); ++ ext3_lock_group(sb, ac->ac_b_ex.fe_group); ++ spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); +#ifdef AGGRESSIVE_CHECK + { + int i; @@ -3202,15 +3265,15 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } + } +#endif -+ mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, ++ mb_set_bits(NULL, bitmap_bh->b_data, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); + -+ spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + - ac->ac_b_ex.fe_len); + spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); ++ ext3_unlock_group(sb, ac->ac_b_ex.fe_group); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) @@ -3233,10 +3296,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct ext3_locality_group *lg = ac->ac_lg; + + BUG_ON(lg == NULL); -+ if (EXT3_SB(sb)->s_stripe) -+ ac->ac_g_ex.fe_len = EXT3_SB(sb)->s_stripe; -+ else -+ ac->ac_g_ex.fe_len = (1024 * 1024) >> sb->s_blocksize_bits; ++ ac->ac_g_ex.fe_len = EXT3_SB(sb)->s_mb_group_prealloc; + + mb_debug("#%u: goal %u blocks for locality group\n", + current->pid, ac->ac_g_ex.fe_len); @@ -3250,9 +3310,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct ext3_allocation_request *ar) +{ + struct ext3_inode_info *ei = EXT3_I(ac->ac_inode); ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); + loff_t start, end, size, orig_size, orig_start; + struct list_head *cur; -+ int bsbits, max; ++ int bsbits, i, wind; + + /* do normalize only data requests, metadata requests + do not need preallocation */ @@ -3279,51 +3340,36 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + -+ /* max available blocks in a free group */ -+ max = EXT3_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -+ - EXT3_SB(ac->ac_sb)->s_itb_per_group; -+ -+#define NRL_CHECK_SIZE(req,size,max,bits) \ -+ (req <= (size) || max <= ((size) >> bits)) -+ -+ /* first, try to predict filesize */ -+ /* XXX: should this table be tunable? */ + start = 0; -+ if (size <= 16 * 1024) { -+ size = 16 * 1024; -+ } else if (size <= 32 * 1024) { -+ size = 32 * 1024; -+ } else if (size <= 64 * 1024) { -+ size = 64 * 1024; -+ } else if (size <= 128 * 1024) { -+ size = 128 * 1024; -+ } else if (size <= 256 * 1024) { -+ size = 256 * 1024; -+ } else if (size <= 512 * 1024) { -+ size = 512 * 1024; -+ } else if (size <= 1024 * 1024) { -+ size = 1024 * 1024; -+ } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) { -+ start = ac->ac_o_ex.fe_logical << bsbits; -+ start = (start / (1024 * 1024)) * (1024 * 1024); -+ size = 1024 * 1024; -+ } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) { -+ start = ac->ac_o_ex.fe_logical << bsbits; -+ start = (start / (4 * (1024 * 1024))) * 4 * (1024 * 1024); -+ size = 4 * 1024 * 1024; -+ } else if(NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,(8<<20)>>bsbits,max,bsbits)){ -+ start = ac->ac_o_ex.fe_logical; -+ start = start << bsbits; -+ start = (start / (8 * (1024 * 1024))) * 8 * (1024 * 1024); -+ size = 8 * 1024 * 1024; -+ } else { -+ start = ac->ac_o_ex.fe_logical; -+ start = start << bsbits; -+ size = ac->ac_o_ex.fe_len << bsbits; ++ wind = 0; ++ ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } + } -+ orig_size = size = size >> bsbits; -+ orig_start = start = start >> bsbits; ++ size = wind; ++ ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; ++ } ++ orig_size = size; ++ orig_start = start; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { @@ -3407,16 +3453,28 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + start > ac->ac_o_ex.fe_logical); + + /* now prepare goal request */ -+ BUG_ON(size <= 0 || size >= EXT3_BLOCKS_PER_GROUP(ac->ac_sb)); -+ if (size < ac->ac_o_ex.fe_len) { -+ /* XXX: don't normalize tails? */ -+ } + -+ /* XXX: is it better to align blocks WRT to logical placement -+ * or satisfy big request as is */ ++ /* XXX: is it better to align blocks WRT to logical ++ * placement or satisfy big request as is */ + ac->ac_g_ex.fe_logical = start; + ac->ac_g_ex.fe_len = size; + ++ /* define goal start in order to merge */ ++ if (ar->pright && (ar->lright == (start + size))) { ++ /* merge to the right */ ++ ext3_get_group_no_and_offset(ac->ac_sb, ar->pright - size, ++ &ac->ac_f_ex.fe_group, ++ &ac->ac_f_ex.fe_start); ++ ac->ac_flags |= EXT3_MB_HINT_TRY_GOAL; ++ } ++ if (ar->pleft && (ar->lleft + 1 == start)) { ++ /* merge to the left */ ++ ext3_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, ++ &ac->ac_f_ex.fe_group, ++ &ac->ac_f_ex.fe_start); ++ ac->ac_flags |= EXT3_MB_HINT_TRY_GOAL; ++ } ++ + mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, + (unsigned) orig_size, (unsigned) start); +} @@ -3557,17 +3615,62 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +} + +/* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions ++ */ ++int ext3_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext3_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ ++ spin_lock(sb_bgl_lock(EXT3_SB(sb), group)); ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = ext2_find_next_le_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ spin_unlock(sb_bgl_lock(EXT3_SB(sb), group)); ++ ext3_error(sb, __FUNCTION__, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -EIO; ++ } ++ spin_unlock(sb_bgl_lock(EXT3_SB(sb), group)); ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + */ -+void ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group) ++int ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group) +{ + struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); + struct ext3_prealloc_space *pa; ++ struct ext3_group_desc *gdp; + struct list_head *cur; + unsigned long groupnr; + unsigned long start; -+ int preallocated = 0, count = 0, len; ++ int preallocated = 0, count = 0, len, skip = 0, err; ++ ++ gdp = ext3_get_group_desc (sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext3_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. @@ -3583,14 +3686,23 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); -+ if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group && len != 0); + mb_set_bits(sb_bgl_lock(EXT3_SB(sb), group), bitmap, start,len); + preallocated += len; + count++; + } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext3_error(sb, __FUNCTION__, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; ++ } + mb_debug("prellocated %u for group %u\n", preallocated, group); ++ return 0; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,5) @@ -3623,7 +3735,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted == 0) { ++ if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } @@ -3650,6 +3762,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + */ + ext3_lock_group(sb, grp); + list_del_rcu(&pa->pa_group_list); ++ EXT3_GROUP_INFO(sb, grp)->bb_prealloc_nr--; + ext3_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); @@ -3734,6 +3847,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + ext3_lock_group(sb, ac->ac_b_ex.fe_group); + list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext3_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); @@ -3791,6 +3905,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + ext3_lock_group(sb, ac->ac_b_ex.fe_group); + list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext3_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); @@ -3838,6 +3953,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ac.ac_sb = sb; + ac.ac_inode = pa->pa_inode; + ac.ac_op = EXT3_MB_HISTORY_DISCARD; ++ ac.ac_o_ex.fe_len = 1; + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); @@ -3933,7 +4049,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } + + err = ext3_mb_load_buddy(sb, group, &e3b); -+ BUG_ON(err != 0); /* error handling here */ ++ if (err) { ++ brelse(bitmap_bh); ++ return err; ++ } + + if (needed == 0) + needed = EXT3_BLOCKS_PER_GROUP(sb) + 1; @@ -3964,6 +4083,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del_rcu(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } @@ -4080,11 +4201,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + + err = ext3_mb_load_buddy(sb, group, &e3b); -+ BUG_ON(err != 0); /* error handling here */ ++ if (err) ++ return; + + bitmap_bh = read_block_bitmap(sb, group); + + ext3_lock_group(sb, group); ++ BUG_ON(e3b.bd_info->bb_prealloc_nr == 0); ++ e3b.bd_info->bb_prealloc_nr--; + list_del_rcu(&pa->pa_group_list); + + /* can be NULL due to IO error, at worst @@ -4161,24 +4285,35 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +void ext3_mb_group_or_file(struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ int bsbits = ac->ac_sb->s_blocksize_bits; -+ loff_t size, isize; ++ loff_t size; ++ int bsbits; + + if (!(ac->ac_flags & EXT3_MB_HINT_DATA)) + return; + -+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; -+ isize = i_size_read(ac->ac_inode) >> bsbits; -+ if (size < isize) -+ size = isize; -+ -+ /* don't use group allocation for large files */ -+ if (size >= sbi->s_mb_stream_request) ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) + return; + + if (unlikely(ac->ac_flags & EXT3_MB_HINT_GOAL_ONLY)) + return; + ++ /* request is so large that we don't care about ++ * streaming - it overweights any possible seek */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ ++ bsbits = ac->ac_sb->s_blocksize_bits; ++ ++ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; ++ size = size << bsbits; ++ if (size < i_size_read(ac->ac_inode)) ++ size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ /* don't use group allocation for large files */ ++ if (size >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + ac->ac_lg = &sbi->s_locality_groups[smp_processor_id()]; + @@ -4446,7 +4581,6 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + BUG_ON(e3b->bd_bitmap_page == NULL); + BUG_ON(e3b->bd_buddy_page == NULL); + -+ ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; + if (md && db->bb_tid != handle->h_transaction->t_tid) { @@ -4491,7 +4625,6 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + db->bb_md_cur = NULL; + } + } -+ ext3_unlock_group(sb, group); + return 0; +} + @@ -4584,6 +4717,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (err) + goto error_return; + ++ ext3_lock_group(sb, block_group); ++ spin_lock(sb_bgl_lock(sbi, block_group)); +#ifdef AGGRESSIVE_CHECK + { + int i; @@ -4591,35 +4726,31 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif -+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, bit, -+ count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ ac.ac_b_ex.fe_group = block_group; -+ ac.ac_b_ex.fe_start = bit; -+ ac.ac_b_ex.fe_len = count; -+ ext3_mb_store_history(&ac); ++ mb_clear_bits(NULL, bitmap_bh->b_data, bit, count); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); + + if (metadata) { + /* blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed */ + ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); + } else { -+ ext3_lock_group(sb, block_group); + err = mb_free_blocks(inode, &e3b, bit, count); + ext3_mb_return_to_preallocation(inode, &e3b, block, count); -+ ext3_unlock_group(sb, block_group); + BUG_ON(err != 0); + } ++ ext3_unlock_group(sb, block_group); + -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ac.ac_b_ex.fe_group = block_group; ++ ac.ac_b_ex.fe_start = bit; ++ ac.ac_b_ex.fe_len = count; ++ ext3_mb_store_history(&ac); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + ext3_mb_release_desc(&e3b); + @@ -4641,3 +4772,6 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_std_error(sb, err); + return; +} ++ ++EXPORT_SYMBOL(ext3_free_blocks); ++EXPORT_SYMBOL(ext3_mb_discard_inode_preallocations);