From 1f152c64f52b9ceba86377f2fe3146885e45c3f3 Mon Sep 17 00:00:00 2001 From: yangsheng Date: Thu, 6 Dec 2007 10:57:38 +0000 Subject: [PATCH] *** empty log message *** --- .../ext3-mballoc3-core-2.6.22-vanilla.patch | 4590 -------------------- 1 file changed, 4590 deletions(-) delete mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc3-core-2.6.22-vanilla.patch diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core-2.6.22-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core-2.6.22-vanilla.patch deleted file mode 100644 index 93e3c94..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core-2.6.22-vanilla.patch +++ /dev/null @@ -1,4590 +0,0 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2007-06-08 23:44:08.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2007-10-17 22:25:01.000000000 +0400 -@@ -57,6 +57,30 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 /* prefer goal again. length */ -+#define EXT3_MB_HINT_RESERVED 2 /* blocks already reserved */ -+#define EXT3_MB_HINT_METADATA 4 /* metadata is being allocated */ -+#define EXT3_MB_HINT_FIRST 8 /* first blocks in the file */ -+#define EXT3_MB_HINT_BEST 16 /* search for the best chunk */ -+#define EXT3_MB_HINT_DATA 32 /* data is being allocated */ -+#define EXT3_MB_HINT_NOPREALLOC 64 /* don't preallocate (for tails) */ -+#define EXT3_MB_HINT_GROUP_ALLOC 128 /* allocate for locality group */ -+#define EXT3_MB_HINT_GOAL_ONLY 256 /* allocate goal blocks or none */ -+ -+struct ext3_allocation_request { -+ struct inode *inode; /* target inode for block we're allocating */ -+ unsigned long logical; /* logical block in target inode */ -+ unsigned long goal; /* phys. target (a hint) */ -+ unsigned long lleft; /* the closest logical allocated block to the left */ -+ unsigned long pleft; /* phys. block for ^^^ */ -+ unsigned long lright; /* the closest logical allocated block to the right */ -+ unsigned long pright; /* phys. block for ^^^ */ -+ unsigned long len; /* how many blocks we want to allocate */ -+ unsigned long flags; /* flags. see above EXT3_MB_HINT_* */ -+}; -+ - /* - * Special inodes numbers - */ -@@ -387,6 +411,14 @@ struct ext3_inode { - #define ext3_find_first_zero_bit ext2_find_first_zero_bit - #define ext3_find_next_zero_bit ext2_find_next_zero_bit - -+#ifndef ext2_find_next_le_bit -+#ifdef __LITTLE_ENDIAN -+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) -+#else -+#error "mballoc needs a patch for big-endian systems - CFS bug 10634" -+#endif /* __LITTLE_ENDIAN */ -+#endif /* !ext2_find_next_le_bit */ -+ - /* - * Maximal mount counts between two filesystem checks - */ -@@ -763,6 +795,20 @@ extern unsigned long ext3_count_dirs (st - extern void ext3_check_inodes_bitmap (struct super_block *); - extern unsigned long ext3_count_free (struct buffer_head *, unsigned); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern unsigned long ext3_mb_new_blocks(handle_t *, struct ext3_allocation_request *, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+extern void ext3_mb_discard_inode_preallocations(struct inode *); -+extern int __init init_ext3_proc(void); -+extern void exit_ext3_proc(void); -+extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, int *); -+ - - /* inode.c */ - extern int ext3_block_truncate_page(handle_t *, struct page *, -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2007-06-08 23:44:07.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2007-10-17 22:25:01.000000000 +0400 -@@ -81,6 +81,61 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info ***s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ -+ /* tunables */ -+ unsigned long s_mb_factor; -+ unsigned long s_stripe; -+ unsigned long s_mb_stream_request; -+ unsigned long s_mb_max_to_scan; -+ unsigned long s_mb_min_to_scan; -+ unsigned long s_mb_max_groups_to_scan; -+ unsigned long s_mb_stats; -+ unsigned long s_mb_order2_reqs; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ int s_mb_history_num; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ int s_mb_history_filter; -+ -+ /* stats for buddy allocator */ -+ spinlock_t s_mb_pa_lock; -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; -+ atomic_t s_mb_lost_chunks; -+ atomic_t s_mb_preallocated; -+ atomic_t s_mb_discarded; -+ -+ /* locality groups */ -+ struct ext3_locality_group *s_locality_groups; -+ - }; - -+#define EXT3_GROUP_INFO(sb, group) \ -+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ -+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] -+ - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9-full/fs/ext3/super.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2007-06-08 23:44:08.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2007-10-17 22:26:27.000000000 +0400 -@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -463,6 +464,8 @@ static struct inode *ext3_alloc_inode(st - ei->vfs_inode.i_version = 1; - - memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); -+ INIT_LIST_HEAD(&ei->i_prealloc_list); -+ spin_lock_init(&ei->i_prealloc_lock); - return &ei->vfs_inode; - } - -@@ -2576,7 +2579,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2598,6 +2607,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.9-full/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2007-10-17 21:59:51.072534980 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2007-10-17 23:09:22.000000000 +0400 -@@ -0,0 +1,4404 @@ -+/* -+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * MUSTDO: -+ * - test ext3_ext_search_left() and ext3_ext_search_right() -+ * - search for metadata in few groups -+ * -+ * TODO v4: -+ * - normalization should take into account whether file is still open -+ * - discard preallocations if no free space left (policy?) -+ * - don't normalize tails -+ * - quota -+ * - reservation for superuser -+ * -+ * TODO v3: -+ * - bitmap read-ahead (proposed by Oleg Drokin aka green) -+ * - track min/max extents in each group for better group selection -+ * - mb_mark_used() may allocate chunk right after splitting buddy -+ * - tree of groups sorted by number of free blocks -+ * - error handling -+ */ -+ -+/* -+ * mballoc operates on the following data: -+ * - on-disk bitmap -+ * - in-core buddy (actually includes buddy and bitmap) -+ * - preallocation descriptors (PAs) -+ * -+ * there are two types of preallocations: -+ * - inode -+ * assiged to specific inode and can be used for this inode only. -+ * it describes part of inode's space preallocated to specific -+ * physical blocks. any block from that preallocated can be used -+ * independent. the descriptor just tracks number of blocks left -+ * unused. so, before taking some block from descriptor, one must -+ * make sure corresponded logical block isn't allocated yet. this -+ * also means that freeing any block within descriptor's range -+ * must discard all preallocated blocks. -+ * - locality group -+ * assigned to specific locality group which does not translate to -+ * permanent set of inodes: inode can join and leave group. space -+ * from this type of preallocation can be used for any inode. thus -+ * it's consumed from the beginning to the end. -+ * -+ * relation between them can be expressed as: -+ * in-core buddy = on-disk bitmap + preallocation descriptors -+ * -+ * this mean blocks mballoc considers used are: -+ * - allocated blocks (persistent) -+ * - preallocated blocks (non-persistent) -+ * -+ * consistency in mballoc world means that at any time a block is either -+ * free or used in ALL structures. notice: "any time" should not be read -+ * literally -- time is discrete and delimited by locks. -+ * -+ * to keep it simple, we don't use block numbers, instead we count number of -+ * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. -+ * -+ * all operations can be expressed as: -+ * - init buddy: buddy = on-disk + PAs -+ * - new PA: buddy += N; PA = N -+ * - use inode PA: on-disk += N; PA -= N -+ * - discard inode PA buddy -= on-disk - PA; PA = 0 -+ * - use locality group PA on-disk += N; PA -= N -+ * - discard locality group PA buddy -= PA; PA = 0 -+ * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap -+ * is used in real operation because we can't know actual used -+ * bits from PA, only from on-disk bitmap -+ * -+ * if we follow this strict logic, then all operations above should be atomic. -+ * given some of them can block, we'd have to use something like semaphores -+ * killing performance on high-end SMP hardware. let's try to relax it using -+ * the following knowledge: -+ * 1) if buddy is referenced, it's already initialized -+ * 2) while block is used in buddy and the buddy is referenced, -+ * nobody can re-allocate that block -+ * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has -+ * bit set and PA claims same block, it's OK. IOW, one can set bit in -+ * on-disk bitmap if buddy has same bit set or/and PA covers corresponded -+ * block -+ * -+ * so, now we're building a concurrency table: -+ * - init buddy vs. -+ * - new PA -+ * blocks for PA are allocated in the buddy, buddy must be referenced -+ * until PA is linked to allocation group to avoid concurrent buddy init -+ * - use inode PA -+ * we need to make sure that either on-disk bitmap or PA has uptodate data -+ * given (3) we care that PA-=N operation doesn't interfere with init -+ * - discard inode PA -+ * the simplest way would be to have buddy initialized by the discard -+ * - use locality group PA -+ * again PA-=N must be serialized with init -+ * - discard locality group PA -+ * the simplest way would be to have buddy initialized by the discard -+ * - new PA vs. -+ * - use inode PA -+ * i_truncate_mutex serializes them -+ * - discard inode PA -+ * discard process must wait until PA isn't used by another process -+ * - use locality group PA -+ * some mutex should serialize them -+ * - discard locality group PA -+ * discard process must wait until PA isn't used by another process -+ * - use inode PA -+ * - use inode PA -+ * i_truncate_mutex or another mutex should serializes them -+ * - discard inode PA -+ * discard process must wait until PA isn't used by another process -+ * - use locality group PA -+ * nothing wrong here -- they're different PAs covering different blocks -+ * - discard locality group PA -+ * discard process must wait until PA isn't used by another process -+ * -+ * now we're ready to make few consequences: -+ * - PA is referenced and while it is no discard is possible -+ * - PA is referenced until block isn't marked in on-disk bitmap -+ * - PA changes only after on-disk bitmap -+ * - discard must not compete with init. either init is done before -+ * any discard or they're serialized somehow -+ * - buddy init as sum of on-disk bitmap and PAs is done atomically -+ * -+ * a special case when we've used PA to emptiness. no need to modify buddy -+ * in this case, but we should care about concurrent init -+ * -+ */ -+ -+ /* -+ * Logic in few words: -+ * -+ * - allocation: -+ * load group -+ * find blocks -+ * mark bits in on-disk bitmap -+ * release group -+ * -+ * - use preallocation: -+ * find proper PA (per-inode or group) -+ * load group -+ * mark bits in on-disk bitmap -+ * release group -+ * release PA -+ * -+ * - free: -+ * load group -+ * mark bits in on-disk bitmap -+ * release group -+ * -+ * - discard preallocations in group: -+ * mark PAs deleted -+ * move them onto local list -+ * load on-disk bitmap -+ * load group -+ * remove PA from object (inode or locality group) -+ * mark free blocks in-core -+ * -+ * - discard inode's preallocations: -+ */ -+ -+/* -+ * Locking rules -+ * -+ * Locks: -+ * - bitlock on a group (group) -+ * - object (inode/locality) (object) -+ * - per-pa lock (pa) -+ * -+ * Paths: -+ * - new pa -+ * object -+ * group -+ * -+ * - find and use pa: -+ * pa -+ * -+ * - release consumed pa: -+ * pa -+ * group -+ * object -+ * -+ * - generate in-core bitmap: -+ * group -+ * pa -+ * -+ * - discard all for given object (inode, locality group): -+ * object -+ * pa -+ * group -+ * -+ * - discard all for given group: -+ * group -+ * pa -+ * group -+ * object -+ * -+ */ -+ -+/* -+ * with AGGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. these checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ * with DOUBLE_CHECK defined mballoc creates persistent in-core -+ * bitmaps, maintains and uses them to check for double allocations -+ */ -+#define DOUBLE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory -+ * and you can monitor it in /proc/fs/ext3//mb_history -+ */ -+#define EXT3_MB_HISTORY -+#define EXT3_MB_HISTORY_ALLOC 1 /* allocation */ -+#define EXT3_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ -+#define EXT3_MB_HISTORY_DISCARD 4 /* preallocation discarded */ -+#define EXT3_MB_HISTORY_FREE 8 /* free */ -+ -+#define EXT3_MB_HISTORY_DEFAULT (EXT3_MB_HISTORY_ALLOC | \ -+ EXT3_MB_HISTORY_PREALLOC | \ -+ EXT3_MB_HISTORY_DISCARD | \ -+ EXT3_MB_HISTORY_FREE) -+ -+/* -+ * How long mballoc can look for a best extent (in found extents) -+ */ -+#define MB_DEFAULT_MAX_TO_SCAN 200 -+ -+/* -+ * How long mballoc must look for a best extent -+ */ -+#define MB_DEFAULT_MIN_TO_SCAN 10 -+ -+/* -+ * How many groups mballoc will scan looking for the best chunk -+ */ -+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+#define MB_DEFAULT_STATS 1 -+ -+/* -+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served -+ * by the stream allocator, which purpose is to pack requests -+ * as close each to other as possible to produce smooth I/O traffic -+ */ -+#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ -+ -+/* -+ * for which requests use 2^N search using buddies -+ */ -+#define MB_DEFAULT_ORDER2_REQS 8 -+ -+/* -+ * default stripe size = 1MB -+ */ -+#define MB_DEFAULT_STRIPE 256 -+ -+static struct kmem_cache *ext3_pspace_cachep = NULL; -+ -+#ifdef EXT3_BB_MAX_BLOCKS -+#undef EXT3_BB_MAX_BLOCKS -+#endif -+#define EXT3_BB_MAX_BLOCKS 30 -+ -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_group_info { -+ unsigned long bb_state; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned short bb_fragments; -+ struct list_head bb_prealloc_list; -+#ifdef DOUBLE_CHECK -+ void *bb_bitmap; -+#endif -+ unsigned short bb_counters[]; -+}; -+ -+#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 -+#define EXT3_GROUP_INFO_LOCKED_BIT 1 -+ -+#define EXT3_MB_GRP_NEED_INIT(grp) \ -+ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) -+ -+ -+struct ext3_prealloc_space { -+ struct list_head pa_inode_list; -+ struct list_head pa_group_list; -+ union { -+ struct list_head pa_tmp_list; -+ struct rcu_head pa_rcu; -+ } u; -+ spinlock_t pa_lock; -+ atomic_t pa_count; -+ unsigned pa_deleted; -+ unsigned long pa_pstart; /* phys. block */ -+ unsigned long pa_lstart; /* log. block */ -+ unsigned short pa_len; /* len of preallocated chunk */ -+ unsigned short pa_free; /* how many blocks are free */ -+ unsigned short pa_linear; /* consumed in one direction -+ * strictly, for group prealloc */ -+ spinlock_t *pa_obj_lock; -+ struct inode *pa_inode; /* hack, for history only */ -+}; -+ -+ -+struct ext3_free_extent { -+ unsigned long fe_logical; -+ unsigned long fe_start; -+ unsigned long fe_group; -+ unsigned long fe_len; -+}; -+ -+/* -+ * Locality group: -+ * we try to group all related changes together -+ * so that writeback can flush/allocate them together as well -+ */ -+struct ext3_locality_group { -+ /* for allocator */ -+ struct semaphore lg_sem; /* to serialize allocates */ -+ struct list_head lg_prealloc_list;/* list of preallocations */ -+ spinlock_t lg_prealloc_lock; -+}; -+ -+struct ext3_allocation_context { -+ struct inode *ac_inode; -+ struct super_block *ac_sb; -+ -+ /* original request */ -+ struct ext3_free_extent ac_o_ex; -+ -+ /* goal request (after normalization) */ -+ struct ext3_free_extent ac_g_ex; -+ -+ /* the best found extent */ -+ struct ext3_free_extent ac_b_ex; -+ -+ /* copy of the bext found extent taken before preallocation efforts */ -+ struct ext3_free_extent ac_f_ex; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ unsigned long ac_ex_scanned; -+ __u16 ac_groups_scanned; -+ __u16 ac_found; -+ __u16 ac_tail; -+ __u16 ac_buddy; -+ __u16 ac_flags; /* allocation hints */ -+ __u8 ac_status; -+ __u8 ac_criteria; -+ __u8 ac_repeats; -+ __u8 ac_2order; /* if request is to allocate 2^N blocks and -+ * N > 0, the field stores N, otherwise 0 */ -+ __u8 ac_op; /* operation, for history only */ -+ struct page *ac_bitmap_page; -+ struct page *ac_buddy_page; -+ struct ext3_prealloc_space *ac_pa; -+ struct ext3_locality_group *ac_lg; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+#define AC_STATUS_BREAK 3 -+ -+struct ext3_mb_history { -+ struct ext3_free_extent orig; /* orig allocation */ -+ struct ext3_free_extent goal; /* goal allocation */ -+ struct ext3_free_extent result; /* result allocation */ -+ unsigned pid; -+ unsigned ino; -+ __u16 found; /* how many extents have been found */ -+ __u16 groups; /* how many groups have been scanned */ -+ __u16 tail; /* what tail broke some buddy */ -+ __u16 buddy; /* buddy the tail ^^^ broke */ -+ __u16 flags; -+ __u8 cr:3; /* which phase the result extent was found at */ -+ __u8 op:4; -+ __u8 merged:1; -+}; -+ -+struct ext3_buddy { -+ struct page *bd_buddy_page; -+ void *bd_buddy; -+ struct page *bd_bitmap_page; -+ void *bd_bitmap; -+ struct ext3_group_info *bd_info; -+ struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; -+}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) -+ -+#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(ac) -+#else -+static void ext3_mb_store_history(struct ext3_allocation_context *ac); -+#endif -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); -+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+unsigned long ext3_new_blocks_old(handle_t *handle, struct inode *inode, -+ unsigned long goal, unsigned long *count, int *errp); -+void ext3_mb_release_blocks(struct super_block *, int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+void ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group); -+void ext3_mb_free_consumed_preallocations(struct ext3_allocation_context *ac); -+void ext3_mb_return_to_preallocation(struct inode *inode, struct ext3_buddy *e3b, -+ sector_t block, int count); -+void ext3_mb_show_ac(struct ext3_allocation_context *ac); -+void ext3_mb_check_with_pa(struct ext3_buddy *e3b, int first, int count); -+void ext3_mb_put_pa(struct ext3_allocation_context *, struct super_block *, struct ext3_prealloc_space *pa); -+int ext3_mb_init_per_dev_proc(struct super_block *sb); -+int ext3_mb_destroy_per_dev_proc(struct super_block *sb); -+ -+/* -+ * Calculate the block group number and offset, given a block number -+ */ -+static void ext3_get_group_no_and_offset(struct super_block *sb, -+ unsigned long blocknr, -+ unsigned long *blockgrpp, -+ unsigned long *offsetp) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ unsigned long offset; -+ -+ blocknr = blocknr - le32_to_cpu(es->s_first_data_block); -+ offset = blocknr % EXT3_BLOCKS_PER_GROUP(sb); -+ blocknr = blocknr / EXT3_BLOCKS_PER_GROUP(sb); -+ if (offsetp) -+ *offsetp = offset; -+ if (blockgrpp) -+ *blockgrpp = blocknr; -+ -+} -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline int -+ext3_is_group_locked(struct super_block *sb, int group) -+{ -+ return bit_spin_is_locked(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+unsigned long ext3_grp_offs_to_block(struct super_block *sb, -+ struct ext3_free_extent *fex) -+{ -+ unsigned long block; -+ -+ block = (unsigned long) fex->fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + fex->fe_start -+ + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); -+ return block; -+} -+ -+#if BITS_PER_LONG == 64 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 7UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~7UL); \ -+} -+#elif BITS_PER_LONG == 32 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 3UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~3UL); \ -+} -+#else -+#error "how many bits you are?!" -+#endif -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); -+} -+ -+static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(lock, bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(lock, bit, addr); -+} -+ -+static inline int mb_find_next_zero_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ return ext2_find_next_zero_bit(addr, max, start) - fix; -+} -+ -+static inline int mb_find_next_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ -+#ifdef __BIG_ENDIAN -+#else -+ return find_next_bit(addr, max, start) - fix; -+#endif -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ char *bb; -+ -+ BUG_ON(EXT3_MB_BITMAP(e3b) == EXT3_MB_BUDDY(e3b)); -+ BUG_ON(max == NULL); -+ -+ if (order > e3b->bd_blkbits + 1) { -+ *max = 0; -+ return NULL; -+ } -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return EXT3_MB_BITMAP(e3b); -+ -+ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; -+ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; -+ -+ return bb; -+} -+ -+#ifdef DOUBLE_CHECK -+void mb_free_blocks_double(struct inode *inode, struct ext3_buddy *e3b, -+ int first, int count) -+{ -+ int i; -+ struct super_block *sb = e3b->bd_sb; -+ -+ if (unlikely(e3b->bd_info->bb_bitmap == NULL)) -+ return; -+ BUG_ON(!ext3_is_group_locked(sb, e3b->bd_group)); -+ for (i = 0; i < count; i++) { -+ if (!mb_test_bit(first + i, e3b->bd_info->bb_bitmap)) { -+ unsigned long blocknr; -+ blocknr = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb); -+ blocknr += first + i; -+ blocknr += -+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); -+ -+ ext3_error(sb, __FUNCTION__, "double-free of inode" -+ " %lu's block %lu(bit %u in group %u)\n", -+ inode ? inode->i_ino : 0, blocknr, -+ first + i, e3b->bd_group); -+ } -+ mb_clear_bit(first + i, e3b->bd_info->bb_bitmap); -+ } -+} -+ -+void mb_mark_used_double(struct ext3_buddy *e3b, int first, int count) -+{ -+ int i; -+ if (unlikely(e3b->bd_info->bb_bitmap == NULL)) -+ return; -+ BUG_ON(!ext3_is_group_locked(e3b->bd_sb, e3b->bd_group)); -+ for (i = 0; i < count; i++) { -+ BUG_ON(mb_test_bit(first + i, e3b->bd_info->bb_bitmap)); -+ mb_set_bit(first + i, e3b->bd_info->bb_bitmap); -+ } -+} -+ -+void mb_cmp_bitmaps(struct ext3_buddy *e3b, void *bitmap) -+{ -+ if (memcmp(e3b->bd_info->bb_bitmap, bitmap, e3b->bd_sb->s_blocksize)) { -+ unsigned char *b1, *b2; -+ int i; -+ b1 = (unsigned char *) e3b->bd_info->bb_bitmap; -+ b2 = (unsigned char *) bitmap; -+ for (i = 0; i < e3b->bd_sb->s_blocksize; i++) { -+ if (b1[i] != b2[i]) { -+ printk("corruption in group %u at byte %u(%u): " -+ "%x in copy != %x on disk/prealloc\n", -+ e3b->bd_group, i, i * 8, b1[i], b2[i]); -+ BUG(); -+ } -+ } -+ } -+} -+ -+#else -+#define mb_free_blocks_double(a,b,c,d) -+#define mb_mark_used_double(a,b,c) -+#define mb_cmp_bitmaps(a,b) -+#endif -+ -+#ifdef AGGRESSIVE_CHECK -+ -+#define MB_CHECK_ASSERT(assert) \ -+do { \ -+ if (!(assert)) { \ -+ printk (KERN_EMERG \ -+ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ -+ function, file, line, # assert); \ -+ BUG(); \ -+ } \ -+} while (0) -+ -+static int __mb_check_buddy(struct ext3_buddy *e3b, char *file, -+ const char *function, int line) -+{ -+ struct super_block *sb = e3b->bd_sb; -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ struct ext3_group_info *grp; -+ int fragments = 0, fstart; -+ struct list_head *cur; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ { -+ static int mb_check_counter = 0; -+ if (mb_check_counter++ % 100 != 0) -+ return 0; -+ } -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ MB_CHECK_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ MB_CHECK_ASSERT(buddy2); -+ MB_CHECK_ASSERT(buddy != buddy2); -+ MB_CHECK_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (!mb_test_bit(i << 1, buddy2)) -+ MB_CHECK_ASSERT(mb_test_bit((i<<1)+1, buddy2)); -+ else if (!mb_test_bit((i << 1) + 1, buddy2)) -+ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); -+ MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ MB_CHECK_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); -+ } -+ count++; -+ } -+ MB_CHECK_ASSERT(e3b->bd_info->bb_counters[order] == count); -+ order--; -+ } -+ -+ fstart = -1; -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) { -+ MB_CHECK_ASSERT(i >= e3b->bd_info->bb_first_free); -+ if (fstart == -1) { -+ fragments++; -+ fstart = i; -+ } -+ continue; -+ } -+ fstart = -1; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ MB_CHECK_ASSERT(k < max2); -+ MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); -+ } -+ } -+ MB_CHECK_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); -+ MB_CHECK_ASSERT(e3b->bd_info->bb_fragments == fragments); -+ -+ grp = EXT3_GROUP_INFO(sb, e3b->bd_group); -+ buddy = mb_find_buddy(e3b, 0, &max); -+ list_for_each(cur, &grp->bb_prealloc_list) { -+ unsigned long groupnr; -+ struct ext3_prealloc_space *pa; -+ pa = list_entry(cur, struct ext3_prealloc_space, group_list); -+ ext3_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k); -+ MB_CHECK_ASSERT(groupnr == e3b->bd_group); -+ for (i = 0; i < pa->len; i++) -+ MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); -+ } -+ return 0; -+} -+#undef MB_CHECK_ASSERT -+#define mb_check_buddy(e3b) __mb_check_buddy(e3b,__FILE__,__FUNCTION__,__LINE__) -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+/* find most significant bit */ -+static int inline fmsb(unsigned short word) -+{ -+ int order; -+ -+ if (word > 255) { -+ order = 7; -+ word >>= 8; -+ } else { -+ order = -1; -+ } -+ -+ do { -+ order++; -+ word >>= 1; -+ } while (word != 0); -+ -+ return order; -+} -+ -+static void inline -+ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, -+ int len, struct ext3_group_info *grp) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned short min, max, chunk, border; -+ -+ BUG_ON(len >= EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ border = 2 << sb->s_blocksize_bits; -+ -+ while (len > 0) { -+ /* find how many blocks can be covered since this position */ -+ max = ffs(first | border) - 1; -+ -+ /* find how many blocks of power 2 we need to mark */ -+ min = fmsb(len); -+ -+ if (max < min) -+ min = max; -+ chunk = 1 << min; -+ -+ /* mark multiblock chunks only */ -+ grp->bb_counters[min]++; -+ if (min > 0) -+ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); -+ -+ len -= chunk; -+ first += chunk; -+ } -+} -+ -+static void -+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); -+ unsigned short i = 0, first, len; -+ unsigned free = 0, fragments = 0; -+ unsigned long long period = get_cycles(); -+ -+ /* initialize buddy from bitmap which is aggregation -+ * of on-disk bitmap and preallocations */ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ grp->bb_first_free = i; -+ while (i < max) { -+ fragments++; -+ first = i; -+ i = ext2_find_next_le_bit(bitmap, max, i); -+ len = i - first; -+ free += len; -+ if (len > 1) -+ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); -+ else -+ grp->bb_counters[0]++; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ grp->bb_fragments = fragments; -+ -+ if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; -+ } -+ -+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); -+ -+ period = get_cycles() - period; -+ spin_lock(&EXT3_SB(sb)->s_bal_lock); -+ EXT3_SB(sb)->s_mb_buddies_generated++; -+ EXT3_SB(sb)->s_mb_generation_time += period; -+ spin_unlock(&EXT3_SB(sb)->s_bal_lock); -+} -+ -+static int ext3_mb_init_cache(struct page *page, char *incore) -+{ -+ int blocksize, blocks_per_page, groups_per_page; -+ int err = 0, i, first_group, first_block; -+ struct super_block *sb; -+ struct buffer_head *bhs; -+ struct buffer_head **bh; -+ struct inode *inode; -+ char *data, *bitmap; -+ -+ mb_debug("init page %lu\n", page->index); -+ -+ inode = page->mapping->host; -+ sb = inode->i_sb; -+ blocksize = 1 << inode->i_blkbits; -+ blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ -+ groups_per_page = blocks_per_page >> 1; -+ if (groups_per_page == 0) -+ groups_per_page = 1; -+ -+ /* allocate buffer_heads to read bitmaps */ -+ if (groups_per_page > 1) { -+ err = -ENOMEM; -+ i = sizeof(struct buffer_head *) * groups_per_page; -+ bh = kmalloc(i, GFP_NOFS); -+ if (bh == NULL) -+ goto out; -+ memset(bh, 0, i); -+ } else -+ bh = &bhs; -+ -+ first_group = page->index * blocks_per_page / 2; -+ -+ /* read all groups the page covers into the cache */ -+ for (i = 0; i < groups_per_page; i++) { -+ struct ext3_group_desc * desc; -+ -+ if (first_group + i >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ err = -EIO; -+ desc = ext3_get_group_desc(sb, first_group + i, NULL); -+ if (desc == NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (bh[i] == NULL) -+ goto out; -+ -+ if (buffer_uptodate(bh[i])) -+ continue; -+ -+ lock_buffer(bh[i]); -+ if (buffer_uptodate(bh[i])) { -+ unlock_buffer(bh[i]); -+ continue; -+ } -+ -+ get_bh(bh[i]); -+ bh[i]->b_end_io = end_buffer_read_sync; -+ submit_bh(READ, bh[i]); -+ mb_debug("read bitmap for group %u\n", first_group + i); -+ } -+ -+ /* wait for I/O completion */ -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ wait_on_buffer(bh[i]); -+ -+ err = -EIO; -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ if (!buffer_uptodate(bh[i])) -+ goto out; -+ -+ first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { -+ int group; -+ -+ group = (first_block + i) >> 1; -+ if (group >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ data = page_address(page) + (i * blocksize); -+ bitmap = bh[group - first_group]->b_data; -+ -+ if ((first_block + i) & 1) { -+ /* this is block of buddy */ -+ BUG_ON(incore == NULL); -+ mb_debug("put buddy for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memset(data, 0xff, blocksize); -+ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; -+ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, -+ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, incore, group); -+ incore = NULL; -+ } else { -+ /* this is block of bitmap */ -+ BUG_ON(incore != NULL); -+ mb_debug("put bitmap for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ -+ /* see comments in ext3_mb_put_pa() */ -+ ext3_lock_group(sb, group); -+ memcpy(data, bitmap, blocksize); -+ -+ /* mark all preallocated blocks used in in-core bitmap */ -+ ext3_mb_generate_from_pa(sb, data, group); -+ ext3_unlock_group(sb, group); -+ -+ incore = data; -+ } -+ } -+ SetPageUptodate(page); -+ -+out: -+ if (bh) { -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh != &bhs) -+ kfree(bh); -+ } -+ return err; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *inode = sbi->s_buddy_cache; -+ int blocks_per_page, block, pnum, poff; -+ struct page *page; -+ -+ mb_debug("load group %u\n", group); -+ -+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = EXT3_GROUP_INFO(sb, group); -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ e3b->bd_buddy_page = NULL; -+ e3b->bd_bitmap_page = NULL; -+ -+ block = group * 2; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ /* we could use find_or_create_page(), but it locks page -+ * what we'd like to avoid in fast path ... */ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) { -+ ext3_mb_init_cache(page, NULL); -+ mb_cmp_bitmaps(e3b, page_address(page) + -+ (poff * sb->s_blocksize)); -+ } -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_bitmap_page = page; -+ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ block++; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page, e3b->bd_bitmap); -+ -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_buddy_page = page; -+ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ BUG_ON(e3b->bd_bitmap_page == NULL); -+ BUG_ON(e3b->bd_buddy_page == NULL); -+ -+ return 0; -+ -+err: -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+ e3b->bd_buddy = NULL; -+ e3b->bd_bitmap = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+} -+ -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ BUG_ON(EXT3_MB_BITMAP(e3b) == EXT3_MB_BUDDY(e3b)); -+ BUG_ON(block >= (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = EXT3_MB_BUDDY(e3b); -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (!mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit_atomic(lock, cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit_atomic(lock, cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct inode *inode, struct ext3_buddy *e3b, -+ int first, int count) -+{ -+ int block = 0, max = 0, order; -+ void *buddy, *buddy2; -+ struct super_block *sb = e3b->bd_sb; -+ -+ BUG_ON(first + count > (sb->s_blocksize << 3)); -+ BUG_ON(!ext3_is_group_locked(sb, e3b->bd_group)); -+ mb_check_buddy(e3b); -+ mb_free_blocks_double(inode, e3b, first, count); -+ -+ e3b->bd_info->bb_free += count; -+ if (first < e3b->bd_info->bb_first_free) -+ e3b->bd_info->bb_first_free = first; -+ -+ /* let's maintain fragments counter */ -+ if (first != 0) -+ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); -+ if (first + count < EXT3_SB(sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); -+ if (block && max) -+ e3b->bd_info->bb_fragments--; -+ else if (!block && !max) -+ e3b->bd_info->bb_fragments++; -+ -+ /* let's maintain buddy itself */ -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ if (!mb_test_bit(block, EXT3_MB_BITMAP(e3b))) { -+ unsigned long blocknr; -+ blocknr = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb); -+ blocknr += block; -+ blocknr += -+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); -+ -+ ext3_error(sb, __FUNCTION__, "double-free of inode" -+ " %lu's block %lu(bit %u in group %u)\n", -+ inode ? inode->i_ino : 0, blocknr, block, -+ e3b->bd_group); -+ } -+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_info->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (mb_test_bit(block, buddy) || -+ mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't set -+ * free bits in bitmap */ -+ mb_set_bit(block, buddy); -+ mb_set_bit(block + 1, buddy); -+ } -+ e3b->bd_info->bb_counters[order]--; -+ e3b->bd_info->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_info->bb_counters[order]++; -+ -+ mb_clear_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int next = block, max, ord; -+ void *buddy; -+ -+ BUG_ON(!ext3_is_group_locked(e3b->bd_sb, e3b->bd_group)); -+ BUG_ON(ex == NULL); -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ BUG_ON(buddy == NULL); -+ BUG_ON(block >= max); -+ if (mb_test_bit(block, buddy)) { -+ ex->fe_len = 0; -+ ex->fe_start = 0; -+ ex->fe_group = 0; -+ return 0; -+ } -+ -+ if (likely(order == 0)) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_group = e3b->bd_group; -+ -+ /* calc difference from given start */ -+ next = next - ex->fe_start; -+ ex->fe_len -= next; -+ ex->fe_start += next; -+ -+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ } -+ -+ BUG_ON(ex->fe_start + ex->fe_len > (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) -+{ -+ int ord, mlen = 0, max = 0, cur; -+ int start = ex->fe_start; -+ int len = ex->fe_len; -+ unsigned ret = 0; -+ int len0 = len; -+ void *buddy; -+ -+ BUG_ON(start + len > (e3b->bd_sb->s_blocksize << 3)); -+ BUG_ON(e3b->bd_group != ex->fe_group); -+ BUG_ON(!ext3_is_group_locked(e3b->bd_sb, e3b->bd_group)); -+ mb_check_buddy(e3b); -+ mb_mark_used_double(e3b, start, len); -+ -+ e3b->bd_info->bb_free -= len; -+ if (e3b->bd_info->bb_first_free == start) -+ e3b->bd_info->bb_first_free += len; -+ -+ /* let's maintain fragments counter */ -+ if (start != 0) -+ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); -+ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); -+ if (mlen && max) -+ e3b->bd_info->bb_fragments++; -+ else if (!mlen && !max) -+ e3b->bd_info->bb_fragments--; -+ -+ /* let's maintain buddy itself */ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ BUG_ON((start >> ord) >= max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ BUG_ON(len < 0); -+ continue; -+ } -+ -+ /* store for history */ -+ if (ret == 0) -+ ret = len | (ord << 16); -+ -+ /* we have to split large buddy */ -+ BUG_ON(ord <= 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(cur, buddy); -+ mb_clear_bit(cur + 1, buddy); -+ e3b->bd_info->bb_counters[ord]++; -+ e3b->bd_info->bb_counters[ord]++; -+ } -+ -+ mb_set_bits(sb_bgl_lock(EXT3_SB(e3b->bd_sb), ex->fe_group), -+ EXT3_MB_BITMAP(e3b), ex->fe_start, len0); -+ mb_check_buddy(e3b); -+ -+ return ret; -+} -+ -+/* -+ * Must be called under group lock! -+ */ -+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ unsigned long ret; -+ -+ BUG_ON(ac->ac_b_ex.fe_group != e3b->bd_group); -+ BUG_ON(ac->ac_status == AC_STATUS_FOUND); -+ -+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; -+ ret = mb_mark_used(e3b, &ac->ac_b_ex); -+ -+ /* preallocation can change ac_b_ex, thus we store actually -+ * allocated blocks for history */ -+ ac->ac_f_ex = ac->ac_b_ex; -+ -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_tail = ret & 0xffff; -+ ac->ac_buddy = ret >> 16; -+ -+ /* XXXXXXX: SUCH A HORRIBLE **CK */ -+ ac->ac_bitmap_page = e3b->bd_bitmap_page; -+ get_page(ac->ac_bitmap_page); -+ ac->ac_buddy_page = e3b->bd_buddy_page; -+ get_page(ac->ac_buddy_page); -+} -+ -+/* -+ * regular allocator, for general purposes allocation -+ */ -+ -+void ext3_mb_check_limits(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b, -+ int finish_group) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ struct ext3_free_extent ex; -+ int max; -+ -+ /* -+ * We don't want to scan for a whole year -+ */ -+ if (ac->ac_found > sbi->s_mb_max_to_scan && -+ !(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ ac->ac_status = AC_STATUS_BREAK; -+ return; -+ } -+ -+ /* -+ * Haven't found good chunk so far, let's continue -+ */ -+ if (bex->fe_len < gex->fe_len) -+ return; -+ -+ if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) -+ && bex->fe_group == e3b->bd_group) { -+ /* recheck chunk's availability - we don't know -+ * when it was found (within this lock-unlock -+ * period or not) */ -+ max = mb_find_extent(e3b, 0, bex->fe_start, gex->fe_len, &ex); -+ if (max >= gex->fe_len) { -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ } -+} -+ -+/* -+ * The routine checks whether found extent is good enough. If it is, -+ * then the extent gets marked used and flag is set to the context -+ * to stop scanning. Otherwise, the extent is compared with the -+ * previous found extent and if new one is better, then it's stored -+ * in the context. Later, the best found extent will be used, if -+ * mballoc can't find good enough extent. -+ * -+ * FIXME: real allocation policy is to be designed yet! -+ */ -+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, -+ struct ext3_free_extent *ex, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ -+ BUG_ON(ex->fe_len <= 0); -+ BUG_ON(ex->fe_len >= EXT3_BLOCKS_PER_GROUP(ac->ac_sb)); -+ BUG_ON(ex->fe_start >= EXT3_BLOCKS_PER_GROUP(ac->ac_sb)); -+ BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); -+ -+ ac->ac_found++; -+ -+ /* -+ * The special case - take what you catch first -+ */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Let's check whether the chuck is good enough -+ */ -+ if (ex->fe_len == gex->fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If this is first found extent, just store it in the context -+ */ -+ if (bex->fe_len == 0) { -+ *bex = *ex; -+ return; -+ } -+ -+ /* -+ * If new found extent is better, store it in the context -+ */ -+ if (bex->fe_len < gex->fe_len) { -+ /* if the request isn't satisfied, any found extent -+ * larger than previous best one is better */ -+ if (ex->fe_len > bex->fe_len) -+ *bex = *ex; -+ } else if (ex->fe_len > gex->fe_len) { -+ /* if the request is satisfied, then we try to find -+ * an extent that still satisfy the request, but is -+ * smaller than previous one */ -+ *bex = *ex; -+ } -+ -+ ext3_mb_check_limits(ac, e3b, 0); -+} -+ -+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent ex = ac->ac_b_ex; -+ int group = ex.fe_group, max, err; -+ -+ BUG_ON(ex.fe_len <= 0); -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ -+ if (max > 0) { -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ -+ ext3_unlock_group(ac->ac_sb, group); -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ int group = ac->ac_g_ex.fe_group, max, err; -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_super_block *es = sbi->s_es; -+ struct ext3_free_extent ex; -+ -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { -+ unsigned long start; -+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); -+ if (start % sbi->s_stripe == 0) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ } else if (max >= ac->ac_g_ex.fe_len) { -+ BUG_ON(ex.fe_len <= 0); -+ BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); -+ BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ BUG_ON(ex.fe_len <= 0); -+ BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); -+ BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ ext3_unlock_group(ac->ac_sb, group); -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+/* -+ * The routine scans buddy structures (not bitmap!) from given order -+ * to max order and tries to find big enough chunk to satisfy the req -+ */ -+static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_group_info *grp = e3b->bd_info; -+ void *buddy; -+ int i, k, max; -+ -+ BUG_ON(ac->ac_2order <= 0); -+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { -+ if (grp->bb_counters[i] == 0) -+ continue; -+ -+ buddy = mb_find_buddy(e3b, i, &max); -+ BUG_ON(buddy == NULL); -+ -+ k = mb_find_next_zero_bit(buddy, max, 0); -+ BUG_ON(k >= max); -+ -+ ac->ac_found++; -+ -+ ac->ac_b_ex.fe_len = 1 << i; -+ ac->ac_b_ex.fe_start = k << i; -+ ac->ac_b_ex.fe_group = e3b->bd_group; -+ -+ ext3_mb_use_best_found(ac, e3b); -+ -+ BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); -+ -+ if (EXT3_SB(sb)->s_mb_stats) -+ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); -+ -+ break; -+ } -+} -+ -+/* -+ * The routine scans the group and measures all found extents. -+ * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can know upper limit. -+ */ -+static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ int i, free; -+ -+ free = e3b->bd_info->bb_free; -+ BUG_ON(free <= 0); -+ -+ i = e3b->bd_info->bb_first_free; -+ -+ while (free && ac->ac_status == AC_STATUS_CONTINUE) { -+ i = mb_find_next_zero_bit(bitmap, EXT3_BLOCKS_PER_GROUP(sb), i); -+ if (i >= EXT3_BLOCKS_PER_GROUP(sb)) { -+ BUG_ON(free != 0); -+ break; -+ } -+ -+ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); -+ BUG_ON(ex.fe_len <= 0); -+ BUG_ON(free < ex.fe_len); -+ -+ ext3_mb_measure_extent(ac, &ex, e3b); -+ -+ i += ex.fe_len; -+ free -= ex.fe_len; -+ } -+ -+ ext3_mb_check_limits(ac, e3b, 1); -+} -+ -+/* -+ * This is a special case for storages like raid5 -+ * we try to find stripe-aligned chunks for stripe-size requests -+ */ -+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ unsigned long i, max; -+ -+ BUG_ON(sbi->s_stripe == 0); -+ -+ /* find first stripe-aligned block */ -+ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(sbi->s_es->s_first_data_block); -+ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; -+ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) -+ % EXT3_BLOCKS_PER_GROUP(sb); -+ -+ while (i < EXT3_BLOCKS_PER_GROUP(sb)) { -+ if (!mb_test_bit(i, bitmap)) { -+ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); -+ if (max >= sbi->s_stripe) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ break; -+ } -+ } -+ i += sbi->s_stripe; -+ } -+} -+ -+static int ext3_mb_good_group(struct ext3_allocation_context *ac, -+ int group, int cr) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ unsigned free, fragments, i, bits; -+ -+ BUG_ON(cr < 0 || cr >= 4); -+ BUG_ON(EXT3_MB_GRP_NEED_INIT(grp)); -+ -+ free = grp->bb_free; -+ fragments = grp->bb_fragments; -+ if (free == 0) -+ return 0; -+ if (fragments == 0) -+ return 0; -+ -+ switch (cr) { -+ case 0: -+ BUG_ON(ac->ac_2order == 0); -+ bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i <= bits; i++) -+ if (grp->bb_counters[i] > 0) -+ return 1; -+ break; -+ case 1: -+ if ((free / fragments) >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 2: -+ if (free >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 3: -+ return 1; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_regular_allocator(struct ext3_allocation_context *ac) -+{ -+ int group, i, cr, err = 0; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ sb = ac->ac_sb; -+ sbi = EXT3_SB(sb); -+ BUG_ON(ac->ac_status == AC_STATUS_FOUND); -+ -+ /* first, try the goal */ -+ err = ext3_mb_find_by_goal(ac, &e3b); -+ if (err || ac->ac_status == AC_STATUS_FOUND) -+ goto out; -+ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_GOAL_ONLY)) -+ goto out; -+ -+ i = ffs(ac->ac_g_ex.fe_len); -+ ac->ac_2order = 0; -+ if (i >= sbi->s_mb_order2_reqs) { -+ i--; -+ if ((ac->ac_g_ex.fe_len & (~(1 << i))) == 0) -+ ac->ac_2order = i; -+ } -+ -+ group = ac->ac_g_ex.fe_group; -+ -+ /* Let's just scan groups to find more-less suitable blocks */ -+ cr = ac->ac_2order ? 0 : 1; -+repeat: -+ for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { -+ ac->ac_criteria = cr; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ struct ext3_group_info *grp; -+ -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ /* quick check to skip empty groups */ -+ grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ if (grp->bb_free == 0) -+ continue; -+ -+ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { -+ /* we need full data about the group -+ * to make a good selection */ -+ err = ext3_mb_load_buddy(sb, group, &e3b); -+ if (err) -+ goto out; -+ ext3_mb_release_desc(&e3b); -+ } -+ -+ /* check is group good for our criteries */ -+ if (!ext3_mb_good_group(ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_buddy(sb, group, &e3b); -+ if (err) -+ goto out; -+ -+ ext3_lock_group(sb, group); -+ if (!ext3_mb_good_group(ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ ac->ac_groups_scanned++; -+ if (cr == 0) -+ ext3_mb_simple_scan_group(ac, &e3b); -+ else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) -+ ext3_mb_scan_aligned(ac, &e3b); -+ else -+ ext3_mb_complex_scan_group(ac, &e3b); -+ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ -+ if (ac->ac_status != AC_STATUS_CONTINUE) -+ break; -+ } -+ } -+ -+ if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && -+ !(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ -+ ext3_mb_try_best_found(ac, &e3b); -+ if (ac->ac_status != AC_STATUS_FOUND) { -+ /* -+ * Someone more lucky has already allocated it. -+ * The only thing we can do is just take first -+ * found block(s) -+ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); -+ */ -+ ac->ac_b_ex.fe_group = 0; -+ ac->ac_b_ex.fe_start = 0; -+ ac->ac_b_ex.fe_len = 0; -+ ac->ac_status = AC_STATUS_CONTINUE; -+ ac->ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 3; -+ atomic_inc(&sbi->s_mb_lost_chunks); -+ goto repeat; -+ } -+ } -+out: -+ return err; -+} -+ -+#ifdef EXT3_MB_HISTORY -+struct ext3_mb_proc_session { -+ struct ext3_mb_history *history; -+ struct super_block *sb; -+ int start; -+ int max; -+}; -+ -+static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, -+ struct ext3_mb_history *hs, -+ int first) -+{ -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (!first && hs == s->history + s->start) -+ return NULL; -+ while (hs->orig.fe_len == 0) { -+ hs++; -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (hs == s->history + s->start) -+ return NULL; -+ } -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ if (!hs) -+ return NULL; -+ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ else -+ return ext3_mb_history_skip_empty(s, ++hs, 0); -+} -+ -+static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) -+{ -+ char buf[25], buf2[25], buf3[25], *fmt; -+ struct ext3_mb_history *hs = v; -+ -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " -+ "%-5s %-2s %-5s %-5s %-5s %-6s\n", -+ "pid", "inode", "original", "goal", "result","found", -+ "grps", "cr", "flags", "merge", "tail", "broken"); -+ return 0; -+ } -+ -+ if (hs->op == EXT3_MB_HISTORY_ALLOC) { -+ fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " -+ "%-5u %-5s %-5u %-6u\n"; -+ sprintf(buf2, "%lu/%lu/%lu@%lu", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len, -+ hs->result.fe_logical); -+ sprintf(buf, "%lu/%lu/%lu@%lu", hs->orig.fe_group, -+ hs->orig.fe_start, hs->orig.fe_len, -+ hs->orig.fe_logical); -+ sprintf(buf3, "%lu/%lu/%lu@%lu", hs->goal.fe_group, -+ hs->goal.fe_start, hs->goal.fe_len, -+ hs->goal.fe_logical); -+ seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, -+ hs->found, hs->groups, hs->cr, hs->flags, -+ hs->merged ? "M" : "", hs->tail, -+ hs->buddy ? 1 << hs->buddy : 0); -+ } else if (hs->op == EXT3_MB_HISTORY_PREALLOC) { -+ fmt = "%-5u %-8u %-23s %-23s %-23s\n"; -+ sprintf(buf2, "%lu/%lu/%lu@%lu", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len, -+ hs->result.fe_logical); -+ sprintf(buf, "%lu/%lu/%lu@%lu", hs->orig.fe_group, -+ hs->orig.fe_start, hs->orig.fe_len, -+ hs->orig.fe_logical); -+ seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); -+ } else if (hs->op == EXT3_MB_HISTORY_DISCARD) { -+ sprintf(buf2, "%lu/%lu/%lu", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-23s discard\n", -+ hs->pid, hs->ino, buf2); -+ } else if (hs->op == EXT3_MB_HISTORY_FREE) { -+ sprintf(buf2, "%lu/%lu/%lu", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-23s free\n", -+ hs->pid, hs->ino, buf2); -+ } -+ return 0; -+} -+ -+static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_history_ops = { -+ .start = ext3_mb_seq_history_start, -+ .next = ext3_mb_seq_history_next, -+ .stop = ext3_mb_seq_history_stop, -+ .show = ext3_mb_seq_history_show, -+}; -+ -+static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -ENOMEM; -+ s->sb = sb; -+ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; -+ s->history = kmalloc(size, GFP_KERNEL); -+ if (s->history == NULL) { -+ kfree(s); -+ return -ENOMEM; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(s->history, sbi->s_mb_history, size); -+ s->max = sbi->s_mb_history_max; -+ s->start = sbi->s_mb_history_cur % s->max; -+ spin_unlock(&sbi->s_mb_history_lock); -+ -+ rc = seq_open(file, &ext3_mb_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->history); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ kfree(s->history); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static ssize_t ext3_mb_seq_history_write(struct file *file, -+ const char __user *buffer, -+ size_t count, loff_t *ppos) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ struct super_block *sb = s->sb; -+ char str[32]; -+ int value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ "mb_history", (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ value = simple_strtol(str, NULL, 0); -+ if (value < 0) -+ return -ERANGE; -+ EXT3_SB(sb)->s_mb_history_filter = value; -+ -+ return count; -+} -+ -+static struct file_operations ext3_mb_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .write = ext3_mb_seq_history_write, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, -+}; -+ -+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ -+ group = *pos + 1; -+ return (void *) group; -+} -+ -+static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ ++*pos; -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ group = *pos + 1; -+ return (void *) group;; -+} -+ -+static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) -+{ -+ struct super_block *sb = seq->private; -+ long group = (long) v; -+ int i, err; -+ struct ext3_buddy e3b; -+ struct sg { -+ struct ext3_group_info info; -+ unsigned short counters[16]; -+ } sg; -+ -+ group--; -+ if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s " -+ "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " -+ "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", -+ "2^0", "2^1", "2^2", "2^3", "2^4", "2^5","2^6", -+ "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + -+ sizeof(struct ext3_group_info); -+ err = ext3_mb_load_buddy(sb, group, &e3b); -+ if (err) { -+ seq_printf(seq, "#%-5lu: I/O error\n", group); -+ return 0; -+ } -+ ext3_lock_group(sb, group); -+ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); -+ for (i = 0; i <= 13; i++) -+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? -+ sg.info.bb_counters[i] : 0); -+ seq_printf(seq, " ]\n"); -+ -+ return 0; -+} -+ -+static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_groups_ops = { -+ .start = ext3_mb_seq_groups_start, -+ .next = ext3_mb_seq_groups_next, -+ .stop = ext3_mb_seq_groups_stop, -+ .show = ext3_mb_seq_groups_show, -+}; -+ -+static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ int rc; -+ -+ rc = seq_open(file, &ext3_mb_seq_groups_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = sb; -+ } -+ return rc; -+ -+} -+ -+static struct file_operations ext3_mb_seq_groups_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static void ext3_mb_history_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); -+ -+ if (sbi->s_mb_history) -+ kfree(sbi->s_mb_history); -+} -+ -+static void ext3_mb_history_init(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ -+ if (sbi->s_mb_proc != NULL) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_history_fops; -+ p->data = sb; -+ } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_groups_fops; -+ p->data = sb; -+ } -+ } -+ -+ sbi->s_mb_history_max = 1000; -+ sbi->s_mb_history_cur = 0; -+ spin_lock_init(&sbi->s_mb_history_lock); -+ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); -+ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); -+ if (likely(sbi->s_mb_history != NULL)) -+ memset(sbi->s_mb_history, 0, i); -+ /* if we can't allocate history, then we simple won't use it */ -+} -+ -+static void -+ext3_mb_store_history(struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_mb_history h; -+ -+ if (unlikely(sbi->s_mb_history == NULL)) -+ return; -+ -+ if (!(ac->ac_op & sbi->s_mb_history_filter)) -+ return; -+ -+ h.op = ac->ac_op; -+ h.pid = current->pid; -+ h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; -+ h.orig = ac->ac_o_ex; -+ h.result = ac->ac_b_ex; -+ h.flags = ac->ac_flags; -+ h.merged = 0; -+ if (ac->ac_op == EXT3_MB_HISTORY_ALLOC) { -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ h.merged = 1; -+ h.goal = ac->ac_g_ex; -+ h.result = ac->ac_f_ex; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); -+ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) -+ sbi->s_mb_history_cur = 0; -+ spin_unlock(&sbi->s_mb_history_lock); -+} -+ -+#else -+#define ext3_mb_history_release(sb) -+#define ext3_mb_history_init(sb) -+#endif -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, j, len, metalen; -+ int num_meta_group_infos = -+ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ struct ext3_group_info **meta_group_info; -+ -+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -+ * So a two level scheme suffices for now. */ -+ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * -+ num_meta_group_infos, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); -+ return -ENOMEM; -+ } -+ sbi->s_buddy_cache = new_inode(sb); -+ if (sbi->s_buddy_cache == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ goto err_freesgi; -+ } -+ EXT3_I(sbi->s_buddy_cache)->i_disksize = 0; -+ -+ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) { -+ if ((i + 1) == num_meta_group_infos) -+ metalen = sizeof(*meta_group_info) * -+ (sbi->s_groups_count - -+ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); -+ meta_group_info = kmalloc(metalen, GFP_KERNEL); -+ if (meta_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " -+ "buddy group\n"); -+ goto err_freemeta; -+ } -+ sbi->s_group_info[i] = meta_group_info; -+ } -+ -+ /* -+ * calculate needed size. if change bb_counters size, -+ * don't forget about ext3_mb_generate_buddy() -+ */ -+ len = sizeof(struct ext3_group_info); -+ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct ext3_group_desc * desc; -+ -+ meta_group_info = -+ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; -+ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); -+ -+ meta_group_info[j] = kmalloc(len, GFP_KERNEL); -+ if (meta_group_info[j] == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ i--; -+ goto err_freebuddy; -+ } -+ desc = ext3_get_group_desc(sb, i, NULL); -+ if (desc == NULL) { -+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_freebuddy; -+ } -+ memset(meta_group_info[j], 0, len); -+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &meta_group_info[j]->bb_state); -+ -+ /* initialize bb_free to be able to skip -+ * empty groups without initialization */ -+ meta_group_info[j]->bb_free = -+ le16_to_cpu(desc->bg_free_blocks_count); -+ -+ INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); -+ -+#ifdef DOUBLE_CHECK -+ { -+ struct buffer_head *bh; -+ meta_group_info[j]->bb_bitmap = -+ kmalloc(sb->s_blocksize, GFP_KERNEL); -+ BUG_ON(meta_group_info[j]->bb_bitmap == NULL); -+ bh = read_block_bitmap(sb, i); -+ BUG_ON(bh == NULL); -+ memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, -+ sb->s_blocksize); -+ brelse(bh); -+ } -+#endif -+ -+ } -+ -+ return 0; -+ -+err_freebuddy: -+ while (i >= 0) { -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ i--; -+ } -+ i = num_meta_group_infos; -+err_freemeta: -+ while (--i >= 0) -+ kfree(sbi->s_group_info[i]); -+ iput(sbi->s_buddy_cache); -+err_freesgi: -+ kfree(sbi->s_group_info); -+ return -ENOMEM; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned i, offset, max; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); -+ -+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_offsets == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ return -ENOMEM; -+ } -+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_maxs == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ -+ /* order 0 is regular bitmap */ -+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; -+ sbi->s_mb_offsets[0] = 0; -+ -+ i = 1; -+ offset = 0; -+ max = sb->s_blocksize << 2; -+ do { -+ sbi->s_mb_offsets[i] = offset; -+ sbi->s_mb_maxs[i] = max; -+ offset += 1 << (sb->s_blocksize_bits - i); -+ max = max >> 1; -+ i++; -+ } while (i <= sb->s_blocksize_bits + 1); -+ -+ /* init file for buddy data */ -+ if ((i = ext3_mb_init_backend(sb))) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return i; -+ } -+ -+ spin_lock_init(&sbi->s_md_lock); -+ INIT_LIST_HEAD(&sbi->s_active_transaction); -+ INIT_LIST_HEAD(&sbi->s_closed_transaction); -+ INIT_LIST_HEAD(&sbi->s_committed_transaction); -+ spin_lock_init(&sbi->s_bal_lock); -+ -+ sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; -+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; -+ sbi->s_mb_max_groups_to_scan = MB_DEFAULT_MAX_GROUPS_TO_SCAN; -+ sbi->s_mb_stats = MB_DEFAULT_STATS; -+ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; -+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; -+ sbi->s_mb_history_filter = EXT3_MB_HISTORY_DEFAULT; -+ -+ i = sizeof(struct ext3_locality_group) * NR_CPUS; -+ sbi->s_locality_groups = kmalloc(i, GFP_NOFS); -+ if (sbi->s_locality_groups == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ for (i = 0; i < NR_CPUS; i++) { -+ struct ext3_locality_group *lg; -+ lg = &sbi->s_locality_groups[i]; -+ sema_init(&lg->lg_sem, 1); -+ INIT_LIST_HEAD(&lg->lg_prealloc_list); -+ spin_lock_init(&lg->lg_prealloc_lock); -+ } -+ -+ ext3_mb_init_per_dev_proc(sb); -+ ext3_mb_history_init(sb); -+ -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+void ext3_mb_cleanup_pa(struct ext3_group_info *grp) -+{ -+ struct ext3_prealloc_space *pa; -+ struct list_head *cur, *tmp; -+ int count = 0; -+ -+ list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { -+ pa = list_entry(cur, struct ext3_prealloc_space, pa_group_list); -+ list_del_rcu(&pa->pa_group_list); -+ count++; -+ kfree(pa); -+ } -+ if (count) -+ mb_debug("mballoc: %u PAs left\n", count); -+ -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, num_meta_group_infos; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) { -+#ifdef DOUBLE_CHECK -+ if (EXT3_GROUP_INFO(sb, i)->bb_bitmap) -+ kfree(EXT3_GROUP_INFO(sb, i)->bb_bitmap); -+#endif -+ ext3_mb_cleanup_pa(EXT3_GROUP_INFO(sb, i)); -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ } -+ num_meta_group_infos = (sbi->s_groups_count + -+ EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) -+ kfree(sbi->s_group_info[i]); -+ kfree(sbi->s_group_info); -+ } -+ if (sbi->s_mb_offsets) -+ kfree(sbi->s_mb_offsets); -+ if (sbi->s_mb_maxs) -+ kfree(sbi->s_mb_maxs); -+ if (sbi->s_buddy_cache) -+ iput(sbi->s_buddy_cache); -+ if (sbi->s_mb_stats) { -+ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", -+ atomic_read(&sbi->s_bal_allocated), -+ atomic_read(&sbi->s_bal_reqs), -+ atomic_read(&sbi->s_bal_success)); -+ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " -+ "%u 2^N hits, %u breaks, %u lost\n", -+ atomic_read(&sbi->s_bal_ex_scanned), -+ atomic_read(&sbi->s_bal_goals), -+ atomic_read(&sbi->s_bal_2orders), -+ atomic_read(&sbi->s_bal_breaks), -+ atomic_read(&sbi->s_mb_lost_chunks)); -+ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", -+ sbi->s_mb_buddies_generated++, -+ sbi->s_mb_generation_time); -+ printk("EXT3-fs: mballoc: %u preallocated, %u discarded\n", -+ atomic_read(&sbi->s_mb_preallocated), -+ atomic_read(&sbi->s_mb_discarded)); -+ } -+ -+ if (sbi->s_locality_groups) -+ kfree(sbi->s_locality_groups); -+ -+ ext3_mb_history_release(sb); -+ ext3_mb_destroy_per_dev_proc(sb); -+ -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_buddy(sb, md->group, &e3b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ err = mb_free_blocks(NULL, &e3b, md->blocks[i], 1); -+ BUG_ON(err != 0); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ /* balance refcounts from ext3_mb_free_metadata() */ -+ page_cache_release(e3b.bd_buddy_page); -+ page_cache_release(e3b.bd_bitmap_page); -+ -+ kfree(md); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "stats" -+#define EXT3_MB_MAX_TO_SCAN_NAME "max_to_scan" -+#define EXT3_MB_MIN_TO_SCAN_NAME "min_to_scan" -+#define EXT3_MB_ORDER2_REQ "order2_req" -+#define EXT3_MB_STREAM_REQ "stream_req" -+ -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ sbi->s_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_max_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_min_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_stream_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", sbi->s_mb_stream_request); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stream_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ext3_sb_info *sbi = data; -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STREAM_REQ, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ sbi->s_mb_stream_request = value; -+ -+ return count; -+} -+ -+int ext3_mb_init_per_dev_proc(struct super_block *sb) -+{ -+ mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct proc_dir_entry *proc; -+ char devname[64], *name; -+ -+ snprintf(devname, sizeof(devname) - 1, "%s", -+ bdevname(sb->s_bdev, devname)); -+ sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext3); -+ -+ name = EXT3_MB_STATS_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_stats_read; -+ proc->write_proc = ext3_mb_stats_write; -+ -+ name = EXT3_MB_MAX_TO_SCAN_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_max_to_scan_read; -+ proc->write_proc = ext3_mb_max_to_scan_write; -+ -+ name = EXT3_MB_MIN_TO_SCAN_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_min_to_scan_read; -+ proc->write_proc = ext3_mb_min_to_scan_write; -+ -+ name = EXT3_MB_ORDER2_REQ; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_order2_req_read; -+ proc->write_proc = ext3_mb_order2_req_write; -+ -+ name = EXT3_MB_STREAM_REQ; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); -+ if (proc == NULL) -+ goto err_out; -+ proc->data = sbi; -+ proc->read_proc = ext3_mb_stream_req_read; -+ proc->write_proc = ext3_mb_stream_req_write; -+ -+ return 0; -+ -+err_out: -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", name); -+ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_STATS_NAME, sbi->s_mb_proc); -+ remove_proc_entry(devname, proc_root_ext3); -+ sbi->s_mb_proc = NULL; -+ -+ return -ENOMEM; -+} -+ -+int ext3_mb_destroy_per_dev_proc(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char devname[64]; -+ -+ if (sbi->s_mb_proc == NULL) -+ return -EINVAL; -+ -+ snprintf(devname, sizeof(devname) - 1, "%s", -+ bdevname(sb->s_bdev, devname)); -+ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_STATS_NAME, sbi->s_mb_proc); -+ remove_proc_entry(devname, proc_root_ext3); -+ -+ return 0; -+} -+ -+int __init init_ext3_proc(void) -+{ -+ ext3_pspace_cachep = -+ kmem_cache_create("ext3_prealloc_space", -+ sizeof(struct ext3_prealloc_space), -+ 0, SLAB_RECLAIM_ACCOUNT, NULL, NULL); -+ if (ext3_pspace_cachep == NULL) -+ return -ENOMEM; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ /* XXX: synchronize_rcu(); */ -+ kmem_cache_destroy(ext3_pspace_cachep); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} -+ -+ -+/* -+ * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps -+ * Returns 0 if success or error code -+ */ -+int ext3_mb_mark_diskspace_used(struct ext3_allocation_context *ac, handle_t *handle) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_super_block *es; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ sector_t block; -+ int err; -+ -+ BUG_ON(ac->ac_status != AC_STATUS_FOUND); -+ BUG_ON(ac->ac_b_ex.fe_len <= 0); -+ -+ sb = ac->ac_sb; -+ sbi = EXT3_SB(sb); -+ es = sbi->s_es; -+ -+ ext3_debug("using block group %d(%d)\n", ac->ac_b_group.group, -+ gdp->bg_free_blocks_count); -+ -+ err = -EIO; -+ bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); -+ if (!bitmap_bh) -+ goto out_err; -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ -+ err = -EIO; -+ gdp = ext3_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); -+ if (!gdp) -+ goto out_err; -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac->ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac->ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, __FUNCTION__, -+ "Allocating block in system zone - block = %lu", -+ (unsigned long) block); -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < ac->ac_b_ex.fe_len; i++) { -+ BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, -+ bitmap_bh->b_data)); -+ } -+ } -+#endif -+ mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, -+ ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); -+ -+ spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -+ - ac->ac_b_ex.fe_len); -+ spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ -+out_err: -+ sb->s_dirt = 1; -+ brelse(bitmap_bh); -+ return err; -+} -+ -+/* -+ * here we normalize request for locality group -+ * XXX: should we try to preallocate more than the group has now? -+ */ -+void ext3_mb_normalize_group_request(struct ext3_allocation_context *ac) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_locality_group *lg = ac->ac_lg; -+ -+ BUG_ON(lg == NULL); -+ if (EXT3_SB(sb)->s_stripe) -+ ac->ac_g_ex.fe_len = EXT3_SB(sb)->s_stripe; -+ else -+ ac->ac_g_ex.fe_len = (1024 * 1024) >> sb->s_blocksize_bits; -+ -+ mb_debug("#%u: goal %u blocks for locality group\n", -+ current->pid, ac->ac_g_ex.fe_len); -+} -+ -+/* -+ * Normalization means making request better in terms of -+ * size and alignment -+ */ -+void ext3_mb_normalize_request(struct ext3_allocation_context *ac, -+ struct ext3_allocation_request *ar) -+{ -+ struct ext3_inode_info *ei = EXT3_I(ac->ac_inode); -+ loff_t start, end, size, orig_size, orig_start; -+ struct list_head *cur; -+ int bsbits, max; -+ -+ /* do normalize only data requests, metadata requests -+ do not need preallocation */ -+ if (!(ac->ac_flags & EXT3_MB_HINT_DATA)) -+ return; -+ -+ /* sometime caller may want exact blocks */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_GOAL_ONLY)) -+ return; -+ -+ /* caller may indicate that preallocation isn't -+ * required (it's a tail, for example) */ -+ if (ac->ac_flags & EXT3_MB_HINT_NOPREALLOC) -+ return; -+ -+ if (ac->ac_flags & EXT3_MB_HINT_GROUP_ALLOC) -+ return ext3_mb_normalize_group_request(ac); -+ -+ bsbits = ac->ac_sb->s_blocksize_bits; -+ -+ /* first, let's learn actual file size -+ * given current request is allocated */ -+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; -+ size = size << bsbits; -+ if (size < i_size_read(ac->ac_inode)) -+ size = i_size_read(ac->ac_inode); -+ -+ /* max available blocks in a free group */ -+ max = EXT3_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -+ - EXT3_SB(ac->ac_sb)->s_itb_per_group; -+ -+#define NRL_CHECK_SIZE(req,size,max,bits) \ -+ (req <= (size) || max <= ((size) >> bits)) -+ -+ /* first, try to predict filesize */ -+ /* XXX: should this table be tunable? */ -+ start = 0; -+ if (size <= 16 * 1024) { -+ size = 16 * 1024; -+ } else if (size <= 32 * 1024) { -+ size = 32 * 1024; -+ } else if (size <= 64 * 1024) { -+ size = 64 * 1024; -+ } else if (size <= 128 * 1024) { -+ size = 128 * 1024; -+ } else if (size <= 256 * 1024) { -+ size = 256 * 1024; -+ } else if (size <= 512 * 1024) { -+ size = 512 * 1024; -+ } else if (size <= 1024 * 1024) { -+ size = 1024 * 1024; -+ } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) { -+ start = ac->ac_o_ex.fe_logical << bsbits; -+ start = (start / (1024 * 1024)) * (1024 * 1024); -+ size = 1024 * 1024; -+ } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) { -+ start = ac->ac_o_ex.fe_logical << bsbits; -+ start = (start / (4 * (1024 * 1024))) * 4 * (1024 * 1024); -+ size = 4 * 1024 * 1024; -+ } else if(NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,(8<<20)>>bsbits,max,bsbits)){ -+ start = ac->ac_o_ex.fe_logical; -+ start = start << bsbits; -+ start = (start / (8 * (1024 * 1024))) * 8 * (1024 * 1024); -+ size = 8 * 1024 * 1024; -+ } else { -+ start = ac->ac_o_ex.fe_logical; -+ start = start << bsbits; -+ size = ac->ac_o_ex.fe_len << bsbits; -+ } -+ orig_size = size = size >> bsbits; -+ orig_start = start = start >> bsbits; -+ -+ /* don't cover already allocated blocks in selected range */ -+ if (ar->pleft && start <= ar->lleft) { -+ size -= ar->lleft + 1 - start; -+ start = ar->lleft + 1; -+ } -+ if (ar->pright && start + size - 1 >= ar->lright) -+ size -= start + size - ar->lright; -+ -+ end = start + size; -+ -+ /* check we don't cross already preallocated blocks */ -+ rcu_read_lock(); -+ list_for_each_rcu(cur, &ei->i_prealloc_list) { -+ struct ext3_prealloc_space *pa; -+ unsigned long pa_end; -+ -+ pa = list_entry(cur, struct ext3_prealloc_space, pa_inode_list); -+ -+ if (pa->pa_deleted) -+ continue; -+ spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted) { -+ spin_unlock(&pa->pa_lock); -+ continue; -+ } -+ -+ pa_end = pa->pa_lstart + pa->pa_len; -+ -+ /* PA must not overlap original request */ -+ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || -+ ac->ac_o_ex.fe_logical < pa->pa_lstart)); -+ -+ /* skip PA normalized request doesn't overlap with */ -+ if (pa->pa_lstart >= end) { -+ spin_unlock(&pa->pa_lock); -+ continue; -+ } -+ if (pa_end <= start) { -+ spin_unlock(&pa->pa_lock); -+ continue; -+ } -+ BUG_ON(pa->pa_lstart <= start && pa_end >= end); -+ -+ if (pa_end <= ac->ac_o_ex.fe_logical) { -+ BUG_ON(pa_end < start); -+ start = pa_end; -+ } -+ -+ if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { -+ BUG_ON(pa->pa_lstart > end); -+ end = pa->pa_lstart; -+ } -+ spin_unlock(&pa->pa_lock); -+ } -+ rcu_read_unlock(); -+ size = end - start; -+ -+ /* XXX: extra loop to check we really don't overlap preallocations */ -+ rcu_read_lock(); -+ list_for_each_rcu(cur, &ei->i_prealloc_list) { -+ struct ext3_prealloc_space *pa; -+ unsigned long pa_end; -+ pa = list_entry(cur, struct ext3_prealloc_space, pa_inode_list); -+ spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted == 0) { -+ pa_end = pa->pa_lstart + pa->pa_len; -+ BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); -+ } -+ spin_unlock(&pa->pa_lock); -+ } -+ rcu_read_unlock(); -+ -+ if (start + size <= ac->ac_o_ex.fe_logical && -+ start > ac->ac_o_ex.fe_logical) { -+ printk("start %lu, size %lu, fe_logical %lu\n", -+ (unsigned long) start, (unsigned long) size, -+ (unsigned long) ac->ac_o_ex.fe_logical); -+ } -+ BUG_ON(start + size <= ac->ac_o_ex.fe_logical && -+ start > ac->ac_o_ex.fe_logical); -+ -+ /* now prepare goal request */ -+ BUG_ON(size <= 0 || size >= EXT3_BLOCKS_PER_GROUP(ac->ac_sb)); -+ if (size < ac->ac_o_ex.fe_len) { -+ /* XXX: don't normalize tails? */ -+ } -+ -+ /* XXX: is it better to align blocks WRT to logical placement -+ * or satisfy big request as is */ -+ ac->ac_g_ex.fe_logical = start; -+ ac->ac_g_ex.fe_len = size; -+ -+ mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, -+ (unsigned) orig_size, (unsigned) start); -+} -+ -+void ext3_mb_collect_stats(struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ -+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { -+ atomic_inc(&sbi->s_bal_reqs); -+ atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); -+ if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) -+ atomic_inc(&sbi->s_bal_success); -+ atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ atomic_inc(&sbi->s_bal_goals); -+ if (ac->ac_found > sbi->s_mb_max_to_scan) -+ atomic_inc(&sbi->s_bal_breaks); -+ } -+ -+ ext3_mb_store_history(ac); -+} -+ -+/* -+ * use blocks preallocated to inode -+ */ -+void ext3_mb_use_inode_pa(struct ext3_allocation_context *ac, -+ struct ext3_prealloc_space *pa) -+{ -+ unsigned long start, len; -+ -+ /* found preallocated blocks, use them */ -+ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); -+ len = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); -+ len = len - start; -+ ext3_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, -+ &ac->ac_b_ex.fe_start); -+ ac->ac_b_ex.fe_len = len; -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_pa = pa; -+ -+ BUG_ON(start < pa->pa_pstart); -+ BUG_ON(start + len > pa->pa_pstart + pa->pa_len); -+ BUG_ON(pa->pa_free < len); -+ pa->pa_free -= len; -+ -+ mb_debug("use %lu/%lu from inode pa %p\n", start, len, pa); -+} -+ -+/* -+ * use blocks preallocated to locality group -+ */ -+void ext3_mb_use_group_pa(struct ext3_allocation_context *ac, -+ struct ext3_prealloc_space *pa) -+{ -+ unsigned len = ac->ac_o_ex.fe_len; -+ -+ ext3_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, -+ &ac->ac_b_ex.fe_group, -+ &ac->ac_b_ex.fe_start); -+ ac->ac_b_ex.fe_len = len; -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_pa = pa; -+ -+ /* we don't correct pa_pstart or pa_plen here to avoid -+ * possible race when tte group is being loaded concurrently -+ * instead we correct pa later, after blocks are marked -+ * in on-disk bitmap -- see ext3_mb_release_context() */ -+ mb_debug("use %lu/%lu from group pa %p\n", pa->pa_lstart-len, len, pa); -+} -+ -+/* -+ * search goal blocks in preallocated space -+ */ -+int ext3_mb_use_preallocated(struct ext3_allocation_context *ac) -+{ -+ struct ext3_inode_info *ei = EXT3_I(ac->ac_inode); -+ struct ext3_locality_group *lg; -+ struct ext3_prealloc_space *pa; -+ struct list_head *cur; -+ -+ /* only data can be preallocated */ -+ if (!(ac->ac_flags & EXT3_MB_HINT_DATA)) -+ return 0; -+ -+ /* first, try per-file preallocation */ -+ rcu_read_lock(); -+ list_for_each_rcu(cur, &ei->i_prealloc_list) { -+ pa = list_entry(cur, struct ext3_prealloc_space, pa_inode_list); -+ -+ /* all fields in this condition don't change, -+ * so we can skip locking for them */ -+ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || -+ ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) -+ continue; -+ -+ /* found preallocated blocks, use them */ -+ spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted == 0 && pa->pa_free) { -+ atomic_inc(&pa->pa_count); -+ ext3_mb_use_inode_pa(ac, pa); -+ spin_unlock(&pa->pa_lock); -+ ac->ac_criteria = 10; -+ rcu_read_unlock(); -+ return 1; -+ } -+ spin_unlock(&pa->pa_lock); -+ } -+ rcu_read_unlock(); -+ -+ /* can we use group allocation? */ -+ if (!(ac->ac_flags & EXT3_MB_HINT_GROUP_ALLOC)) -+ return 0; -+ -+ /* inode may have no locality group for some reason */ -+ lg = ac->ac_lg; -+ if (lg == NULL) -+ return 0; -+ -+ rcu_read_lock(); -+ list_for_each_rcu(cur, &lg->lg_prealloc_list) { -+ pa = list_entry(cur, struct ext3_prealloc_space, pa_inode_list); -+ spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { -+ atomic_inc(&pa->pa_count); -+ ext3_mb_use_group_pa(ac, pa); -+ spin_unlock(&pa->pa_lock); -+ ac->ac_criteria = 20; -+ rcu_read_unlock(); -+ return 1; -+ } -+ spin_unlock(&pa->pa_lock); -+ } -+ rcu_read_unlock(); -+ -+ return 0; -+} -+ -+/* -+ * the function goes through all preallocation in this group and marks them -+ * used in in-core bitmap. buddy must be generated from this bitmap -+ */ -+void ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ struct ext3_prealloc_space *pa; -+ struct list_head *cur; -+ unsigned long groupnr; -+ unsigned long start; -+ int preallocated = 0, count = 0, len; -+ -+ /* all form of preallocation discards first load group, -+ * so the only competing code is preallocation use. -+ * we don't need any locking here -+ * notice we do NOT ignore preallocations with pa_deleted -+ * otherwise we could leave used blocks available for -+ * allocation in buddy when concurrent ext3_mb_put_pa() -+ * is dropping preallocation -+ */ -+ list_for_each_rcu(cur, &grp->bb_prealloc_list) { -+ pa = list_entry(cur, struct ext3_prealloc_space, pa_group_list); -+ spin_lock(&pa->pa_lock); -+ ext3_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); -+ len = pa->pa_len; -+ spin_unlock(&pa->pa_lock); -+ if (unlikely(len == 0)) -+ continue; -+ BUG_ON(groupnr != group && len != 0); -+ mb_set_bits(sb_bgl_lock(EXT3_SB(sb), group), bitmap, start,len); -+ preallocated += len; -+ count++; -+ } -+ mb_debug("prellocated %u for group %u\n", preallocated, group); -+} -+ -+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,5) -+static void ext3_mb_pa_callback(struct rcu_head *head) -+{ -+ struct ext3_prealloc_space *pa; -+ pa = container_of(head, struct ext3_prealloc_space, u.pa_rcu); -+ kmem_cache_free(ext3_pspace_cachep, pa); -+} -+#define mb_call_rcu(__pa) call_rcu(&(__pa)->u.pa_rcu, ext3_mb_pa_callback) -+#else -+static void ext3_mb_pa_callback(void *pa) -+{ -+ kmem_cache_free(ext3_pspace_cachep, pa); -+} -+#define mb_call_rcu(__pa) call_rcu(&(__pa)->u.pa_rcu, ext3_mb_pa_callback, pa) -+#endif -+ -+/* -+ * drops a reference to preallocated space descriptor -+ * if this was the last reference and the space is consumed -+ */ -+void ext3_mb_put_pa(struct ext3_allocation_context *ac, -+ struct super_block *sb, struct ext3_prealloc_space *pa) -+{ -+ unsigned long grp; -+ -+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) -+ return; -+ -+ /* in this short window concurrent discard can set pa_deleted */ -+ spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted == 0) { -+ spin_unlock(&pa->pa_lock); -+ return; -+ } -+ -+ pa->pa_deleted = 1; -+ spin_unlock(&pa->pa_lock); -+ -+ /* -1 is to protect from crossing allocation group */ -+ ext3_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); -+ -+ /* -+ * possible race: -+ * -+ * P1 (buddy init) P2 (regular allocation) -+ * find block B in PA -+ * copy on-disk bitmap to buddy -+ * mark B in on-disk bitmap -+ * drop PA from group -+ * mark all PAs in buddy -+ * -+ * thus, P1 initializes buddy with B available. to prevent this -+ * we make "copy" and "mark all PAs" atomic and serialize "drop PA" -+ * against that pair -+ */ -+ ext3_lock_group(sb, grp); -+ list_del_rcu(&pa->pa_group_list); -+ ext3_unlock_group(sb, grp); -+ -+ spin_lock(pa->pa_obj_lock); -+ list_del_rcu(&pa->pa_inode_list); -+ spin_unlock(pa->pa_obj_lock); -+ -+ mb_call_rcu(pa); -+} -+ -+/* -+ * creates new preallocated space for given inode -+ */ -+int ext3_mb_new_inode_pa(struct ext3_allocation_context *ac) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_prealloc_space *pa; -+ struct ext3_group_info *grp; -+ struct ext3_inode_info *ei; -+ -+ /* preallocate only when found space is larger then requested */ -+ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); -+ BUG_ON(ac->ac_status != AC_STATUS_FOUND); -+ BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); -+ -+ pa = kmem_cache_alloc(ext3_pspace_cachep, GFP_NOFS); -+ if (pa == NULL) -+ return -ENOMEM; -+ -+ if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { -+ int winl, wins, win, offs; -+ -+ /* we can't allocate as much as normalizer wants. -+ * so, found space must get proper lstart -+ * to cover original request */ -+ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); -+ BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); -+ -+ /* we're limited by original request in that -+ * logical block must be covered any way -+ * winl is window we can move our chunk within */ -+ winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; -+ -+ /* also, we should cover whole original request */ -+ wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; -+ -+ /* the smallest one defines real window */ -+ win = min(winl, wins); -+ -+ offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; -+ if (offs && offs < win) -+ win = offs; -+ -+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; -+ BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); -+ BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); -+ } -+ -+ /* preallocation can change ac_b_ex, thus we store actually -+ * allocated blocks for history */ -+ ac->ac_f_ex = ac->ac_b_ex; -+ -+ pa->pa_lstart = ac->ac_b_ex.fe_logical; -+ pa->pa_pstart = ext3_grp_offs_to_block(sb, &ac->ac_b_ex); -+ pa->pa_len = ac->ac_b_ex.fe_len; -+ pa->pa_free = pa->pa_len; -+ atomic_set(&pa->pa_count, 1); -+ spin_lock_init(&pa->pa_lock); -+ pa->pa_deleted = 0; -+ pa->pa_linear = 0; -+ -+ mb_debug("new inode pa %p: %lu/%lu for %lu\n", pa, -+ pa->pa_pstart, pa->pa_len, pa->pa_lstart); -+ -+ ext3_mb_use_inode_pa(ac, pa); -+ atomic_add(pa->pa_free, &EXT3_SB(sb)->s_mb_preallocated); -+ -+ ei = EXT3_I(ac->ac_inode); -+ grp = EXT3_GROUP_INFO(sb, ac->ac_b_ex.fe_group); -+ -+ pa->pa_obj_lock = &ei->i_prealloc_lock; -+ pa->pa_inode = ac->ac_inode; -+ -+ ext3_lock_group(sb, ac->ac_b_ex.fe_group); -+ list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list); -+ ext3_unlock_group(sb, ac->ac_b_ex.fe_group); -+ -+ spin_lock(pa->pa_obj_lock); -+ list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); -+ spin_unlock(pa->pa_obj_lock); -+ -+ return 0; -+} -+ -+/* -+ * creates new preallocated space for locality group inodes belongs to -+ */ -+int ext3_mb_new_group_pa(struct ext3_allocation_context *ac) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_locality_group *lg; -+ struct ext3_prealloc_space *pa; -+ struct ext3_group_info *grp; -+ -+ /* preallocate only when found space is larger then requested */ -+ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); -+ BUG_ON(ac->ac_status != AC_STATUS_FOUND); -+ BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); -+ -+ BUG_ON(ext3_pspace_cachep == NULL); -+ pa = kmem_cache_alloc(ext3_pspace_cachep, GFP_NOFS); -+ if (pa == NULL) -+ return -ENOMEM; -+ -+ /* preallocation can change ac_b_ex, thus we store actually -+ * allocated blocks for history */ -+ ac->ac_f_ex = ac->ac_b_ex; -+ -+ pa->pa_pstart = ext3_grp_offs_to_block(sb, &ac->ac_b_ex); -+ pa->pa_lstart = pa->pa_pstart; -+ pa->pa_len = ac->ac_b_ex.fe_len; -+ pa->pa_free = pa->pa_len; -+ atomic_set(&pa->pa_count, 1); -+ spin_lock_init(&pa->pa_lock); -+ pa->pa_deleted = 0; -+ pa->pa_linear = 1; -+ -+ mb_debug("new group pa %p: %lu/%lu for %lu\n", pa, -+ pa->pa_pstart, pa->pa_len, pa->pa_lstart); -+ -+ ext3_mb_use_group_pa(ac, pa); -+ atomic_add(pa->pa_free, &EXT3_SB(sb)->s_mb_preallocated); -+ -+ grp = EXT3_GROUP_INFO(sb, ac->ac_b_ex.fe_group); -+ lg = ac->ac_lg; -+ BUG_ON(lg == NULL); -+ -+ pa->pa_obj_lock = &lg->lg_prealloc_lock; -+ pa->pa_inode = NULL; -+ -+ ext3_lock_group(sb, ac->ac_b_ex.fe_group); -+ list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list); -+ ext3_unlock_group(sb, ac->ac_b_ex.fe_group); -+ -+ spin_lock(pa->pa_obj_lock); -+ list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); -+ spin_unlock(pa->pa_obj_lock); -+ -+ return 0; -+} -+ -+int ext3_mb_new_preallocation(struct ext3_allocation_context *ac) -+{ -+ int err; -+ -+ if (ac->ac_flags & EXT3_MB_HINT_GROUP_ALLOC) -+ err = ext3_mb_new_group_pa(ac); -+ else -+ err = ext3_mb_new_inode_pa(ac); -+ return err; -+} -+ -+/* -+ * finds all unused blocks in on-disk bitmap, frees them in -+ * in-core bitmap and buddy. -+ * @pa must be unlinked from inode and group lists, so that -+ * nobody else can find/use it. -+ * the caller MUST hold group/inode locks. -+ * TODO: optimize the case when there are no in-core structures yet -+ */ -+int ext3_mb_release_inode_pa(struct ext3_buddy *e3b, -+ struct buffer_head *bitmap_bh, -+ struct ext3_prealloc_space *pa) -+{ -+ struct ext3_allocation_context ac; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long bit, end, next, group; -+ sector_t start; -+ int err = 0, free = 0; -+ -+ BUG_ON(pa->pa_deleted == 0); -+ ext3_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); -+ BUG_ON(group != e3b->bd_group && pa->pa_len != 0); -+ end = bit + pa->pa_len; -+ -+ ac.ac_sb = sb; -+ ac.ac_inode = pa->pa_inode; -+ ac.ac_op = EXT3_MB_HISTORY_DISCARD; -+ -+ while (bit < end) { -+ bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); -+ if (bit >= end) -+ break; -+ next = mb_find_next_bit(bitmap_bh->b_data, end, bit); -+ if (next > end) -+ next = end; -+ start = group * EXT3_BLOCKS_PER_GROUP(sb) + bit + -+ le32_to_cpu(sbi->s_es->s_first_data_block); -+ mb_debug(" free preallocated %u/%u in group %u\n", -+ (unsigned) start, (unsigned) next - bit, -+ (unsigned) group); -+ free += next - bit; -+ -+ ac.ac_b_ex.fe_group = group; -+ ac.ac_b_ex.fe_start = bit; -+ ac.ac_b_ex.fe_len = next - bit; -+ ac.ac_b_ex.fe_logical = 0; -+ ext3_mb_store_history(&ac); -+ -+ mb_free_blocks(pa->pa_inode, e3b, bit, next - bit); -+ bit = next + 1; -+ } -+ if (free != pa->pa_free) { -+ printk("pa %p: logic %lu, phys. %lu, len %lu\n", -+ pa, (unsigned long) pa->pa_lstart, -+ (unsigned long) pa->pa_pstart, -+ (unsigned long) pa->pa_len); -+ printk("free %u, pa_free %u\n", free, pa->pa_free); -+ } -+ BUG_ON(free != pa->pa_free); -+ atomic_add(free, &sbi->s_mb_discarded); -+ -+ return err; -+} -+ -+int ext3_mb_release_group_pa(struct ext3_buddy *e3b, -+ struct ext3_prealloc_space *pa) -+{ -+ struct ext3_allocation_context ac; -+ struct super_block *sb = e3b->bd_sb; -+ unsigned long bit, group; -+ -+ ac.ac_op = EXT3_MB_HISTORY_DISCARD; -+ -+ BUG_ON(pa->pa_deleted == 0); -+ ext3_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); -+ BUG_ON(group != e3b->bd_group && pa->pa_len != 0); -+ mb_free_blocks(pa->pa_inode, e3b, bit, pa->pa_len); -+ atomic_add(pa->pa_len, &EXT3_SB(sb)->s_mb_discarded); -+ -+ ac.ac_sb = sb; -+ ac.ac_inode = NULL; -+ ac.ac_b_ex.fe_group = group; -+ ac.ac_b_ex.fe_start = bit; -+ ac.ac_b_ex.fe_len = pa->pa_len; -+ ac.ac_b_ex.fe_logical = 0; -+ ext3_mb_store_history(&ac); -+ -+ return 0; -+} -+ -+/* -+ * releases all preallocations in given group -+ * -+ * first, we need to decide discard policy: -+ * - when do we discard -+ * 1) ENOSPC -+ * - how many do we discard -+ * 1) how many requested -+ */ -+int ext3_mb_discard_group_preallocations(struct super_block *sb, -+ int group, int needed) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_prealloc_space *pa, *tmp; -+ struct list_head list; -+ struct ext3_buddy e3b; -+ int err, busy, free = 0; -+ -+ mb_debug("discard preallocation for group %lu\n", group); -+ -+ if (list_empty(&grp->bb_prealloc_list)) -+ return 0; -+ -+ bitmap_bh = read_block_bitmap(sb, group); -+ if (bitmap_bh == NULL) { -+ /* error handling here */ -+ ext3_mb_release_desc(&e3b); -+ BUG_ON(bitmap_bh == NULL); -+ } -+ -+ err = ext3_mb_load_buddy(sb, group, &e3b); -+ BUG_ON(err != 0); /* error handling here */ -+ -+ if (needed == 0) -+ needed = EXT3_BLOCKS_PER_GROUP(sb) + 1; -+ -+ grp = EXT3_GROUP_INFO(sb, group); -+ INIT_LIST_HEAD(&list); -+ -+repeat: -+ busy = 0; -+ ext3_lock_group(sb, group); -+ list_for_each_entry_safe (pa, tmp, &grp->bb_prealloc_list, pa_group_list) { -+ spin_lock(&pa->pa_lock); -+ if (atomic_read(&pa->pa_count)) { -+ spin_unlock(&pa->pa_lock); -+ printk("uh! busy PA\n"); -+ dump_stack(); -+ busy = 1; -+ continue; -+ } -+ if (pa->pa_deleted) { -+ spin_unlock(&pa->pa_lock); -+ continue; -+ } -+ -+ /* seems this one can be freed ... */ -+ pa->pa_deleted = 1; -+ -+ /* we can trust pa_free ... */ -+ free += pa->pa_free; -+ -+ spin_unlock(&pa->pa_lock); -+ -+ list_del_rcu(&pa->pa_group_list); -+ list_add(&pa->u.pa_tmp_list, &list); -+ } -+ -+ /* if we still need more blocks and some PAs were used, try again */ -+ if (free < needed && busy) { -+ ext3_unlock_group(sb, group); -+ goto repeat; -+ } -+ -+ /* found anything to free? */ -+ if (list_empty(&list)) { -+ BUG_ON(free != 0); -+ goto out; -+ } -+ -+ /* now free all selected PAs */ -+ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { -+ -+ /* remove from object (inode or locality group) */ -+ spin_lock(pa->pa_obj_lock); -+ list_del_rcu(&pa->pa_inode_list); -+ spin_unlock(pa->pa_obj_lock); -+ -+ if (pa->pa_linear) -+ ext3_mb_release_group_pa(&e3b, pa); -+ else -+ ext3_mb_release_inode_pa(&e3b, bitmap_bh, pa); -+ -+ list_del(&pa->u.pa_tmp_list); -+ mb_call_rcu(pa); -+ } -+ -+out: -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ brelse(bitmap_bh); -+ return free; -+} -+ -+/* -+ * releases all non-used preallocated blocks for given inode -+ */ -+void ext3_mb_discard_inode_preallocations(struct inode *inode) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_prealloc_space *pa, *tmp; -+ unsigned long group = 0; -+ struct list_head list; -+ struct ext3_buddy e3b; -+ int err; -+ -+ if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { -+ /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ -+ return; -+ } -+ -+ mb_debug("discard preallocation for inode %lu\n", inode->i_ino); -+ -+ INIT_LIST_HEAD(&list); -+ -+repeat: -+ /* first, collect all pa's in the inode */ -+ spin_lock(&ei->i_prealloc_lock); -+ while (!list_empty(&ei->i_prealloc_list)) { -+ pa = list_entry(ei->i_prealloc_list.next, -+ struct ext3_prealloc_space, pa_inode_list); -+ BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); -+ spin_lock(&pa->pa_lock); -+ if (atomic_read(&pa->pa_count)) { -+ /* this shouldn't happen often - nobody should -+ * use preallocation while we're discarding it */ -+ spin_unlock(&pa->pa_lock); -+ spin_unlock(&ei->i_prealloc_lock); -+ printk("uh-oh! used pa while discarding\n"); -+ dump_stack(); -+ current->state = TASK_UNINTERRUPTIBLE; -+ schedule_timeout(HZ); -+ goto repeat; -+ -+ } -+ if (pa->pa_deleted == 0) { -+ pa->pa_deleted = 1; -+ spin_unlock(&pa->pa_lock); -+ list_del_rcu(&pa->pa_inode_list); -+ list_add(&pa->u.pa_tmp_list, &list); -+ continue; -+ } -+ -+ /* someone is deleting pa right now */ -+ spin_unlock(&pa->pa_lock); -+ spin_unlock(&ei->i_prealloc_lock); -+ -+ /* we have to wait here because pa_deleted -+ * doesn't mean pa is already unlinked from -+ * the list. as we might be called from -+ * ->clear_inode() the inode will get freed -+ * and concurrent thread which is unlinking -+ * pa from inode's list may access already -+ * freed memory, bad-bad-bad */ -+ -+ /* XXX: if this happens too often, we can -+ * add a flag to force wait only in case -+ * of ->clear_inode(), but not in case of -+ * regular truncate */ -+ printk("uh-oh! some one just deleted it\n"); -+ dump_stack(); -+ current->state = TASK_UNINTERRUPTIBLE; -+ schedule_timeout(HZ); -+ goto repeat; -+ } -+ spin_unlock(&ei->i_prealloc_lock); -+ -+ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { -+ BUG_ON(pa->pa_linear != 0); -+ ext3_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); -+ -+ err = ext3_mb_load_buddy(sb, group, &e3b); -+ BUG_ON(err != 0); /* error handling here */ -+ -+ bitmap_bh = read_block_bitmap(sb, group); -+ if (bitmap_bh == NULL) { -+ /* error handling here */ -+ ext3_mb_release_desc(&e3b); -+ BUG_ON(bitmap_bh == NULL); -+ } -+ -+ ext3_lock_group(sb, group); -+ list_del_rcu(&pa->pa_group_list); -+ ext3_mb_release_inode_pa(&e3b, bitmap_bh, pa); -+ ext3_unlock_group(sb, group); -+ -+ ext3_mb_release_desc(&e3b); -+ brelse(bitmap_bh); -+ -+ list_del(&pa->u.pa_tmp_list); -+ mb_call_rcu(pa); -+ } -+} -+ -+/* -+ * finds all preallocated spaces and return blocks being freed to them -+ * if preallocated space becomes full (no block is used from the space) -+ * then the function frees space in buddy -+ * XXX: at the moment, truncate (which is the only way to free blocks) -+ * discards all preallocations -+ */ -+void ext3_mb_return_to_preallocation(struct inode *inode, struct ext3_buddy *e3b, -+ sector_t block, int count) -+{ -+ BUG_ON(!list_empty(&EXT3_I(inode)->i_prealloc_list)); -+} -+ -+void ext3_mb_show_ac(struct ext3_allocation_context *ac) -+{ -+#if 0 -+ struct super_block *sb = ac->ac_sb; -+ int i; -+ -+ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", -+ ac->ac_status, ac->ac_flags); -+ printk(KERN_ERR "EXT3-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " -+ "best %lu/%lu/%lu@%lu cr %d\n", -+ ac->ac_o_ex.fe_group, ac->ac_o_ex.fe_start, -+ ac->ac_o_ex.fe_len, ac->ac_o_ex.fe_logical, -+ ac->ac_g_ex.fe_group, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, ac->ac_g_ex.fe_logical, -+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, -+ ac->ac_b_ex.fe_len, ac->ac_b_ex.fe_logical, -+ ac->ac_criteria); -+ printk(KERN_ERR "EXT3-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, -+ ac->ac_found); -+ printk("EXT3-fs: groups: "); -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, i); -+ struct ext3_prealloc_space *pa; -+ unsigned long start; -+ struct list_head *cur; -+ list_for_each_rcu(cur, &grp->bb_prealloc_list) { -+ pa = list_entry(cur, struct ext3_prealloc_space, -+ pa_group_list); -+ spin_lock(&pa->pa_lock); -+ ext3_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); -+ spin_unlock(&pa->pa_lock); -+ printk("PA:%u:%lu:%u ", i, start, pa->pa_len); -+ } -+ -+ if (grp->bb_free == 0) -+ continue; -+ printk("%d: %d/%d ", i, grp->bb_free, grp->bb_fragments); -+ } -+ printk("\n"); -+ //dump_stack(); -+#endif -+} -+ -+void ext3_mb_group_or_file(struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ int bsbits = ac->ac_sb->s_blocksize_bits; -+ loff_t size, isize; -+ -+ if (!(ac->ac_flags & EXT3_MB_HINT_DATA)) -+ return; -+ -+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; -+ isize = i_size_read(ac->ac_inode) >> bsbits; -+ if (size < isize) -+ size = isize; -+ -+ /* don't use group allocation for large files */ -+ if (size >= sbi->s_mb_stream_request) -+ return; -+ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_GOAL_ONLY)) -+ return; -+ -+ BUG_ON(ac->ac_lg != NULL); -+ ac->ac_lg = &sbi->s_locality_groups[smp_processor_id()]; -+ -+ /* we're going to use group allocation */ -+ ac->ac_flags |= EXT3_MB_HINT_GROUP_ALLOC; -+ -+ /* serialize all allocations in the group */ -+ down(&ac->ac_lg->lg_sem); -+} -+ -+int ext3_mb_initialize_context(struct ext3_allocation_context *ac, -+ struct ext3_allocation_request *ar) -+{ -+ struct super_block *sb = ar->inode->i_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_super_block *es = sbi->s_es; -+ unsigned long group, len, goal; -+ unsigned long block; -+ -+ /* we can't allocate > group size */ -+ len = ar->len; -+ if (len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) -+ len = EXT3_BLOCKS_PER_GROUP(sb) - 10; -+ -+ /* start searching from the goal */ -+ goal = ar->goal; -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ ext3_get_group_no_and_offset(sb, goal, &group, &block); -+ -+ /* set up allocation goals */ -+ ac->ac_b_ex.fe_logical = ar->logical; -+ ac->ac_b_ex.fe_group = 0; -+ ac->ac_b_ex.fe_start = 0; -+ ac->ac_b_ex.fe_len = 0; -+ ac->ac_status = AC_STATUS_CONTINUE; -+ ac->ac_groups_scanned = 0; -+ ac->ac_ex_scanned = 0; -+ ac->ac_found = 0; -+ ac->ac_sb = sb; -+ ac->ac_inode = ar->inode; -+ ac->ac_o_ex.fe_logical = ar->logical; -+ ac->ac_o_ex.fe_group = group; -+ ac->ac_o_ex.fe_start = block; -+ ac->ac_o_ex.fe_len = len; -+ ac->ac_g_ex.fe_logical = ar->logical; -+ ac->ac_g_ex.fe_group = group; -+ ac->ac_g_ex.fe_start = block; -+ ac->ac_g_ex.fe_len = len; -+ ac->ac_f_ex.fe_len = 0; -+ ac->ac_flags = ar->flags; -+ ac->ac_2order = 0; -+ ac->ac_criteria = 0; -+ ac->ac_pa = NULL; -+ ac->ac_bitmap_page = NULL; -+ ac->ac_buddy_page = NULL; -+ ac->ac_lg = NULL; -+ -+ /* we have to define context: we'll we work with a file or -+ * locality group. this is a policy, actually */ -+ ext3_mb_group_or_file(ac); -+ -+ mb_debug("init ac: %u blocks @ %llu, goal %llu, flags %x, 2^%d, " -+ "left: %llu/%llu, right %llu/%llu to %swritable\n", -+ (unsigned) ar->len, (unsigned) ar->logical, -+ (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, -+ (unsigned) ar->lleft, (unsigned) ar->pleft, -+ (unsigned) ar->lright, (unsigned) ar->pright, -+ atomic_read(&ar->inode->i_writecount) ? "" : "non-"); -+ return 0; -+ -+} -+ -+/* -+ * release all resource we used in allocation -+ */ -+int ext3_mb_release_context(struct ext3_allocation_context *ac) -+{ -+ if (ac->ac_pa) { -+ if (ac->ac_pa->pa_linear) { -+ /* see comment in ext3_mb_use_group_pa() */ -+ spin_lock(&ac->ac_pa->pa_lock); -+ ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; -+ ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; -+ ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; -+ ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; -+ spin_unlock(&ac->ac_pa->pa_lock); -+ } -+ ext3_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); -+ } -+ if (ac->ac_bitmap_page) -+ page_cache_release(ac->ac_bitmap_page); -+ if (ac->ac_buddy_page) -+ page_cache_release(ac->ac_buddy_page); -+ if (ac->ac_flags & EXT3_MB_HINT_GROUP_ALLOC) -+ up(&ac->ac_lg->lg_sem); -+ ext3_mb_collect_stats(ac); -+ return 0; -+} -+ -+int ext3_mb_discard_preallocations(struct super_block *sb, int needed) -+{ -+ int i, ret, freed = 0; -+ -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count && needed > 0; i++) { -+ ret = ext3_mb_discard_group_preallocations(sb, i, needed); -+ freed += ret; -+ needed -= ret; -+ } -+ -+ return freed; -+} -+ -+/* -+ * Main entry point into mballoc to allocate blocks -+ * it tries to use preallocation first, then falls back -+ * to usual allocation -+ */ -+unsigned long ext3_mb_new_blocks(handle_t *handle, -+ struct ext3_allocation_request *ar, int *errp) -+{ -+ struct ext3_allocation_context ac; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ unsigned long block = 0; -+ int freed, inquota; -+ -+ sb = ar->inode->i_sb; -+ sbi = EXT3_SB(sb); -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning++ == 0) -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ar->len = 1; -+ block = ext3_new_block_old(handle, ar->inode, ar->goal, errp); -+ return block; -+ } -+ -+ while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { -+ ar->flags |= EXT3_MB_HINT_NOPREALLOC; -+ ar->len--; -+ } -+ if (ar->len == 0) { -+ *errp = -EDQUOT; -+ return 0; -+ } -+ inquota = ar->len; -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ if ((*errp = ext3_mb_initialize_context(&ac, ar))) { -+ ar->len = 0; -+ goto out; -+ } -+ -+ ac.ac_op = EXT3_MB_HISTORY_PREALLOC; -+ if (!ext3_mb_use_preallocated(&ac)) { -+ -+ ac.ac_op = EXT3_MB_HISTORY_ALLOC; -+ ext3_mb_normalize_request(&ac, ar); -+ -+repeat: -+ /* allocate space in core */ -+ ext3_mb_regular_allocator(&ac); -+ -+ /* as we've just preallocated more space than -+ * user requested orinally, we store allocated -+ * space in a special descriptor */ -+ if (ac.ac_status == AC_STATUS_FOUND && -+ ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len) -+ ext3_mb_new_preallocation(&ac); -+ } -+ -+ if (likely(ac.ac_status == AC_STATUS_FOUND)) { -+ ext3_mb_mark_diskspace_used(&ac, handle); -+ *errp = 0; -+ block = ext3_grp_offs_to_block(sb, &ac.ac_b_ex); -+ ar->len = ac.ac_b_ex.fe_len; -+ } else { -+ freed = ext3_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len); -+ if (freed) -+ goto repeat; -+ *errp = -ENOSPC; -+ ac.ac_b_ex.fe_len = 0; -+ ar->len = 0; -+ ext3_mb_show_ac(&ac); -+ } -+ -+ ext3_mb_release_context(&ac); -+ -+out: -+ if (ar->len < inquota) -+ DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); -+ -+ return block; -+} -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) -+{ -+ struct ext3_allocation_request ar; -+ unsigned long ret; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, errp); -+ return ret; -+ } -+ -+ ar.inode = inode; -+ ar.goal = goal; -+ ar.len = 1; -+ ar.logical = 0; -+ ar.lleft = 0; -+ ar.pleft = 0; -+ ar.lright = 0; -+ ar.pright = 0; -+ ar.flags = 0; -+ ret = ext3_mb_new_blocks(handle, &ar, errp); -+ return ret; -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be already -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_group_info *db = e3b->bd_info; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ BUG_ON(e3b->bd_bitmap_page == NULL); -+ BUG_ON(e3b->bd_buddy_page == NULL); -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ /* protect buddy cache from being freed, -+ * otherwise we'll refresh it from -+ * on-disk bitmap and lose not-yet-available -+ * blocks */ -+ page_cache_get(e3b->bd_buddy_page); -+ page_cache_get(e3b->bd_bitmap_page); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+/* -+ * Main entry point into mballoc to free blocks -+ */ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, -+ int metadata, int *freed) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_allocation_context ac; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ *freed = 0; -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, __FUNCTION__, -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+ ac.ac_op = EXT3_MB_HISTORY_FREE; -+ ac.ac_inode = inode; -+ ac.ac_sb = sb; -+ -+do_more: -+ overflow = 0; -+ ext3_get_group_no_and_offset(sb, block, &block_group, &bit); -+ -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, __FUNCTION__, -+ "Freeing blocks in system zone - " -+ "Block = %lu, count = %lu", block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_buddy(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < count; i++) -+ BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); -+ } -+#endif -+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, bit, -+ count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ ac.ac_b_ex.fe_group = block_group; -+ ac.ac_b_ex.fe_start = bit; -+ ac.ac_b_ex.fe_len = count; -+ ext3_mb_store_history(&ac); -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ err = mb_free_blocks(inode, &e3b, bit, count); -+ ext3_mb_return_to_preallocation(inode, &e3b, block, count); -+ ext3_unlock_group(sb, block_group); -+ BUG_ON(err != 0); -+ } -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ *freed += count; -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3_std_error(sb, err); -+ return; -+} -- 1.8.3.1