--- /dev/null
+This contains following kernel patches:
+
+a078dff87013 ("ext4: fixup possible uninitialized variable access in
+ ext4_mb_choose_next_group_cr1()")
+80fa46d6b9e7 ("ext4: limit the number of retries after discarding
+ preallocations blocks")
+820897258ad3 ("ext4: Refactor code related to freeing PAs")
+cf5e2ca6c990 ("ext4: mballoc: refactor
+ ext4_mb_discard_preallocations()")
+83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of
+ rbtree")
+a9f2a2931d0e ("ext4: use locality group preallocation for small
+ closed files")
+1940265ede66 ("ext4: avoid unnecessary spreading of allocations among
+ groups")
+4fca50d440cc ("ext4: make mballoc try target group first even with
+ mb_optimize_scan")
+3fa5d23e68a3 ("ext4: reflect mb_optimize_scan value in options file")
+077d0c2c78df ("ext4: make mb_optimize_scan performance mount option
+ work with extents")
+196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
+21175ca434c5 ("ext4: make prefetch_block_bitmaps default")
+3d392b2676bf ("ext4: add prefetch_block_bitmaps mount option")
+cfd732377221 ("ext4: add prefetching for block allocation bitmaps")
+4b68f6df1059 ("ext4: add MB_NUM_ORDERS macro")
+dddcd2f9ebde ("ext4: optimize the implementation of ext4_mb_good_group()")
+a6c75eaf1103 ("ext4: add mballoc stats proc file")
+67d251860461 ("ext4: drop s_mb_bal_lock and convert protected fields
+ to atomic")
+
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
+@@ -151,6 +151,10 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_USE_RESERVED 0x2000
+ /* Do strict check for free blocks while retrying block allocation */
+ #define EXT4_MB_STRICT_CHECK 0x4000
++/* Large fragment size list lookup succeeded at least once for cr = 0 */
++#define EXT4_MB_CR0_OPTIMIZED 0x8000
++/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
++#define EXT4_MB_CR1_OPTIMIZED 0x00010000
+
+ struct ext4_allocation_request {
+ /* target inode for block we're allocating */
+@@ -1167,6 +1170,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
++#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
+ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
+ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+@@ -1193,7 +1197,9 @@ struct ext4_inode_info {
+
+ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
+ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
+-
++#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
++ * scanning in mballoc
++ */
+
+ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
+ ~EXT4_MOUNT_##opt
+@@ -1480,9 +1486,15 @@ struct ext4_sb_info {
+ unsigned int s_mb_free_pending;
+ struct list_head s_freed_data_list; /* List of blocks to be freed
+ after commit completed */
++ struct list_head *s_mb_avg_fragment_size;
++ rwlock_t *s_mb_avg_fragment_size_locks;
++ struct list_head *s_mb_largest_free_orders;
++ rwlock_t *s_mb_largest_free_orders_locks;
+
+ /* tunables */
+ unsigned long s_stripe;
++ unsigned int s_mb_max_linear_groups;
++ unsigned int s_mb_stream_request;
+ unsigned long s_mb_small_req;
+ unsigned long s_mb_large_req;
+ unsigned int s_mb_max_to_scan;
+@@ -1508,15 +1520,18 @@ struct ext4_sb_info {
+ atomic_t s_bal_success; /* we found long enough chunks */
+ atomic_t s_bal_allocated; /* in blocks */
+ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_groups_scanned; /* number of groups scanned */
+ atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_breaks; /* too long searches */
+ atomic_t s_bal_2orders; /* 2^order hits */
+- /* cX loop didn't find blocks */
+- atomic64_t s_bal_cX_failed[3];
++ atomic_t s_bal_cr0_bad_suggestions;
++ atomic_t s_bal_cr1_bad_suggestions;
++ atomic64_t s_bal_cX_groups_considered[4];
++ atomic64_t s_bal_cX_hits[4];
++ atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
+ atomic64_t s_bal_cX_skipped[3];
+- spinlock_t s_bal_lock;
+- unsigned long s_mb_buddies_generated;
+- unsigned long long s_mb_generation_time;
++ atomic_t s_mb_buddies_generated; /* number of buddies generated */
++ atomic64_t s_mb_generation_time;
+ atomic_t s_mb_lost_chunks;
+ atomic_t s_mb_preallocated;
+ atomic_t s_mb_discarded;
+@@ -2393,9 +2408,15 @@ struct ext4_lazy_init {
+ struct mutex li_list_mtx;
+ };
+
++enum ext4_li_mode {
++ EXT4_LI_MODE_PREFETCH_BBITMAP,
++ EXT4_LI_MODE_ITABLE,
++};
++
+ struct ext4_li_request {
+ struct super_block *lr_super;
+- struct ext4_sb_info *lr_sbi;
++ enum ext4_li_mode lr_mode;
++ ext4_group_t lr_first_not_zeroed;
+ ext4_group_t lr_next_group;
+ struct list_head lr_request;
+ unsigned long lr_next_sched;
+@@ -2685,6 +2706,7 @@ extern const struct file_operations ext4
+ extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
+ extern long ext4_mb_stats;
+ extern long ext4_mb_max_to_scan;
++extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+ extern int ext4_mb_init(struct super_block *);
+ extern int ext4_mb_release(struct super_block *);
+ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+@@ -2693,6 +2715,12 @@ extern int ext4_mb_reserve_blocks(struct
+ extern void ext4_discard_preallocations(struct inode *, unsigned int);
+ extern int __init ext4_init_mballoc(void);
+ extern void ext4_exit_mballoc(void);
++extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
++ ext4_group_t group,
++ unsigned int nr, int *cnt);
++extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
++ unsigned int nr);
++
+ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags);
+@@ -3178,13 +3206,18 @@ struct ext4_group_info {
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
++ int bb_avg_fragment_size_order; /* order of average
++ fragment in BG */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
++ ext4_group_t bb_group; /* Group number */
+ struct list_head bb_prealloc_list;
+ unsigned long bb_prealloc_nr;
+ #ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+ #endif
+ struct rw_semaphore alloc_sem;
++ struct list_head bb_avg_fragment_size_node;
++ struct list_head bb_largest_free_order_node;
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c
++++ linux-stage/fs/ext4/mballoc.c
+@@ -127,11 +127,53 @@
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
+ *
++ * If "mb_optimize_scan" mount option is set, we maintain in memory group info
++ * structures in two data structures:
++ *
++ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
++ *
++ * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
++ *
++ * This is an array of lists where the index in the array represents the
++ * largest free order in the buddy bitmap of the participating group infos of
++ * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
++ * number of buddy bitmap orders possible) number of lists. Group-infos are
++ * placed in appropriate lists.
++ *
++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
++ *
++ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
++ *
++ * This is an array of lists where in the i-th list there are groups with
++ * average fragment size >= 2^i and < 2^(i+1). The average fragment size
++ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
++ * Note that we don't bother with a special list for completely empty groups
++ * so we only have MB_NUM_ORDERS(sb) lists.
++ *
++ * When "mb_optimize_scan" mount option is set, mballoc consults the above data
++ * structures to decide the order in which groups are to be traversed for
++ * fulfilling an allocation request.
++ *
++ * At CR = 0, we look for groups which have the largest_free_order >= the order
++ * of the request. We directly look at the largest free order list in the data
++ * structure (1) above where largest_free_order = order of the request. If that
++ * list is empty, we look at remaining list in the increasing order of
++ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
++ *
++ * At CR = 1, we only consider groups where average fragment size > request
++ * size. So, we lookup a group which has average fragment size just above or
++ * equal to request size using our average fragment size group lists (data
++ * structure 2) in O(1) time.
++ *
++ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
++ * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
++ *
+ * The regular allocator (using the buddy cache) supports a few tunables.
+ *
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
++ * /sys/fs/ext4/<partition>/mb_linear_limit
+ *
+ * The regular allocator uses buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+@@ -149,6 +191,16 @@
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
++ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
++ * get traversed linearly. That may result in subsequent allocations being not
++ * close to each other. And so, the underlying device may get filled up in a
++ * non-linear fashion. While that may not matter on non-rotational devices, for
++ * rotational devices that may result in higher seek times. "mb_linear_limit"
++ * tells mballoc how many groups mballoc should search linearly before
++ * performing consulting above data structures for more efficient lookups. For
++ * non rotational devices, this value defaults to 0 and for rotational devices
++ * this is set to MB_DEFAULT_LINEAR_LIMIT.
++ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+@@ -299,6 +351,8 @@
+ * - bitlock on a group (group)
+ * - object (inode/locality) (object)
+ * - per-pa lock (pa)
++ * - cr0 lists lock (cr0)
++ * - cr1 tree lock (cr1)
+ *
+ * Paths:
+ * - new pa
+@@ -328,6 +382,9 @@
+ * group
+ * object
+ *
++ * - allocation path (ext4_mb_regular_allocator)
++ * group
++ * cr0/cr1
+ */
+ static struct kmem_cache *ext4_pspace_cachep;
+ static struct kmem_cache *ext4_ac_cachep;
+@@ -351,6 +408,9 @@ static void ext4_mb_generate_from_freeli
+ ext4_group_t group);
+ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+
++static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
++ ext4_group_t group, int cr);
++
+ /*
+ * The algorithm using this percpu seq counter goes below:
+ * 1. We sample the percpu discard_pa_seq counter before trying for block
+@@ -747,6 +807,221 @@ static void ext4_mb_mark_free_simple(str
+ }
+ }
+
++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
++{
++ int order;
++
++ /*
++ * We don't bother with a special lists groups with only 1 block free
++ * extents and for completely empty groups.
++ */
++ order = fls(len) - 2;
++ if (order < 0)
++ return 0;
++ if (order == MB_NUM_ORDERS(sb))
++ order--;
++ return order;
++}
++
++/* Move group to appropriate avg_fragment_size list */
++static void
++mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
++{
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
++ int new_order;
++
++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
++ return;
++
++ new_order = mb_avg_fragment_size_order(sb,
++ grp->bb_free / grp->bb_fragments);
++ if (new_order == grp->bb_avg_fragment_size_order)
++ return;
++
++ if (grp->bb_avg_fragment_size_order != -1) {
++ write_lock(&sbi->s_mb_avg_fragment_size_locks[
++ grp->bb_avg_fragment_size_order]);
++ list_del(&grp->bb_avg_fragment_size_node);
++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[
++ grp->bb_avg_fragment_size_order]);
++ }
++ grp->bb_avg_fragment_size_order = new_order;
++ write_lock(&sbi->s_mb_avg_fragment_size_locks[
++ grp->bb_avg_fragment_size_order]);
++ list_add_tail(&grp->bb_avg_fragment_size_node,
++ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[
++ grp->bb_avg_fragment_size_order]);
++}
++
++/*
++ * Choose next group by traversing largest_free_order lists. Updates *new_cr if
++ * cr level needs an update.
++ */
++static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
++{
++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++ struct ext4_group_info *iter, *grp;
++ int i;
++
++ if (ac->ac_status == AC_STATUS_FOUND)
++ return;
++
++ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
++ atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
++
++ grp = NULL;
++ for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++ if (list_empty(&sbi->s_mb_largest_free_orders[i]))
++ continue;
++ read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
++ if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
++ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
++ continue;
++ }
++ grp = NULL;
++ list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
++ bb_largest_free_order_node) {
++ if (sbi->s_mb_stats)
++ atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
++ grp = iter;
++ break;
++ }
++ }
++ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
++ if (grp)
++ break;
++ }
++
++ if (!grp) {
++ /* Increment cr and search again */
++ *new_cr = 1;
++ } else {
++ *group = grp->bb_group;
++ ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
++ }
++}
++
++/*
++ * Choose next group by traversing average fragment size list of suitable
++ * order. Updates *new_cr if cr level needs an update.
++ */
++static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
++{
++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++ struct ext4_group_info *grp = NULL, *iter;
++ int i;
++
++ if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
++ if (sbi->s_mb_stats)
++ atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
++ }
++
++ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
++ i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++ if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
++ continue;
++ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++ continue;
++ }
++ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
++ bb_avg_fragment_size_node) {
++ if (sbi->s_mb_stats)
++ atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
++ grp = iter;
++ break;
++ }
++ }
++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++ if (grp)
++ break;
++ }
++
++ if (grp) {
++ *group = grp->bb_group;
++ ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
++ } else {
++ *new_cr = 2;
++ }
++}
++
++static inline int should_optimize_scan(struct ext4_allocation_context *ac)
++{
++ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
++ return 0;
++ if (ac->ac_criteria >= 2)
++ return 0;
++ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
++ return 0;
++ return 1;
++}
++
++/*
++ * Return next linear group for allocation. If linear traversal should not be
++ * performed, this function just returns the same group
++ */
++static int
++next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
++{
++ if (!should_optimize_scan(ac))
++ goto inc_and_return;
++
++ if (ac->ac_groups_linear_remaining) {
++ ac->ac_groups_linear_remaining--;
++ goto inc_and_return;
++ }
++
++ return group;
++inc_and_return:
++ /*
++ * Artificially restricted ngroups for non-extent
++ * files makes group > ngroups possible on first loop.
++ */
++ return group + 1 >= ngroups ? 0 : group + 1;
++}
++
++/*
++ * ext4_mb_choose_next_group: choose next group for allocation.
++ *
++ * @ac Allocation Context
++ * @new_cr This is an output parameter. If the there is no good group
++ * available at current CR level, this field is updated to indicate
++ * the new cr level that should be used.
++ * @group This is an input / output parameter. As an input it indicates the
++ * next group that the allocator intends to use for allocation. As
++ * output, this field indicates the next group that should be used as
++ * determined by the optimization functions.
++ * @ngroups Total number of groups
++ */
++static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
++{
++ *new_cr = ac->ac_criteria;
++
++ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
++ *group = next_linear_group(ac, *group, ngroups);
++ return;
++ }
++
++ if (*new_cr == 0) {
++ ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
++ } else if (*new_cr == 1) {
++ ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
++ } else {
++ /*
++ * TODO: For CR=2, we can arrange groups in an rb tree sorted by
++ * bb_free. But until that happens, we should never come here.
++ */
++ WARN_ON(1);
++ }
++}
++
+ /*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+@@ -754,22 +1029,39 @@ static void ext4_mb_mark_free_simple(str
+ static void
+ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+ {
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int i;
+- int bits;
+
+- grp->bb_largest_free_order = -1; /* uninit */
+-
+- bits = sb->s_blocksize_bits + 1;
+- for (i = bits; i >= 0; i--) {
+- if (grp->bb_counters[i] > 0) {
+- grp->bb_largest_free_order = i;
++ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
++ if (grp->bb_counters[i] > 0)
+ break;
+- }
++ /* No need to move between order lists? */
++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
++ i == grp->bb_largest_free_order) {
++ grp->bb_largest_free_order = i;
++ return;
++ }
++
++ if (grp->bb_largest_free_order >= 0) {
++ write_lock(&sbi->s_mb_largest_free_orders_locks[
++ grp->bb_largest_free_order]);
++ list_del_init(&grp->bb_largest_free_order_node);
++ write_unlock(&sbi->s_mb_largest_free_orders_locks[
++ grp->bb_largest_free_order]);
++ }
++ grp->bb_largest_free_order = i;
++ if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
++ write_lock(&sbi->s_mb_largest_free_orders_locks[
++ grp->bb_largest_free_order]);
++ list_add_tail(&grp->bb_largest_free_order_node,
++ &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
++ write_unlock(&sbi->s_mb_largest_free_orders_locks[
++ grp->bb_largest_free_order]);
+ }
+ }
+
+ static noinline_for_stack
+-int ext4_mb_generate_buddy(struct super_block *sb,
++void ext4_mb_generate_buddy(struct super_block *sb,
+ void *buddy, void *bitmap, ext4_group_t group)
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+@@ -817,19 +1109,15 @@ int ext4_mb_generate_buddy(struct super_
+ grp->bb_free = free;
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+- return -EIO;
+ }
+ mb_set_largest_free_order(sb, grp);
++ mb_update_avg_fragment_size(sb, grp);
+
+ clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+ period = get_cycles() - period;
+- spin_lock(&sbi->s_bal_lock);
+- sbi->s_mb_buddies_generated++;
+- sbi->s_mb_generation_time += period;
+- spin_unlock(&sbi->s_bal_lock);
+-
+- return 0;
++ atomic_inc(&sbi->s_mb_buddies_generated);
++ atomic64_add(period, &sbi->s_mb_generation_time);
+ }
+
+ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+@@ -987,14 +1275,14 @@ static int ext4_mb_init_cache(struct pag
+ grinfo->bb_fragments = 0;
+ memset(grinfo->bb_counters, 0,
+ sizeof(*grinfo->bb_counters) *
+- (sb->s_blocksize_bits+2));
++ (MB_NUM_ORDERS(sb)));
+ /*
+ * incore got set to the group block bitmap below
+ */
+ ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
+- err = ext4_mb_generate_buddy(sb, data, incore, group);
++ ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
+ incore = NULL;
+ } else {
+@@ -1558,6 +1846,7 @@ static void mb_free_blocks(struct inode
+
+ done:
+ mb_set_largest_free_order(sb, e4b->bd_info);
++ mb_update_avg_fragment_size(sb, e4b->bd_info);
+ mb_check_buddy(e4b);
+ }
+
+@@ -1695,6 +1984,7 @@ static int mb_mark_used(struct ext4_budd
+ }
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+
++ mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_check_buddy(e4b);
+
+@@ -1989,7 +2279,7 @@ void ext4_mb_simple_scan_group(struct ex
+ int max;
+
+ BUG_ON(ac->ac_2order <= 0);
+- for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
++ for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
+ if (grp->bb_counters[i] == 0)
+ continue;
+
+@@ -2135,13 +2425,11 @@ static bool ext4_mb_good_group(struct ex
+
+ BUG_ON(cr < 0 || cr >= 4);
+
+- free = grp->bb_free;
+- if (free == 0)
+- return false;
+- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
++ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return false;
+
+- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
++ free = grp->bb_free;
++ if (free == 0)
+ return false;
+
+ fragments = grp->bb_fragments;
+@@ -2158,8 +2446,10 @@ static bool ext4_mb_good_group(struct ex
+ ((group % flex_size) == 0))
+ return false;
+
+- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+- (free / fragments) >= ac->ac_g_ex.fe_len)
++ if (free < ac->ac_g_ex.fe_len)
++ return false;
++
++ if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
+ return true;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+@@ -2193,10 +2483,13 @@ static int ext4_mb_good_group_nolock(str
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+ struct super_block *sb = ac->ac_sb;
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
+ ext4_grpblk_t free;
+ int ret = 0;
+
++ if (sbi->s_mb_stats)
++ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
+ if (should_lock)
+ ext4_lock_group(sb, group);
+ free = grp->bb_free;
+@@ -2246,97 +2539,90 @@ static u64 available_blocks_count(struct
+ }
+
+ /*
+- * each allocation context (i.e. a thread doing allocation) has own
+- * sliding prefetch window of @s_mb_prefetch size which starts at the
+- * very first goal and moves ahead of scaning.
+- * a side effect is that subsequent allocations will likely find
+- * the bitmaps in cache or at least in-flight.
++ * Start prefetching @nr block bitmaps starting at @group.
++ * Return the next group which needs to be prefetched.
+ */
+-static void
+-ext4_mb_prefetch(struct ext4_allocation_context *ac,
+- ext4_group_t start)
++ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
++ unsigned int nr, int *cnt)
+ {
+- struct super_block *sb = ac->ac_sb;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+- struct ext4_sb_info *sbi = EXT4_SB(sb);
+- struct ext4_group_info *grp;
+- ext4_group_t group = start;
+ struct buffer_head *bh;
+- int nr;
+-
+- /* limit prefetching at cr=0, otherwise mballoc can
+- * spend a lot of time loading imperfect groups */
+- if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+- return;
+-
+- /* batch prefetching to get few READs in flight */
+- nr = ac->ac_prefetch - group;
+- if (ac->ac_prefetch < group)
+- /* wrapped to the first groups */
+- nr += ngroups;
+- if (nr > 0)
+- return;
+- BUG_ON(nr < 0);
++ struct blk_plug plug;
+
+- nr = sbi->s_mb_prefetch;
+- if (ext4_has_feature_flex_bg(sb)) {
+- /* align to flex_bg to get more bitmas with a single IO */
+- nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+- nr = nr + sbi->s_mb_prefetch - group;
+- }
++ blk_start_plug(&plug);
+ while (nr-- > 0) {
+- grp = ext4_get_group_info(sb, group);
+- /* prevent expensive getblk() on groups w/ IO in progress */
+- if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp))
+- goto next;
+-
+- /* ignore empty groups - those will be skipped
+- * during the scanning as well */
+- if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+- bh = ext4_read_block_bitmap_nowait(sb, group, 1);
++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
++ NULL);
++ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
++
++ /*
++ * Prefetch block groups with free blocks; but don't
++ * bother if it is marked uninitialized on disk, since
++ * it won't require I/O to read. Also only try to
++ * prefetch once, so we avoid getblk() call, which can
++ * be expensive.
++ */
++ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
++ EXT4_MB_GRP_NEED_INIT(grp) &&
++ ext4_free_group_clusters(sb, gdp) > 0 &&
++ !(ext4_has_group_desc_csum(sb) &&
++ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
++ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+- if (!buffer_uptodate(bh))
+- ac->ac_prefetch_ios++;
++ if (!buffer_uptodate(bh) && cnt)
++ (*cnt)++;
+ brelse(bh);
+ }
+ }
+-next:
+ if (++group >= ngroups)
+ group = 0;
+ }
+- ac->ac_prefetch = group;
++ blk_finish_plug(&plug);
++ return group;
+ }
+
+-static void
+-ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
++/*
++ * Prefetching reads the block bitmap into the buffer cache; but we
++ * need to make sure that the buddy bitmap in the page cache has been
++ * initialized. Note that ext4_mb_init_group() will block if the I/O
++ * is not yet completed, or indeed if it was not initiated by
++ * ext4_mb_prefetch did not start the I/O.
++ *
++ * TODO: We should actually kick off the buddy bitmap setup in a work
++ * queue when the buffer I/O is completed, so that we don't block
++ * waiting for the block allocation bitmap read to finish when
++ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
++ */
++void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
++ unsigned int nr)
+ {
+- struct ext4_group_info *grp;
+- ext4_group_t group;
+- int nr, rc;
+-
+- /* initialize last window of prefetched groups */
+- nr = ac->ac_prefetch_ios;
+- if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+- nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+- group = ac->ac_prefetch;
+ while (nr-- > 0) {
+- grp = ext4_get_group_info(ac->ac_sb, group);
+- if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+- rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+- if (rc)
++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
++ NULL);
++ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
++
++ if (!group)
++ group = ext4_get_groups_count(sb);
++ group--;
++ grp = ext4_get_group_info(sb, group);
++
++ if (EXT4_MB_GRP_NEED_INIT(grp) &&
++ ext4_free_group_clusters(sb, gdp) > 0 &&
++ !(ext4_has_group_desc_csum(sb) &&
++ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
++ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+- if (group-- == 0)
+- group = ext4_get_groups_count(ac->ac_sb) - 1;
+ }
+ }
+
+ static noinline_for_stack int
+ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+ {
+- ext4_group_t ngroups, group, i;
+- int cr = -1;
++ ext4_group_t prefetch_grp = 0, ngroups, group, i;
++ int cr = -1, new_cr;
+ int err = 0, first_err = 0;
++ unsigned int nr = 0, prefetch_ios = 0;
+ struct ext4_sb_info *sbi;
+ struct super_block *sb;
+ struct ext4_buddy e4b;
+@@ -2373,13 +2659,13 @@ ext4_mb_regular_allocator(struct ext4_al
+ * We also support searching for power-of-two requests only for
+ * requests upto maximum buddy size we have constructed.
+ */
+- if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
++ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
+ /*
+ * This should tell if fe_len is exactly power of 2
+ */
+ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+ ac->ac_2order = array_index_nospec(i - 1,
+- sb->s_blocksize_bits + 2);
++ MB_NUM_ORDERS(sb));
+ }
+
+ /* if stream allocation is enabled, use global goal */
+@@ -2420,19 +2706,41 @@ repeat:
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+- ac->ac_prefetch = group;
++ ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
++ prefetch_grp = group;
+
+- for (i = 0; i < ngroups; group++, i++) {
++ for (i = 0, new_cr = cr; i < ngroups; i++,
++ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
+ int ret = 0;
++
+ cond_resched();
++ if (new_cr != cr) {
++ cr = new_cr;
++ goto repeat;
++ }
++
+ /*
+- * Artificially restricted ngroups for non-extent
+- * files makes group > ngroups possible on first loop.
++ * Batch reads of the block allocation bitmaps
++ * to get multiple READs in flight; limit
++ * prefetching at cr=0/1, otherwise mballoc can
++ * spend a lot of time loading imperfect groups
+ */
+- if (group >= ngroups)
+- group = 0;
+-
+- ext4_mb_prefetch(ac, group);
++ if ((prefetch_grp == group) &&
++ (cr > 1 ||
++ prefetch_ios < sbi->s_mb_prefetch_limit)) {
++ unsigned int curr_ios = prefetch_ios;
++
++ nr = sbi->s_mb_prefetch;
++ if (ext4_has_feature_flex_bg(sb)) {
++ nr = (group / sbi->s_mb_prefetch) *
++ sbi->s_mb_prefetch;
++ nr = nr + sbi->s_mb_prefetch - group;
++ }
++ prefetch_grp = ext4_mb_prefetch(sb, group,
++ nr, &prefetch_ios);
++ if (prefetch_ios == curr_ios)
++ nr = 0;
++ }
+
+ /* This now checks without needing the buddy page */
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
+@@ -2503,7 +2811,13 @@ repeat:
+ atomic_inc(&sbi->s_mb_lost_chunks);
+ goto repeat;
+ }
++ /* Processed all groups and haven't found blocks */
++ if (sbi->s_mb_stats && i == ngroups)
++ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+ }
++
++ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
++ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ out:
+ if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
+ err = first_err;
+@@ -2512,8 +2826,9 @@ out:
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+ ac->ac_flags, cr, err);
+
+- /* use prefetched bitmaps to init buddy so that read info is not lost */
+- ext4_mb_prefetch_fini(ac);
++ if (nr)
++ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
++
+ return err;
+ }
+
+@@ -2704,6 +3019,77 @@ const struct file_operations ext4_seq_pr
+ .write = ext4_mb_prealloc_table_proc_write,
+ };
+
++int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
++{
++ struct super_block *sb = (struct super_block *)seq->private;
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
++
++ seq_puts(seq, "mballoc:\n");
++ if (!sbi->s_mb_stats) {
++ seq_puts(seq, "\tmb stats collection turned off.\n");
++ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
++ return 0;
++ }
++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
++
++ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
++
++ seq_puts(seq, "\tcr0_stats:\n");
++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
++ seq_printf(seq, "\t\tgroups_considered: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
++ seq_printf(seq, "\t\tuseless_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_failed[0]));
++ seq_printf(seq, "\t\tbad_suggestions: %u\n",
++ atomic_read(&sbi->s_bal_cr0_bad_suggestions));
++ seq_printf(seq, "\t\tskipped_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_skipped[0]));
++
++ seq_puts(seq, "\tcr1_stats:\n");
++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
++ seq_printf(seq, "\t\tgroups_considered: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
++ seq_printf(seq, "\t\tuseless_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_failed[1]));
++ seq_printf(seq, "\t\tbad_suggestions: %u\n",
++ atomic_read(&sbi->s_bal_cr1_bad_suggestions));
++ seq_printf(seq, "\t\tskipped_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_skipped[1]));
++
++ seq_puts(seq, "\tcr2_stats:\n");
++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
++ seq_printf(seq, "\t\tgroups_considered: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
++ seq_printf(seq, "\t\tuseless_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_failed[2]));
++ seq_printf(seq, "\t\tskipped_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_skipped[2]));
++
++ seq_puts(seq, "\tcr3_stats:\n");
++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
++ seq_printf(seq, "\t\tgroups_considered: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
++ seq_printf(seq, "\t\tuseless_loops: %llu\n",
++ atomic64_read(&sbi->s_bal_cX_failed[3]));
++ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
++
++ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
++ atomic_read(&sbi->s_mb_buddies_generated),
++ ext4_get_groups_count(sb));
++ seq_printf(seq, "\tbuddies_time_used: %llu\n",
++ atomic64_read(&sbi->s_mb_generation_time));
++ seq_printf(seq, "\tpreallocated: %u\n",
++ atomic_read(&sbi->s_mb_preallocated));
++ seq_printf(seq, "\tdiscarded: %u\n",
++ atomic_read(&sbi->s_mb_discarded));
++ return 0;
++}
++
+ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+ {
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+@@ -2764,93 +3150,6 @@ const struct file_operations ext4_seq_mb
+ .write = ext4_mb_last_group_write,
+ };
+
+-static int mb_seq_alloc_show(struct seq_file *seq, void *v)
+-{
+- struct super_block *sb = seq->private;
+- struct ext4_sb_info *sbi = EXT4_SB(sb);
+-
+- seq_printf(seq, "mballoc:\n");
+- seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
+- seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+- seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+-
+- seq_printf(seq, "\textents_scanned: %u\n",
+- atomic_read(&sbi->s_bal_ex_scanned));
+- seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+- seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+- seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+- seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+-
+- seq_printf(seq, "\tuseless_c1_loops: %llu\n",
+- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]));
+- seq_printf(seq, "\tuseless_c2_loops: %llu\n",
+- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]));
+- seq_printf(seq, "\tuseless_c3_loops: %llu\n",
+- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
+- seq_printf(seq, "\tskipped_c1_loops: %llu\n",
+- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]));
+- seq_printf(seq, "\tskipped_c2_loops: %llu\n",
+- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]));
+- seq_printf(seq, "\tskipped_c3_loops: %llu\n",
+- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+- seq_printf(seq, "\tbuddies_generated: %lu\n",
+- sbi->s_mb_buddies_generated);
+- seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
+- seq_printf(seq, "\tpreallocated: %u\n",
+- atomic_read(&sbi->s_mb_preallocated));
+- seq_printf(seq, "\tdiscarded: %u\n",
+- atomic_read(&sbi->s_mb_discarded));
+- return 0;
+-}
+-
+-static ssize_t mb_seq_alloc_write(struct file *file,
+- const char __user *buf,
+- size_t cnt, loff_t *pos)
+-{
+- struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
+-
+- atomic_set(&sbi->s_bal_allocated, 0),
+- atomic_set(&sbi->s_bal_reqs, 0),
+- atomic_set(&sbi->s_bal_success, 0);
+-
+- atomic_set(&sbi->s_bal_ex_scanned, 0),
+- atomic_set(&sbi->s_bal_goals, 0),
+- atomic_set(&sbi->s_bal_2orders, 0),
+- atomic_set(&sbi->s_bal_breaks, 0),
+- atomic_set(&sbi->s_mb_lost_chunks, 0);
+-
+- atomic64_set(&sbi->s_bal_cX_failed[0], 0),
+- atomic64_set(&sbi->s_bal_cX_failed[1], 0),
+- atomic64_set(&sbi->s_bal_cX_failed[2], 0);
+-
+- atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
+- atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
+- atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
+-
+-
+- sbi->s_mb_buddies_generated = 0;
+- sbi->s_mb_generation_time = 0;
+-
+- atomic_set(&sbi->s_mb_preallocated, 0),
+- atomic_set(&sbi->s_mb_discarded, 0);
+-
+- return cnt;
+-}
+-
+-static int mb_seq_alloc_open(struct inode *inode, struct file *file)
+-{
+- return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
+-}
+-
+-const struct file_operations ext4_mb_seq_alloc_fops = {
+- .owner = THIS_MODULE,
+- .open = mb_seq_alloc_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = single_release,
+- .write = mb_seq_alloc_write,
+-};
+-
+ int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(m->private);
+@@ -2952,7 +3251,11 @@ int ext4_mb_add_groupinfo(struct super_b
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
+ meta_group_info[i]->bb_free_root = RB_ROOT;
++ INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
++ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
++ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
++ meta_group_info[i]->bb_group = group;
+
+ mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
+ return 0;
+@@ -3008,6 +3311,26 @@ static int ext4_mb_init_backend(struct s
+ goto err_freebuddy;
+ }
+
++ if (ext4_has_feature_flex_bg(sb)) {
++ /* a single flex group is supposed to be read by a single IO */
++ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
++ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
++ } else {
++ sbi->s_mb_prefetch = 32;
++ }
++ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
++ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
++ /* now many real IOs to prefetch within a single allocation at cr=0
++ * given cr=0 is an CPU-related optimization we shouldn't try to
++ * load too many groups, at some point we should start to use what
++ * we've got in memory.
++ * with an average random access time 5ms, it'd take a second to get
++ * 200 groups (* N with flex_bg), so let's make this limit 4
++ */
++ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
++ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
++ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
++
+ return 0;
+
+ err_freebuddy:
+@@ -3086,7 +3409,7 @@ int ext4_mb_init(struct super_block *sb)
+ unsigned max;
+ int ret;
+
+- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
++ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
+
+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+ if (sbi->s_mb_offsets == NULL) {
+@@ -3094,7 +3417,7 @@ int ext4_mb_init(struct super_block *sb)
+ goto out;
+ }
+
+- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
++ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+ if (sbi->s_mb_maxs == NULL) {
+ ret = -ENOMEM;
+@@ -3120,16 +3443,53 @@ int ext4_mb_init(struct super_block *sb)
+ offset_incr = offset_incr >> 1;
+ max = max >> 1;
+ i++;
+- } while (i <= sb->s_blocksize_bits + 1);
++ } while (i < MB_NUM_ORDERS(sb));
++
++ sbi->s_mb_avg_fragment_size =
++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++ GFP_KERNEL);
++ if (!sbi->s_mb_avg_fragment_size) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ sbi->s_mb_avg_fragment_size_locks =
++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++ GFP_KERNEL);
++ if (!sbi->s_mb_avg_fragment_size_locks) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
++ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
++ }
++ sbi->s_mb_largest_free_orders =
++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++ GFP_KERNEL);
++ if (!sbi->s_mb_largest_free_orders) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ sbi->s_mb_largest_free_orders_locks =
++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++ GFP_KERNEL);
++ if (!sbi->s_mb_largest_free_orders_locks) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++ INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
++ rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
++ }
+
+ spin_lock_init(&sbi->s_md_lock);
+- spin_lock_init(&sbi->s_bal_lock);
+ sbi->s_mb_free_pending = 0;
+ INIT_LIST_HEAD(&sbi->s_freed_data_list);
+
+ sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+ sbi->s_mb_stats = MB_DEFAULT_STATS;
++ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
+ sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
+@@ -3214,6 +3574,10 @@ int ext4_mb_init(struct super_block *sb)
+ spin_lock_init(&lg->lg_prealloc_lock);
+ }
+
++ if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
++ sbi->s_mb_max_linear_groups = 0;
++ else
++ sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
+ /* init file for buddy data */
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0)
+@@ -3225,6 +3589,10 @@ out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
+ out:
++ kfree(sbi->s_mb_avg_fragment_size);
++ kfree(sbi->s_mb_avg_fragment_size_locks);
++ kfree(sbi->s_mb_largest_free_orders);
++ kfree(sbi->s_mb_largest_free_orders_locks);
+ kfree(sbi->s_mb_prealloc_table);
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+@@ -3282,6 +3650,10 @@ int ext4_mb_release(struct super_block *
+ kvfree(group_info);
+ rcu_read_unlock();
+ }
++ kfree(sbi->s_mb_avg_fragment_size);
++ kfree(sbi->s_mb_avg_fragment_size_locks);
++ kfree(sbi->s_mb_largest_free_orders);
++ kfree(sbi->s_mb_largest_free_orders_locks);
+ kfree(sbi->s_mb_offsets);
+ kfree(sbi->s_mb_maxs);
+ iput(sbi->s_buddy_cache);
+@@ -3302,17 +3674,18 @@ int ext4_mb_release(struct super_block *
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]),
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+ ext4_msg(sb, KERN_INFO,
+- "mballoc: %u extents scanned, %u goal hits, "
++ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
+ "%u 2^N hits, %u breaks, %u lost",
+ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_groups_scanned),
+ atomic_read(&sbi->s_bal_goals),
+ atomic_read(&sbi->s_bal_2orders),
+ atomic_read(&sbi->s_bal_breaks),
+ atomic_read(&sbi->s_mb_lost_chunks));
+ ext4_msg(sb, KERN_INFO,
+- "mballoc: %lu generated and it took %Lu",
+- sbi->s_mb_buddies_generated,
+- sbi->s_mb_generation_time);
++ "mballoc: %u generated and it took %llu",
++ atomic_read(&sbi->s_mb_buddies_generated),
++ atomic64_read(&sbi->s_mb_generation_time));
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u preallocated, %u discarded",
+ atomic_read(&sbi->s_mb_preallocated),
+@@ -3832,12 +4205,13 @@ static void ext4_mb_collect_stats(struct
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
++ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+ if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
+ atomic_inc(&sbi->s_bal_success);
+ atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
++ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+ atomic_inc(&sbi->s_bal_goals);
+@@ -4206,16 +4580,22 @@ static void ext4_mb_mark_pa_deleted(stru
+ }
+ }
+
+-static void ext4_mb_pa_callback(struct rcu_head *head)
++static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
+ {
+- struct ext4_prealloc_space *pa;
+- pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+-
++ BUG_ON(!pa);
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
+ kmem_cache_free(ext4_pspace_cachep, pa);
+ }
+
++static void ext4_mb_pa_callback(struct rcu_head *head)
++{
++ struct ext4_prealloc_space *pa;
++
++ pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
++ ext4_mb_pa_free(pa);
++}
++
+ /*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+@@ -4741,14 +5121,20 @@ static int ext4_mb_pa_alloc(struct ext4_
+ return 0;
+ }
+
+-static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
++static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
+ {
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+
+ BUG_ON(!pa);
+ ac->ac_pa = NULL;
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+- kmem_cache_free(ext4_pspace_cachep, pa);
++ /*
++ * current function is only called due to an error or due to
++ * len of found blocks < len of requested blocks hence the PA has not
++ * been added to grp->bb_prealloc_list. So we don't need to lock it
++ */
++ pa->pa_deleted = 1;
++ ext4_mb_pa_free(pa);
+ }
+
+ #ifdef CONFIG_EXT4_DEBUG
+@@ -4837,6 +5223,7 @@ static void ext4_mb_group_or_file(struct
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int bsbits = ac->ac_sb->s_blocksize_bits;
+ loff_t size, isize;
++ bool inode_pa_eligible, group_pa_eligible;
+
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return;
+@@ -4844,26 +5231,27 @@ static void ext4_mb_group_or_file(struct
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
++ group_pa_eligible = sbi->s_mb_group_prealloc > 0;
++ inode_pa_eligible = true;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
+
++ /* No point in using inode preallocation for closed files */
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+- !inode_is_open_for_write(ac->ac_inode)) {
+- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+- return;
+- }
+-
+- if (sbi->s_mb_group_prealloc <= 0) {
+- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+- return;
+- }
++ !inode_is_open_for_write(ac->ac_inode))
++ inode_pa_eligible = false;
+
+- /* don't use group allocation for large files */
+ size = max(size, isize);
+- if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
+- (size >= sbi->s_mb_large_req)) {
+- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
++ /* Don't use group allocation for large files */
++ if (size > sbi->s_mb_stream_request)
++ group_pa_eligible = false;
++
++ if (!group_pa_eligible) {
++ if (inode_pa_eligible)
++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
++ else
++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+ return;
+ }
+
+@@ -5211,6 +5599,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
+ ext4_fsblk_t block = 0;
+ unsigned int inquota = 0;
+ unsigned int reserv_clstrs = 0;
++ int retries = 0;
+ u64 seq;
+
+ might_sleep();
+@@ -5293,13 +5682,13 @@ repeat:
+ * So we have to free this pa here itself.
+ */
+ if (*errp) {
+- ext4_mb_pa_free(ac);
++ ext4_mb_pa_put_free(ac);
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
+ if (ac->ac_status == AC_STATUS_FOUND &&
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+- ext4_mb_pa_free(ac);
++ ext4_mb_pa_put_free(ac);
+ }
+ if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+@@ -5311,13 +5700,14 @@ repeat:
+ ar->len = ac->ac_b_ex.fe_len;
+ }
+ } else {
+- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
++ if (++retries < 3 &&
++ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
+ goto repeat;
+ /*
+ * If block allocation fails then the pa allocated above
+ * needs to be freed here itself.
+ */
+- ext4_mb_pa_free(ac);
++ ext4_mb_pa_put_free(ac);
+ *errp = -ENOSPC;
+ }
+
+@@ -5988,6 +6378,7 @@ ext4_trim_all_free(struct super_block *s
+ ret = count;
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
++
+ out:
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+Index: linux-stage/fs/ext4/sysfs.c
+===================================================================
+--- linux-stage.orig/fs/ext4/sysfs.c
++++ linux-stage/fs/ext4/sysfs.c
+@@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_s
+ EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+ EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
++EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+ EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+ EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+@@ -262,6 +263,7 @@ static struct attribute *ext4_attrs[] =
+ ATTR_LIST(mb_large_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(mb_max_inode_prealloc),
++ ATTR_LIST(mb_max_linear_groups),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+@@ -475,14 +477,14 @@ int ext4_register_sysfs(struct super_blo
+ sb);
+ proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_ops, sb);
++ proc_create_single_data("mb_stats", 0444, sbi->s_proc,
++ ext4_seq_mb_stats_show, sb);
+ proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc,
+ &ext4_seq_prealloc_table_fops, sb);
+ proc_create_data("mb_last_group", S_IRUGO, sbi->s_proc,
+ &ext4_seq_mb_last_group_fops, sb);
+ proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
+ ext4_mb_seq_last_start_seq_show, sb);
+- proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
+- sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
+ }
+ return 0;
+ }
+Index: linux-stage/fs/ext4/mballoc.h
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.h
++++ linux-stage/fs/ext4/mballoc.h
+@@ -82,6 +82,23 @@
+ */
+ #define MB_DEFAULT_MAX_INODE_PREALLOC 512
+
++/*
++ * Number of groups to search linearly before performing group scanning
++ * optimization.
++ */
++#define MB_DEFAULT_LINEAR_LIMIT 4
++
++/*
++ * Minimum number of groups that should be present in the file system to perform
++ * group scanning optimizations.
++ */
++#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
++
++/*
++ * Number of valid buddy orders
++ */
++#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
++
+ struct ext4_free_data {
+ /* this links the free block information from sb_info */
+ struct list_head efd_list;
+@@ -166,11 +183,13 @@ struct ext4_allocation_context {
+ /* copy of the best found extent taken before preallocation efforts */
+ struct ext4_free_extent ac_f_ex;
+
++ __u32 ac_groups_considered;
++ __u32 ac_flags; /* allocation hints */
+ __u16 ac_groups_scanned;
++ __u16 ac_groups_linear_remaining;
+ __u16 ac_found;
+ __u16 ac_tail;
+ __u16 ac_buddy;
+- __u16 ac_flags; /* allocation hints */
+ __u8 ac_status;
+ __u8 ac_criteria;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+Index: linux-stage/fs/ext4/balloc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/balloc.c
++++ linux-stage/fs/ext4/balloc.c
+@@ -441,6 +441,12 @@ ext4_read_block_bitmap_nowait(struct sup
+ return ERR_PTR(-ENOMEM);
+ }
+
++ if (ignore_locked && buffer_locked(bh)) {
++ /* buffer under IO already, return if called for prefetching */
++ put_bh(bh);
++ return NULL;
++ }
++
+ if (bitmap_uptodate(bh))
+ goto verify;
+
+@@ -498,7 +504,8 @@ ext4_read_block_bitmap_nowait(struct sup
+ trace_ext4_read_block_bitmap_load(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
++ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
++ (ignore_locked ? REQ_RAHEAD : 0), bh);
+ return bh;
+ verify:
+ err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -1526,6 +1526,7 @@ enum {
+ Opt_dioread_nolock, Opt_dioread_lock,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
++ Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+ };
+
+ static const match_table_t tokens = {
+@@ -1616,6 +1617,9 @@ static const match_table_t tokens = {
+ {Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_nombcache, "nombcache"},
+ {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
++ {Opt_removed, "prefetch_block_bitmaps"},
++ {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"},
++ {Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
+ {Opt_removed, "check=none"}, /* mount option from ext2/3 */
+ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
+ {Opt_removed, "reservation"}, /* mount option from ext2/3 */
+@@ -1648,6 +1652,8 @@ static ext4_fsblk_t get_sb_block(void **
+ }
+
+ #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
++#define DEFAULT_MB_OPTIMIZE_SCAN (-1)
++
+ static const char deprecated_msg[] =
+ "Mount option \"%s\" will be removed by %s\n"
+ "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+@@ -1835,12 +1841,16 @@ static const struct mount_opts {
+ {Opt_max_dir_size_kb, 0, MOPT_GTE0},
+ {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
++ {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
++ MOPT_SET},
++ {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
+ {Opt_err, 0, 0}
+ };
+
+ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+ substring_t *args, unsigned long *journal_devnum,
+- unsigned int *journal_ioprio, int is_remount)
++ unsigned int *journal_ioprio,
++ int *mb_optimize_scan, int is_remount)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const struct mount_opts *m;
+@@ -2129,6 +2139,13 @@ static int handle_mount_opt(struct super
+ sbi->s_mount_opt |= m->mount_opt;
+ } else if (token == Opt_data_err_ignore) {
+ sbi->s_mount_opt &= ~m->mount_opt;
++ } else if (token == Opt_mb_optimize_scan) {
++ if (arg != 0 && arg != 1) {
++ ext4_msg(sb, KERN_WARNING,
++ "mb_optimize_scan should be set to 0 or 1.");
++ return -1;
++ }
++ *mb_optimize_scan = arg;
+ } else {
+ if (!args->from)
+ arg = 1;
+@@ -2151,6 +2168,7 @@ static int handle_mount_opt(struct super
+ static int parse_options(char *options, struct super_block *sb,
+ unsigned long *journal_devnum,
+ unsigned int *journal_ioprio,
++ int *mb_optimize_scan,
+ int is_remount)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2171,7 +2189,8 @@ static int parse_options(char *options,
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, tokens, args);
+ if (handle_mount_opt(sb, p, token, args, journal_devnum,
+- journal_ioprio, is_remount) < 0)
++ journal_ioprio, mb_optimize_scan,
++ is_remount) < 0)
+ return 0;
+ }
+ #ifdef CONFIG_QUOTA
+@@ -2359,6 +2378,14 @@ static int _ext4_show_options(struct seq
+ SEQ_OPTS_PUTS("dax=inode");
+ }
+
++ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
++ !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
++ SEQ_OPTS_PUTS("mb_optimize_scan=0");
++ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
++ test_opt2(sb, MB_OPTIMIZE_SCAN)) {
++ SEQ_OPTS_PUTS("mb_optimize_scan=1");
++ }
++
+ ext4_show_quota_options(seq, sb);
+ return 0;
+ }
+@@ -3140,15 +3167,34 @@ static void print_daily_error_info(struc
+ static int ext4_run_li_request(struct ext4_li_request *elr)
+ {
+ struct ext4_group_desc *gdp = NULL;
+- ext4_group_t group, ngroups;
+- struct super_block *sb;
++ struct super_block *sb = elr->lr_super;
++ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
++ ext4_group_t group = elr->lr_next_group;
++ unsigned int prefetch_ios = 0;
+ int ret = 0;
+ u64 start_time;
+
+- sb = elr->lr_super;
+- ngroups = EXT4_SB(sb)->s_groups_count;
++ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
++ elr->lr_next_group = ext4_mb_prefetch(sb, group,
++ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
++ if (prefetch_ios)
++ ext4_mb_prefetch_fini(sb, elr->lr_next_group,
++ prefetch_ios);
++ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
++ prefetch_ios);
++ if (group >= elr->lr_next_group) {
++ ret = 1;
++ if (elr->lr_first_not_zeroed != ngroups &&
++ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
++ elr->lr_next_group = elr->lr_first_not_zeroed;
++ elr->lr_mode = EXT4_LI_MODE_ITABLE;
++ ret = 0;
++ }
++ }
++ return ret;
++ }
+
+- for (group = elr->lr_next_group; group < ngroups; group++) {
++ for (; group < ngroups; group++) {
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (!gdp) {
+ ret = 1;
+@@ -3166,9 +3212,10 @@ static int ext4_run_li_request(struct ex
+ start_time = ktime_get_real_ns();
+ ret = ext4_init_inode_table(sb, group,
+ elr->lr_timeout ? 0 : 1);
++ trace_ext4_lazy_itable_init(sb, group);
+ if (elr->lr_timeout == 0) {
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+- elr->lr_sbi->s_li_wait_mult);
++ EXT4_SB(elr->lr_super)->s_li_wait_mult);
+ }
+ elr->lr_next_sched = jiffies + elr->lr_timeout;
+ elr->lr_next_group = group + 1;
+@@ -3182,15 +3229,11 @@ static int ext4_run_li_request(struct ex
+ */
+ static void ext4_remove_li_request(struct ext4_li_request *elr)
+ {
+- struct ext4_sb_info *sbi;
+-
+ if (!elr)
+ return;
+
+- sbi = elr->lr_sbi;
+-
+ list_del(&elr->lr_request);
+- sbi->s_li_request = NULL;
++ EXT4_SB(elr->lr_super)->s_li_request = NULL;
+ kfree(elr);
+ }
+
+@@ -3399,7 +3442,6 @@ static int ext4_li_info_new(void)
+ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+ ext4_group_t start)
+ {
+- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_li_request *elr;
+
+ elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+@@ -3407,8 +3449,13 @@ static struct ext4_li_request *ext4_li_r
+ return NULL;
+
+ elr->lr_super = sb;
+- elr->lr_sbi = sbi;
+- elr->lr_next_group = start;
++ elr->lr_first_not_zeroed = start;
++ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
++ elr->lr_mode = EXT4_LI_MODE_ITABLE;
++ elr->lr_next_group = start;
++ } else {
++ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
++ }
+
+ /*
+ * Randomize first schedule time of the request to
+@@ -3438,8 +3485,9 @@ int ext4_register_li_request(struct supe
+ goto out;
+ }
+
+- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
+- !test_opt(sb, INIT_INODE_TABLE))
++ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
++ (first_not_zeroed == ngroups || sb_rdonly(sb) ||
++ !test_opt(sb, INIT_INODE_TABLE)))
+ goto out;
+
+ elr = ext4_li_request_new(sb, first_not_zeroed);
+@@ -3725,6 +3773,7 @@ static int ext4_fill_super(struct super_
+ __u64 blocks_count;
+ int err = 0;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
++ int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
+ ext4_group_t first_not_zeroed;
+
+ if ((data && !orig_data) || !sbi)
+@@ -3958,7 +4007,7 @@ static int ext4_fill_super(struct super_
+ if (!s_mount_opts)
+ goto failed_mount;
+ if (!parse_options(s_mount_opts, sb, &journal_devnum,
+- &journal_ioprio, 0)) {
++ &journal_ioprio, &mb_optimize_scan, 0)) {
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ s_mount_opts);
+@@ -3967,7 +4016,7 @@ static int ext4_fill_super(struct super_
+ }
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+ if (!parse_options((char *) data, sb, &journal_devnum,
+- &journal_ioprio, 0))
++ &journal_ioprio, &mb_optimize_scan, 0))
+ goto failed_mount;
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+@@ -4628,6 +4677,19 @@ no_journal:
+ }
+
+ ext4_ext_init(sb);
++
++ /*
++ * Enable optimize_scan if number of groups is > threshold. This can be
++ * turned off by passing "mb_optimize_scan=0". This can also be
++ * turned on forcefully by passing "mb_optimize_scan=1".
++ */
++ if (mb_optimize_scan == 1)
++ set_opt2(sb, MB_OPTIMIZE_SCAN);
++ else if (mb_optimize_scan == 0)
++ clear_opt2(sb, MB_OPTIMIZE_SCAN);
++ else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
++ set_opt2(sb, MB_OPTIMIZE_SCAN);
++
+ err = ext4_mb_init(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+@@ -5393,6 +5455,7 @@ static int ext4_remount(struct super_blo
+ int enable_quota = 0;
+ ext4_group_t g;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
++ int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
+ int err = 0;
+ #ifdef CONFIG_QUOTA
+ int i, j;
+@@ -5439,7 +5502,8 @@ static int ext4_remount(struct super_blo
+ vfs_flags = SB_LAZYTIME | SB_I_VERSION;
+ sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
+
+- if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
++ if (!parse_options(data, sb, NULL, &journal_ioprio, &mb_optimize_scan,
++ 1)) {
+ err = -EINVAL;
+ goto restore_opts;
+ }
+Index: linux-stage/include/trace/events/ext4.h
+===================================================================
+--- linux-stage.orig/include/trace/events/ext4.h
++++ linux-stage/include/trace/events/ext4.h
+@@ -2712,6 +2712,50 @@ TRACE_EVENT(ext4_error,
+ __entry->function, __entry->line)
+ );
+
++TRACE_EVENT(ext4_prefetch_bitmaps,
++ TP_PROTO(struct super_block *sb, ext4_group_t group,
++ ext4_group_t next, unsigned int prefetch_ios),
++
++ TP_ARGS(sb, group, next, prefetch_ios),
++
++ TP_STRUCT__entry(
++ __field( dev_t, dev )
++ __field( __u32, group )
++ __field( __u32, next )
++ __field( __u32, ios )
++ ),
++
++ TP_fast_assign(
++ __entry->dev = sb->s_dev;
++ __entry->group = group;
++ __entry->next = next;
++ __entry->ios = prefetch_ios;
++ ),
++
++ TP_printk("dev %d,%d group %u next %u ios %u",
++ MAJOR(__entry->dev), MINOR(__entry->dev),
++ __entry->group, __entry->next, __entry->ios)
++);
++
++TRACE_EVENT(ext4_lazy_itable_init,
++ TP_PROTO(struct super_block *sb, ext4_group_t group),
++
++ TP_ARGS(sb, group),
++
++ TP_STRUCT__entry(
++ __field( dev_t, dev )
++ __field( __u32, group )
++ ),
++
++ TP_fast_assign(
++ __entry->dev = sb->s_dev;
++ __entry->group = group;
++ ),
++
++ TP_printk("dev %d,%d group %u",
++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
++);
++
+ #endif /* _TRACE_EXT4_H */
+
+ /* This part must be outside protection */