Whamcloud - gitweb
LU-14438 ldiskfs: backport ldiskfs mballoc patches
authorBobi Jam <bobijam@whamcloud.com>
Fri, 21 Jul 2023 07:34:20 +0000 (15:34 +0800)
committerAndreas Dilger <adilger@whamcloud.com>
Mon, 18 Sep 2023 06:26:07 +0000 (06:26 +0000)
This contains following kernel patches:

a078dff87013 ("ext4: fixup possible uninitialized variable access in
                     ext4_mb_choose_next_group_cr1()")
80fa46d6b9e7 ("ext4: limit the number of retries after discarding
                     preallocations blocks")
820897258ad3 ("ext4: Refactor code related to freeing PAs")
cf5e2ca6c990 ("ext4: mballoc: refactor
                     ext4_mb_discard_preallocations()")
83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of
                     rbtree")
a9f2a2931d0e ("ext4: use locality group preallocation for small
                     closed files")
1940265ede66 ("ext4: avoid unnecessary spreading of allocations among
                     groups")
4fca50d440cc ("ext4: make mballoc try target group first even with
                     mb_optimize_scan")
3fa5d23e68a3 ("ext4: reflect mb_optimize_scan value in options file")
077d0c2c78df ("ext4: make mb_optimize_scan performance mount option
                     work with extents")
196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
21175ca434c5 ("ext4: make prefetch_block_bitmaps default")
3d392b2676bf ("ext4: add prefetch_block_bitmaps mount option")
cfd732377221 ("ext4: add prefetching for block allocation bitmaps")
4b68f6df1059 ("ext4: add MB_NUM_ORDERS macro")
dddcd2f9ebde ("ext4: optimize the implementation of
                     ext4_mb_good_group()")
a6c75eaf1103 ("ext4: add mballoc stats proc file")
67d251860461 ("ext4: drop s_mb_bal_lock and convert protected fields
                     to atomic")

Lustre-change: https://review.whamcloud.com/51472
Lustre-commit: TBD (from 8da59fc988f0cebcac10e8ef1faab1e4c913de03)

Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Change-Id: I079dfb74bd743894934484803cedb683073e4d94
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52120
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series

diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch
new file mode 100644 (file)
index 0000000..89aae1e
--- /dev/null
@@ -0,0 +1,1824 @@
+This contains following kernel patches:
+
+a078dff87013 ("ext4: fixup possible uninitialized variable access in
+                     ext4_mb_choose_next_group_cr1()")
+80fa46d6b9e7 ("ext4: limit the number of retries after discarding
+                     preallocations blocks")
+820897258ad3 ("ext4: Refactor code related to freeing PAs")
+cf5e2ca6c990 ("ext4: mballoc: refactor
+                     ext4_mb_discard_preallocations()")
+83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of
+                     rbtree")
+a9f2a2931d0e ("ext4: use locality group preallocation for small
+                     closed files")
+1940265ede66 ("ext4: avoid unnecessary spreading of allocations among
+                     groups")
+4fca50d440cc ("ext4: make mballoc try target group first even with
+                     mb_optimize_scan")
+3fa5d23e68a3 ("ext4: reflect mb_optimize_scan value in options file")
+077d0c2c78df ("ext4: make mb_optimize_scan performance mount option
+                     work with extents")
+196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
+21175ca434c5 ("ext4: make prefetch_block_bitmaps default")
+3d392b2676bf ("ext4: add prefetch_block_bitmaps mount option")
+cfd732377221 ("ext4: add prefetching for block allocation bitmaps")
+4b68f6df1059 ("ext4: add MB_NUM_ORDERS macro")
+dddcd2f9ebde ("ext4: optimize the implementation of ext4_mb_good_group()")
+a6c75eaf1103 ("ext4: add mballoc stats proc file")
+67d251860461 ("ext4: drop s_mb_bal_lock and convert protected fields
+                     to atomic")
+
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
+@@ -151,6 +151,10 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_USE_RESERVED          0x2000
+ /* Do strict check for free blocks while retrying block allocation */
+ #define EXT4_MB_STRICT_CHECK          0x4000
++/* Large fragment size list lookup succeeded at least once for cr = 0 */
++#define EXT4_MB_CR0_OPTIMIZED         0x8000
++/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
++#define EXT4_MB_CR1_OPTIMIZED         0x00010000
+ struct ext4_allocation_request {
+       /* target inode for block we're allocating */
+@@ -1167,6 +1170,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_JOURNAL_CHECKSUM   0x800000 /* Journal checksums */
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT       0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_WARN_ON_ERROR      0x2000000 /* Trigger WARN_ON on error */
++#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
+ #define EXT4_MOUNT_DELALLOC           0x8000000 /* Delalloc support */
+ #define EXT4_MOUNT_DATA_ERR_ABORT     0x10000000 /* Abort on file data write */
+ #define EXT4_MOUNT_BLOCK_VALIDITY     0x20000000 /* Block validity checking */
+@@ -1193,7 +1197,9 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT2_DAX_NEVER         0x00000020 /* Do not allow Direct Access */
+ #define EXT4_MOUNT2_DAX_INODE         0x00000040 /* For printing options only */
+-
++#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN  0x00000080 /* Optimize group
++                                                  * scanning in mballoc
++                                                  */
+ #define clear_opt(sb, opt)            EXT4_SB(sb)->s_mount_opt &= \
+                                               ~EXT4_MOUNT_##opt
+@@ -1480,9 +1486,15 @@ struct ext4_sb_info {
+       unsigned int s_mb_free_pending;
+       struct list_head s_freed_data_list;     /* List of blocks to be freed
+                                                  after commit completed */
++      struct list_head *s_mb_avg_fragment_size;
++      rwlock_t *s_mb_avg_fragment_size_locks;
++      struct list_head *s_mb_largest_free_orders;
++      rwlock_t *s_mb_largest_free_orders_locks;
+       /* tunables */
+       unsigned long s_stripe;
++      unsigned int s_mb_max_linear_groups;
++      unsigned int s_mb_stream_request;
+       unsigned long s_mb_small_req;
+       unsigned long s_mb_large_req;
+       unsigned int s_mb_max_to_scan;
+@@ -1508,15 +1520,18 @@ struct ext4_sb_info {
+       atomic_t s_bal_success; /* we found long enough chunks */
+       atomic_t s_bal_allocated;       /* in blocks */
+       atomic_t s_bal_ex_scanned;      /* total extents scanned */
++      atomic_t s_bal_groups_scanned;  /* number of groups scanned */
+       atomic_t s_bal_goals;   /* goal hits */
+       atomic_t s_bal_breaks;  /* too long searches */
+       atomic_t s_bal_2orders; /* 2^order hits */
+-      /* cX loop didn't find blocks */
+-      atomic64_t s_bal_cX_failed[3];
++      atomic_t s_bal_cr0_bad_suggestions;
++      atomic_t s_bal_cr1_bad_suggestions;
++      atomic64_t s_bal_cX_groups_considered[4];
++      atomic64_t s_bal_cX_hits[4];
++      atomic64_t s_bal_cX_failed[4];          /* cX loop didn't find blocks */
+       atomic64_t s_bal_cX_skipped[3];
+-      spinlock_t s_bal_lock;
+-      unsigned long s_mb_buddies_generated;
+-      unsigned long long s_mb_generation_time;
++      atomic_t s_mb_buddies_generated;        /* number of buddies generated */
++      atomic64_t s_mb_generation_time;
+       atomic_t s_mb_lost_chunks;
+       atomic_t s_mb_preallocated;
+       atomic_t s_mb_discarded;
+@@ -2393,9 +2408,15 @@ struct ext4_lazy_init {
+       struct mutex            li_list_mtx;
+ };
++enum ext4_li_mode {
++      EXT4_LI_MODE_PREFETCH_BBITMAP,
++      EXT4_LI_MODE_ITABLE,
++};
++
+ struct ext4_li_request {
+       struct super_block      *lr_super;
+-      struct ext4_sb_info     *lr_sbi;
++      enum ext4_li_mode       lr_mode;
++      ext4_group_t            lr_first_not_zeroed;
+       ext4_group_t            lr_next_group;
+       struct list_head        lr_request;
+       unsigned long           lr_next_sched;
+@@ -2685,6 +2706,7 @@ extern const struct file_operations ext4
+ extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
+ extern long ext4_mb_stats;
+ extern long ext4_mb_max_to_scan;
++extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+ extern int ext4_mb_init(struct super_block *);
+ extern int ext4_mb_release(struct super_block *);
+ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+@@ -2693,6 +2715,12 @@ extern int ext4_mb_reserve_blocks(struct
+ extern void ext4_discard_preallocations(struct inode *, unsigned int);
+ extern int __init ext4_init_mballoc(void);
+ extern void ext4_exit_mballoc(void);
++extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
++                                   ext4_group_t group,
++                                   unsigned int nr, int *cnt);
++extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
++                                unsigned int nr);
++
+ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, ext4_fsblk_t block,
+                            unsigned long count, int flags);
+@@ -3178,13 +3206,18 @@ struct ext4_group_info {
+       ext4_grpblk_t   bb_first_free;  /* first free block */
+       ext4_grpblk_t   bb_free;        /* total free blocks */
+       ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
++      int             bb_avg_fragment_size_order;     /* order of average
++                                                         fragment in BG */
+       ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
++      ext4_group_t    bb_group;       /* Group number */
+       struct          list_head bb_prealloc_list;
+       unsigned long   bb_prealloc_nr;
+ #ifdef DOUBLE_CHECK
+       void            *bb_bitmap;
+ #endif
+       struct rw_semaphore alloc_sem;
++      struct list_head bb_avg_fragment_size_node;
++      struct list_head bb_largest_free_order_node;
+       ext4_grpblk_t   bb_counters[];  /* Nr of free power-of-two-block
+                                        * regions, index is order.
+                                        * bb_counters[3] = 5 means
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c
++++ linux-stage/fs/ext4/mballoc.c
+@@ -127,11 +127,53 @@
+  * the smallest multiple of the stripe value (sbi->s_stripe) which is
+  * greater than the default mb_group_prealloc.
+  *
++ * If "mb_optimize_scan" mount option is set, we maintain in memory group info
++ * structures in two data structures:
++ *
++ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
++ *
++ *    Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
++ *
++ *    This is an array of lists where the index in the array represents the
++ *    largest free order in the buddy bitmap of the participating group infos of
++ *    that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
++ *    number of buddy bitmap orders possible) number of lists. Group-infos are
++ *    placed in appropriate lists.
++ *
++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
++ *
++ *    Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
++ *
++ *    This is an array of lists where in the i-th list there are groups with
++ *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
++ *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
++ *    Note that we don't bother with a special list for completely empty groups
++ *    so we only have MB_NUM_ORDERS(sb) lists.
++ *
++ * When "mb_optimize_scan" mount option is set, mballoc consults the above data
++ * structures to decide the order in which groups are to be traversed for
++ * fulfilling an allocation request.
++ *
++ * At CR = 0, we look for groups which have the largest_free_order >= the order
++ * of the request. We directly look at the largest free order list in the data
++ * structure (1) above where largest_free_order = order of the request. If that
++ * list is empty, we look at remaining list in the increasing order of
++ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
++ *
++ * At CR = 1, we only consider groups where average fragment size > request
++ * size. So, we lookup a group which has average fragment size just above or
++ * equal to request size using our average fragment size group lists (data
++ * structure 2) in O(1) time.
++ *
++ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
++ * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
++ *
+  * The regular allocator (using the buddy cache) supports a few tunables.
+  *
+  * /sys/fs/ext4/<partition>/mb_min_to_scan
+  * /sys/fs/ext4/<partition>/mb_max_to_scan
+  * /sys/fs/ext4/<partition>/mb_order2_req
++ * /sys/fs/ext4/<partition>/mb_linear_limit
+  *
+  * The regular allocator uses buddy scan only if the request len is power of
+  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+@@ -149,6 +191,16 @@
+  * can be used for allocation. ext4_mb_good_group explains how the groups are
+  * checked.
+  *
++ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
++ * get traversed linearly. That may result in subsequent allocations being not
++ * close to each other. And so, the underlying device may get filled up in a
++ * non-linear fashion. While that may not matter on non-rotational devices, for
++ * rotational devices that may result in higher seek times. "mb_linear_limit"
++ * tells mballoc how many groups mballoc should search linearly before
++ * performing consulting above data structures for more efficient lookups. For
++ * non rotational devices, this value defaults to 0 and for rotational devices
++ * this is set to MB_DEFAULT_LINEAR_LIMIT.
++ *
+  * Both the prealloc space are getting populated as above. So for the first
+  * request we will hit the buddy cache which will result in this prealloc
+  * space getting filled. The prealloc space is then later used for the
+@@ -299,6 +351,8 @@
+  *  - bitlock on a group      (group)
+  *  - object (inode/locality) (object)
+  *  - per-pa lock             (pa)
++ *  - cr0 lists lock          (cr0)
++ *  - cr1 tree lock           (cr1)
+  *
+  * Paths:
+  *  - new pa
+@@ -328,6 +382,9 @@
+  *    group
+  *        object
+  *
++ *  - allocation path (ext4_mb_regular_allocator)
++ *    group
++ *    cr0/cr1
+  */
+ static struct kmem_cache *ext4_pspace_cachep;
+ static struct kmem_cache *ext4_ac_cachep;
+@@ -351,6 +408,9 @@ static void ext4_mb_generate_from_freeli
+                                               ext4_group_t group);
+ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
++static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
++                             ext4_group_t group, int cr);
++
+ /*
+  * The algorithm using this percpu seq counter goes below:
+  * 1. We sample the percpu discard_pa_seq counter before trying for block
+@@ -747,6 +807,221 @@ static void ext4_mb_mark_free_simple(str
+       }
+ }
++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
++{
++      int order;
++
++      /*
++       * We don't bother with a special lists groups with only 1 block free
++       * extents and for completely empty groups.
++       */
++      order = fls(len) - 2;
++      if (order < 0)
++              return 0;
++      if (order == MB_NUM_ORDERS(sb))
++              order--;
++      return order;
++}
++
++/* Move group to appropriate avg_fragment_size list */
++static void
++mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
++{
++      struct ext4_sb_info *sbi = EXT4_SB(sb);
++      int new_order;
++
++      if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
++              return;
++
++      new_order = mb_avg_fragment_size_order(sb,
++                                      grp->bb_free / grp->bb_fragments);
++      if (new_order == grp->bb_avg_fragment_size_order)
++              return;
++
++      if (grp->bb_avg_fragment_size_order != -1) {
++              write_lock(&sbi->s_mb_avg_fragment_size_locks[
++                                      grp->bb_avg_fragment_size_order]);
++              list_del(&grp->bb_avg_fragment_size_node);
++              write_unlock(&sbi->s_mb_avg_fragment_size_locks[
++                                      grp->bb_avg_fragment_size_order]);
++      }
++      grp->bb_avg_fragment_size_order = new_order;
++      write_lock(&sbi->s_mb_avg_fragment_size_locks[
++                                      grp->bb_avg_fragment_size_order]);
++      list_add_tail(&grp->bb_avg_fragment_size_node,
++              &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
++      write_unlock(&sbi->s_mb_avg_fragment_size_locks[
++                                      grp->bb_avg_fragment_size_order]);
++}
++
++/*
++ * Choose next group by traversing largest_free_order lists. Updates *new_cr if
++ * cr level needs an update.
++ */
++static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
++                      int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
++{
++      struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++      struct ext4_group_info *iter, *grp;
++      int i;
++
++      if (ac->ac_status == AC_STATUS_FOUND)
++              return;
++
++      if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
++              atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
++
++      grp = NULL;
++      for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++              if (list_empty(&sbi->s_mb_largest_free_orders[i]))
++                      continue;
++              read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
++              if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
++                      read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
++                      continue;
++              }
++              grp = NULL;
++              list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
++                                  bb_largest_free_order_node) {
++                      if (sbi->s_mb_stats)
++                              atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
++                      if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
++                              grp = iter;
++                              break;
++                      }
++              }
++              read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
++              if (grp)
++                      break;
++      }
++
++      if (!grp) {
++              /* Increment cr and search again */
++              *new_cr = 1;
++      } else {
++              *group = grp->bb_group;
++              ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
++      }
++}
++
++/*
++ * Choose next group by traversing average fragment size list of suitable
++ * order. Updates *new_cr if cr level needs an update.
++ */
++static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
++              int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
++{
++      struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++      struct ext4_group_info *grp = NULL, *iter;
++      int i;
++
++      if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
++              if (sbi->s_mb_stats)
++                      atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
++      }
++
++      for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
++           i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++              if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
++                      continue;
++              read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
++              if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
++                      read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++                      continue;
++              }
++              list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
++                                  bb_avg_fragment_size_node) {
++                      if (sbi->s_mb_stats)
++                              atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
++                      if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
++                              grp = iter;
++                              break;
++                      }
++              }
++              read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++              if (grp)
++                      break;
++      }
++
++      if (grp) {
++              *group = grp->bb_group;
++              ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
++      } else {
++              *new_cr = 2;
++      }
++}
++
++static inline int should_optimize_scan(struct ext4_allocation_context *ac)
++{
++      if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
++              return 0;
++      if (ac->ac_criteria >= 2)
++              return 0;
++      if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
++              return 0;
++      return 1;
++}
++
++/*
++ * Return next linear group for allocation. If linear traversal should not be
++ * performed, this function just returns the same group
++ */
++static int
++next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
++{
++      if (!should_optimize_scan(ac))
++              goto inc_and_return;
++
++      if (ac->ac_groups_linear_remaining) {
++              ac->ac_groups_linear_remaining--;
++              goto inc_and_return;
++      }
++
++      return group;
++inc_and_return:
++      /*
++       * Artificially restricted ngroups for non-extent
++       * files makes group > ngroups possible on first loop.
++       */
++      return group + 1 >= ngroups ? 0 : group + 1;
++}
++
++/*
++ * ext4_mb_choose_next_group: choose next group for allocation.
++ *
++ * @ac        Allocation Context
++ * @new_cr    This is an output parameter. If the there is no good group
++ *            available at current CR level, this field is updated to indicate
++ *            the new cr level that should be used.
++ * @group     This is an input / output parameter. As an input it indicates the
++ *            next group that the allocator intends to use for allocation. As
++ *            output, this field indicates the next group that should be used as
++ *            determined by the optimization functions.
++ * @ngroups   Total number of groups
++ */
++static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
++              int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
++{
++      *new_cr = ac->ac_criteria;
++
++      if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
++              *group = next_linear_group(ac, *group, ngroups);
++              return;
++      }
++
++      if (*new_cr == 0) {
++              ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
++      } else if (*new_cr == 1) {
++              ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
++      } else {
++              /*
++               * TODO: For CR=2, we can arrange groups in an rb tree sorted by
++               * bb_free. But until that happens, we should never come here.
++               */
++              WARN_ON(1);
++      }
++}
++
+ /*
+  * Cache the order of the largest free extent we have available in this block
+  * group.
+@@ -754,22 +1029,39 @@ static void ext4_mb_mark_free_simple(str
+ static void
+ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+ {
++      struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int i;
+-      int bits;
+-      grp->bb_largest_free_order = -1; /* uninit */
+-
+-      bits = sb->s_blocksize_bits + 1;
+-      for (i = bits; i >= 0; i--) {
+-              if (grp->bb_counters[i] > 0) {
+-                      grp->bb_largest_free_order = i;
++      for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
++              if (grp->bb_counters[i] > 0)
+                       break;
+-              }
++      /* No need to move between order lists? */
++      if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
++          i == grp->bb_largest_free_order) {
++              grp->bb_largest_free_order = i;
++              return;
++      }
++
++      if (grp->bb_largest_free_order >= 0) {
++              write_lock(&sbi->s_mb_largest_free_orders_locks[
++                                            grp->bb_largest_free_order]);
++              list_del_init(&grp->bb_largest_free_order_node);
++              write_unlock(&sbi->s_mb_largest_free_orders_locks[
++                                            grp->bb_largest_free_order]);
++      }
++      grp->bb_largest_free_order = i;
++      if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
++              write_lock(&sbi->s_mb_largest_free_orders_locks[
++                                            grp->bb_largest_free_order]);
++              list_add_tail(&grp->bb_largest_free_order_node,
++                    &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
++              write_unlock(&sbi->s_mb_largest_free_orders_locks[
++                                            grp->bb_largest_free_order]);
+       }
+ }
+ static noinline_for_stack
+-int ext4_mb_generate_buddy(struct super_block *sb,
++void ext4_mb_generate_buddy(struct super_block *sb,
+                               void *buddy, void *bitmap, ext4_group_t group)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+@@ -817,19 +1109,15 @@ int ext4_mb_generate_buddy(struct super_
+               grp->bb_free = free;
+               ext4_mark_group_bitmap_corrupted(sb, group,
+                                       EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+-              return -EIO;
+       }
+       mb_set_largest_free_order(sb, grp);
++      mb_update_avg_fragment_size(sb, grp);
+       clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+       period = get_cycles() - period;
+-      spin_lock(&sbi->s_bal_lock);
+-      sbi->s_mb_buddies_generated++;
+-      sbi->s_mb_generation_time += period;
+-      spin_unlock(&sbi->s_bal_lock);
+-
+-      return 0;
++      atomic_inc(&sbi->s_mb_buddies_generated);
++      atomic64_add(period, &sbi->s_mb_generation_time);
+ }
+ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+@@ -987,14 +1275,14 @@ static int ext4_mb_init_cache(struct pag
+                       grinfo->bb_fragments = 0;
+                       memset(grinfo->bb_counters, 0,
+                              sizeof(*grinfo->bb_counters) *
+-                              (sb->s_blocksize_bits+2));
++                             (MB_NUM_ORDERS(sb)));
+                       /*
+                        * incore got set to the group block bitmap below
+                        */
+                       ext4_lock_group(sb, group);
+                       /* init the buddy */
+                       memset(data, 0xff, blocksize);
+-                      err = ext4_mb_generate_buddy(sb, data, incore, group);
++                      ext4_mb_generate_buddy(sb, data, incore, group);
+                       ext4_unlock_group(sb, group);
+                       incore = NULL;
+               } else {
+@@ -1558,6 +1846,7 @@ static void mb_free_blocks(struct inode
+ done:
+       mb_set_largest_free_order(sb, e4b->bd_info);
++      mb_update_avg_fragment_size(sb, e4b->bd_info);
+       mb_check_buddy(e4b);
+ }
+@@ -1695,6 +1984,7 @@ static int mb_mark_used(struct ext4_budd
+       }
+       mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
++      mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
+       ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+       mb_check_buddy(e4b);
+@@ -1989,7 +2279,7 @@ void ext4_mb_simple_scan_group(struct ex
+       int max;
+       BUG_ON(ac->ac_2order <= 0);
+-      for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
++      for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
+               if (grp->bb_counters[i] == 0)
+                       continue;
+@@ -2135,13 +2425,11 @@ static bool ext4_mb_good_group(struct ex
+       BUG_ON(cr < 0 || cr >= 4);
+-      free = grp->bb_free;
+-      if (free == 0)
+-              return false;
+-      if (cr <= 2 && free < ac->ac_g_ex.fe_len)
++      if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+               return false;
+-      if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
++      free = grp->bb_free;
++      if (free == 0)
+               return false;
+       fragments = grp->bb_fragments;
+@@ -2158,8 +2446,10 @@ static bool ext4_mb_good_group(struct ex
+                   ((group % flex_size) == 0))
+                       return false;
+-              if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+-                  (free / fragments) >= ac->ac_g_ex.fe_len)
++              if (free < ac->ac_g_ex.fe_len)
++                      return false;
++
++              if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
+                       return true;
+               if (grp->bb_largest_free_order < ac->ac_2order)
+@@ -2193,10 +2483,13 @@ static int ext4_mb_good_group_nolock(str
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+       struct super_block *sb = ac->ac_sb;
++      struct ext4_sb_info *sbi = EXT4_SB(sb);
+       bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
+       ext4_grpblk_t free;
+       int ret = 0;
++      if (sbi->s_mb_stats)
++              atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
+       if (should_lock)
+               ext4_lock_group(sb, group);
+       free = grp->bb_free;
+@@ -2246,97 +2539,90 @@ static u64 available_blocks_count(struct
+ }
+ /*
+- * each allocation context (i.e. a thread doing allocation) has own
+- * sliding prefetch window of @s_mb_prefetch size which starts at the
+- * very first goal and moves ahead of scaning.
+- * a side effect is that subsequent allocations will likely find
+- * the bitmaps in cache or at least in-flight.
++ * Start prefetching @nr block bitmaps starting at @group.
++ * Return the next group which needs to be prefetched.
+  */
+-static void
+-ext4_mb_prefetch(struct ext4_allocation_context *ac,
+-                  ext4_group_t start)
++ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
++                            unsigned int nr, int *cnt)
+ {
+-      struct super_block *sb = ac->ac_sb;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+-      struct ext4_sb_info *sbi = EXT4_SB(sb);
+-      struct ext4_group_info *grp;
+-      ext4_group_t group = start;
+       struct buffer_head *bh;
+-      int nr;
+-
+-      /* limit prefetching at cr=0, otherwise mballoc can
+-       * spend a lot of time loading imperfect groups */
+-      if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+-              return;
+-
+-      /* batch prefetching to get few READs in flight */
+-      nr = ac->ac_prefetch - group;
+-      if (ac->ac_prefetch < group)
+-              /* wrapped to the first groups */
+-              nr += ngroups;
+-      if (nr > 0)
+-              return;
+-      BUG_ON(nr < 0);
++      struct blk_plug plug;
+-      nr = sbi->s_mb_prefetch;
+-      if (ext4_has_feature_flex_bg(sb)) {
+-              /* align to flex_bg to get more bitmas with a single IO */
+-              nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+-              nr = nr + sbi->s_mb_prefetch - group;
+-      }
++      blk_start_plug(&plug);
+       while (nr-- > 0) {
+-              grp = ext4_get_group_info(sb, group);
+-              /* prevent expensive getblk() on groups w/ IO in progress */
+-              if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp))
+-                      goto next;
+-
+-              /* ignore empty groups - those will be skipped
+-               * during the scanning as well */
+-              if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+-                      bh = ext4_read_block_bitmap_nowait(sb, group, 1);
++              struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
++                                                                NULL);
++              struct ext4_group_info *grp = ext4_get_group_info(sb, group);
++
++              /*
++               * Prefetch block groups with free blocks; but don't
++               * bother if it is marked uninitialized on disk, since
++               * it won't require I/O to read.  Also only try to
++               * prefetch once, so we avoid getblk() call, which can
++               * be expensive.
++               */
++              if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
++                  EXT4_MB_GRP_NEED_INIT(grp) &&
++                  ext4_free_group_clusters(sb, gdp) > 0 &&
++                  !(ext4_has_group_desc_csum(sb) &&
++                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
++                      bh = ext4_read_block_bitmap_nowait(sb, group, true);
+                       if (bh && !IS_ERR(bh)) {
+-                              if (!buffer_uptodate(bh))
+-                                      ac->ac_prefetch_ios++;
++                              if (!buffer_uptodate(bh) && cnt)
++                                      (*cnt)++;
+                               brelse(bh);
+                       }
+               }
+-next:
+               if (++group >= ngroups)
+                       group = 0;
+       }
+-      ac->ac_prefetch = group;
++      blk_finish_plug(&plug);
++      return group;
+ }
+-static void
+-ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
++/*
++ * Prefetching reads the block bitmap into the buffer cache; but we
++ * need to make sure that the buddy bitmap in the page cache has been
++ * initialized.  Note that ext4_mb_init_group() will block if the I/O
++ * is not yet completed, or indeed if it was not initiated by
++ * ext4_mb_prefetch did not start the I/O.
++ *
++ * TODO: We should actually kick off the buddy bitmap setup in a work
++ * queue when the buffer I/O is completed, so that we don't block
++ * waiting for the block allocation bitmap read to finish when
++ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
++ */
++void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
++                         unsigned int nr)
+ {
+-      struct ext4_group_info *grp;
+-      ext4_group_t group;
+-      int nr, rc;
+-
+-      /* initialize last window of prefetched groups */
+-      nr = ac->ac_prefetch_ios;
+-      if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+-              nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+-      group = ac->ac_prefetch;
+       while (nr-- > 0) {
+-              grp = ext4_get_group_info(ac->ac_sb, group);
+-              if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+-                      rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+-                      if (rc)
++              struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
++                                                                NULL);
++              struct ext4_group_info *grp = ext4_get_group_info(sb, group);
++
++              if (!group)
++                      group = ext4_get_groups_count(sb);
++              group--;
++              grp = ext4_get_group_info(sb, group);
++
++              if (EXT4_MB_GRP_NEED_INIT(grp) &&
++                  ext4_free_group_clusters(sb, gdp) > 0 &&
++                  !(ext4_has_group_desc_csum(sb) &&
++                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
++                      if (ext4_mb_init_group(sb, group, GFP_NOFS))
+                               break;
+               }
+-              if (group-- == 0)
+-                      group = ext4_get_groups_count(ac->ac_sb) - 1;
+       }
+ }
+ static noinline_for_stack int
+ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+ {
+-      ext4_group_t ngroups, group, i;
+-      int cr = -1;
++      ext4_group_t prefetch_grp = 0, ngroups, group, i;
++      int cr = -1, new_cr;
+       int err = 0, first_err = 0;
++      unsigned int nr = 0, prefetch_ios = 0;
+       struct ext4_sb_info *sbi;
+       struct super_block *sb;
+       struct ext4_buddy e4b;
+@@ -2373,13 +2659,13 @@ ext4_mb_regular_allocator(struct ext4_al
+        * We also support searching for power-of-two requests only for
+        * requests upto maximum buddy size we have constructed.
+        */
+-      if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
++      if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
+               /*
+                * This should tell if fe_len is exactly power of 2
+                */
+               if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+                       ac->ac_2order = array_index_nospec(i - 1,
+-                                                         sb->s_blocksize_bits + 2);
++                                                         MB_NUM_ORDERS(sb));
+       }
+       /* if stream allocation is enabled, use global goal */
+@@ -2420,19 +2706,41 @@ repeat:
+                * from the goal value specified
+                */
+               group = ac->ac_g_ex.fe_group;
+-              ac->ac_prefetch = group;
++              ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
++              prefetch_grp = group;
+-              for (i = 0; i < ngroups; group++, i++) {
++              for (i = 0, new_cr = cr; i < ngroups; i++,
++                   ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
+                       int ret = 0;
++
+                       cond_resched();
++                      if (new_cr != cr) {
++                              cr = new_cr;
++                              goto repeat;
++                      }
++
+                       /*
+-                       * Artificially restricted ngroups for non-extent
+-                       * files makes group > ngroups possible on first loop.
++                       * Batch reads of the block allocation bitmaps
++                       * to get multiple READs in flight; limit
++                       * prefetching at cr=0/1, otherwise mballoc can
++                       * spend a lot of time loading imperfect groups
+                        */
+-                      if (group >= ngroups)
+-                              group = 0;
+-
+-                      ext4_mb_prefetch(ac, group);
++                      if ((prefetch_grp == group) &&
++                          (cr > 1 ||
++                           prefetch_ios < sbi->s_mb_prefetch_limit)) {
++                              unsigned int curr_ios = prefetch_ios;
++
++                              nr = sbi->s_mb_prefetch;
++                              if (ext4_has_feature_flex_bg(sb)) {
++                                      nr = (group / sbi->s_mb_prefetch) *
++                                              sbi->s_mb_prefetch;
++                                      nr = nr + sbi->s_mb_prefetch - group;
++                              }
++                              prefetch_grp = ext4_mb_prefetch(sb, group,
++                                                      nr, &prefetch_ios);
++                              if (prefetch_ios == curr_ios)
++                                      nr = 0;
++                      }
+                       /* This now checks without needing the buddy page */
+                       ret = ext4_mb_good_group_nolock(ac, group, cr);
+@@ -2503,7 +2811,13 @@ repeat:
+                       atomic_inc(&sbi->s_mb_lost_chunks);
+                       goto repeat;
+               }
++              /* Processed all groups and haven't found blocks */
++              if (sbi->s_mb_stats && i == ngroups)
++                      atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+       }
++
++      if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
++              atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ out:
+       if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
+               err = first_err;
+@@ -2512,8 +2826,9 @@ out:
+                ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+                ac->ac_flags, cr, err);
+-      /* use prefetched bitmaps to init buddy so that read info is not lost */
+-      ext4_mb_prefetch_fini(ac);
++      if (nr)
++              ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
++
+       return err;
+ }
+@@ -2704,6 +3019,77 @@ const struct file_operations ext4_seq_pr
+       .write   = ext4_mb_prealloc_table_proc_write,
+ };
++int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
++{
++      struct super_block *sb = (struct super_block *)seq->private;
++      struct ext4_sb_info *sbi = EXT4_SB(sb);
++
++      seq_puts(seq, "mballoc:\n");
++      if (!sbi->s_mb_stats) {
++              seq_puts(seq, "\tmb stats collection turned off.\n");
++              seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
++              return 0;
++      }
++      seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
++      seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
++
++      seq_printf(seq, "\tgroups_scanned: %u\n",  atomic_read(&sbi->s_bal_groups_scanned));
++
++      seq_puts(seq, "\tcr0_stats:\n");
++      seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
++      seq_printf(seq, "\t\tgroups_considered: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
++      seq_printf(seq, "\t\tuseless_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_failed[0]));
++      seq_printf(seq, "\t\tbad_suggestions: %u\n",
++                 atomic_read(&sbi->s_bal_cr0_bad_suggestions));
++      seq_printf(seq, "\t\tskipped_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_skipped[0]));
++
++      seq_puts(seq, "\tcr1_stats:\n");
++      seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
++      seq_printf(seq, "\t\tgroups_considered: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
++      seq_printf(seq, "\t\tuseless_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_failed[1]));
++      seq_printf(seq, "\t\tbad_suggestions: %u\n",
++                 atomic_read(&sbi->s_bal_cr1_bad_suggestions));
++      seq_printf(seq, "\t\tskipped_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_skipped[1]));
++
++      seq_puts(seq, "\tcr2_stats:\n");
++      seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
++      seq_printf(seq, "\t\tgroups_considered: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
++      seq_printf(seq, "\t\tuseless_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_failed[2]));
++      seq_printf(seq, "\t\tskipped_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_skipped[2]));
++
++      seq_puts(seq, "\tcr3_stats:\n");
++      seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
++      seq_printf(seq, "\t\tgroups_considered: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
++      seq_printf(seq, "\t\tuseless_loops: %llu\n",
++                 atomic64_read(&sbi->s_bal_cX_failed[3]));
++      seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
++      seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
++      seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
++      seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
++      seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
++
++      seq_printf(seq, "\tbuddies_generated: %u/%u\n",
++                 atomic_read(&sbi->s_mb_buddies_generated),
++                 ext4_get_groups_count(sb));
++      seq_printf(seq, "\tbuddies_time_used: %llu\n",
++                 atomic64_read(&sbi->s_mb_generation_time));
++      seq_printf(seq, "\tpreallocated: %u\n",
++                 atomic_read(&sbi->s_mb_preallocated));
++      seq_printf(seq, "\tdiscarded: %u\n",
++                 atomic_read(&sbi->s_mb_discarded));
++      return 0;
++}
++
+ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+ {
+       int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+@@ -2764,93 +3150,6 @@ const struct file_operations ext4_seq_mb
+       .write         = ext4_mb_last_group_write,
+ };
+-static int mb_seq_alloc_show(struct seq_file *seq, void *v)
+-{
+-      struct super_block *sb = seq->private;
+-      struct ext4_sb_info *sbi = EXT4_SB(sb);
+-
+-      seq_printf(seq, "mballoc:\n");
+-      seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
+-      seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+-      seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+-
+-      seq_printf(seq, "\textents_scanned: %u\n",
+-                 atomic_read(&sbi->s_bal_ex_scanned));
+-      seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+-      seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+-      seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+-      seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+-
+-      seq_printf(seq, "\tuseless_c1_loops: %llu\n",
+-                 (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]));
+-      seq_printf(seq, "\tuseless_c2_loops: %llu\n",
+-                 (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]));
+-      seq_printf(seq, "\tuseless_c3_loops: %llu\n",
+-                 (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
+-      seq_printf(seq, "\tskipped_c1_loops: %llu\n",
+-                 (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]));
+-      seq_printf(seq, "\tskipped_c2_loops: %llu\n",
+-                 (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]));
+-      seq_printf(seq, "\tskipped_c3_loops: %llu\n",
+-                 (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+-      seq_printf(seq, "\tbuddies_generated: %lu\n",
+-                 sbi->s_mb_buddies_generated);
+-      seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
+-      seq_printf(seq, "\tpreallocated: %u\n",
+-                 atomic_read(&sbi->s_mb_preallocated));
+-      seq_printf(seq, "\tdiscarded: %u\n",
+-                 atomic_read(&sbi->s_mb_discarded));
+-      return 0;
+-}
+-
+-static ssize_t mb_seq_alloc_write(struct file *file,
+-                            const char __user *buf,
+-                            size_t cnt, loff_t *pos)
+-{
+-      struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
+-
+-      atomic_set(&sbi->s_bal_allocated, 0),
+-      atomic_set(&sbi->s_bal_reqs, 0),
+-      atomic_set(&sbi->s_bal_success, 0);
+-
+-      atomic_set(&sbi->s_bal_ex_scanned, 0),
+-      atomic_set(&sbi->s_bal_goals, 0),
+-      atomic_set(&sbi->s_bal_2orders, 0),
+-      atomic_set(&sbi->s_bal_breaks, 0),
+-      atomic_set(&sbi->s_mb_lost_chunks, 0);
+-
+-      atomic64_set(&sbi->s_bal_cX_failed[0], 0),
+-      atomic64_set(&sbi->s_bal_cX_failed[1], 0),
+-      atomic64_set(&sbi->s_bal_cX_failed[2], 0);
+-
+-      atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
+-      atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
+-      atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
+-
+-
+-      sbi->s_mb_buddies_generated = 0;
+-      sbi->s_mb_generation_time = 0;
+-
+-      atomic_set(&sbi->s_mb_preallocated, 0),
+-      atomic_set(&sbi->s_mb_discarded, 0);
+-
+-      return cnt;
+-}
+-
+-static int mb_seq_alloc_open(struct inode *inode, struct file *file)
+-{
+-      return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
+-}
+-
+-const struct file_operations ext4_mb_seq_alloc_fops = {
+-      .owner          = THIS_MODULE,
+-      .open           = mb_seq_alloc_open,
+-      .read           = seq_read,
+-      .llseek         = seq_lseek,
+-      .release        = single_release,
+-      .write          = mb_seq_alloc_write,
+-};
+-
+ int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(m->private);
+@@ -2952,7 +3251,11 @@ int ext4_mb_add_groupinfo(struct super_b
+       INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+       init_rwsem(&meta_group_info[i]->alloc_sem);
+       meta_group_info[i]->bb_free_root = RB_ROOT;
++      INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
++      INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
+       meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
++      meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
++      meta_group_info[i]->bb_group = group;
+       mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
+       return 0;
+@@ -3008,6 +3311,26 @@ static int ext4_mb_init_backend(struct s
+                       goto err_freebuddy;
+       }
++      if (ext4_has_feature_flex_bg(sb)) {
++              /* a single flex group is supposed to be read by a single IO */
++              sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
++              sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
++      } else {
++              sbi->s_mb_prefetch = 32;
++      }
++      if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
++              sbi->s_mb_prefetch = ext4_get_groups_count(sb);
++      /* now many real IOs to prefetch within a single allocation at cr=0
++       * given cr=0 is an CPU-related optimization we shouldn't try to
++       * load too many groups, at some point we should start to use what
++       * we've got in memory.
++       * with an average random access time 5ms, it'd take a second to get
++       * 200 groups (* N with flex_bg), so let's make this limit 4
++       */
++      sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
++      if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
++              sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
++
+       return 0;
+ err_freebuddy:
+@@ -3086,7 +3409,7 @@ int ext4_mb_init(struct super_block *sb)
+       unsigned max;
+       int ret;
+-      i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
++      i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
+       sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+       if (sbi->s_mb_offsets == NULL) {
+@@ -3094,7 +3417,7 @@ int ext4_mb_init(struct super_block *sb)
+               goto out;
+       }
+-      i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
++      i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
+       sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+       if (sbi->s_mb_maxs == NULL) {
+               ret = -ENOMEM;
+@@ -3120,16 +3443,53 @@ int ext4_mb_init(struct super_block *sb)
+               offset_incr = offset_incr >> 1;
+               max = max >> 1;
+               i++;
+-      } while (i <= sb->s_blocksize_bits + 1);
++      } while (i < MB_NUM_ORDERS(sb));
++
++      sbi->s_mb_avg_fragment_size =
++              kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++                      GFP_KERNEL);
++      if (!sbi->s_mb_avg_fragment_size) {
++              ret = -ENOMEM;
++              goto out;
++      }
++      sbi->s_mb_avg_fragment_size_locks =
++              kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++                      GFP_KERNEL);
++      if (!sbi->s_mb_avg_fragment_size_locks) {
++              ret = -ENOMEM;
++              goto out;
++      }
++      for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++              INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
++              rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
++      }
++      sbi->s_mb_largest_free_orders =
++              kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++                      GFP_KERNEL);
++      if (!sbi->s_mb_largest_free_orders) {
++              ret = -ENOMEM;
++              goto out;
++      }
++      sbi->s_mb_largest_free_orders_locks =
++              kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++                      GFP_KERNEL);
++      if (!sbi->s_mb_largest_free_orders_locks) {
++              ret = -ENOMEM;
++              goto out;
++      }
++      for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++              INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
++              rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
++      }
+       spin_lock_init(&sbi->s_md_lock);
+-      spin_lock_init(&sbi->s_bal_lock);
+       sbi->s_mb_free_pending = 0;
+       INIT_LIST_HEAD(&sbi->s_freed_data_list);
+       sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+       sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+       sbi->s_mb_stats = MB_DEFAULT_STATS;
++      sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+       sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
+       sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
+@@ -3214,6 +3574,10 @@ int ext4_mb_init(struct super_block *sb)
+               spin_lock_init(&lg->lg_prealloc_lock);
+       }
++      if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
++              sbi->s_mb_max_linear_groups = 0;
++      else
++              sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
+       /* init file for buddy data */
+       ret = ext4_mb_init_backend(sb);
+       if (ret != 0)
+@@ -3225,6 +3589,10 @@ out_free_locality_groups:
+       free_percpu(sbi->s_locality_groups);
+       sbi->s_locality_groups = NULL;
+ out:
++      kfree(sbi->s_mb_avg_fragment_size);
++      kfree(sbi->s_mb_avg_fragment_size_locks);
++      kfree(sbi->s_mb_largest_free_orders);
++      kfree(sbi->s_mb_largest_free_orders_locks);
+       kfree(sbi->s_mb_prealloc_table);
+       kfree(sbi->s_mb_offsets);
+       sbi->s_mb_offsets = NULL;
+@@ -3282,6 +3650,10 @@ int ext4_mb_release(struct super_block *
+               kvfree(group_info);
+               rcu_read_unlock();
+       }
++      kfree(sbi->s_mb_avg_fragment_size);
++      kfree(sbi->s_mb_avg_fragment_size_locks);
++      kfree(sbi->s_mb_largest_free_orders);
++      kfree(sbi->s_mb_largest_free_orders_locks);
+       kfree(sbi->s_mb_offsets);
+       kfree(sbi->s_mb_maxs);
+       iput(sbi->s_buddy_cache);
+@@ -3302,17 +3674,18 @@ int ext4_mb_release(struct super_block *
+                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]),
+                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+               ext4_msg(sb, KERN_INFO,
+-                    "mballoc: %u extents scanned, %u goal hits, "
++                    "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
+                               "%u 2^N hits, %u breaks, %u lost",
+                               atomic_read(&sbi->s_bal_ex_scanned),
++                              atomic_read(&sbi->s_bal_groups_scanned),
+                               atomic_read(&sbi->s_bal_goals),
+                               atomic_read(&sbi->s_bal_2orders),
+                               atomic_read(&sbi->s_bal_breaks),
+                               atomic_read(&sbi->s_mb_lost_chunks));
+               ext4_msg(sb, KERN_INFO,
+-                     "mballoc: %lu generated and it took %Lu",
+-                              sbi->s_mb_buddies_generated,
+-                              sbi->s_mb_generation_time);
++                     "mballoc: %u generated and it took %llu",
++                              atomic_read(&sbi->s_mb_buddies_generated),
++                              atomic64_read(&sbi->s_mb_generation_time));
+               ext4_msg(sb, KERN_INFO,
+                      "mballoc: %u preallocated, %u discarded",
+                               atomic_read(&sbi->s_mb_preallocated),
+@@ -3832,12 +4205,13 @@ static void ext4_mb_collect_stats(struct
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+-      if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
++      if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
+               atomic_inc(&sbi->s_bal_reqs);
+               atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+               if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
+                       atomic_inc(&sbi->s_bal_success);
+               atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
++              atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
+               if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+                               ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+                       atomic_inc(&sbi->s_bal_goals);
+@@ -4206,16 +4580,22 @@ static void ext4_mb_mark_pa_deleted(stru
+       }
+ }
+-static void ext4_mb_pa_callback(struct rcu_head *head)
++static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
+ {
+-      struct ext4_prealloc_space *pa;
+-      pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+-
++      BUG_ON(!pa);
+       BUG_ON(atomic_read(&pa->pa_count));
+       BUG_ON(pa->pa_deleted == 0);
+       kmem_cache_free(ext4_pspace_cachep, pa);
+ }
++static void ext4_mb_pa_callback(struct rcu_head *head)
++{
++      struct ext4_prealloc_space *pa;
++
++      pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
++      ext4_mb_pa_free(pa);
++}
++
+ /*
+  * drops a reference to preallocated space descriptor
+  * if this was the last reference and the space is consumed
+@@ -4741,14 +5121,20 @@ static int ext4_mb_pa_alloc(struct ext4_
+       return 0;
+ }
+-static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
++static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
+ {
+       struct ext4_prealloc_space *pa = ac->ac_pa;
+       BUG_ON(!pa);
+       ac->ac_pa = NULL;
+       WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+-      kmem_cache_free(ext4_pspace_cachep, pa);
++      /*
++       * current function is only called due to an error or due to
++       * len of found blocks < len of requested blocks hence the PA has not
++       * been added to grp->bb_prealloc_list. So we don't need to lock it
++       */
++      pa->pa_deleted = 1;
++      ext4_mb_pa_free(pa);
+ }
+ #ifdef CONFIG_EXT4_DEBUG
+@@ -4837,6 +5223,7 @@ static void ext4_mb_group_or_file(struct
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       int bsbits = ac->ac_sb->s_blocksize_bits;
+       loff_t size, isize;
++      bool inode_pa_eligible, group_pa_eligible;
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return;
+@@ -4844,26 +5231,27 @@ static void ext4_mb_group_or_file(struct
+       if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+               return;
++      group_pa_eligible = sbi->s_mb_group_prealloc > 0;
++      inode_pa_eligible = true;
+       size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+       isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+               >> bsbits;
++      /* No point in using inode preallocation for closed files */
+       if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+-          !inode_is_open_for_write(ac->ac_inode)) {
+-              ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+-              return;
+-      }
+-
+-      if (sbi->s_mb_group_prealloc <= 0) {
+-              ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+-              return;
+-      }
++          !inode_is_open_for_write(ac->ac_inode))
++              inode_pa_eligible = false;
+-      /* don't use group allocation for large files */
+       size = max(size, isize);
+-      if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
+-          (size >= sbi->s_mb_large_req)) {
+-              ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
++      /* Don't use group allocation for large files */
++      if (size > sbi->s_mb_stream_request)
++              group_pa_eligible = false;
++
++      if (!group_pa_eligible) {
++              if (inode_pa_eligible)
++                      ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
++              else
++                      ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+               return;
+       }
+@@ -5211,6 +5599,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
+       ext4_fsblk_t block = 0;
+       unsigned int inquota = 0;
+       unsigned int reserv_clstrs = 0;
++      int retries = 0;
+       u64 seq;
+       might_sleep();
+@@ -5293,13 +5682,13 @@ repeat:
+                * So we have to free this pa here itself.
+                */
+               if (*errp) {
+-                      ext4_mb_pa_free(ac);
++                      ext4_mb_pa_put_free(ac);
+                       ext4_discard_allocated_blocks(ac);
+                       goto errout;
+               }
+               if (ac->ac_status == AC_STATUS_FOUND &&
+                       ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+-                      ext4_mb_pa_free(ac);
++                      ext4_mb_pa_put_free(ac);
+       }
+       if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+               *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+@@ -5311,13 +5700,14 @@ repeat:
+                       ar->len = ac->ac_b_ex.fe_len;
+               }
+       } else {
+-              if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
++              if (++retries < 3 &&
++                  ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
+                       goto repeat;
+               /*
+                * If block allocation fails then the pa allocated above
+                * needs to be freed here itself.
+                */
+-              ext4_mb_pa_free(ac);
++              ext4_mb_pa_put_free(ac);
+               *errp = -ENOSPC;
+       }
+@@ -5988,6 +6378,7 @@ ext4_trim_all_free(struct super_block *s
+               ret = count;
+               EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+       }
++
+ out:
+       ext4_unlock_group(sb, group);
+       ext4_mb_unload_buddy(&e4b);
+Index: linux-stage/fs/ext4/sysfs.c
+===================================================================
+--- linux-stage.orig/fs/ext4/sysfs.c
++++ linux-stage/fs/ext4/sysfs.c
+@@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_s
+ EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+ EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
++EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+ EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+ EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+@@ -262,6 +263,7 @@ static struct attribute *ext4_attrs[] =
+       ATTR_LIST(mb_large_req),
+       ATTR_LIST(mb_group_prealloc),
+       ATTR_LIST(mb_max_inode_prealloc),
++      ATTR_LIST(mb_max_linear_groups),
+       ATTR_LIST(max_writeback_mb_bump),
+       ATTR_LIST(extent_max_zeroout_kb),
+       ATTR_LIST(trigger_fs_error),
+@@ -475,14 +477,14 @@ int ext4_register_sysfs(struct super_blo
+                               sb);
+               proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
+                               &ext4_mb_seq_groups_ops, sb);
++              proc_create_single_data("mb_stats", 0444, sbi->s_proc,
++                              ext4_seq_mb_stats_show, sb);
+               proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc,
+                               &ext4_seq_prealloc_table_fops, sb);
+               proc_create_data("mb_last_group", S_IRUGO, sbi->s_proc,
+                               &ext4_seq_mb_last_group_fops, sb);
+               proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
+                               ext4_mb_seq_last_start_seq_show, sb);
+-              proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
+-                               sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
+       }
+       return 0;
+ }
+Index: linux-stage/fs/ext4/mballoc.h
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.h
++++ linux-stage/fs/ext4/mballoc.h
+@@ -82,6 +82,23 @@
+  */
+ #define MB_DEFAULT_MAX_INODE_PREALLOC 512
++/*
++ * Number of groups to search linearly before performing group scanning
++ * optimization.
++ */
++#define MB_DEFAULT_LINEAR_LIMIT               4
++
++/*
++ * Minimum number of groups that should be present in the file system to perform
++ * group scanning optimizations.
++ */
++#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD      16
++
++/*
++ * Number of valid buddy orders
++ */
++#define MB_NUM_ORDERS(sb)             ((sb)->s_blocksize_bits + 2)
++
+ struct ext4_free_data {
+       /* this links the free block information from sb_info */
+       struct list_head                efd_list;
+@@ -166,11 +183,13 @@ struct ext4_allocation_context {
+       /* copy of the best found extent taken before preallocation efforts */
+       struct ext4_free_extent ac_f_ex;
++      __u32 ac_groups_considered;
++      __u32 ac_flags;         /* allocation hints */
+       __u16 ac_groups_scanned;
++      __u16 ac_groups_linear_remaining;
+       __u16 ac_found;
+       __u16 ac_tail;
+       __u16 ac_buddy;
+-      __u16 ac_flags;         /* allocation hints */
+       __u8 ac_status;
+       __u8 ac_criteria;
+       __u8 ac_2order;         /* if request is to allocate 2^N blocks and
+Index: linux-stage/fs/ext4/balloc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/balloc.c
++++ linux-stage/fs/ext4/balloc.c
+@@ -441,6 +441,12 @@ ext4_read_block_bitmap_nowait(struct sup
+               return ERR_PTR(-ENOMEM);
+       }
++      if (ignore_locked && buffer_locked(bh)) {
++              /* buffer under IO already, return if called for prefetching */
++              put_bh(bh);
++              return NULL;
++      }
++
+       if (bitmap_uptodate(bh))
+               goto verify;
+@@ -498,7 +504,8 @@ ext4_read_block_bitmap_nowait(struct sup
+       trace_ext4_read_block_bitmap_load(sb, block_group);
+       bh->b_end_io = ext4_end_bitmap_read;
+       get_bh(bh);
+-      submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
++      submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
++                (ignore_locked ? REQ_RAHEAD : 0), bh);
+       return bh;
+ verify:
+       err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -1526,6 +1526,7 @@ enum {
+       Opt_dioread_nolock, Opt_dioread_lock,
+       Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+       Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
++      Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+ };
+ static const match_table_t tokens = {
+@@ -1616,6 +1617,9 @@ static const match_table_t tokens = {
+       {Opt_test_dummy_encryption, "test_dummy_encryption"},
+       {Opt_nombcache, "nombcache"},
+       {Opt_nombcache, "no_mbcache"},  /* for backward compatibility */
++      {Opt_removed, "prefetch_block_bitmaps"},
++      {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"},
++      {Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
+       {Opt_removed, "check=none"},    /* mount option from ext2/3 */
+       {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
+       {Opt_removed, "reservation"},   /* mount option from ext2/3 */
+@@ -1648,6 +1652,8 @@ static ext4_fsblk_t get_sb_block(void **
+ }
+ #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
++#define DEFAULT_MB_OPTIMIZE_SCAN      (-1)
++
+ static const char deprecated_msg[] =
+       "Mount option \"%s\" will be removed by %s\n"
+       "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+@@ -1835,12 +1841,16 @@ static const struct mount_opts {
+       {Opt_max_dir_size_kb, 0, MOPT_GTE0},
+       {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+       {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
++      {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
++       MOPT_SET},
++      {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
+       {Opt_err, 0, 0}
+ };
+ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+                           substring_t *args, unsigned long *journal_devnum,
+-                          unsigned int *journal_ioprio, int is_remount)
++                          unsigned int *journal_ioprio,
++                          int *mb_optimize_scan, int is_remount)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       const struct mount_opts *m;
+@@ -2129,6 +2139,13 @@ static int handle_mount_opt(struct super
+               sbi->s_mount_opt |= m->mount_opt;
+       } else if (token == Opt_data_err_ignore) {
+               sbi->s_mount_opt &= ~m->mount_opt;
++      } else if (token == Opt_mb_optimize_scan) {
++              if (arg != 0 && arg != 1) {
++                      ext4_msg(sb, KERN_WARNING,
++                               "mb_optimize_scan should be set to 0 or 1.");
++                      return -1;
++              }
++              *mb_optimize_scan = arg;
+       } else {
+               if (!args->from)
+                       arg = 1;
+@@ -2151,6 +2168,7 @@ static int handle_mount_opt(struct super
+ static int parse_options(char *options, struct super_block *sb,
+                        unsigned long *journal_devnum,
+                        unsigned int *journal_ioprio,
++                       int *mb_optimize_scan,
+                        int is_remount)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2171,7 +2189,8 @@ static int parse_options(char *options,
+               args[0].to = args[0].from = NULL;
+               token = match_token(p, tokens, args);
+               if (handle_mount_opt(sb, p, token, args, journal_devnum,
+-                                   journal_ioprio, is_remount) < 0)
++                                   journal_ioprio, mb_optimize_scan,
++                                   is_remount) < 0)
+                       return 0;
+       }
+ #ifdef CONFIG_QUOTA
+@@ -2359,6 +2378,14 @@ static int _ext4_show_options(struct seq
+               SEQ_OPTS_PUTS("dax=inode");
+       }
++      if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
++                      !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
++              SEQ_OPTS_PUTS("mb_optimize_scan=0");
++      } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
++                      test_opt2(sb, MB_OPTIMIZE_SCAN)) {
++              SEQ_OPTS_PUTS("mb_optimize_scan=1");
++      }
++
+       ext4_show_quota_options(seq, sb);
+       return 0;
+ }
+@@ -3140,15 +3167,34 @@ static void print_daily_error_info(struc
+ static int ext4_run_li_request(struct ext4_li_request *elr)
+ {
+       struct ext4_group_desc *gdp = NULL;
+-      ext4_group_t group, ngroups;
+-      struct super_block *sb;
++      struct super_block *sb = elr->lr_super;
++      ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
++      ext4_group_t group = elr->lr_next_group;
++      unsigned int prefetch_ios = 0;
+       int ret = 0;
+       u64 start_time;
+-      sb = elr->lr_super;
+-      ngroups = EXT4_SB(sb)->s_groups_count;
++      if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
++              elr->lr_next_group = ext4_mb_prefetch(sb, group,
++                              EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
++              if (prefetch_ios)
++                      ext4_mb_prefetch_fini(sb, elr->lr_next_group,
++                                            prefetch_ios);
++              trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
++                                          prefetch_ios);
++              if (group >= elr->lr_next_group) {
++                      ret = 1;
++                      if (elr->lr_first_not_zeroed != ngroups &&
++                          !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
++                              elr->lr_next_group = elr->lr_first_not_zeroed;
++                              elr->lr_mode = EXT4_LI_MODE_ITABLE;
++                              ret = 0;
++                      }
++              }
++              return ret;
++      }
+-      for (group = elr->lr_next_group; group < ngroups; group++) {
++      for (; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp) {
+                       ret = 1;
+@@ -3166,9 +3212,10 @@ static int ext4_run_li_request(struct ex
+               start_time = ktime_get_real_ns();
+               ret = ext4_init_inode_table(sb, group,
+                                           elr->lr_timeout ? 0 : 1);
++              trace_ext4_lazy_itable_init(sb, group);
+               if (elr->lr_timeout == 0) {
+                       elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+-                              elr->lr_sbi->s_li_wait_mult);
++                              EXT4_SB(elr->lr_super)->s_li_wait_mult);
+               }
+               elr->lr_next_sched = jiffies + elr->lr_timeout;
+               elr->lr_next_group = group + 1;
+@@ -3182,15 +3229,11 @@ static int ext4_run_li_request(struct ex
+  */
+ static void ext4_remove_li_request(struct ext4_li_request *elr)
+ {
+-      struct ext4_sb_info *sbi;
+-
+       if (!elr)
+               return;
+-      sbi = elr->lr_sbi;
+-
+       list_del(&elr->lr_request);
+-      sbi->s_li_request = NULL;
++      EXT4_SB(elr->lr_super)->s_li_request = NULL;
+       kfree(elr);
+ }
+@@ -3399,7 +3442,6 @@ static int ext4_li_info_new(void)
+ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                           ext4_group_t start)
+ {
+-      struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+@@ -3407,8 +3449,13 @@ static struct ext4_li_request *ext4_li_r
+               return NULL;
+       elr->lr_super = sb;
+-      elr->lr_sbi = sbi;
+-      elr->lr_next_group = start;
++      elr->lr_first_not_zeroed = start;
++      if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
++              elr->lr_mode = EXT4_LI_MODE_ITABLE;
++              elr->lr_next_group = start;
++      } else {
++              elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
++      }
+       /*
+        * Randomize first schedule time of the request to
+@@ -3438,8 +3485,9 @@ int ext4_register_li_request(struct supe
+               goto out;
+       }
+-      if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
+-          !test_opt(sb, INIT_INODE_TABLE))
++      if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
++          (first_not_zeroed == ngroups || sb_rdonly(sb) ||
++           !test_opt(sb, INIT_INODE_TABLE)))
+               goto out;
+       elr = ext4_li_request_new(sb, first_not_zeroed);
+@@ -3725,6 +3773,7 @@ static int ext4_fill_super(struct super_
+       __u64 blocks_count;
+       int err = 0;
+       unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
++      int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
+       ext4_group_t first_not_zeroed;
+       if ((data && !orig_data) || !sbi)
+@@ -3958,7 +4007,7 @@ static int ext4_fill_super(struct super_
+               if (!s_mount_opts)
+                       goto failed_mount;
+               if (!parse_options(s_mount_opts, sb, &journal_devnum,
+-                                 &journal_ioprio, 0)) {
++                                 &journal_ioprio, &mb_optimize_scan, 0)) {
+                       ext4_msg(sb, KERN_WARNING,
+                                "failed to parse options in superblock: %s",
+                                s_mount_opts);
+@@ -3967,7 +4016,7 @@ static int ext4_fill_super(struct super_
+       }
+       sbi->s_def_mount_opt = sbi->s_mount_opt;
+       if (!parse_options((char *) data, sb, &journal_devnum,
+-                         &journal_ioprio, 0))
++                         &journal_ioprio, &mb_optimize_scan, 0))
+               goto failed_mount;
+       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+@@ -4628,6 +4677,19 @@ no_journal:
+       }
+       ext4_ext_init(sb);
++
++      /*
++       * Enable optimize_scan if number of groups is > threshold. This can be
++       * turned off by passing "mb_optimize_scan=0". This can also be
++       * turned on forcefully by passing "mb_optimize_scan=1".
++       */
++      if (mb_optimize_scan == 1)
++              set_opt2(sb, MB_OPTIMIZE_SCAN);
++      else if (mb_optimize_scan == 0)
++              clear_opt2(sb, MB_OPTIMIZE_SCAN);
++      else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
++              set_opt2(sb, MB_OPTIMIZE_SCAN);
++
+       err = ext4_mb_init(sb);
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+@@ -5393,6 +5455,7 @@ static int ext4_remount(struct super_blo
+       int enable_quota = 0;
+       ext4_group_t g;
+       unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
++      int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
+       int err = 0;
+ #ifdef CONFIG_QUOTA
+       int i, j;
+@@ -5439,7 +5502,8 @@ static int ext4_remount(struct super_blo
+       vfs_flags = SB_LAZYTIME | SB_I_VERSION;
+       sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
+-      if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
++      if (!parse_options(data, sb, NULL, &journal_ioprio, &mb_optimize_scan,
++                         1)) {
+               err = -EINVAL;
+               goto restore_opts;
+       }
+Index: linux-stage/include/trace/events/ext4.h
+===================================================================
+--- linux-stage.orig/include/trace/events/ext4.h
++++ linux-stage/include/trace/events/ext4.h
+@@ -2712,6 +2712,50 @@ TRACE_EVENT(ext4_error,
+                 __entry->function, __entry->line)
+ );
++TRACE_EVENT(ext4_prefetch_bitmaps,
++          TP_PROTO(struct super_block *sb, ext4_group_t group,
++                   ext4_group_t next, unsigned int prefetch_ios),
++
++      TP_ARGS(sb, group, next, prefetch_ios),
++
++      TP_STRUCT__entry(
++              __field(        dev_t,  dev                     )
++              __field(        __u32,  group                   )
++              __field(        __u32,  next                    )
++              __field(        __u32,  ios                     )
++      ),
++
++      TP_fast_assign(
++              __entry->dev    = sb->s_dev;
++              __entry->group  = group;
++              __entry->next   = next;
++              __entry->ios    = prefetch_ios;
++      ),
++
++      TP_printk("dev %d,%d group %u next %u ios %u",
++                MAJOR(__entry->dev), MINOR(__entry->dev),
++                __entry->group, __entry->next, __entry->ios)
++);
++
++TRACE_EVENT(ext4_lazy_itable_init,
++          TP_PROTO(struct super_block *sb, ext4_group_t group),
++
++      TP_ARGS(sb, group),
++
++      TP_STRUCT__entry(
++              __field(        dev_t,  dev                     )
++              __field(        __u32,  group                   )
++      ),
++
++      TP_fast_assign(
++              __entry->dev    = sb->s_dev;
++              __entry->group  = group;
++      ),
++
++      TP_printk("dev %d,%d group %u",
++                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
++);
++
+ #endif /* _TRACE_EXT4_H */
+ /* This part must be outside protection */
index 5613c88..965c819 100644 (file)
@@ -36,3 +36,4 @@ rhel8.7/ext4-filename-encode.patch
 rhel8/ext4-old_ea_inodes_handling_fix.patch
 rhel8.4/ext4-optimize-find_delayed_extent.patch
 rhel8/ext4-limit-per-inode-preallocation-list.patch
+rhel8/ext4-mballoc-improve.patch