From 8da59fc988f0cebcac10e8ef1faab1e4c913de03 Mon Sep 17 00:00:00 2001 From: Bobi Jam Date: Fri, 21 Jul 2023 15:34:20 +0800 Subject: [PATCH] LU-14438 ldiskfs: backport ldiskfs mballoc patches This contains following kernel patches: a078dff87013 ("ext4: fixup possible uninitialized variable access in ext4_mb_choose_next_group_cr1()") 80fa46d6b9e7 ("ext4: limit the number of retries after discarding preallocations blocks") 820897258ad3 ("ext4: Refactor code related to freeing PAs") cf5e2ca6c990 ("ext4: mballoc: refactor ext4_mb_discard_preallocations()") 83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree") a9f2a2931d0e ("ext4: use locality group preallocation for small closed files") 1940265ede66 ("ext4: avoid unnecessary spreading of allocations among groups") 4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan") 3fa5d23e68a3 ("ext4: reflect mb_optimize_scan value in options file") 077d0c2c78df ("ext4: make mb_optimize_scan performance mount option work with extents") 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") 21175ca434c5 ("ext4: make prefetch_block_bitmaps default") 3d392b2676bf ("ext4: add prefetch_block_bitmaps mount option") cfd732377221 ("ext4: add prefetching for block allocation bitmaps") 4b68f6df1059 ("ext4: add MB_NUM_ORDERS macro") dddcd2f9ebde ("ext4: optimize the implementation of ext4_mb_good_group()") a6c75eaf1103 ("ext4: add mballoc stats proc file") 67d251860461 ("ext4: drop s_mb_bal_lock and convert protected fields to atomic") Signed-off-by: Bobi Jam Change-Id: I079dfb74bd743894934484803cedb683073e4d94 --- .../patches/rhel8/ext4-mballoc-improve.patch | 1825 ++++++++++++++++++++ .../series/ldiskfs-4.18-rhel8.8.series | 1 + 2 files changed, 1826 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch new file mode 100644 index 0000000..a00522a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-improve.patch @@ -0,0 +1,1825 @@ +This contains following kernel patches: + +a078dff87013 ("ext4: fixup possible uninitialized variable access in + ext4_mb_choose_next_group_cr1()") +80fa46d6b9e7 ("ext4: limit the number of retries after discarding + preallocations blocks") +820897258ad3 ("ext4: Refactor code related to freeing PAs") +cf5e2ca6c990 ("ext4: mballoc: refactor + ext4_mb_discard_preallocations()") +83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of + rbtree") +a9f2a2931d0e ("ext4: use locality group preallocation for small + closed files") +1940265ede66 ("ext4: avoid unnecessary spreading of allocations among + groups") +4fca50d440cc ("ext4: make mballoc try target group first even with + mb_optimize_scan") +3fa5d23e68a3 ("ext4: reflect mb_optimize_scan value in options file") +077d0c2c78df ("ext4: make mb_optimize_scan performance mount option + work with extents") +196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +21175ca434c5 ("ext4: make prefetch_block_bitmaps default") +3d392b2676bf ("ext4: add prefetch_block_bitmaps mount option") +cfd732377221 ("ext4: add prefetching for block allocation bitmaps") +4b68f6df1059 ("ext4: add MB_NUM_ORDERS macro") +dddcd2f9ebde ("ext4: optimize the implementation of ext4_mb_good_group()") +a6c75eaf1103 ("ext4: add mballoc stats proc file") +67d251860461 ("ext4: drop s_mb_bal_lock and convert protected fields + to atomic") + +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -151,7 +151,10 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_USE_RESERVED 0x2000 + /* Do strict check for free blocks while retrying block allocation */ + #define EXT4_MB_STRICT_CHECK 0x4000 +- ++/* Large fragment size list lookup succeeded at least once for cr = 0 */ ++#define EXT4_MB_CR0_OPTIMIZED 0x8000 ++/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ ++#define EXT4_MB_CR1_OPTIMIZED 0x00010000 + struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; +@@ -1166,6 +1169,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ ++#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 + #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ + #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +@@ -1192,7 +1196,9 @@ struct ext4_inode_info { + + #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ + #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ +- ++#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group ++ * scanning in mballoc ++ */ + + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +@@ -1479,9 +1485,15 @@ struct ext4_sb_info { + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ ++ struct list_head *s_mb_avg_fragment_size; ++ rwlock_t *s_mb_avg_fragment_size_locks; ++ struct list_head *s_mb_largest_free_orders; ++ rwlock_t *s_mb_largest_free_orders_locks; + + /* tunables */ + unsigned long s_stripe; ++ unsigned int s_mb_max_linear_groups; ++ unsigned int s_mb_stream_request; + unsigned long s_mb_small_req; + unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; +@@ -1506,15 +1518,18 @@ struct ext4_sb_info { + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_groups_scanned; /* number of groups scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ +- /* cX loop didn't find blocks */ +- atomic64_t s_bal_cX_failed[3]; ++ atomic_t s_bal_cr0_bad_suggestions; ++ atomic_t s_bal_cr1_bad_suggestions; ++ atomic64_t s_bal_cX_groups_considered[4]; ++ atomic64_t s_bal_cX_hits[4]; ++ atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic64_t s_bal_cX_skipped[3]; +- spinlock_t s_bal_lock; +- unsigned long s_mb_buddies_generated; +- unsigned long long s_mb_generation_time; ++ atomic_t s_mb_buddies_generated; /* number of buddies generated */ ++ atomic64_t s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; +@@ -2391,9 +2406,15 @@ struct ext4_lazy_init { + struct mutex li_list_mtx; + }; + ++enum ext4_li_mode { ++ EXT4_LI_MODE_PREFETCH_BBITMAP, ++ EXT4_LI_MODE_ITABLE, ++}; ++ + struct ext4_li_request { + struct super_block *lr_super; +- struct ext4_sb_info *lr_sbi; ++ enum ext4_li_mode lr_mode; ++ ext4_group_t lr_first_not_zeroed; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; +@@ -2685,6 +2706,7 @@ extern const struct file_operations ext4 + extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v); + extern long ext4_mb_stats; + extern long ext4_mb_max_to_scan; ++extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); + extern int ext4_mb_init(struct super_block *); + extern int ext4_mb_release(struct super_block *); + extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, +@@ -2693,6 +2715,12 @@ extern int ext4_mb_reserve_blocks(struct + extern void ext4_discard_preallocations(struct inode *); + extern int __init ext4_init_mballoc(void); + extern void ext4_exit_mballoc(void); ++extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, ++ ext4_group_t group, ++ unsigned int nr, int *cnt); ++extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, ++ unsigned int nr); ++ + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +@@ -3178,13 +3206,18 @@ struct ext4_group_info { + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ++ int bb_avg_fragment_size_order; /* order of average ++ fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ++ ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; + unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif + struct rw_semaphore alloc_sem; ++ struct list_head bb_avg_fragment_size_node; ++ struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -127,11 +127,53 @@ + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. + * ++ * If "mb_optimize_scan" mount option is set, we maintain in memory group info ++ * structures in two data structures: ++ * ++ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) ++ * ++ * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) ++ * ++ * This is an array of lists where the index in the array represents the ++ * largest free order in the buddy bitmap of the participating group infos of ++ * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total ++ * number of buddy bitmap orders possible) number of lists. Group-infos are ++ * placed in appropriate lists. ++ * ++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) ++ * ++ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) ++ * ++ * This is an array of lists where in the i-th list there are groups with ++ * average fragment size >= 2^i and < 2^(i+1). The average fragment size ++ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. ++ * Note that we don't bother with a special list for completely empty groups ++ * so we only have MB_NUM_ORDERS(sb) lists. ++ * ++ * When "mb_optimize_scan" mount option is set, mballoc consults the above data ++ * structures to decide the order in which groups are to be traversed for ++ * fulfilling an allocation request. ++ * ++ * At CR = 0, we look for groups which have the largest_free_order >= the order ++ * of the request. We directly look at the largest free order list in the data ++ * structure (1) above where largest_free_order = order of the request. If that ++ * list is empty, we look at remaining list in the increasing order of ++ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. ++ * ++ * At CR = 1, we only consider groups where average fragment size > request ++ * size. So, we lookup a group which has average fragment size just above or ++ * equal to request size using our average fragment size group lists (data ++ * structure 2) in O(1) time. ++ * ++ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in ++ * linear order which requires O(N) search time for each CR 0 and CR 1 phase. ++ * + * The regular allocator (using the buddy cache) supports a few tunables. + * + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req ++ * /sys/fs/ext4//mb_linear_limit + * + * The regular allocator uses buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The +@@ -149,6 +191,16 @@ + * can be used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * ++ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not ++ * get traversed linearly. That may result in subsequent allocations being not ++ * close to each other. And so, the underlying device may get filled up in a ++ * non-linear fashion. While that may not matter on non-rotational devices, for ++ * rotational devices that may result in higher seek times. "mb_linear_limit" ++ * tells mballoc how many groups mballoc should search linearly before ++ * performing consulting above data structures for more efficient lookups. For ++ * non rotational devices, this value defaults to 0 and for rotational devices ++ * this is set to MB_DEFAULT_LINEAR_LIMIT. ++ * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the +@@ -299,6 +351,8 @@ + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) ++ * - cr0 lists lock (cr0) ++ * - cr1 tree lock (cr1) + * + * Paths: + * - new pa +@@ -328,6 +382,9 @@ + * group + * object + * ++ * - allocation path (ext4_mb_regular_allocator) ++ * group ++ * cr0/cr1 + */ + static struct kmem_cache *ext4_pspace_cachep; + static struct kmem_cache *ext4_ac_cachep; +@@ -351,6 +408,9 @@ static void ext4_mb_generate_from_freeli + ext4_group_t group); + static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); + ++static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ++ ext4_group_t group, int cr); ++ + /* + * The algorithm using this percpu seq counter goes below: + * 1. We sample the percpu discard_pa_seq counter before trying for block +@@ -747,6 +807,221 @@ static void ext4_mb_mark_free_simple(str + } + } + ++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) ++{ ++ int order; ++ ++ /* ++ * We don't bother with a special lists groups with only 1 block free ++ * extents and for completely empty groups. ++ */ ++ order = fls(len) - 2; ++ if (order < 0) ++ return 0; ++ if (order == MB_NUM_ORDERS(sb)) ++ order--; ++ return order; ++} ++ ++/* Move group to appropriate avg_fragment_size list */ ++static void ++mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ int new_order; ++ ++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) ++ return; ++ ++ new_order = mb_avg_fragment_size_order(sb, ++ grp->bb_free / grp->bb_fragments); ++ if (new_order == grp->bb_avg_fragment_size_order) ++ return; ++ ++ if (grp->bb_avg_fragment_size_order != -1) { ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_del(&grp->bb_avg_fragment_size_node); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ } ++ grp->bb_avg_fragment_size_order = new_order; ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_add_tail(&grp->bb_avg_fragment_size_node, ++ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++} ++ ++/* ++ * Choose next group by traversing largest_free_order lists. Updates *new_cr if ++ * cr level needs an update. ++ */ ++static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, ++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ++ struct ext4_group_info *iter, *grp; ++ int i; ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ return; ++ ++ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) ++ atomic_inc(&sbi->s_bal_cr0_bad_suggestions); ++ ++ grp = NULL; ++ for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ++ if (list_empty(&sbi->s_mb_largest_free_orders[i])) ++ continue; ++ read_lock(&sbi->s_mb_largest_free_orders_locks[i]); ++ if (list_empty(&sbi->s_mb_largest_free_orders[i])) { ++ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); ++ continue; ++ } ++ grp = NULL; ++ list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], ++ bb_largest_free_order_node) { ++ if (sbi->s_mb_stats) ++ atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); ++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { ++ grp = iter; ++ break; ++ } ++ } ++ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); ++ if (grp) ++ break; ++ } ++ ++ if (!grp) { ++ /* Increment cr and search again */ ++ *new_cr = 1; ++ } else { ++ *group = grp->bb_group; ++ ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; ++ } ++} ++ ++/* ++ * Choose next group by traversing average fragment size list of suitable ++ * order. Updates *new_cr if cr level needs an update. ++ */ ++static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, ++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ++ struct ext4_group_info *grp = NULL, *iter; ++ int i; ++ ++ if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { ++ if (sbi->s_mb_stats) ++ atomic_inc(&sbi->s_bal_cr1_bad_suggestions); ++ } ++ ++ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ++ i < MB_NUM_ORDERS(ac->ac_sb); i++) { ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) ++ continue; ++ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ continue; ++ } ++ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], ++ bb_avg_fragment_size_node) { ++ if (sbi->s_mb_stats) ++ atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); ++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { ++ grp = iter; ++ break; ++ } ++ } ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (grp) ++ break; ++ } ++ ++ if (grp) { ++ *group = grp->bb_group; ++ ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; ++ } else { ++ *new_cr = 2; ++ } ++} ++ ++static inline int should_optimize_scan(struct ext4_allocation_context *ac) ++{ ++ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) ++ return 0; ++ if (ac->ac_criteria >= 2) ++ return 0; ++ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * Return next linear group for allocation. If linear traversal should not be ++ * performed, this function just returns the same group ++ */ ++static int ++next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) ++{ ++ if (!should_optimize_scan(ac)) ++ goto inc_and_return; ++ ++ if (ac->ac_groups_linear_remaining) { ++ ac->ac_groups_linear_remaining--; ++ goto inc_and_return; ++ } ++ ++ return group; ++inc_and_return: ++ /* ++ * Artificially restricted ngroups for non-extent ++ * files makes group > ngroups possible on first loop. ++ */ ++ return group + 1 >= ngroups ? 0 : group + 1; ++} ++ ++/* ++ * ext4_mb_choose_next_group: choose next group for allocation. ++ * ++ * @ac Allocation Context ++ * @new_cr This is an output parameter. If the there is no good group ++ * available at current CR level, this field is updated to indicate ++ * the new cr level that should be used. ++ * @group This is an input / output parameter. As an input it indicates the ++ * next group that the allocator intends to use for allocation. As ++ * output, this field indicates the next group that should be used as ++ * determined by the optimization functions. ++ * @ngroups Total number of groups ++ */ ++static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups) ++{ ++ *new_cr = ac->ac_criteria; ++ ++ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { ++ *group = next_linear_group(ac, *group, ngroups); ++ return; ++ } ++ ++ if (*new_cr == 0) { ++ ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); ++ } else if (*new_cr == 1) { ++ ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); ++ } else { ++ /* ++ * TODO: For CR=2, we can arrange groups in an rb tree sorted by ++ * bb_free. But until that happens, we should never come here. ++ */ ++ WARN_ON(1); ++ } ++} ++ + /* + * Cache the order of the largest free extent we have available in this block + * group. +@@ -754,22 +1029,39 @@ static void ext4_mb_mark_free_simple(str + static void + mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + { ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + int i; +- int bits; + +- grp->bb_largest_free_order = -1; /* uninit */ +- +- bits = sb->s_blocksize_bits + 1; +- for (i = bits; i >= 0; i--) { +- if (grp->bb_counters[i] > 0) { +- grp->bb_largest_free_order = i; ++ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) ++ if (grp->bb_counters[i] > 0) + break; +- } ++ /* No need to move between order lists? */ ++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || ++ i == grp->bb_largest_free_order) { ++ grp->bb_largest_free_order = i; ++ return; ++ } ++ ++ if (grp->bb_largest_free_order >= 0) { ++ write_lock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ list_del_init(&grp->bb_largest_free_order_node); ++ write_unlock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ } ++ grp->bb_largest_free_order = i; ++ if (grp->bb_largest_free_order >= 0 && grp->bb_free) { ++ write_lock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ list_add_tail(&grp->bb_largest_free_order_node, ++ &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); ++ write_unlock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); + } + } + + static noinline_for_stack +-int ext4_mb_generate_buddy(struct super_block *sb, ++void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -817,19 +1109,15 @@ int ext4_mb_generate_buddy(struct super_ + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); +- return -EIO; + } + mb_set_largest_free_order(sb, grp); ++ mb_update_avg_fragment_size(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; +- spin_lock(&sbi->s_bal_lock); +- sbi->s_mb_buddies_generated++; +- sbi->s_mb_generation_time += period; +- spin_unlock(&sbi->s_bal_lock); +- +- return 0; ++ atomic_inc(&sbi->s_mb_buddies_generated); ++ atomic64_add(period, &sbi->s_mb_generation_time); + } + + static void mb_regenerate_buddy(struct ext4_buddy *e4b) +@@ -987,14 +1275,14 @@ static int ext4_mb_init_cache(struct pag + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * +- (sb->s_blocksize_bits+2)); ++ (MB_NUM_ORDERS(sb))); + /* + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- err = ext4_mb_generate_buddy(sb, data, incore, group); ++ ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1558,6 +1846,7 @@ static void mb_free_blocks(struct inode + + done: + mb_set_largest_free_order(sb, e4b->bd_info); ++ mb_update_avg_fragment_size(sb, e4b->bd_info); + mb_check_buddy(e4b); + } + +@@ -1695,6 +1984,7 @@ static int mb_mark_used(struct ext4_budd + } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + ++ mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); + mb_check_buddy(e4b); + +@@ -1989,7 +2279,7 @@ void ext4_mb_simple_scan_group(struct ex + int max; + + BUG_ON(ac->ac_2order <= 0); +- for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { + if (grp->bb_counters[i] == 0) + continue; + +@@ -2135,13 +2425,11 @@ static bool ext4_mb_good_group(struct ex + + BUG_ON(cr < 0 || cr >= 4); + +- free = grp->bb_free; +- if (free == 0) +- return false; +- if (cr <= 2 && free < ac->ac_g_ex.fe_len) ++ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return false; + +- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) ++ free = grp->bb_free; ++ if (free == 0) + return false; + + fragments = grp->bb_fragments; +@@ -2158,8 +2446,10 @@ static bool ext4_mb_good_group(struct ex + ((group % flex_size) == 0)) + return false; + +- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || +- (free / fragments) >= ac->ac_g_ex.fe_len) ++ if (free < ac->ac_g_ex.fe_len) ++ return false; ++ ++ if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) + return true; + + if (grp->bb_largest_free_order < ac->ac_2order) +@@ -2193,10 +2483,13 @@ static int ext4_mb_good_group_nolock(str + { + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct super_block *sb = ac->ac_sb; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; + ext4_grpblk_t free; + int ret = 0; + ++ if (sbi->s_mb_stats) ++ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); + if (should_lock) + ext4_lock_group(sb, group); + free = grp->bb_free; +@@ -2246,97 +2539,90 @@ static u64 available_blocks_count(struct + } + + /* +- * each allocation context (i.e. a thread doing allocation) has own +- * sliding prefetch window of @s_mb_prefetch size which starts at the +- * very first goal and moves ahead of scaning. +- * a side effect is that subsequent allocations will likely find +- * the bitmaps in cache or at least in-flight. ++ * Start prefetching @nr block bitmaps starting at @group. ++ * Return the next group which needs to be prefetched. + */ +-static void +-ext4_mb_prefetch(struct ext4_allocation_context *ac, +- ext4_group_t start) ++ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, ++ unsigned int nr, int *cnt) + { +- struct super_block *sb = ac->ac_sb; + ext4_group_t ngroups = ext4_get_groups_count(sb); +- struct ext4_sb_info *sbi = EXT4_SB(sb); +- struct ext4_group_info *grp; +- ext4_group_t group = start; + struct buffer_head *bh; +- int nr; +- +- /* limit prefetching at cr=0, otherwise mballoc can +- * spend a lot of time loading imperfect groups */ +- if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit) +- return; +- +- /* batch prefetching to get few READs in flight */ +- nr = ac->ac_prefetch - group; +- if (ac->ac_prefetch < group) +- /* wrapped to the first groups */ +- nr += ngroups; +- if (nr > 0) +- return; +- BUG_ON(nr < 0); ++ struct blk_plug plug; + +- nr = sbi->s_mb_prefetch; +- if (ext4_has_feature_flex_bg(sb)) { +- /* align to flex_bg to get more bitmas with a single IO */ +- nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch; +- nr = nr + sbi->s_mb_prefetch - group; +- } ++ blk_start_plug(&plug); + while (nr-- > 0) { +- grp = ext4_get_group_info(sb, group); +- /* prevent expensive getblk() on groups w/ IO in progress */ +- if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp)) +- goto next; +- +- /* ignore empty groups - those will be skipped +- * during the scanning as well */ +- if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) { +- bh = ext4_read_block_bitmap_nowait(sb, group, 1); ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, ++ NULL); ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ ++ /* ++ * Prefetch block groups with free blocks; but don't ++ * bother if it is marked uninitialized on disk, since ++ * it won't require I/O to read. Also only try to ++ * prefetch once, so we avoid getblk() call, which can ++ * be expensive. ++ */ ++ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && ++ EXT4_MB_GRP_NEED_INIT(grp) && ++ ext4_free_group_clusters(sb, gdp) > 0 && ++ !(ext4_has_group_desc_csum(sb) && ++ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ bh = ext4_read_block_bitmap_nowait(sb, group, true); + if (bh && !IS_ERR(bh)) { +- if (!buffer_uptodate(bh)) +- ac->ac_prefetch_ios++; ++ if (!buffer_uptodate(bh) && cnt) ++ (*cnt)++; + brelse(bh); + } + } +-next: + if (++group >= ngroups) + group = 0; + } +- ac->ac_prefetch = group; ++ blk_finish_plug(&plug); ++ return group; + } + +-static void +-ext4_mb_prefetch_fini(struct ext4_allocation_context *ac) ++/* ++ * Prefetching reads the block bitmap into the buffer cache; but we ++ * need to make sure that the buddy bitmap in the page cache has been ++ * initialized. Note that ext4_mb_init_group() will block if the I/O ++ * is not yet completed, or indeed if it was not initiated by ++ * ext4_mb_prefetch did not start the I/O. ++ * ++ * TODO: We should actually kick off the buddy bitmap setup in a work ++ * queue when the buffer I/O is completed, so that we don't block ++ * waiting for the block allocation bitmap read to finish when ++ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). ++ */ ++void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, ++ unsigned int nr) + { +- struct ext4_group_info *grp; +- ext4_group_t group; +- int nr, rc; +- +- /* initialize last window of prefetched groups */ +- nr = ac->ac_prefetch_ios; +- if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch) +- nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch; +- group = ac->ac_prefetch; + while (nr-- > 0) { +- grp = ext4_get_group_info(ac->ac_sb, group); +- if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) { +- rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); +- if (rc) ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, ++ NULL); ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ ++ if (!group) ++ group = ext4_get_groups_count(sb); ++ group--; ++ grp = ext4_get_group_info(sb, group); ++ ++ if (EXT4_MB_GRP_NEED_INIT(grp) && ++ ext4_free_group_clusters(sb, gdp) > 0 && ++ !(ext4_has_group_desc_csum(sb) && ++ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } +- if (group-- == 0) +- group = ext4_get_groups_count(ac->ac_sb) - 1; + } + } + + static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { +- ext4_group_t ngroups, group, i; +- int cr = -1; ++ ext4_group_t prefetch_grp = 0, ngroups, group, i; ++ int cr = -1, new_cr; + int err = 0, first_err = 0; ++ unsigned int nr = 0, prefetch_ios = 0; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; +@@ -2373,13 +2659,13 @@ ext4_mb_regular_allocator(struct ext4_al + * We also support searching for power-of-two requests only for + * requests upto maximum buddy size we have constructed. + */ +- if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { ++ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { + /* + * This should tell if fe_len is exactly power of 2 + */ + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + ac->ac_2order = array_index_nospec(i - 1, +- sb->s_blocksize_bits + 2); ++ MB_NUM_ORDERS(sb)); + } + + /* if stream allocation is enabled, use global goal */ +@@ -2420,19 +2706,41 @@ repeat: + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; +- ac->ac_prefetch = group; ++ ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; ++ prefetch_grp = group; + +- for (i = 0; i < ngroups; group++, i++) { ++ for (i = 0, new_cr = cr; i < ngroups; i++, ++ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { + int ret = 0; ++ + cond_resched(); ++ if (new_cr != cr) { ++ cr = new_cr; ++ goto repeat; ++ } ++ + /* +- * Artificially restricted ngroups for non-extent +- * files makes group > ngroups possible on first loop. ++ * Batch reads of the block allocation bitmaps ++ * to get multiple READs in flight; limit ++ * prefetching at cr=0/1, otherwise mballoc can ++ * spend a lot of time loading imperfect groups + */ +- if (group >= ngroups) +- group = 0; +- +- ext4_mb_prefetch(ac, group); ++ if ((prefetch_grp == group) && ++ (cr > 1 || ++ prefetch_ios < sbi->s_mb_prefetch_limit)) { ++ unsigned int curr_ios = prefetch_ios; ++ ++ nr = sbi->s_mb_prefetch; ++ if (ext4_has_feature_flex_bg(sb)) { ++ nr = (group / sbi->s_mb_prefetch) * ++ sbi->s_mb_prefetch; ++ nr = nr + sbi->s_mb_prefetch - group; ++ } ++ prefetch_grp = ext4_mb_prefetch(sb, group, ++ nr, &prefetch_ios); ++ if (prefetch_ios == curr_ios) ++ nr = 0; ++ } + + /* This now checks without needing the buddy page */ + ret = ext4_mb_good_group_nolock(ac, group, cr); +@@ -2503,7 +2811,13 @@ repeat: + atomic_inc(&sbi->s_mb_lost_chunks); + goto repeat; + } ++ /* Processed all groups and haven't found blocks */ ++ if (sbi->s_mb_stats && i == ngroups) ++ atomic64_inc(&sbi->s_bal_cX_failed[cr]); + } ++ ++ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) ++ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); + out: + if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) + err = first_err; +@@ -2512,8 +2826,9 @@ out: + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, + ac->ac_flags, cr, err); + +- /* use prefetched bitmaps to init buddy so that read info is not lost */ +- ext4_mb_prefetch_fini(ac); ++ if (nr) ++ ext4_mb_prefetch_fini(sb, prefetch_grp, nr); ++ + return err; + } + +@@ -2704,6 +3019,77 @@ const struct file_operations ext4_seq_pr + .write = ext4_mb_prealloc_table_proc_write, + }; + ++int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) ++{ ++ struct super_block *sb = (struct super_block *)seq->private; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ ++ seq_puts(seq, "mballoc:\n"); ++ if (!sbi->s_mb_stats) { ++ seq_puts(seq, "\tmb stats collection turned off.\n"); ++ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); ++ return 0; ++ } ++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); ++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); ++ ++ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); ++ ++ seq_puts(seq, "\tcr0_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[0])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[0])); ++ seq_printf(seq, "\t\tbad_suggestions: %u\n", ++ atomic_read(&sbi->s_bal_cr0_bad_suggestions)); ++ seq_printf(seq, "\t\tskipped_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[0])); ++ ++ seq_puts(seq, "\tcr1_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[1])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[1])); ++ seq_printf(seq, "\t\tbad_suggestions: %u\n", ++ atomic_read(&sbi->s_bal_cr1_bad_suggestions)); ++ seq_printf(seq, "\t\tskipped_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[1])); ++ ++ seq_puts(seq, "\tcr2_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[2])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[2])); ++ seq_printf(seq, "\t\tskipped_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[2])); ++ ++ seq_puts(seq, "\tcr3_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[3])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[3])); ++ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); ++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); ++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); ++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); ++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); ++ ++ seq_printf(seq, "\tbuddies_generated: %u/%u\n", ++ atomic_read(&sbi->s_mb_buddies_generated), ++ ext4_get_groups_count(sb)); ++ seq_printf(seq, "\tbuddies_time_used: %llu\n", ++ atomic64_read(&sbi->s_mb_generation_time)); ++ seq_printf(seq, "\tpreallocated: %u\n", ++ atomic_read(&sbi->s_mb_preallocated)); ++ seq_printf(seq, "\tdiscarded: %u\n", ++ atomic_read(&sbi->s_mb_discarded)); ++ return 0; ++} ++ + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + { + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; +@@ -2764,93 +3150,6 @@ const struct file_operations ext4_seq_mb + .write = ext4_mb_last_group_write, + }; + +-static int mb_seq_alloc_show(struct seq_file *seq, void *v) +-{ +- struct super_block *sb = seq->private; +- struct ext4_sb_info *sbi = EXT4_SB(sb); +- +- seq_printf(seq, "mballoc:\n"); +- seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated)); +- seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); +- seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); +- +- seq_printf(seq, "\textents_scanned: %u\n", +- atomic_read(&sbi->s_bal_ex_scanned)); +- seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); +- seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); +- seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); +- seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); +- +- seq_printf(seq, "\tuseless_c1_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0])); +- seq_printf(seq, "\tuseless_c2_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1])); +- seq_printf(seq, "\tuseless_c3_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2])); +- seq_printf(seq, "\tskipped_c1_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0])); +- seq_printf(seq, "\tskipped_c2_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1])); +- seq_printf(seq, "\tskipped_c3_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2])); +- seq_printf(seq, "\tbuddies_generated: %lu\n", +- sbi->s_mb_buddies_generated); +- seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time); +- seq_printf(seq, "\tpreallocated: %u\n", +- atomic_read(&sbi->s_mb_preallocated)); +- seq_printf(seq, "\tdiscarded: %u\n", +- atomic_read(&sbi->s_mb_discarded)); +- return 0; +-} +- +-static ssize_t mb_seq_alloc_write(struct file *file, +- const char __user *buf, +- size_t cnt, loff_t *pos) +-{ +- struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); +- +- atomic_set(&sbi->s_bal_allocated, 0), +- atomic_set(&sbi->s_bal_reqs, 0), +- atomic_set(&sbi->s_bal_success, 0); +- +- atomic_set(&sbi->s_bal_ex_scanned, 0), +- atomic_set(&sbi->s_bal_goals, 0), +- atomic_set(&sbi->s_bal_2orders, 0), +- atomic_set(&sbi->s_bal_breaks, 0), +- atomic_set(&sbi->s_mb_lost_chunks, 0); +- +- atomic64_set(&sbi->s_bal_cX_failed[0], 0), +- atomic64_set(&sbi->s_bal_cX_failed[1], 0), +- atomic64_set(&sbi->s_bal_cX_failed[2], 0); +- +- atomic64_set(&sbi->s_bal_cX_skipped[0], 0), +- atomic64_set(&sbi->s_bal_cX_skipped[1], 0), +- atomic64_set(&sbi->s_bal_cX_skipped[2], 0); +- +- +- sbi->s_mb_buddies_generated = 0; +- sbi->s_mb_generation_time = 0; +- +- atomic_set(&sbi->s_mb_preallocated, 0), +- atomic_set(&sbi->s_mb_discarded, 0); +- +- return cnt; +-} +- +-static int mb_seq_alloc_open(struct inode *inode, struct file *file) +-{ +- return single_open(file, mb_seq_alloc_show, PDE_DATA(inode)); +-} +- +-const struct file_operations ext4_mb_seq_alloc_fops = { +- .owner = THIS_MODULE, +- .open = mb_seq_alloc_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = single_release, +- .write = mb_seq_alloc_write, +-}; +- + int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v) + { + struct ext4_sb_info *sbi = EXT4_SB(m->private); +@@ -2952,7 +3251,11 @@ int ext4_mb_add_groupinfo(struct super_b + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; ++ INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); ++ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ ++ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ ++ meta_group_info[i]->bb_group = group; + + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); + return 0; +@@ -3008,6 +3311,26 @@ static int ext4_mb_init_backend(struct s + goto err_freebuddy; + } + ++ if (ext4_has_feature_flex_bg(sb)) { ++ /* a single flex group is supposed to be read by a single IO */ ++ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; ++ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ ++ } else { ++ sbi->s_mb_prefetch = 32; ++ } ++ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) ++ sbi->s_mb_prefetch = ext4_get_groups_count(sb); ++ /* now many real IOs to prefetch within a single allocation at cr=0 ++ * given cr=0 is an CPU-related optimization we shouldn't try to ++ * load too many groups, at some point we should start to use what ++ * we've got in memory. ++ * with an average random access time 5ms, it'd take a second to get ++ * 200 groups (* N with flex_bg), so let's make this limit 4 ++ */ ++ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; ++ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) ++ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); ++ + return 0; + + err_freebuddy: +@@ -3087,7 +3410,7 @@ int ext4_mb_init(struct super_block *sb) + unsigned max; + int ret; + +- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); ++ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { +@@ -3095,7 +3418,7 @@ int ext4_mb_init(struct super_block *sb) + goto out; + } + +- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); ++ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + ret = -ENOMEM; +@@ -3121,16 +3444,53 @@ int ext4_mb_init(struct super_block *sb) + offset_incr = offset_incr >> 1; + max = max >> 1; + i++; +- } while (i <= sb->s_blocksize_bits + 1); ++ } while (i < MB_NUM_ORDERS(sb)); ++ ++ sbi->s_mb_avg_fragment_size = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ sbi->s_mb_avg_fragment_size_locks = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size_locks) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) { ++ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); ++ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); ++ } ++ sbi->s_mb_largest_free_orders = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!sbi->s_mb_largest_free_orders) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ sbi->s_mb_largest_free_orders_locks = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), ++ GFP_KERNEL); ++ if (!sbi->s_mb_largest_free_orders_locks) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) { ++ INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); ++ rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); ++ } + + spin_lock_init(&sbi->s_md_lock); +- spin_lock_init(&sbi->s_bal_lock); + sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; ++ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + if (!sbi->s_mb_c1_blocks) + sbi->s_mb_c1_blocks = +@@ -3220,6 +3580,10 @@ int ext4_mb_init(struct super_block *sb) + spin_lock_init(&lg->lg_prealloc_lock); + } + ++ if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) ++ sbi->s_mb_max_linear_groups = 0; ++ else ++ sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) +@@ -3231,6 +3595,10 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); ++ kfree(sbi->s_mb_largest_free_orders); ++ kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; +@@ -3288,6 +3656,10 @@ int ext4_mb_release(struct super_block * + kvfree(group_info); + rcu_read_unlock(); + } ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); ++ kfree(sbi->s_mb_largest_free_orders); ++ kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + iput(sbi->s_buddy_cache); +@@ -3308,17 +3680,18 @@ int ext4_mb_release(struct super_block * + (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]), + (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2])); + ext4_msg(sb, KERN_INFO, +- "mballoc: %u extents scanned, %u goal hits, " ++ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_groups_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + ext4_msg(sb, KERN_INFO, +- "mballoc: %lu generated and it took %Lu", +- sbi->s_mb_buddies_generated, +- sbi->s_mb_generation_time); ++ "mballoc: %u generated and it took %llu", ++ atomic_read(&sbi->s_mb_buddies_generated), ++ atomic64_read(&sbi->s_mb_generation_time)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", + atomic_read(&sbi->s_mb_preallocated), +@@ -3838,12 +4211,13 @@ static void ext4_mb_collect_stats(struct + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + +- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { ++ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); ++ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); +@@ -4192,16 +4566,22 @@ int ext4_mb_generate_from_pa(struct supe + return 0; + } + +-static void ext4_mb_pa_callback(struct rcu_head *head) ++static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) + { +- struct ext4_prealloc_space *pa; +- pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); +- ++ BUG_ON(!pa); + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); + kmem_cache_free(ext4_pspace_cachep, pa); + } + ++static void ext4_mb_pa_callback(struct rcu_head *head) ++{ ++ struct ext4_prealloc_space *pa; ++ ++ pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); ++ ext4_mb_pa_free(pa); ++} ++ + /* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed +@@ -4721,14 +5101,20 @@ static int ext4_mb_pa_alloc(struct ext4_ + return 0; + } + +-static void ext4_mb_pa_free(struct ext4_allocation_context *ac) ++static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) + { + struct ext4_prealloc_space *pa = ac->ac_pa; + + BUG_ON(!pa); + ac->ac_pa = NULL; + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); +- kmem_cache_free(ext4_pspace_cachep, pa); ++ /* ++ * current function is only called due to an error or due to ++ * len of found blocks < len of requested blocks hence the PA has not ++ * been added to grp->bb_prealloc_list. So we don't need to lock it ++ */ ++ pa->pa_deleted = 1; ++ ext4_mb_pa_free(pa); + } + + #ifdef CONFIG_EXT4_DEBUG +@@ -4817,6 +5203,7 @@ static void ext4_mb_group_or_file(struct + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; ++ bool inode_pa_eligible, group_pa_eligible; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; +@@ -4824,26 +5211,27 @@ static void ext4_mb_group_or_file(struct + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + ++ group_pa_eligible = sbi->s_mb_group_prealloc > 0; ++ inode_pa_eligible = true; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + ++ /* No point in using inode preallocation for closed files */ + if ((size == isize) && !ext4_fs_is_busy(sbi) && +- !inode_is_open_for_write(ac->ac_inode)) { +- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; +- return; +- } +- +- if (sbi->s_mb_group_prealloc <= 0) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; +- return; +- } ++ !inode_is_open_for_write(ac->ac_inode)) ++ inode_pa_eligible = false; + +- /* don't use group allocation for large files */ + size = max(size, isize); +- if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || +- (size >= sbi->s_mb_large_req)) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ /* Don't use group allocation for large files */ ++ if (size > sbi->s_mb_stream_request) ++ group_pa_eligible = false; ++ ++ if (!group_pa_eligible) { ++ if (inode_pa_eligible) ++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ else ++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + +@@ -5160,6 +5548,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; ++ int retries = 0; + u64 seq; + + might_sleep(); +@@ -5242,13 +5631,13 @@ repeat: + * So we have to free this pa here itself. + */ + if (*errp) { +- ext4_mb_pa_free(ac); ++ ext4_mb_pa_put_free(ac); + ext4_discard_allocated_blocks(ac); + goto errout; + } + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) +- ext4_mb_pa_free(ac); ++ ext4_mb_pa_put_free(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); +@@ -5260,13 +5649,14 @@ repeat: + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) ++ if (++retries < 3 && ++ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + /* + * If block allocation fails then the pa allocated above + * needs to be freed here itself. + */ +- ext4_mb_pa_free(ac); ++ ext4_mb_pa_put_free(ac); + *errp = -ENOSPC; + } + +@@ -5937,6 +6327,7 @@ ext4_trim_all_free(struct super_block *s + ret = count; + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } ++ + out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); +Index: linux-stage/fs/ext4/sysfs.c +=================================================================== +--- linux-stage.orig/fs/ext4/sysfs.c ++++ linux-stage/fs/ext4/sysfs.c +@@ -222,6 +222,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_ + EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); + EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); ++EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); + EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); + EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +@@ -260,6 +261,7 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_small_req), + ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), ++ ATTR_LIST(mb_max_linear_groups), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(trigger_fs_error), +@@ -473,14 +475,14 @@ int ext4_register_sysfs(struct super_blo + sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_single_data("mb_stats", 0444, sbi->s_proc, ++ ext4_seq_mb_stats_show, sb); + proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, + &ext4_seq_prealloc_table_fops, sb); + proc_create_data("mb_last_group", S_IRUGO, sbi->s_proc, + &ext4_seq_mb_last_group_fops, sb); + proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc, + ext4_mb_seq_last_start_seq_show, sb); +- proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR, +- sbi->s_proc, &ext4_mb_seq_alloc_fops, sb); + } + return 0; + } +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h ++++ linux-stage/fs/ext4/mballoc.h +@@ -78,6 +78,23 @@ + #define MB_DEFAULT_GROUP_PREALLOC 512 + + ++/* ++ * Number of groups to search linearly before performing group scanning ++ * optimization. ++ */ ++#define MB_DEFAULT_LINEAR_LIMIT 4 ++ ++/* ++ * Minimum number of groups that should be present in the file system to perform ++ * group scanning optimizations. ++ */ ++#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 ++ ++/* ++ * Number of valid buddy orders ++ */ ++#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) ++ + struct ext4_free_data { + /* this links the free block information from sb_info */ + struct list_head efd_list; +@@ -162,11 +179,13 @@ struct ext4_allocation_context { + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + ++ __u32 ac_groups_considered; ++ __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; ++ __u16 ac_groups_linear_remaining; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; +- __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and +Index: linux-stage/fs/ext4/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/balloc.c ++++ linux-stage/fs/ext4/balloc.c +@@ -441,6 +441,12 @@ ext4_read_block_bitmap_nowait(struct sup + return ERR_PTR(-ENOMEM); + } + ++ if (ignore_locked && buffer_locked(bh)) { ++ /* buffer under IO already, return if called for prefetching */ ++ put_bh(bh); ++ return NULL; ++ } ++ + if (bitmap_uptodate(bh)) + goto verify; + +@@ -498,7 +504,8 @@ ext4_read_block_bitmap_nowait(struct sup + trace_ext4_read_block_bitmap_load(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); +- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); ++ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO | ++ (ignore_locked ? REQ_RAHEAD : 0), bh); + return bh; + verify: + err = ext4_validate_block_bitmap(sb, desc, block_group, bh); +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -1526,6 +1526,7 @@ enum { + Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, ++ Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, + }; + + static const match_table_t tokens = { +@@ -1619,6 +1620,9 @@ static const match_table_t tokens = { + {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ ++ {Opt_removed, "prefetch_block_bitmaps"}, ++ {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, ++ {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ +@@ -1651,6 +1655,8 @@ static ext4_fsblk_t get_sb_block(void ** + } + + #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) ++#define DEFAULT_MB_OPTIMIZE_SCAN (-1) ++ + static const char deprecated_msg[] = + "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; +@@ -1841,12 +1847,16 @@ static const struct mount_opts { + {Opt_mb_c3_threshold, 0, MOPT_STRING}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, ++ {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, ++ MOPT_SET}, ++ {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, + {Opt_err, 0, 0} + }; + + static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, +- unsigned int *journal_ioprio, int is_remount) ++ unsigned int *journal_ioprio, ++ int *mb_optimize_scan, int is_remount) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; +@@ -2141,6 +2151,13 @@ static int handle_mount_opt(struct super + sbi->s_mount_opt |= m->mount_opt; + } else if (token == Opt_data_err_ignore) { + sbi->s_mount_opt &= ~m->mount_opt; ++ } else if (token == Opt_mb_optimize_scan) { ++ if (arg != 0 && arg != 1) { ++ ext4_msg(sb, KERN_WARNING, ++ "mb_optimize_scan should be set to 0 or 1."); ++ return -1; ++ } ++ *mb_optimize_scan = arg; + } else { + if (!args->from) + arg = 1; +@@ -2163,6 +2180,7 @@ static int handle_mount_opt(struct super + static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, ++ int *mb_optimize_scan, + int is_remount) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -2183,7 +2201,8 @@ static int parse_options(char *options, + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, +- journal_ioprio, is_remount) < 0) ++ journal_ioprio, mb_optimize_scan, ++ is_remount) < 0) + return 0; + } + #ifdef CONFIG_QUOTA +@@ -2371,6 +2390,14 @@ static int _ext4_show_options(struct seq + SEQ_OPTS_PUTS("dax=inode"); + } + ++ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && ++ !test_opt2(sb, MB_OPTIMIZE_SCAN)) { ++ SEQ_OPTS_PUTS("mb_optimize_scan=0"); ++ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && ++ test_opt2(sb, MB_OPTIMIZE_SCAN)) { ++ SEQ_OPTS_PUTS("mb_optimize_scan=1"); ++ } ++ + ext4_show_quota_options(seq, sb); + return 0; + } +@@ -3152,15 +3179,34 @@ static void print_daily_error_info(struc + static int ext4_run_li_request(struct ext4_li_request *elr) + { + struct ext4_group_desc *gdp = NULL; +- ext4_group_t group, ngroups; +- struct super_block *sb; ++ struct super_block *sb = elr->lr_super; ++ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ++ ext4_group_t group = elr->lr_next_group; ++ unsigned int prefetch_ios = 0; + int ret = 0; + u64 start_time; + +- sb = elr->lr_super; +- ngroups = EXT4_SB(sb)->s_groups_count; ++ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { ++ elr->lr_next_group = ext4_mb_prefetch(sb, group, ++ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); ++ if (prefetch_ios) ++ ext4_mb_prefetch_fini(sb, elr->lr_next_group, ++ prefetch_ios); ++ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, ++ prefetch_ios); ++ if (group >= elr->lr_next_group) { ++ ret = 1; ++ if (elr->lr_first_not_zeroed != ngroups && ++ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { ++ elr->lr_next_group = elr->lr_first_not_zeroed; ++ elr->lr_mode = EXT4_LI_MODE_ITABLE; ++ ret = 0; ++ } ++ } ++ return ret; ++ } + +- for (group = elr->lr_next_group; group < ngroups; group++) { ++ for (; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; +@@ -3178,9 +3224,10 @@ static int ext4_run_li_request(struct ex + start_time = ktime_get_real_ns(); + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); ++ trace_ext4_lazy_itable_init(sb, group); + if (elr->lr_timeout == 0) { + elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * +- elr->lr_sbi->s_li_wait_mult); ++ EXT4_SB(elr->lr_super)->s_li_wait_mult); + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; +@@ -3194,15 +3241,11 @@ static int ext4_run_li_request(struct ex + */ + static void ext4_remove_li_request(struct ext4_li_request *elr) + { +- struct ext4_sb_info *sbi; +- + if (!elr) + return; + +- sbi = elr->lr_sbi; +- + list_del(&elr->lr_request); +- sbi->s_li_request = NULL; ++ EXT4_SB(elr->lr_super)->s_li_request = NULL; + kfree(elr); + } + +@@ -3411,7 +3454,6 @@ static int ext4_li_info_new(void) + static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) + { +- struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); +@@ -3419,8 +3461,13 @@ static struct ext4_li_request *ext4_li_r + return NULL; + + elr->lr_super = sb; +- elr->lr_sbi = sbi; +- elr->lr_next_group = start; ++ elr->lr_first_not_zeroed = start; ++ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { ++ elr->lr_mode = EXT4_LI_MODE_ITABLE; ++ elr->lr_next_group = start; ++ } else { ++ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; ++ } + + /* + * Randomize first schedule time of the request to +@@ -3450,8 +3497,9 @@ int ext4_register_li_request(struct supe + goto out; + } + +- if (first_not_zeroed == ngroups || sb_rdonly(sb) || +- !test_opt(sb, INIT_INODE_TABLE)) ++ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && ++ (first_not_zeroed == ngroups || sb_rdonly(sb) || ++ !test_opt(sb, INIT_INODE_TABLE))) + goto out; + + elr = ext4_li_request_new(sb, first_not_zeroed); +@@ -3737,6 +3785,7 @@ static int ext4_fill_super(struct super_ + __u64 blocks_count; + int err = 0; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ++ int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; + ext4_group_t first_not_zeroed; + + if ((data && !orig_data) || !sbi) +@@ -3970,7 +4019,7 @@ static int ext4_fill_super(struct super_ + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, +- &journal_ioprio, 0)) { ++ &journal_ioprio, &mb_optimize_scan, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); +@@ -3979,7 +4028,7 @@ static int ext4_fill_super(struct super_ + } + sbi->s_def_mount_opt = sbi->s_mount_opt; + if (!parse_options((char *) data, sb, &journal_devnum, +- &journal_ioprio, 0)) ++ &journal_ioprio, &mb_optimize_scan, 0)) + goto failed_mount; + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { +@@ -4640,6 +4689,19 @@ no_journal: + } + + ext4_ext_init(sb); ++ ++ /* ++ * Enable optimize_scan if number of groups is > threshold. This can be ++ * turned off by passing "mb_optimize_scan=0". This can also be ++ * turned on forcefully by passing "mb_optimize_scan=1". ++ */ ++ if (mb_optimize_scan == 1) ++ set_opt2(sb, MB_OPTIMIZE_SCAN); ++ else if (mb_optimize_scan == 0) ++ clear_opt2(sb, MB_OPTIMIZE_SCAN); ++ else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) ++ set_opt2(sb, MB_OPTIMIZE_SCAN); ++ + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", +@@ -5405,6 +5467,7 @@ static int ext4_remount(struct super_blo + int enable_quota = 0; + ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ++ int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; + int err = 0; + #ifdef CONFIG_QUOTA + int i, j; +@@ -5451,7 +5514,8 @@ static int ext4_remount(struct super_blo + vfs_flags = SB_LAZYTIME | SB_I_VERSION; + sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); + +- if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { ++ if (!parse_options(data, sb, NULL, &journal_ioprio, &mb_optimize_scan, ++ 1)) { + err = -EINVAL; + goto restore_opts; + } +Index: linux-stage/include/trace/events/ext4.h +=================================================================== +--- linux-stage.orig/include/trace/events/ext4.h ++++ linux-stage/include/trace/events/ext4.h +@@ -2707,6 +2707,50 @@ TRACE_EVENT(ext4_error, + __entry->function, __entry->line) + ); + ++TRACE_EVENT(ext4_prefetch_bitmaps, ++ TP_PROTO(struct super_block *sb, ext4_group_t group, ++ ext4_group_t next, unsigned int prefetch_ios), ++ ++ TP_ARGS(sb, group, next, prefetch_ios), ++ ++ TP_STRUCT__entry( ++ __field( dev_t, dev ) ++ __field( __u32, group ) ++ __field( __u32, next ) ++ __field( __u32, ios ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = sb->s_dev; ++ __entry->group = group; ++ __entry->next = next; ++ __entry->ios = prefetch_ios; ++ ), ++ ++ TP_printk("dev %d,%d group %u next %u ios %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->group, __entry->next, __entry->ios) ++); ++ ++TRACE_EVENT(ext4_lazy_itable_init, ++ TP_PROTO(struct super_block *sb, ext4_group_t group), ++ ++ TP_ARGS(sb, group), ++ ++ TP_STRUCT__entry( ++ __field( dev_t, dev ) ++ __field( __u32, group ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = sb->s_dev; ++ __entry->group = group; ++ ), ++ ++ TP_printk("dev %d,%d group %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ++); ++ + #endif /* _TRACE_EXT4_H */ + + /* This part must be outside protection */ diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series index ab02384..c4f4ad6 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series @@ -37,3 +37,4 @@ rhel8.7/ext4-filename-encode.patch rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch rhel8/ext4-encdata.patch +rhel8/ext4-mballoc-improve.patch -- 1.8.3.1