From 1534c43ccb034048d8ab0a22cb55635116eebe09 Mon Sep 17 00:00:00 2001 From: Bobi Jam Date: Tue, 3 Sep 2024 23:42:43 +0800 Subject: [PATCH] LU-14438 ldiskfs: backport ldiskfs mballoc patches This contains following kernel patches: cfd732377221 ("ext4: add prefetching for block allocation bitmaps") 3d392b2676bf ("ext4: add prefetch_block_bitmaps mount option") dddcd2f9ebde ("ext4: optimize the implementation of ext4_mb_good_group()") 67d251860461 ("ext4: drop s_mb_bal_lock and convert protected fields to atomic") a6c75eaf1103 ("ext4: add mballoc stats proc file") 4b68f6df1059 ("ext4: add MB_NUM_ORDERS macro") 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") 21175ca434c5 ("ext4: make prefetch_block_bitmaps default") 077d0c2c78df ("ext4: make mb_optimize_scan performance mount option work with extents") 3fa5d23e68a3 ("ext4: reflect mb_optimize_scan value in options file") 4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan") 1940265ede66 ("ext4: avoid unnecessary spreading of allocations among groups") a9f2a2931d0e ("ext4: use locality group preallocation for small closed files") 83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree") 80fa46d6b9e7 ("ext4: limit the number of retries after discarding preallocations blocks") a078dff87013 ("ext4: fixup possible uninitialized variable access in ext4_mb_choose_next_group_cr1()") 820897258ad3 ("ext4: Refactor code related to freeing PAs") 5354b2af3406 ("ext4: allow ext4_get_group_info() to fail") 3c6296046c85 ("ext4: Don't skip prefetching BLOCK_UNINIT groups") 4f3d1e4533b0 ("ext4: Ensure ext4_mb_prefetch_fini() is called for all prefetched BGs") Signed-off-by: Bobi Jam Change-Id: I079dfb74bd743894934484803cedb683073e4d94 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51472 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Li Dongyang Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- .../rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch | 118 +++ .../rhel8.8/ext4-add-mballoc-stats-proc-file.patch | 333 ++++++++ ...t4-add-prefetch-block-bitmap-mount-option.patch | 278 +++++++ ...-prefetching-for-block-allocation-bitmaps.patch | 291 +++++++ .../ext4-allow-ext4_get_group_info-to-fail.patch | 449 +++++++++++ ...oiod-unnecessary-spreading-of-allocations.patch | 69 ++ ...l_lock-convert-protected-fields-to-atomic.patch | 102 +++ ...refetch_fini-called-for-all-prefetched-bg.patch | 85 ++ ...init-var-in-ext4_mb_choose_next_group_cr1.patch | 40 + .../ext4-improve-cr0-cr1-group-scanning.patch | 867 +++++++++++++++++++++ ...-of-retries-after-discard-prealloc-blocks.patch | 71 ++ ...mb_optimize_scan-performance-with-extents.patch | 118 +++ .../ext4-make-mballoc-try-target-group-first.patch | 86 ++ .../ext4-make-prefetch_block_bitmaps-default.patch | 87 +++ ...-not-skip-prefetching-BLOCK_UNINIT-groups.patch | 66 ++ .../ext4-optimize-the-ext4_mb_good_group.patch | 59 ++ .../ext4-refactor-code-related-to-freeing-pa.patch | 104 +++ ...reflect-mb_optimize_scan-value-in-options.patch | 35 + .../ext4-use-buckets-for-cr1-block-scan.patch | 440 +++++++++++ ...ality-group-preallocation-for-small-files.patch | 79 ++ .../ext4-allow-ext4_get_group_info-to-fail.patch | 433 ++++++++++ ...-not-skip-prefetching-BLOCK_UNINIT-groups.patch | 51 ++ .../series/ldiskfs-4.18-rhel8.10.series | 19 + .../series/ldiskfs-4.18-rhel8.8.series | 20 + .../series/ldiskfs-5.14-rhel9.4.series | 3 + .../series/ldiskfs-5.14-rhel9.5.series | 2 + 26 files changed, 4305 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-mballoc-stats-proc-file.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetch-block-bitmap-mount-option.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetching-for-block-allocation-bitmaps.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-allow-ext4_get_group_info-to-fail.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-avoiod-unnecessary-spreading-of-allocations.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-drop-s_mb_bal_lock-convert-protected-fields-to-atomic.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-fixup-possible-uninit-var-in-ext4_mb_choose_next_group_cr1.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-improve-cr0-cr1-group-scanning.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-limit-number-of-retries-after-discard-prealloc-blocks.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mb_optimize_scan-performance-with-extents.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mballoc-try-target-group-first.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-prefetch_block_bitmaps-default.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-optimize-the-ext4_mb_good_group.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-refactor-code-related-to-freeing-pa.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-reflect-mb_optimize_scan-value-in-options.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-buckets-for-cr1-block-scan.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-locality-group-preallocation-for-small-files.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel9.4/ext4-allow-ext4_get_group_info-to-fail.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel9.4/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch new file mode 100644 index 0000000..64868c9 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch @@ -0,0 +1,118 @@ +commit 4b68f6df105966f04f45f1eca0561b86f2b3551d +Author: Harshad Shirwadkar +AuthorDate: Thu Apr 1 10:21:26 2021 -0700 +Commit: Theodore Ts'o +CommitDate: Fri Apr 9 11:34:59 2021 -0400 + +ext4: add MB_NUM_ORDERS macro + +A few arrays in mballoc.c use the total number of valid orders as +their size. Currently, this value is set as "sb->s_blocksize_bits + +2". This makes code harder to read. So, instead add a new macro +MB_NUM_ORDERS(sb) to make the code more readable. + +Signed-off-by: Harshad Shirwadkar +Reviewed-by: Andreas Dilger +Reviewed-by: Ritesh Harjani +Link: https://lore.kernel.org/r/20210401172129.189766-5-harshadshirwadkar@gmail.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -759,7 +759,7 @@ mb_set_largest_free_order(struct super_b + + grp->bb_largest_free_order = -1; /* uninit */ + +- bits = sb->s_blocksize_bits + 1; ++ bits = MB_NUM_ORDERS(sb) - 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; +@@ -985,7 +985,7 @@ static int ext4_mb_init_cache(struct pag + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * +- (sb->s_blocksize_bits+2)); ++ (MB_NUM_ORDERS(sb))); + /* + * incore got set to the group block bitmap below + */ +@@ -1987,7 +1987,7 @@ void ext4_mb_simple_scan_group(struct ex + int max; + + BUG_ON(ac->ac_2order <= 0); +- for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { + if (grp->bb_counters[i] == 0) + continue; + +@@ -2157,7 +2157,7 @@ static bool ext4_mb_good_group(struct ex + if (free < ac->ac_g_ex.fe_len) + return false; + +- if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ++ if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) + return true; + + if (grp->bb_largest_free_order < ac->ac_2order) +@@ -2368,13 +2368,13 @@ ext4_mb_regular_allocator(struct ext4_al + * We also support searching for power-of-two requests only for + * requests upto maximum buddy size we have constructed. + */ +- if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { ++ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { + /* + * This should tell if fe_len is exactly power of 2 + */ + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + ac->ac_2order = array_index_nospec(i - 1, +- sb->s_blocksize_bits + 2); ++ MB_NUM_ORDERS(sb)); + } + + /* if stream allocation is enabled, use global goal */ +@@ -3101,7 +3101,7 @@ int ext4_mb_init(struct super_block *sb) + unsigned max; + int ret; + +- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); ++ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { +@@ -3109,7 +3109,7 @@ int ext4_mb_init(struct super_block *sb) + goto out; + } + +- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); ++ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + ret = -ENOMEM; +@@ -3135,7 +3135,7 @@ int ext4_mb_init(struct super_block *sb) + offset_incr = offset_incr >> 1; + max = max >> 1; + i++; +- } while (i <= sb->s_blocksize_bits + 1); ++ } while (i < MB_NUM_ORDERS(sb)); + + spin_lock_init(&sbi->s_md_lock); + sbi->s_mb_free_pending = 0; +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h ++++ linux-stage/fs/ext4/mballoc.h +@@ -82,6 +82,11 @@ + */ + #define MB_DEFAULT_MAX_INODE_PREALLOC 512 + ++/* ++ * Number of valid buddy orders ++ */ ++#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) ++ + struct ext4_free_data { + /* this links the free block information from sb_info */ + struct list_head efd_list; diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-mballoc-stats-proc-file.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-mballoc-stats-proc-file.patch new file mode 100644 index 0000000..bc5492b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-mballoc-stats-proc-file.patch @@ -0,0 +1,333 @@ +commit a6c75eaf11032f4a3d2b3ce2265a194ac6e4a7f0 +Author: Harshad Shirwadkar +AuthorDate: Thu Apr 1 10:21:25 2021 -0700 +Commit: Theodore Ts'o +CommitDate: Fri Apr 9 11:34:59 2021 -0400 + +ext4: add mballoc stats proc file + +Add new stats for measuring the performance of mballoc. This patch is +forked from Artem Blagodarenko's work that can be found here: + +https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch + +This patch reorganizes the stats by cr level. This is how the output +looks like: + + mballoc: + reqs: 0 + success: 0 + groups_scanned: 0 + cr0_stats: + hits: 0 + groups_considered: 0 + useless_loops: 0 + bad_suggestions: 0 + cr1_stats: + hits: 0 + groups_considered: 0 + useless_loops: 0 + bad_suggestions: 0 + cr2_stats: + hits: 0 + groups_considered: 0 + useless_loops: 0 + cr3_stats: + hits: 0 + groups_considered: 0 + useless_loops: 0 + extents_scanned: 0 + goal_hits: 0 + 2^n_hits: 0 + breaks: 0 + lost: 0 + buddies_generated: 0/40 + buddies_time_used: 0 + preallocated: 0 + discarded: 0 + +Signed-off-by: Harshad Shirwadkar +Reviewed-by: Andreas Dilger +Reviewed-by: Ritesh Harjani +Link: https://lore.kernel.org/r/20210401172129.189766-4-harshadshirwadkar@gmail.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1516,11 +1516,13 @@ struct ext4_sb_info { + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_groups_scanned; /* number of groups scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ +- /* cX loop didn't find blocks */ +- atomic64_t s_bal_cX_failed[4]; ++ atomic64_t s_bal_cX_groups_considered[4]; ++ atomic64_t s_bal_cX_hits[4]; ++ atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic64_t s_bal_cX_skipped[3]; + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; +@@ -2702,6 +2704,7 @@ extern const struct file_operations ext4 + extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v); + extern long ext4_mb_stats; + extern long ext4_mb_max_to_scan; ++extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); + extern int ext4_mb_init(struct super_block *); + extern int ext4_mb_release(struct super_block *); + extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2191,10 +2191,13 @@ static int ext4_mb_good_group_nolock(str + { + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct super_block *sb = ac->ac_sb; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; + ext4_grpblk_t free; + int ret = 0; + ++ if (sbi->s_mb_stats) ++ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); + if (should_lock) + ext4_lock_group(sb, group); + free = grp->bb_free; +@@ -2488,7 +2491,7 @@ repeat: + break; + } + /* Processed all groups and haven't found blocks */ +- if (i == ngroups) ++ if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); + } + +@@ -2517,6 +2520,9 @@ repeat: + goto repeat; + } + } ++ ++ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) ++ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); + out: + if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) + err = first_err; +@@ -2623,6 +2629,67 @@ const struct seq_operations ext4_mb_seq_ + .show = ext4_mb_seq_groups_show, + }; + ++int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) ++{ ++ struct super_block *sb = (struct super_block *)seq->private; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ ++ seq_puts(seq, "mballoc:\n"); ++ if (!sbi->s_mb_stats) { ++ seq_puts(seq, "\tmb stats collection turned off.\n"); ++ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); ++ return 0; ++ } ++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); ++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); ++ ++ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); ++ ++ seq_puts(seq, "\tcr0_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[0])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[0])); ++ ++ seq_puts(seq, "\tcr1_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[1])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[1])); ++ ++ seq_puts(seq, "\tcr2_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[2])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[2])); ++ ++ seq_puts(seq, "\tcr3_stats:\n"); ++ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); ++ seq_printf(seq, "\t\tgroups_considered: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_groups_considered[3])); ++ seq_printf(seq, "\t\tuseless_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[3])); ++ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); ++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); ++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); ++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); ++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); ++ ++ seq_printf(seq, "\tbuddies_generated: %u/%u\n", ++ atomic_read(&sbi->s_mb_buddies_generated), ++ ext4_get_groups_count(sb)); ++ seq_printf(seq, "\tbuddies_time_used: %llu\n", ++ atomic64_read(&sbi->s_mb_generation_time)); ++ seq_printf(seq, "\tpreallocated: %u\n", ++ atomic_read(&sbi->s_mb_preallocated)); ++ seq_printf(seq, "\tdiscarded: %u\n", ++ atomic_read(&sbi->s_mb_discarded)); ++ return 0; ++} ++ + static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, + char *str, size_t cnt, + int update) +@@ -2777,97 +2844,6 @@ const struct file_operations ext4_seq_mb + .write = ext4_mb_last_group_write, + }; + +-static int mb_seq_alloc_show(struct seq_file *seq, void *v) +-{ +- struct super_block *sb = seq->private; +- struct ext4_sb_info *sbi = EXT4_SB(sb); +- +- seq_printf(seq, "mballoc:\n"); +- seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated)); +- seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); +- seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); +- +- seq_printf(seq, "\textents_scanned: %u\n", +- atomic_read(&sbi->s_bal_ex_scanned)); +- seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); +- seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); +- seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); +- seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); +- +- seq_printf(seq, "\tuseless_c0_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0])); +- seq_printf(seq, "\tuseless_c1_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1])); +- seq_printf(seq, "\tuseless_c2_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2])); +- seq_printf(seq, "\tuseless_c3_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3])); +- seq_printf(seq, "\tskipped_c0_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0])); +- seq_printf(seq, "\tskipped_c1_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1])); +- seq_printf(seq, "\tskipped_c2_loops: %llu\n", +- (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2])); +- seq_printf(seq, "\tbuddies_generated: %u\n", +- atomic_read(&sbi->s_mb_buddies_generated)); +- seq_printf(seq, "\tbuddies_time_used: %llu\n", +- atomic64_read(&sbi->s_mb_generation_time)); +- seq_printf(seq, "\tpreallocated: %u\n", +- atomic_read(&sbi->s_mb_preallocated)); +- seq_printf(seq, "\tdiscarded: %u\n", +- atomic_read(&sbi->s_mb_discarded)); +- return 0; +-} +- +-static ssize_t mb_seq_alloc_write(struct file *file, +- const char __user *buf, +- size_t cnt, loff_t *pos) +-{ +- struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); +- +- atomic_set(&sbi->s_bal_allocated, 0), +- atomic_set(&sbi->s_bal_reqs, 0), +- atomic_set(&sbi->s_bal_success, 0); +- +- atomic_set(&sbi->s_bal_ex_scanned, 0), +- atomic_set(&sbi->s_bal_goals, 0), +- atomic_set(&sbi->s_bal_2orders, 0), +- atomic_set(&sbi->s_bal_breaks, 0), +- atomic_set(&sbi->s_mb_lost_chunks, 0); +- +- atomic64_set(&sbi->s_bal_cX_failed[0], 0), +- atomic64_set(&sbi->s_bal_cX_failed[1], 0), +- atomic64_set(&sbi->s_bal_cX_failed[2], 0); +- atomic64_set(&sbi->s_bal_cX_failed[3], 0); +- +- atomic64_set(&sbi->s_bal_cX_skipped[0], 0), +- atomic64_set(&sbi->s_bal_cX_skipped[1], 0), +- atomic64_set(&sbi->s_bal_cX_skipped[2], 0); +- +- +- atomic_set(&sbi->s_mb_buddies_generated, 0); +- atomic64_set(&sbi->s_mb_generation_time, 0); +- +- atomic_set(&sbi->s_mb_preallocated, 0), +- atomic_set(&sbi->s_mb_discarded, 0); +- +- return cnt; +-} +- +-static int mb_seq_alloc_open(struct inode *inode, struct file *file) +-{ +- return single_open(file, mb_seq_alloc_show, PDE_DATA(inode)); +-} +- +-const struct file_operations ext4_mb_seq_alloc_fops = { +- .owner = THIS_MODULE, +- .open = mb_seq_alloc_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = single_release, +- .write = mb_seq_alloc_write, +-}; +- + int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v) + { + struct ext4_sb_info *sbi = EXT4_SB(m->private); +@@ -3350,9 +3326,10 @@ int ext4_mb_release(struct super_block * + (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]), + (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2])); + ext4_msg(sb, KERN_INFO, +- "mballoc: %u extents scanned, %u goal hits, " ++ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_groups_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), +@@ -3871,12 +3848,13 @@ static void ext4_mb_collect_stats(struct + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + +- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { ++ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); ++ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); +Index: linux-stage/fs/ext4/sysfs.c +=================================================================== +--- linux-stage.orig/fs/ext4/sysfs.c ++++ linux-stage/fs/ext4/sysfs.c +@@ -477,14 +477,14 @@ int ext4_register_sysfs(struct super_blo + sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_single_data("mb_stats", 0444, sbi->s_proc, ++ ext4_seq_mb_stats_show, sb); + proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, + &ext4_seq_prealloc_table_fops, sb); + proc_create_data("mb_last_group", S_IRUGO, sbi->s_proc, + &ext4_seq_mb_last_group_fops, sb); + proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc, + ext4_mb_seq_last_start_seq_show, sb); +- proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR, +- sbi->s_proc, &ext4_mb_seq_alloc_fops, sb); + } + return 0; + } diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetch-block-bitmap-mount-option.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetch-block-bitmap-mount-option.patch new file mode 100644 index 0000000..7634b7b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetch-block-bitmap-mount-option.patch @@ -0,0 +1,278 @@ +commit 3d392b2676bf3199863a1e5efb2c087ad9d442a4 +Author: Theodore Ts'o +AuthorDate: Fri Jul 17 00:14:40 2020 -0400 +Commit: Theodore Ts'o +CommitDate: Fri Aug 7 14:12:35 2020 -0400 + +ext4: add prefetch_block_bitmaps mount option + +For file systems where we can afford to keep the buddy bitmaps cached, +we can speed up initial writes to large file systems by starting to +load the block allocation bitmaps as soon as the file system is +mounted. This won't work well for _super_ large file systems, or +memory constrained systems, so we only enable this when it is +requested via a mount option. + +Addresses-Google-Bug: 159488342 +Signed-off-by: Theodore Ts'o +Reviewed-by: Andreas Dilger +--- +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1171,6 +1171,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ ++#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 + #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ + #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +@@ -2402,9 +2403,15 @@ struct ext4_lazy_init { + struct mutex li_list_mtx; + }; + ++enum ext4_li_mode { ++ EXT4_LI_MODE_PREFETCH_BBITMAP, ++ EXT4_LI_MODE_ITABLE, ++}; ++ + struct ext4_li_request { + struct super_block *lr_super; +- struct ext4_sb_info *lr_sbi; ++ enum ext4_li_mode lr_mode; ++ ext4_group_t lr_first_not_zeroed; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; +@@ -2704,6 +2711,12 @@ extern int ext4_mb_reserve_blocks(struct + extern void ext4_discard_preallocations(struct inode *, unsigned int); + extern int __init ext4_init_mballoc(void); + extern void ext4_exit_mballoc(void); ++extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, ++ ext4_group_t group, ++ unsigned int nr, int *cnt); ++extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, ++ unsigned int nr); ++ + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2249,9 +2249,8 @@ static u64 available_blocks_count(struct + * Start prefetching @nr block bitmaps starting at @group. + * Return the next group which needs to be prefetched. + */ +-static ext4_group_t +-ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, +- unsigned int nr, int *cnt) ++ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, ++ unsigned int nr, int *cnt) + { + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct buffer_head *bh; +@@ -2302,9 +2301,8 @@ ext4_mb_prefetch(struct super_block *sb, + * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). + */ + +-static void +-ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, +- unsigned int nr) ++void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, ++ unsigned int nr) + { + while (nr-- > 0) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -1623,6 +1623,7 @@ enum { + Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, ++ Opt_prefetch_block_bitmaps, + }; + + static const match_table_t tokens = { +@@ -1716,6 +1717,7 @@ static const match_table_t tokens = { + {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ ++ {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ +@@ -1938,6 +1940,8 @@ static const struct mount_opts { + {Opt_mb_c3_threshold, 0, MOPT_STRING}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, ++ {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, ++ MOPT_SET}, + {Opt_err, 0, 0} + }; + +@@ -3249,15 +3253,34 @@ static void print_daily_error_info(struc + static int ext4_run_li_request(struct ext4_li_request *elr) + { + struct ext4_group_desc *gdp = NULL; +- ext4_group_t group, ngroups; +- struct super_block *sb; ++ struct super_block *sb = elr->lr_super; ++ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ++ ext4_group_t group = elr->lr_next_group; ++ unsigned int prefetch_ios = 0; + int ret = 0; + u64 start_time; + +- sb = elr->lr_super; +- ngroups = EXT4_SB(sb)->s_groups_count; ++ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { ++ elr->lr_next_group = ext4_mb_prefetch(sb, group, ++ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); ++ if (prefetch_ios) ++ ext4_mb_prefetch_fini(sb, elr->lr_next_group, ++ prefetch_ios); ++ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, ++ prefetch_ios); ++ if (group >= elr->lr_next_group) { ++ ret = 1; ++ if (elr->lr_first_not_zeroed != ngroups && ++ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { ++ elr->lr_next_group = elr->lr_first_not_zeroed; ++ elr->lr_mode = EXT4_LI_MODE_ITABLE; ++ ret = 0; ++ } ++ } ++ return ret; ++ } + +- for (group = elr->lr_next_group; group < ngroups; group++) { ++ for (; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; +@@ -3275,9 +3298,10 @@ static int ext4_run_li_request(struct ex + start_time = ktime_get_real_ns(); + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); ++ trace_ext4_lazy_itable_init(sb, group); + if (elr->lr_timeout == 0) { + elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * +- elr->lr_sbi->s_li_wait_mult); ++ EXT4_SB(elr->lr_super)->s_li_wait_mult); + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; +@@ -3291,15 +3315,11 @@ static int ext4_run_li_request(struct ex + */ + static void ext4_remove_li_request(struct ext4_li_request *elr) + { +- struct ext4_sb_info *sbi; +- + if (!elr) + return; + +- sbi = elr->lr_sbi; +- + list_del(&elr->lr_request); +- sbi->s_li_request = NULL; ++ EXT4_SB(elr->lr_super)->s_li_request = NULL; + kfree(elr); + } + +@@ -3508,7 +3528,6 @@ static int ext4_li_info_new(void) + static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) + { +- struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); +@@ -3516,8 +3535,13 @@ static struct ext4_li_request *ext4_li_r + return NULL; + + elr->lr_super = sb; +- elr->lr_sbi = sbi; +- elr->lr_next_group = start; ++ elr->lr_first_not_zeroed = start; ++ if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) ++ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; ++ else { ++ elr->lr_mode = EXT4_LI_MODE_ITABLE; ++ elr->lr_next_group = start; ++ } + + /* + * Randomize first schedule time of the request to +@@ -3547,8 +3571,9 @@ int ext4_register_li_request(struct supe + goto out; + } + +- if (first_not_zeroed == ngroups || sb_rdonly(sb) || +- !test_opt(sb, INIT_INODE_TABLE)) ++ if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && ++ (first_not_zeroed == ngroups || sb_rdonly(sb) || ++ !test_opt(sb, INIT_INODE_TABLE))) + goto out; + + elr = ext4_li_request_new(sb, first_not_zeroed); +Index: linux-stage/include/trace/events/ext4.h +=================================================================== +--- linux-stage.orig/include/trace/events/ext4.h ++++ linux-stage/include/trace/events/ext4.h +@@ -2712,6 +2712,50 @@ TRACE_EVENT(ext4_error, + __entry->function, __entry->line) + ); + ++TRACE_EVENT(ext4_prefetch_bitmaps, ++ TP_PROTO(struct super_block *sb, ext4_group_t group, ++ ext4_group_t next, unsigned int prefetch_ios), ++ ++ TP_ARGS(sb, group, next, prefetch_ios), ++ ++ TP_STRUCT__entry( ++ __field( dev_t, dev ) ++ __field( __u32, group ) ++ __field( __u32, next ) ++ __field( __u32, ios ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = sb->s_dev; ++ __entry->group = group; ++ __entry->next = next; ++ __entry->ios = prefetch_ios; ++ ), ++ ++ TP_printk("dev %d,%d group %u next %u ios %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->group, __entry->next, __entry->ios) ++); ++ ++TRACE_EVENT(ext4_lazy_itable_init, ++ TP_PROTO(struct super_block *sb, ext4_group_t group), ++ ++ TP_ARGS(sb, group), ++ ++ TP_STRUCT__entry( ++ __field( dev_t, dev ) ++ __field( __u32, group ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = sb->s_dev; ++ __entry->group = group; ++ ), ++ ++ TP_printk("dev %d,%d group %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ++); ++ + #endif /* _TRACE_EXT4_H */ + + /* This part must be outside protection */ diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetching-for-block-allocation-bitmaps.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetching-for-block-allocation-bitmaps.patch new file mode 100644 index 0000000..9360a72 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-add-prefetching-for-block-allocation-bitmaps.patch @@ -0,0 +1,291 @@ +commit cfd73237722135807967f389bcbda558a60a30d6 +Author: Alex Zhuravlev +AuthorDate: Tue Apr 21 10:54:07 2020 +0300 +Commit: Theodore Ts'o +CommitDate: Thu Aug 6 01:44:48 2020 -0400 + +ext4: add prefetching for block allocation bitmaps + +This should significantly improve bitmap loading, especially for flex +groups as it tries to load all bitmaps within a flex.group instead of +one by one synchronously. + +Prefetching is done in 8 * flex_bg groups, so it should be 8 +read-ahead reads for a single allocating thread. At the end of +allocation the thread waits for read-ahead completion and initializes +buddy information so that read-aheads are not lost in case of memory +pressure. + +At cr=0 the number of prefetching IOs is limited per allocation +context to prevent a situation when mballoc loads thousands of bitmaps +looking for a perfect group and ignoring groups with good chunks. + +Together with the patch "ext4: limit scanning of uninitialized groups" +the mount time (which includes few tiny allocations) of a 1PB +filesystem is reduced significantly: + + 0% full 50%-full unpatched patched + mount time 33s 9279s 563s + +[ Restructured by tytso; removed the state flags in the allocation +context, so it can be used to lazily prefetch the allocation bitmaps +immediately after the file system is mounted. Skip prefetching +block groups which are uninitialized. Finally pass in the +REQ_RAHEAD flag to the block layer while prefetching. ] + +Signed-off-by: Alex Zhuravlev +Reviewed-by: Andreas Dilger +Signed-off-by: Theodore Ts'o + +--- +Index: linux-stage/fs/ext4/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/balloc.c ++++ linux-stage/fs/ext4/balloc.c +@@ -498,7 +498,8 @@ ext4_read_block_bitmap_nowait(struct sup + trace_ext4_read_block_bitmap_load(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); +- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); ++ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO | ++ (ignore_locked ? REQ_RAHEAD : 0), bh); + return bh; + verify: + err = ext4_validate_block_bitmap(sb, desc, block_group, bh); +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2246,97 +2246,93 @@ static u64 available_blocks_count(struct + } + + /* +- * each allocation context (i.e. a thread doing allocation) has own +- * sliding prefetch window of @s_mb_prefetch size which starts at the +- * very first goal and moves ahead of scaning. +- * a side effect is that subsequent allocations will likely find +- * the bitmaps in cache or at least in-flight. ++ * Start prefetching @nr block bitmaps starting at @group. ++ * Return the next group which needs to be prefetched. + */ +-static void +-ext4_mb_prefetch(struct ext4_allocation_context *ac, +- ext4_group_t start) ++static ext4_group_t ++ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, ++ unsigned int nr, int *cnt) + { +- struct super_block *sb = ac->ac_sb; + ext4_group_t ngroups = ext4_get_groups_count(sb); +- struct ext4_sb_info *sbi = EXT4_SB(sb); +- struct ext4_group_info *grp; +- ext4_group_t group = start; + struct buffer_head *bh; +- int nr; +- +- /* limit prefetching at cr=0, otherwise mballoc can +- * spend a lot of time loading imperfect groups */ +- if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit) +- return; +- +- /* batch prefetching to get few READs in flight */ +- nr = ac->ac_prefetch - group; +- if (ac->ac_prefetch < group) +- /* wrapped to the first groups */ +- nr += ngroups; +- if (nr > 0) +- return; +- BUG_ON(nr < 0); ++ struct blk_plug plug; + +- nr = sbi->s_mb_prefetch; +- if (ext4_has_feature_flex_bg(sb)) { +- /* align to flex_bg to get more bitmas with a single IO */ +- nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch; +- nr = nr + sbi->s_mb_prefetch - group; +- } ++ blk_start_plug(&plug); + while (nr-- > 0) { +- grp = ext4_get_group_info(sb, group); +- /* prevent expensive getblk() on groups w/ IO in progress */ +- if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp)) +- goto next; +- +- /* ignore empty groups - those will be skipped +- * during the scanning as well */ +- if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) { ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, ++ NULL); ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ ++ /* ++ * Prefetch block groups with free blocks; but don't ++ * bother if it is marked uninitialized on disk, since ++ * it won't require I/O to read. Also only try to ++ * prefetch once, so we avoid getblk() call, which can ++ * be expensive. ++ */ ++ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && ++ EXT4_MB_GRP_NEED_INIT(grp) && ++ ext4_free_group_clusters(sb, gdp) > 0 && ++ !(ext4_has_group_desc_csum(sb) && ++ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + bh = ext4_read_block_bitmap_nowait(sb, group, 1); + if (bh && !IS_ERR(bh)) { +- if (!buffer_uptodate(bh)) +- ac->ac_prefetch_ios++; ++ if (!buffer_uptodate(bh) && cnt) ++ (*cnt)++; + brelse(bh); + } + } +-next: + if (++group >= ngroups) + group = 0; + } +- ac->ac_prefetch = group; ++ blk_finish_plug(&plug); ++ return group; + } + ++/* ++ * Prefetching reads the block bitmap into the buffer cache; but we ++ * need to make sure that the buddy bitmap in the page cache has been ++ * initialized. Note that ext4_mb_init_group() will block if the I/O ++ * is not yet completed, or indeed if it was not initiated by ++ * ext4_mb_prefetch did not start the I/O. ++ * ++ * TODO: We should actually kick off the buddy bitmap setup in a work ++ * queue when the buffer I/O is completed, so that we don't block ++ * waiting for the block allocation bitmap read to finish when ++ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). ++ */ ++ + static void +-ext4_mb_prefetch_fini(struct ext4_allocation_context *ac) ++ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, ++ unsigned int nr) + { +- struct ext4_group_info *grp; +- ext4_group_t group; +- int nr, rc; +- +- /* initialize last window of prefetched groups */ +- nr = ac->ac_prefetch_ios; +- if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch) +- nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch; +- group = ac->ac_prefetch; + while (nr-- > 0) { +- grp = ext4_get_group_info(ac->ac_sb, group); +- if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) { +- rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); +- if (rc) ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, ++ NULL); ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ ++ if (!group) ++ group = ext4_get_groups_count(sb); ++ group--; ++ grp = ext4_get_group_info(sb, group); ++ ++ if (EXT4_MB_GRP_NEED_INIT(grp) && ++ ext4_free_group_clusters(sb, gdp) > 0 && ++ !(ext4_has_group_desc_csum(sb) && ++ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } +- if (group-- == 0) +- group = ext4_get_groups_count(ac->ac_sb) - 1; + } + } + + static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { +- ext4_group_t ngroups, group, i; ++ ext4_group_t prefetch_grp = 0, ngroups, group, i; + int cr = -1; + int err = 0, first_err = 0; ++ unsigned int nr = 0, prefetch_ios = 0; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; +@@ -2420,7 +2416,7 @@ repeat: + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; +- ac->ac_prefetch = group; ++ prefetch_grp = group; + + for (i = 0; i < ngroups; group++, i++) { + int ret = 0; +@@ -2432,7 +2428,28 @@ repeat: + if (group >= ngroups) + group = 0; + +- ext4_mb_prefetch(ac, group); ++ /* ++ * Batch reads of the block allocation bitmaps ++ * to get multiple READs in flight; limit ++ * prefetching at cr=0/1, otherwise mballoc can ++ * spend a lot of time loading imperfect groups ++ */ ++ if ((prefetch_grp == group) && ++ (cr > 1 || ++ prefetch_ios < sbi->s_mb_prefetch_limit)) { ++ unsigned int curr_ios = prefetch_ios; ++ ++ nr = sbi->s_mb_prefetch; ++ if (ext4_has_feature_flex_bg(sb)) { ++ nr = (group / sbi->s_mb_prefetch) * ++ sbi->s_mb_prefetch; ++ nr = nr + sbi->s_mb_prefetch - group; ++ } ++ prefetch_grp = ext4_mb_prefetch(sb, group, ++ nr, &prefetch_ios); ++ if (prefetch_ios == curr_ios) ++ nr = 0; ++ } + + /* This now checks without needing the buddy page */ + ret = ext4_mb_good_group_nolock(ac, group, cr); +@@ -2512,8 +2529,8 @@ out: + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, + ac->ac_flags, cr, err); + +- /* use prefetched bitmaps to init buddy so that read info is not lost */ +- ext4_mb_prefetch_fini(ac); ++ if (nr) ++ ext4_mb_prefetch_fini(sb, prefetch_grp, nr); + return err; + } + +@@ -3012,6 +3029,26 @@ static int ext4_mb_init_backend(struct s + goto err_freebuddy; + } + ++ if (ext4_has_feature_flex_bg(sb)) { ++ /* a single flex group is supposed to be read by a single IO */ ++ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; ++ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ ++ } else { ++ sbi->s_mb_prefetch = 32; ++ } ++ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) ++ sbi->s_mb_prefetch = ext4_get_groups_count(sb); ++ /* now many real IOs to prefetch within a single allocation at cr=0 ++ * given cr=0 is an CPU-related optimization we shouldn't try to ++ * load too many groups, at some point we should start to use what ++ * we've got in memory. ++ * with an average random access time 5ms, it'd take a second to get ++ * 200 groups (* N with flex_bg), so let's make this limit 4 ++ */ ++ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; ++ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) ++ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); ++ + return 0; + + err_freebuddy: diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-allow-ext4_get_group_info-to-fail.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-allow-ext4_get_group_info-to-fail.patch new file mode 100644 index 0000000..f3fb31e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-allow-ext4_get_group_info-to-fail.patch @@ -0,0 +1,449 @@ +commit 5354b2af34064a4579be8bc0e2f15a7b70f14b5f +Author: Theodore Ts'o +AuthorDate: Sat Apr 29 00:06:28 2023 -0400 +Commit: Theodore Ts'o +CommitDate: Sat May 13 18:02:46 2023 -0400 + +ext4: allow ext4_get_group_info() to fail + +Previously, ext4_get_group_info() would treat an invalid group number +as BUG(), since in theory it should never happen. However, if a +malicious attaker (or fuzzer) modifies the superblock via the block +device while it is the file system is mounted, it is possible for +s_first_data_block to get set to a very large number. In that case, +when calculating the block group of some block number (such as the +starting block of a preallocation region), could result in an +underflow and very large block group number. Then the BUG_ON check in +ext4_get_group_info() would fire, resutling in a denial of service +attack that can be triggered by root or someone with write access to +the block device. + +For a quality of implementation perspective, it's best that even if +the system administrator does something that they shouldn't, that it +will not trigger a BUG. So instead of BUG'ing, ext4_get_group_info() +will call ext4_error and return NULL. We also add fallback code in +all of the callers of ext4_get_group_info() that it might NULL. + +Also, since ext4_get_group_info() was already borderline to be an +inline function, un-inline it. The results in a next reduction of the +compiled text size of ext4 by roughly 2k. + +Cc: stable@kernel.org +Link: https://lore.kernel.org/r/20230430154311.579720-2-tytso@mit.edu +Reported-by: syzbot+e2efa3efc15a1c9e95c3@syzkaller.appspotmail.com +Link: https://syzkaller.appspot.com/bug?id=69b28112e098b070f639efb356393af3ffec4220 +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +--- +Index: linux-stage/fs/ext4/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/balloc.c ++++ linux-stage/fs/ext4/balloc.c +@@ -303,6 +303,22 @@ struct ext4_group_desc * ext4_get_group_ + return desc; + } + ++struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ++ ext4_group_t group) ++{ ++ struct ext4_group_info **grp_info; ++ long indexv, indexh; ++ ++ if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) { ++ ext4_error(sb, "invalid group %u", group); ++ return NULL; ++ } ++ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); ++ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); ++ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); ++ return grp_info[indexh]; ++} ++ + /* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. +@@ -372,7 +388,7 @@ static int ext4_validate_block_bitmap(st + + if (buffer_verified(bh)) + return 0; +- if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) ++ if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -2554,6 +2554,8 @@ extern void ext4_check_blocks_bitmap(str + extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); ++extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ++ ext4_group_t group); + extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + + extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, +@@ -3129,19 +3131,6 @@ static inline void ext4_isize_set(struct + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); + } + +-static inline +-struct ext4_group_info *ext4_get_group_info(struct super_block *sb, +- ext4_group_t group) +-{ +- struct ext4_group_info **grp_info; +- long indexv, indexh; +- BUG_ON(group >= EXT4_SB(sb)->s_groups_count); +- indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); +- indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); +- grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); +- return grp_info[indexh]; +-} +- + /* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() +Index: linux-stage/fs/ext4/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/ialloc.c ++++ linux-stage/fs/ext4/ialloc.c +@@ -87,7 +87,7 @@ static int ext4_validate_inode_bitmap(st + + if (buffer_verified(bh)) + return 0; +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ++ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); +@@ -296,7 +296,7 @@ void ext4_free_inode(handle_t *handle, s + bitmap_bh = NULL; + goto error_return; + } +- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { ++ if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; + goto error_return; + } +@@ -916,13 +916,13 @@ got_group: + + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ++ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + /* Skip groups with suspicious inode tables */ +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || ++ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || + IS_ERR(inode_bitmap_bh)) { + inode_bitmap_bh = NULL; + goto next_group; +@@ -1047,6 +1047,10 @@ got: + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + ++ if (!grp) { ++ err = -EFSCORRUPTED; ++ goto out; ++ } + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - +@@ -1395,7 +1399,7 @@ int ext4_init_inode_table(struct super_b + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); +- if (!gdp) ++ if (!gdp || !grp) + goto out; + + /* +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -746,6 +746,8 @@ static int __mb_check_buddy(struct ext4_ + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); ++ if (!grp) ++ return NULL; + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; +@@ -1060,10 +1062,10 @@ mb_set_largest_free_order(struct super_b + } + + static noinline_for_stack +-int ext4_mb_generate_buddy(struct super_block *sb, +- void *buddy, void *bitmap, ext4_group_t group) ++void ext4_mb_generate_buddy(struct super_block *sb, ++ void *buddy, void *bitmap, ext4_group_t group, ++ struct ext4_group_info *grp) + { +- struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; +@@ -1108,7 +1110,6 @@ int ext4_mb_generate_buddy(struct super_ + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); +- return -EIO; + } + mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); +@@ -1118,8 +1119,6 @@ int ext4_mb_generate_buddy(struct super_ + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); +- +- return 0; + } + + static void mb_regenerate_buddy(struct ext4_buddy *e4b) +@@ -1137,7 +1136,7 @@ static void mb_regenerate_buddy(struct e + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, +- e4b->bd_bitmap, e4b->bd_group); ++ e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); + } + + /* The buddy information is attached the buddy cache inode +@@ -1209,6 +1208,8 @@ static int ext4_mb_init_cache(struct pag + break; + + grinfo = ext4_get_group_info(sb, group); ++ if (!grinfo) ++ continue; + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so +@@ -1274,6 +1275,10 @@ static int ext4_mb_init_cache(struct pag + group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); + grinfo = ext4_get_group_info(sb, group); ++ if (!grinfo) { ++ err = -EFSCORRUPTED; ++ goto out; ++ } + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * +@@ -1284,7 +1289,7 @@ static int ext4_mb_init_cache(struct pag + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- err = ext4_mb_generate_buddy(sb, data, incore, group); ++ ext4_mb_generate_buddy(sb, data, incore, group, grinfo); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1399,6 +1404,9 @@ int ext4_mb_init_group(struct super_bloc + might_sleep(); + mb_debug(sb, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); ++ if (!this_grp) ++ return -EFSCORRUPTED; ++ + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already +@@ -1473,6 +1481,8 @@ ext4_mb_load_buddy_gfp(struct super_bloc + + blocks_per_page = PAGE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); ++ if (!grp) ++ return -EFSCORRUPTED; + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = grp; +@@ -2197,6 +2207,8 @@ int ext4_mb_find_by_goal(struct ext4_all + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct ext4_free_extent ex; + ++ if (!grp) ++ return -EFSCORRUPTED; + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + return 0; + if (grp->bb_free == 0) +@@ -2427,7 +2439,7 @@ static bool ext4_mb_good_group(struct ex + + BUG_ON(cr < 0 || cr >= 4); + +- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) ++ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) || !grp) + return false; + + free = grp->bb_free; +@@ -2490,6 +2502,8 @@ static int ext4_mb_good_group_nolock(str + ext4_grpblk_t free; + int ret = 0; + ++ if (!grp) ++ return -EFSCORRUPTED; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); + if (should_lock) +@@ -2564,7 +2578,7 @@ ext4_group_t ext4_mb_prefetch(struct sup + * prefetch once, so we avoid getblk() call, which can + * be expensive. + */ +- if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && ++ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && +@@ -2609,7 +2623,7 @@ void ext4_mb_prefetch_fini(struct super_ + group--; + grp = ext4_get_group_info(sb, group); + +- if (EXT4_MB_GRP_NEED_INIT(grp) && ++ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { +@@ -2883,6 +2897,8 @@ static int ext4_mb_seq_groups_show(struc + sizeof(struct ext4_group_info); + + grinfo = ext4_get_group_info(sb, group); ++ if (!grinfo) ++ return 0; + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); +@@ -2897,7 +2913,7 @@ static int ext4_mb_seq_groups_show(struc + if (gdp != NULL) + free = ext4_free_group_clusters(sb, gdp); + +- memcpy(&sg, ext4_get_group_info(sb, group), i); ++ memcpy(&sg, grinfo, i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); +@@ -3330,8 +3346,12 @@ static int ext4_mb_init_backend(struct s + + err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); +- while (i-- > 0) +- kmem_cache_free(cachep, ext4_get_group_info(sb, i)); ++ while (i-- > 0) { ++ struct ext4_group_info *grp = ext4_get_group_info(sb, i); ++ ++ if (grp) ++ kmem_cache_free(cachep, grp); ++ } + i = sbi->s_group_info_size; + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); +@@ -3634,6 +3654,8 @@ int ext4_mb_release(struct super_block * + for (i = 0; i < ngroups; i++) { + cond_resched(); + grinfo = ext4_get_group_info(sb, i); ++ if (!grinfo) ++ continue; + mb_group_bb_bitmap_free(grinfo); + ext4_lock_group(sb, i); + count = ext4_mb_cleanup_pa(grinfo); +@@ -4480,6 +4502,8 @@ static void ext4_mb_generate_from_freeli + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); ++ if (!grp) ++ return; + n = rb_first(&(grp->bb_free_root)); + + while (n) { +@@ -4549,6 +4573,9 @@ int ext4_mb_generate_from_pa(struct supe + int err; + int len; + ++ if (!grp) ++ return -EIO; ++ + gdp = ext4_get_group_desc(sb, group, NULL); + if (gdp == NULL) + return -EIO; +@@ -4769,6 +4796,8 @@ ext4_mb_new_inode_pa(struct ext4_allocat + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); ++ if (!grp) ++ return; + + pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; +@@ -4825,6 +4854,8 @@ ext4_mb_new_group_pa(struct ext4_allocat + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); ++ if (!grp) ++ return; + lg = ac->ac_lg; + BUG_ON(lg == NULL); + +@@ -4953,6 +4984,8 @@ ext4_mb_discard_group_preallocations(str + int err; + int free = 0; + ++ if (!grp) ++ return 0; + mb_debug(sb, "discard preallocation for group %u\n", group); + if (list_empty(&grp->bb_prealloc_list)) + goto out_dbg; +@@ -5187,6 +5220,9 @@ static inline void ext4_mb_show_pa(struc + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; ++ ++ if (!grp) ++ continue; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, +@@ -5906,6 +5942,7 @@ void ext4_free_blocks(handle_t *handle, + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; ++ struct ext4_group_info *grp; + unsigned int overflow; + ext4_grpblk_t bit; + struct buffer_head *gd_bh; +@@ -5990,8 +6027,8 @@ do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + +- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( +- ext4_get_group_info(sb, block_group)))) ++ grp = ext4_get_group_info(sb, block_group); ++ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return; + + /* +@@ -6537,6 +6574,8 @@ int ext4_trim_fs(struct super_block *sb, + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); ++ if (!grp) ++ continue; + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group, GFP_NOFS); +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -967,6 +967,8 @@ void ext4_mark_group_bitmap_corrupted(st + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + int ret; + ++ if (!grp || !gdp) ++ return; + if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-avoiod-unnecessary-spreading-of-allocations.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-avoiod-unnecessary-spreading-of-allocations.patch new file mode 100644 index 0000000..9bb5e2f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-avoiod-unnecessary-spreading-of-allocations.patch @@ -0,0 +1,69 @@ +commit 1940265ede6683f6317cba0d428ce6505eaca944 +Author: Jan Kara +AuthorDate: Thu Sep 8 11:21:25 2022 +0200 +Commit: Theodore Ts'o +CommitDate: Wed Sep 21 22:11:41 2022 -0400 + +ext4: avoid unnecessary spreading of allocations among groups + +mb_set_largest_free_order() updates lists containing groups with largest +chunk of free space of given order. The way it updates it leads to +always moving the group to the tail of the list. Thus allocations +looking for free space of given order effectively end up cycling through +all groups (and due to initialization in last to first order). This +spreads allocations among block groups which reduces performance for +rotating disks or low-end flash media. Change +mb_set_largest_free_order() to only update lists if the order of the +largest free chunk in the group changed. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Link: https://lore.kernel.org/r/20220908092136.11770-2-jack@suse.cz +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -1078,23 +1078,25 @@ mb_set_largest_free_order(struct super_b + struct ext4_sb_info *sbi = EXT4_SB(sb); + int i; + +- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { ++ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) ++ if (grp->bb_counters[i] > 0) ++ break; ++ /* No need to move between order lists? */ ++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || ++ i == grp->bb_largest_free_order) { ++ grp->bb_largest_free_order = i; ++ return; ++ } ++ ++ if (grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } +- grp->bb_largest_free_order = -1; /* uninit */ +- +- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { +- if (grp->bb_counters[i] > 0) { +- grp->bb_largest_free_order = i; +- break; +- } +- } +- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && +- grp->bb_largest_free_order >= 0 && grp->bb_free) { ++ grp->bb_largest_free_order = i; ++ if (grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-drop-s_mb_bal_lock-convert-protected-fields-to-atomic.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-drop-s_mb_bal_lock-convert-protected-fields-to-atomic.patch new file mode 100644 index 0000000..d029ee1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-drop-s_mb_bal_lock-convert-protected-fields-to-atomic.patch @@ -0,0 +1,102 @@ +commit 67d25186046145748d5fe4c5019d832215e01c1e +Author: Harshad Shirwadkar +AuthorDate: Thu Apr 1 10:21:23 2021 -0700 +Commit: Theodore Ts'o +CommitDate: Fri Apr 9 11:34:58 2021 -0400 + +ext4: drop s_mb_bal_lock and convert protected fields to atomic + +s_mb_buddies_generated gets used later in this patch series to +determine if the cr 0 and cr 1 optimziations should be performed or +not. Currently, s_mb_buddies_generated is protected under a +spin_lock. In the allocation path, it is better if we don't depend on +the lock and instead read the value atomically. In order to do that, +we drop s_bal_lock altogether and we convert the only two protected +fields by it s_mb_buddies_generated and s_mb_generation_time to atomic +type. + +Signed-off-by: Harshad Shirwadkar +Reviewed-by: Andreas Dilger +Reviewed-by: Ritesh Harjani +Link: https://lore.kernel.org/r/20210401172129.189766-2-harshadshirwadkar@gmail.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1522,9 +1522,8 @@ struct ext4_sb_info { + /* cX loop didn't find blocks */ + atomic64_t s_bal_cX_failed[4]; + atomic64_t s_bal_cX_skipped[3]; +- spinlock_t s_bal_lock; +- unsigned long s_mb_buddies_generated; +- unsigned long long s_mb_generation_time; ++ atomic_t s_mb_buddies_generated; /* number of buddies generated */ ++ atomic64_t s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -824,10 +824,8 @@ int ext4_mb_generate_buddy(struct super_ + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; +- spin_lock(&sbi->s_bal_lock); +- sbi->s_mb_buddies_generated++; +- sbi->s_mb_generation_time += period; +- spin_unlock(&sbi->s_bal_lock); ++ atomic_inc(&sbi->s_mb_buddies_generated); ++ atomic64_add(period, &sbi->s_mb_generation_time); + + return 0; + } +@@ -2810,9 +2808,10 @@ static int mb_seq_alloc_show(struct seq_ + (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1])); + seq_printf(seq, "\tskipped_c2_loops: %llu\n", + (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2])); +- seq_printf(seq, "\tbuddies_generated: %lu\n", +- sbi->s_mb_buddies_generated); +- seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time); ++ seq_printf(seq, "\tbuddies_generated: %u\n", ++ atomic_read(&sbi->s_mb_buddies_generated)); ++ seq_printf(seq, "\tbuddies_time_used: %llu\n", ++ atomic64_read(&sbi->s_mb_generation_time)); + seq_printf(seq, "\tpreallocated: %u\n", + atomic_read(&sbi->s_mb_preallocated)); + seq_printf(seq, "\tdiscarded: %u\n", +@@ -2846,8 +2845,8 @@ static ssize_t mb_seq_alloc_write(struct + atomic64_set(&sbi->s_bal_cX_skipped[2], 0); + + +- sbi->s_mb_buddies_generated = 0; +- sbi->s_mb_generation_time = 0; ++ atomic_set(&sbi->s_mb_buddies_generated, 0); ++ atomic64_set(&sbi->s_mb_generation_time, 0); + + atomic_set(&sbi->s_mb_preallocated, 0), + atomic_set(&sbi->s_mb_discarded, 0); +@@ -3163,7 +3162,6 @@ int ext4_mb_init(struct super_block *sb) + } while (i <= sb->s_blocksize_bits + 1); + + spin_lock_init(&sbi->s_md_lock); +- spin_lock_init(&sbi->s_bal_lock); + sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); + +@@ -3360,9 +3358,9 @@ int ext4_mb_release(struct super_block * + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + ext4_msg(sb, KERN_INFO, +- "mballoc: %lu generated and it took %Lu", +- sbi->s_mb_buddies_generated, +- sbi->s_mb_generation_time); ++ "mballoc: %u generated and it took %Lu", ++ atomic_read(&sbi->s_mb_buddies_generated), ++ atomic64_read(&sbi->s_mb_generation_time)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", + atomic_read(&sbi->s_mb_preallocated), diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch new file mode 100644 index 0000000..6f9604f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch @@ -0,0 +1,85 @@ +commit 4f3d1e4533b0982034f316ace85415d3bc57e3da +Author: Ojaswin Mujoo +AuthorDate: Tue May 30 18:03:47 2023 +0530 +Commit: Theodore Ts'o +CommitDate: Mon Jun 26 19:34:56 2023 -0400 + +ext4: Ensure ext4_mb_prefetch_fini() is called for all prefetched BGs + +Before this patch, the call stack in ext4_run_li_request is as follows: + + /* + * nr = no. of BGs we want to fetch (=s_mb_prefetch) + * prefetch_ios = no. of BGs not uptodate after + * ext4_read_block_bitmap_nowait() + */ + next_group = ext4_mb_prefetch(sb, group, nr, prefetch_ios); + ext4_mb_prefetch_fini(sb, next_group prefetch_ios); + +ext4_mb_prefetch_fini() will only try to initialize buddies for BGs in +range [next_group - prefetch_ios, next_group). This is incorrect since +sometimes (prefetch_ios < nr), which causes ext4_mb_prefetch_fini() to +incorrectly ignore some of the BGs that might need initialization. This +issue is more notable now with the previous patch enabling "fetching" of +BLOCK_UNINIT BGs which are marked buffer_uptodate by default. + +Fix this by passing nr to ext4_mb_prefetch_fini() instead of +prefetch_ios so that it considers the right range of groups. + +Similarly, make sure we don't pass nr=0 to ext4_mb_prefetch_fini() in +ext4_mb_regular_allocator() since we might have prefetched BLOCK_UNINIT +groups that would need buddy initialization. + +Signed-off-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/05e648ae04ec5b754207032823e9c1de9a54f87a.1685449706.git.ojaswin@linux.ibm.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2742,8 +2742,6 @@ repeat: + if ((prefetch_grp == group) && + (cr > 1 || + prefetch_ios < sbi->s_mb_prefetch_limit)) { +- unsigned int curr_ios = prefetch_ios; +- + nr = sbi->s_mb_prefetch; + if (ext4_has_feature_flex_bg(sb)) { + nr = (group / sbi->s_mb_prefetch) * +@@ -2752,8 +2750,6 @@ repeat: + } + prefetch_grp = ext4_mb_prefetch(sb, group, + nr, &prefetch_ios); +- if (prefetch_ios == curr_ios) +- nr = 0; + } + + /* This now checks without needing the buddy page */ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -3284,16 +3284,13 @@ static int ext4_run_li_request(struct ex + ext4_group_t group = elr->lr_next_group; + unsigned int prefetch_ios = 0; + int ret = 0; ++ int nr = EXT4_SB(sb)->s_mb_prefetch; + u64 start_time; + + if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { +- elr->lr_next_group = ext4_mb_prefetch(sb, group, +- EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); +- if (prefetch_ios) +- ext4_mb_prefetch_fini(sb, elr->lr_next_group, +- prefetch_ios); +- trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, +- prefetch_ios); ++ elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ++ ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); ++ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); + if (group >= elr->lr_next_group) { + ret = 1; + if (elr->lr_first_not_zeroed != ngroups && diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-fixup-possible-uninit-var-in-ext4_mb_choose_next_group_cr1.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-fixup-possible-uninit-var-in-ext4_mb_choose_next_group_cr1.patch new file mode 100644 index 0000000..db7f3bf --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-fixup-possible-uninit-var-in-ext4_mb_choose_next_group_cr1.patch @@ -0,0 +1,40 @@ +commit a078dff870136090b5779ca2831870a6c5539d36 +Author: Jan Kara +AuthorDate: Thu Sep 22 11:09:29 2022 +0200 +Commit: Theodore Ts'o +CommitDate: Mon Sep 26 13:21:05 2022 -0400 + +ext4: fixup possible uninitialized variable access in ext4_mb_choose_next_group_cr1() + +Variable 'grp' may be left uninitialized if there's no group with +suitable average fragment size (or larger). Fix the problem by +initializing it earlier. + +Link: https://lore.kernel.org/r/20220922091542.pkhedytey7wzp5fi@quack3 +Fixes: 83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree") +Cc: stable@kernel.org +Reported-by: Dan Carpenter +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -911,7 +911,7 @@ static void ext4_mb_choose_next_group_cr + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- struct ext4_group_info *grp, *iter; ++ struct ext4_group_info *grp = NULL, *iter; + int i; + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { +@@ -928,7 +928,6 @@ static void ext4_mb_choose_next_group_cr + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + continue; + } +- grp = NULL; + list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], + bb_avg_fragment_size_node) { + if (sbi->s_mb_stats) diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-improve-cr0-cr1-group-scanning.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-improve-cr0-cr1-group-scanning.patch new file mode 100644 index 0000000..8c2f5ca --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-improve-cr0-cr1-group-scanning.patch @@ -0,0 +1,867 @@ +commit 196e402adf2e4cd66f101923409f1970ec5f1af3 +Author: Harshad Shirwadkar +AuthorDate: Thu Apr 1 10:21:27 2021 -0700 +Commit: Theodore Ts'o +CommitDate: Fri Apr 9 11:34:59 2021 -0400 + +ext4: improve cr 0 / cr 1 group scanning + +Instead of traversing through groups linearly, scan groups in specific +orders at cr 0 and cr 1. At cr 0, we want to find groups that have the +largest free order >= the order of the request. So, with this patch, +we maintain lists for each possible order and insert each group into a +list based on the largest free order in its buddy bitmap. During cr 0 +allocation, we traverse these lists in the increasing order of largest +free orders. This allows us to find a group with the best available cr +0 match in constant time. If nothing can be found, we fallback to cr 1 +immediately. + +At CR1, the story is slightly different. We want to traverse in the +order of increasing average fragment size. For CR1, we maintain a rb +tree of groupinfos which is sorted by average fragment size. Instead +of traversing linearly, at CR1, we traverse in the order of increasing +average fragment size, starting at the most optimal group. This brings +down cr 1 search complexity to log(num groups). + +For cr >= 2, we just perform the linear search as before. Also, in +case of lock contention, we intermittently fallback to linear search +even in CR 0 and CR 1 cases. This allows us to proceed during the +allocation path even in case of high contention. + +There is an opportunity to do optimization at CR2 too. That's because +at CR2 we only consider groups where bb_free counter (number of free +blocks) is greater than the request extent size. That's left as future +work. + +All the changes introduced in this patch are protected under a new +mount option "mb_optimize_scan". + +With this patchset, following experiment was performed: + +Created a highly fragmented disk of size 65TB. The disk had no +contiguous 2M regions. Following command was run consecutively for 3 +times: + +time dd if=/dev/urandom of=file bs=2M count=10 + +Here are the results with and without cr 0/1 optimizations introduced +in this patch: + +|---------+------------------------------+---------------------------| +| | Without CR 0/1 Optimizations | With CR 0/1 Optimizations | +|---------+------------------------------+---------------------------| +| 1st run | 5m1.871s | 2m47.642s | +| 2nd run | 2m28.390s | 0m0.611s | +| 3rd run | 2m26.530s | 0m1.255s | +|---------+------------------------------+---------------------------| + +Signed-off-by: Harshad Shirwadkar +Reported-by: kernel test robot +Reported-by: Dan Carpenter +Reviewed-by: Andreas Dilger +Link: https://lore.kernel.org/r/20210401172129.189766-6-harshadshirwadkar@gmail.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -151,6 +151,13 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_USE_RESERVED 0x2000 + /* Do strict check for free blocks while retrying block allocation */ + #define EXT4_MB_STRICT_CHECK 0x4000 ++ ++/* Large fragment size list lookup succeeded at least once for cr = 0 */ ++#define EXT4_MB_CR0_OPTIMIZED 0x8000 ++/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ ++#define EXT4_MB_CR1_OPTIMIZED 0x00010000 ++/* Perform linear traversal for one group */ ++#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 + #define EXT4_MB_VERY_DENSE 0x80000 + + struct ext4_allocation_request { +@@ -1199,6 +1206,8 @@ struct ext4_inode_info { + #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ + #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ + ++#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group ++ scanning in mballoc */ + + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +@@ -1488,9 +1497,14 @@ struct ext4_sb_info { + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ ++ struct rb_root s_mb_avg_fragment_size_root; ++ rwlock_t s_mb_rb_lock; ++ struct list_head *s_mb_largest_free_orders; ++ rwlock_t *s_mb_largest_free_orders_locks; + + /* tunables */ + unsigned long s_stripe; ++ unsigned int s_mb_max_linear_groups; + unsigned long s_mb_small_req; + unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; +@@ -1520,6 +1534,8 @@ struct ext4_sb_info { + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ ++ atomic_t s_bal_cr0_bad_suggestions; ++ atomic_t s_bal_cr1_bad_suggestions; + atomic64_t s_bal_cX_groups_considered[4]; + atomic64_t s_bal_cX_hits[4]; + atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ +@@ -3206,12 +3222,15 @@ struct ext4_group_info { + ext4_grpblk_t bb_freed_since_trim; /* blocks freed since last trim */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ++ ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; + unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif + struct rw_semaphore alloc_sem; ++ struct rb_node bb_avg_fragment_size_rb; ++ struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -127,11 +127,50 @@ + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. + * ++ * If "mb_optimize_scan" mount option is set, we maintain in memory group info ++ * structures in two data structures: ++ * ++ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) ++ * ++ * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) ++ * ++ * This is an array of lists where the index in the array represents the ++ * largest free order in the buddy bitmap of the participating group infos of ++ * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total ++ * number of buddy bitmap orders possible) number of lists. Group-infos are ++ * placed in appropriate lists. ++ * ++ * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) ++ * ++ * Locking: sbi->s_mb_rb_lock (rwlock) ++ * ++ * This is a red black tree consisting of group infos and the tree is sorted ++ * by average fragment sizes (which is calculated as ext4_group_info->bb_free ++ * / ext4_group_info->bb_fragments). ++ * ++ * When "mb_optimize_scan" mount option is set, mballoc consults the above data ++ * structures to decide the order in which groups are to be traversed for ++ * fulfilling an allocation request. ++ * ++ * At CR = 0, we look for groups which have the largest_free_order >= the order ++ * of the request. We directly look at the largest free order list in the data ++ * structure (1) above where largest_free_order = order of the request. If that ++ * list is empty, we look at remaining list in the increasing order of ++ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. ++ * ++ * At CR = 1, we only consider groups where average fragment size > request ++ * size. So, we lookup a group which has average fragment size just above or ++ * equal to request size using our rb tree (data structure 2) in O(log N) time. ++ * ++ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in ++ * linear order which requires O(N) search time for each CR 0 and CR 1 phase. ++ * + * The regular allocator (using the buddy cache) supports a few tunables. + * + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req ++ * /sys/fs/ext4//mb_linear_limit + * + * The regular allocator uses buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The +@@ -149,6 +188,16 @@ + * can be used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * ++ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not ++ * get traversed linearly. That may result in subsequent allocations being not ++ * close to each other. And so, the underlying device may get filled up in a ++ * non-linear fashion. While that may not matter on non-rotational devices, for ++ * rotational devices that may result in higher seek times. "mb_linear_limit" ++ * tells mballoc how many groups mballoc should search linearly before ++ * performing consulting above data structures for more efficient lookups. For ++ * non rotational devices, this value defaults to 0 and for rotational devices ++ * this is set to MB_DEFAULT_LINEAR_LIMIT. ++ * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the +@@ -299,6 +348,8 @@ + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) ++ * - cr0 lists lock (cr0) ++ * - cr1 tree lock (cr1) + * + * Paths: + * - new pa +@@ -328,6 +379,9 @@ + * group + * object + * ++ * - allocation path (ext4_mb_regular_allocator) ++ * group ++ * cr0/cr1 + */ + static struct kmem_cache *ext4_pspace_cachep; + static struct kmem_cache *ext4_ac_cachep; +@@ -351,6 +405,8 @@ static void ext4_mb_generate_from_freeli + ext4_group_t group); + static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); + ++static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ++ ext4_group_t group, int cr); + /* + * The algorithm using this percpu seq counter goes below: + * 1. We sample the percpu discard_pa_seq counter before trying for block +@@ -747,6 +803,269 @@ static void ext4_mb_mark_free_simple(str + } + } + ++static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, ++ int (*cmp)(struct rb_node *, struct rb_node *)) ++{ ++ struct rb_node **iter = &root->rb_node, *parent = NULL; ++ ++ while (*iter) { ++ parent = *iter; ++ if (cmp(new, *iter) > 0) ++ iter = &((*iter)->rb_left); ++ else ++ iter = &((*iter)->rb_right); ++ } ++ ++ rb_link_node(new, parent, iter); ++ rb_insert_color(new, root); ++} ++ ++static int ++ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) ++{ ++ struct ext4_group_info *grp1 = rb_entry(rb1, ++ struct ext4_group_info, ++ bb_avg_fragment_size_rb); ++ struct ext4_group_info *grp2 = rb_entry(rb2, ++ struct ext4_group_info, ++ bb_avg_fragment_size_rb); ++ int num_frags_1, num_frags_2; ++ ++ num_frags_1 = grp1->bb_fragments ? ++ grp1->bb_free / grp1->bb_fragments : 0; ++ num_frags_2 = grp2->bb_fragments ? ++ grp2->bb_free / grp2->bb_fragments : 0; ++ ++ return (num_frags_2 - num_frags_1); ++} ++ ++/* ++ * Reinsert grpinfo into the avg_fragment_size tree with new average ++ * fragment size. ++ */ ++static void ++mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ ++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) ++ return; ++ ++ write_lock(&sbi->s_mb_rb_lock); ++ if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { ++ rb_erase(&grp->bb_avg_fragment_size_rb, ++ &sbi->s_mb_avg_fragment_size_root); ++ RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); ++ } ++ ++ ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, ++ &grp->bb_avg_fragment_size_rb, ++ ext4_mb_avg_fragment_size_cmp); ++ write_unlock(&sbi->s_mb_rb_lock); ++} ++ ++/* ++ * Choose next group by traversing largest_free_order lists. Updates *new_cr if ++ * cr level needs an update. ++ */ ++static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, ++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ++ struct ext4_group_info *iter, *grp; ++ int i; ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ return; ++ ++ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) ++ atomic_inc(&sbi->s_bal_cr0_bad_suggestions); ++ ++ grp = NULL; ++ for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ++ if (list_empty(&sbi->s_mb_largest_free_orders[i])) ++ continue; ++ read_lock(&sbi->s_mb_largest_free_orders_locks[i]); ++ if (list_empty(&sbi->s_mb_largest_free_orders[i])) { ++ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); ++ continue; ++ } ++ grp = NULL; ++ list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], ++ bb_largest_free_order_node) { ++ if (sbi->s_mb_stats) ++ atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); ++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { ++ grp = iter; ++ break; ++ } ++ } ++ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); ++ if (grp) ++ break; ++ } ++ ++ if (!grp) { ++ /* Increment cr and search again */ ++ *new_cr = 1; ++ } else { ++ *group = grp->bb_group; ++ ac->ac_last_optimal_group = *group; ++ ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; ++ } ++} ++ ++/* ++ * Choose next group by traversing average fragment size tree. Updates *new_cr ++ * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that ++ * the linear search should continue for one iteration since there's lock ++ * contention on the rb tree lock. ++ */ ++static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, ++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ++ int avg_fragment_size, best_so_far; ++ struct rb_node *node, *found; ++ struct ext4_group_info *grp; ++ ++ /* ++ * If there is contention on the lock, instead of waiting for the lock ++ * to become available, just continue searching lineraly. We'll resume ++ * our rb tree search later starting at ac->ac_last_optimal_group. ++ */ ++ if (!read_trylock(&sbi->s_mb_rb_lock)) { ++ ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; ++ return; ++ } ++ ++ if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { ++ if (sbi->s_mb_stats) ++ atomic_inc(&sbi->s_bal_cr1_bad_suggestions); ++ /* We have found something at CR 1 in the past */ ++ grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); ++ for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; ++ found = rb_next(found)) { ++ grp = rb_entry(found, struct ext4_group_info, ++ bb_avg_fragment_size_rb); ++ if (sbi->s_mb_stats) ++ atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); ++ if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) ++ break; ++ } ++ goto done; ++ } ++ ++ node = sbi->s_mb_avg_fragment_size_root.rb_node; ++ best_so_far = 0; ++ found = NULL; ++ ++ while (node) { ++ grp = rb_entry(node, struct ext4_group_info, ++ bb_avg_fragment_size_rb); ++ avg_fragment_size = 0; ++ if (ext4_mb_good_group(ac, grp->bb_group, 1)) { ++ avg_fragment_size = grp->bb_fragments ? ++ grp->bb_free / grp->bb_fragments : 0; ++ if (!best_so_far || avg_fragment_size < best_so_far) { ++ best_so_far = avg_fragment_size; ++ found = node; ++ } ++ } ++ if (avg_fragment_size > ac->ac_g_ex.fe_len) ++ node = node->rb_right; ++ else ++ node = node->rb_left; ++ } ++ ++done: ++ if (found) { ++ grp = rb_entry(found, struct ext4_group_info, ++ bb_avg_fragment_size_rb); ++ *group = grp->bb_group; ++ ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; ++ } else { ++ *new_cr = 2; ++ } ++ ++ read_unlock(&sbi->s_mb_rb_lock); ++ ac->ac_last_optimal_group = *group; ++} ++ ++static inline int should_optimize_scan(struct ext4_allocation_context *ac) ++{ ++ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) ++ return 0; ++ if (ac->ac_criteria >= 2) ++ return 0; ++ if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * Return next linear group for allocation. If linear traversal should not be ++ * performed, this function just returns the same group ++ */ ++static int ++next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) ++{ ++ if (!should_optimize_scan(ac)) ++ goto inc_and_return; ++ ++ if (ac->ac_groups_linear_remaining) { ++ ac->ac_groups_linear_remaining--; ++ goto inc_and_return; ++ } ++ ++ if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { ++ ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; ++ goto inc_and_return; ++ } ++ ++ return group; ++inc_and_return: ++ /* ++ * Artificially restricted ngroups for non-extent ++ * files makes group > ngroups possible on first loop. ++ */ ++ return group + 1 >= ngroups ? 0 : group + 1; ++} ++ ++/* ++ * ext4_mb_choose_next_group: choose next group for allocation. ++ * ++ * @ac Allocation Context ++ * @new_cr This is an output parameter. If the there is no good group ++ * available at current CR level, this field is updated to indicate ++ * the new cr level that should be used. ++ * @group This is an input / output parameter. As an input it indicates the ++ * next group that the allocator intends to use for allocation. As ++ * output, this field indicates the next group that should be used as ++ * determined by the optimization functions. ++ * @@ngroups Total number of groups ++ */ ++static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ++ int *new_cr, ext4_group_t *group, ext4_group_t ngroups) ++{ ++ *new_cr = ac->ac_criteria; ++ ++ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) ++ return; ++ ++ if (*new_cr == 0) { ++ ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); ++ } else if (*new_cr == 1) { ++ ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); ++ } else { ++ /* ++ * TODO: For CR=2, we can arrange groups in an rb tree sorted by ++ * bb_free. But until that happens, we should never come here. ++ */ ++ WARN_ON(1); ++ } ++} ++ + /* + * Cache the order of the largest free extent we have available in this block + * group. +@@ -754,18 +1073,33 @@ static void ext4_mb_mark_free_simple(str + static void + mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + { ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + int i; +- int bits; + ++ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { ++ write_lock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ list_del_init(&grp->bb_largest_free_order_node); ++ write_unlock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ } + grp->bb_largest_free_order = -1; /* uninit */ + +- bits = MB_NUM_ORDERS(sb) - 1; +- for (i = bits; i >= 0; i--) { ++ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } ++ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && ++ grp->bb_largest_free_order >= 0 && grp->bb_free) { ++ write_lock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ list_add_tail(&grp->bb_largest_free_order_node, ++ &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); ++ write_unlock(&sbi->s_mb_largest_free_orders_locks[ ++ grp->bb_largest_free_order]); ++ } + } + + static noinline_for_stack +@@ -826,6 +1160,7 @@ int ext4_mb_generate_buddy(struct super_ + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); ++ mb_update_avg_fragment_size(sb, grp); + + return 0; + } +@@ -1556,6 +1891,7 @@ static void mb_free_blocks(struct inode + + done: + mb_set_largest_free_order(sb, e4b->bd_info); ++ mb_update_avg_fragment_size(sb, e4b->bd_info); + mb_check_buddy(e4b); + } + +@@ -1693,6 +2029,7 @@ static int mb_mark_used(struct ext4_budd + } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + ++ mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); + mb_check_buddy(e4b); + +@@ -2415,17 +2752,21 @@ repeat: + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; ++ ac->ac_last_optimal_group = group; ++ ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; + prefetch_grp = group; + +- for (i = 0; i < ngroups; group++, i++) { +- int ret = 0; ++ for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), ++ i++) { ++ int ret = 0, new_cr; ++ + cond_resched(); +- /* +- * Artificially restricted ngroups for non-extent +- * files makes group > ngroups possible on first loop. +- */ +- if (group >= ngroups) +- group = 0; ++ ++ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); ++ if (new_cr != cr) { ++ cr = new_cr; ++ goto repeat; ++ } + + /* + * Batch reads of the block allocation bitmaps +@@ -2651,6 +2992,8 @@ int ext4_seq_mb_stats_show(struct seq_fi + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[0])); ++ seq_printf(seq, "\t\tbad_suggestions: %u\n", ++ atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + + seq_puts(seq, "\tcr1_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); +@@ -2658,6 +3001,8 @@ int ext4_seq_mb_stats_show(struct seq_fi + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[1])); ++ seq_printf(seq, "\t\tbad_suggestions: %u\n", ++ atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + + seq_puts(seq, "\tcr2_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); +@@ -2945,8 +3290,11 @@ int ext4_mb_add_groupinfo(struct super_b + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; ++ INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); ++ RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_freed_since_trim = 0; ++ meta_group_info[i]->bb_group = group; + + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); + return 0; +@@ -3137,6 +3485,27 @@ int ext4_mb_init(struct super_block *sb) + i++; + } while (i < MB_NUM_ORDERS(sb)); + ++ sbi->s_mb_avg_fragment_size_root = RB_ROOT; ++ sbi->s_mb_largest_free_orders = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!sbi->s_mb_largest_free_orders) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ sbi->s_mb_largest_free_orders_locks = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), ++ GFP_KERNEL); ++ if (!sbi->s_mb_largest_free_orders_locks) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) { ++ INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); ++ rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); ++ } ++ rwlock_init(&sbi->s_mb_rb_lock); ++ + spin_lock_init(&sbi->s_md_lock); + sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); +@@ -3236,6 +3605,10 @@ int ext4_mb_init(struct super_block *sb) + spin_lock_init(&lg->lg_prealloc_lock); + } + ++ if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) ++ sbi->s_mb_max_linear_groups = 0; ++ else ++ sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) +@@ -3247,6 +3620,8 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_largest_free_orders); ++ kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; +@@ -3304,6 +3679,8 @@ int ext4_mb_release(struct super_block * + kvfree(group_info); + rcu_read_unlock(); + } ++ kfree(sbi->s_mb_largest_free_orders); ++ kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h ++++ linux-stage/fs/ext4/mballoc.h +@@ -83,6 +83,18 @@ + #define MB_DEFAULT_MAX_INODE_PREALLOC 512 + + /* ++ * Number of groups to search linearly before performing group scanning ++ * optimization. ++ */ ++#define MB_DEFAULT_LINEAR_LIMIT 4 ++ ++/* ++ * Minimum number of groups that should be present in the file system to perform ++ * group scanning optimizations. ++ */ ++#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 ++ ++/* + * Number of valid buddy orders + */ + #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) +@@ -173,11 +185,14 @@ struct ext4_allocation_context { + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + ++ ext4_group_t ac_last_optimal_group; ++ __u32 ac_groups_considered; ++ __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; ++ __u16 ac_groups_linear_remaining; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; +- __u32 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -1623,7 +1623,7 @@ enum { + Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, +- Opt_prefetch_block_bitmaps, ++ Opt_prefetch_block_bitmaps, Opt_mb_optimize_scan, + }; + + static const match_table_t tokens = { +@@ -1718,6 +1718,7 @@ static const match_table_t tokens = { + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ + {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, ++ {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ +@@ -1750,6 +1751,8 @@ static ext4_fsblk_t get_sb_block(void ** + } + + #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) ++#define DEFAULT_MB_OPTIMIZE_SCAN (-1) ++ + static const char deprecated_msg[] = + "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; +@@ -1942,12 +1945,14 @@ static const struct mount_opts { + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, ++ {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, + {Opt_err, 0, 0} + }; + + static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, +- unsigned int *journal_ioprio, int is_remount) ++ unsigned int *journal_ioprio, int *mb_optimize_scan, ++ int is_remount) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; +@@ -2242,6 +2247,14 @@ static int handle_mount_opt(struct super + sbi->s_mount_opt |= m->mount_opt; + } else if (token == Opt_data_err_ignore) { + sbi->s_mount_opt &= ~m->mount_opt; ++ } else if (token == Opt_mb_optimize_scan) { ++ if (arg != 0 && arg != 1) { ++ ext4_msg(sb, KERN_WARNING, ++ "mb_optimize_scan should be set to 0 or 1."); ++ return -1; ++ } ++ if (mb_optimize_scan) ++ *mb_optimize_scan = arg; + } else { + if (!args->from) + arg = 1; +@@ -2264,6 +2277,7 @@ static int handle_mount_opt(struct super + static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, ++ int *mb_optimize_scan, + int is_remount) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -2284,7 +2298,8 @@ static int parse_options(char *options, + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, +- journal_ioprio, is_remount) < 0) ++ journal_ioprio, mb_optimize_scan, ++ is_remount) < 0) + return 0; + } + #ifdef CONFIG_QUOTA +@@ -3859,6 +3874,7 @@ static int ext4_fill_super(struct super_ + __u64 blocks_count; + int err = 0; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ++ int mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; + ext4_group_t first_not_zeroed; + + if ((data && !orig_data) || !sbi) +@@ -4092,7 +4108,7 @@ static int ext4_fill_super(struct super_ + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, +- &journal_ioprio, 0)) { ++ &journal_ioprio, &mb_optimize_scan, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); +@@ -4101,7 +4117,7 @@ static int ext4_fill_super(struct super_ + } + sbi->s_def_mount_opt = sbi->s_mount_opt; + if (!parse_options((char *) data, sb, &journal_devnum, +- &journal_ioprio, 0)) ++ &journal_ioprio, &mb_optimize_scan, 0)) + goto failed_mount; + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { +@@ -4763,6 +4779,14 @@ no_journal: + } + + ext4_ext_init(sb); ++ ++ if (mb_optimize_scan == 1) ++ set_opt2(sb, MB_OPTIMIZE_SCAN); ++ else if (mb_optimize_scan == 0) ++ clear_opt2(sb, MB_OPTIMIZE_SCAN); ++ else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) ++ set_opt2(sb, MB_OPTIMIZE_SCAN); ++ + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", +@@ -5593,7 +5617,7 @@ static int ext4_remount(struct super_blo + vfs_flags = SB_LAZYTIME | SB_I_VERSION; + sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); + +- if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { ++ if (!parse_options(data, sb, NULL, &journal_ioprio, NULL, 1)) { + err = -EINVAL; + goto restore_opts; + } +Index: linux-stage/fs/ext4/sysfs.c +=================================================================== +--- linux-stage.orig/fs/ext4/sysfs.c ++++ linux-stage/fs/ext4/sysfs.c +@@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_s + EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); ++EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); + EXT4_RW_ATTR_SBI_UI(bg_trimmed_threshold, s_bg_trimmed_threshold); + EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); +@@ -263,6 +264,7 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_inode_prealloc), ++ ATTR_LIST(mb_max_linear_groups), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(bg_trimmed_threshold), diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-limit-number-of-retries-after-discard-prealloc-blocks.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-limit-number-of-retries-after-discard-prealloc-blocks.patch new file mode 100644 index 0000000..a5e4d92 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-limit-number-of-retries-after-discard-prealloc-blocks.patch @@ -0,0 +1,71 @@ +commit 80fa46d6b9e7b1527bfd2197d75431fd9c382161 +Author: Theodore Ts'o +AuthorDate: Thu Sep 1 18:03:14 2022 -0400 +Commit: Theodore Ts'o +CommitDate: Thu Sep 22 10:51:19 2022 -0400 + +ext4: limit the number of retries after discarding preallocations blocks + +This patch avoids threads live-locking for hours when a large number +threads are competing over the last few free extents as they blocks +getting added and removed from preallocation pools. From our bug +reporter: + + A reliable way for triggering this has multiple writers + continuously write() to files when the filesystem is full, while + small amounts of space are freed (e.g. by truncating a large file + -1MiB at a time). In the local filesystem, this can be done by + simply not checking the return code of write (0) and/or the error + (ENOSPACE) that is set. Over NFS with an async mount, even clients + with proper error checking will behave this way since the linux NFS + client implementation will not propagate the server errors [the + write syscalls immediately return success] until the file handle is + closed. This leads to a situation where NFS clients send a + continuous stream of WRITE rpcs which result in ERRNOSPACE -- but + since the client isn't seeing this, the stream of writes continues + at maximum network speed. + + When some space does appear, multiple writers will all attempt to + claim it for their current write. For NFS, we may see dozens to + hundreds of threads that do this. + + The real-world scenario of this is database backup tooling (in + particular, github.com/mdkent/percona-xtrabackup) which may write + large files (>1TiB) to NFS for safe keeping. Some temporary files + are written, rewound, and read back -- all before closing the file + handle (the temp file is actually unlinked, to trigger automatic + deletion on close/crash.) An application like this operating on an + async NFS mount will not see an error code until TiB have been + written/read. + + The lockup was observed when running this database backup on large + filesystems (64 TiB in this case) with a high number of block + groups and no free space. Fragmentation is generally not a factor + in this filesystem (~thousands of large files, mostly contiguous + except for the parts written while the filesystem is at capacity.) + +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -5623,6 +5623,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; ++ int retries = 0; + u64 seq; + + might_sleep(); +@@ -5723,7 +5724,8 @@ repeat: + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) ++ if (++retries < 3 && ++ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + /* + * If block allocation fails then the pa allocated above diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mb_optimize_scan-performance-with-extents.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mb_optimize_scan-performance-with-extents.patch new file mode 100644 index 0000000..b4e9869 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mb_optimize_scan-performance-with-extents.patch @@ -0,0 +1,118 @@ +commit 077d0c2c78df6f7260cdd015a991327efa44d8ad +Author: Ojaswin Mujoo +AuthorDate: Tue Mar 8 15:22:01 2022 +0530 +Commit: Theodore Ts'o +CommitDate: Sat Mar 12 20:54:21 2022 -0500 + +ext4: make mb_optimize_scan performance mount option work with extents + +Currently mb_optimize_scan scan feature which improves filesystem +performance heavily (when FS is fragmented), seems to be not working +with files with extents (ext4 by default has files with extents). + +This patch fixes that and makes mb_optimize_scan feature work +for files with extents. + +Below are some performance numbers obtained when allocating a 10M and 100M +file with and w/o this patch on a filesytem with no 1M contiguous block. + + +=============== +Workload: dd if=/dev/urandom of=test conv=fsync bs=1M count=10/100 + +Time taken +===================================================== +no. Size without-patch with-patch Diff(%) +1 10M 0m8.401s 0m5.623s 33.06% +2 100M 1m40.465s 1m14.737s 25.6% + + +============= +w/o patch: + mballoc: + reqs: 17056 + success: 11407 + groups_scanned: 13643 + cr0_stats: + hits: 37 + groups_considered: 9472 + useless_loops: 36 + bad_suggestions: 0 + cr1_stats: + hits: 11418 + groups_considered: 908560 + useless_loops: 1894 + bad_suggestions: 0 + cr2_stats: + hits: 1873 + groups_considered: 6913 + useless_loops: 21 + cr3_stats: + hits: 21 + groups_considered: 5040 + useless_loops: 21 + extents_scanned: 417364 + goal_hits: 3707 + 2^n_hits: 37 + breaks: 1873 + lost: 0 + buddies_generated: 239/240 + buddies_time_used: 651080 + preallocated: 705 + discarded: 478 + +with patch: + mballoc: + reqs: 12768 + success: 11305 + groups_scanned: 12768 + cr0_stats: + hits: 1 + groups_considered: 18 + useless_loops: 0 + bad_suggestions: 0 + cr1_stats: + hits: 5829 + groups_considered: 50626 + useless_loops: 0 + bad_suggestions: 0 + cr2_stats: + hits: 6938 + groups_considered: 580363 + useless_loops: 0 + cr3_stats: + hits: 0 + groups_considered: 0 + useless_loops: 0 + extents_scanned: 309059 + goal_hits: 0 + 2^n_hits: 1 + breaks: 1463 + lost: 0 + buddies_generated: 239/240 + buddies_time_used: 791392 + preallocated: 673 + discarded: 446 + +Fixes: 196e402 (ext4: improve cr 0 / cr 1 group scanning) +Cc: stable@kernel.org +Reported-by: Geetika Moolchandani +Reported-by: Nageswara R Sastry +Suggested-by: Ritesh Harjani +Signed-off-by: Ojaswin Mujoo +Link: https://lore.kernel.org/r/fc9a48f7f8dcfc83891a8b21f6dd8cdf056ed810.1646732698.git.ojaswin@linux.ibm.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -998,7 +998,7 @@ static inline int should_optimize_scan(s + return 0; + if (ac->ac_criteria >= 2) + return 0; +- if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) ++ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + return 0; + return 1; + } diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mballoc-try-target-group-first.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mballoc-try-target-group-first.patch new file mode 100644 index 0000000..717c1f6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-mballoc-try-target-group-first.patch @@ -0,0 +1,86 @@ +commit 4fca50d440cc5d4dc570ad5484cc0b70b381bc2a +Author: Jan Kara +AuthorDate: Thu Sep 8 11:21:24 2022 +0200 +Commit: Theodore Ts'o +CommitDate: Wed Sep 21 22:11:34 2022 -0400 + +ext4: make mballoc try target group first even with mb_optimize_scan + +One of the side-effects of mb_optimize_scan was that the optimized +functions to select next group to try were called even before we tried +the goal group. As a result we no longer allocate files close to +corresponding inodes as well as we don't try to expand currently +allocated extent in the same group. This results in reaim regression +with workfile.disk workload of upto 8% with many clients on my test +machine: + + baseline mb_optimize_scan +Hmean disk-1 2114.16 ( 0.00%) 2099.37 ( -0.70%) +Hmean disk-41 87794.43 ( 0.00%) 83787.47 * -4.56%* +Hmean disk-81 148170.73 ( 0.00%) 135527.05 * -8.53%* +Hmean disk-121 177506.11 ( 0.00%) 166284.93 * -6.32%* +Hmean disk-161 220951.51 ( 0.00%) 207563.39 * -6.06%* +Hmean disk-201 208722.74 ( 0.00%) 203235.59 ( -2.63%) +Hmean disk-241 222051.60 ( 0.00%) 217705.51 ( -1.96%) +Hmean disk-281 252244.17 ( 0.00%) 241132.72 * -4.41%* +Hmean disk-321 255844.84 ( 0.00%) 245412.84 * -4.08%* + +Also this is causing huge regression (time increased by a factor of 5 or +so) when untarring archive with lots of small files on some eMMC storage +cards. + +Fix the problem by making sure we try goal group first. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Link: https://lore.kernel.org/all/20220727105123.ckwrhbilzrxqpt24@quack3/ +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/r/20220908092136.11770-1-jack@suse.cz +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -1050,8 +1050,10 @@ static void ext4_mb_choose_next_group(st + { + *new_cr = ac->ac_criteria; + +- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) ++ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { ++ *group = next_linear_group(ac, *group, ngroups); + return; ++ } + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); +@@ -2666,7 +2668,7 @@ static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { + ext4_group_t prefetch_grp = 0, ngroups, group, i; +- int cr = -1; ++ int cr = -1, new_cr; + int err = 0, first_err = 0; + unsigned int nr = 0, prefetch_ios = 0; + struct ext4_sb_info *sbi; +@@ -2756,13 +2758,12 @@ repeat: + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; + prefetch_grp = group; + +- for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), +- i++) { +- int ret = 0, new_cr; ++ for (i = 0, new_cr = cr; i < ngroups; i++, ++ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { ++ int ret = 0; + + cond_resched(); + +- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-prefetch_block_bitmaps-default.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-prefetch_block_bitmaps-default.patch new file mode 100644 index 0000000..3972631 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-make-prefetch_block_bitmaps-default.patch @@ -0,0 +1,87 @@ +commit 21175ca434c5d49509b73cf473618b01b0b85437 +Author: Harshad Shirwadkar +AuthorDate: Thu Apr 1 10:21:29 2021 -0700 +Commit: Theodore Ts'o +CommitDate: Fri Apr 9 11:34:59 2021 -0400 + +ext4: make prefetch_block_bitmaps default + +Block bitmap prefetching is needed for these allocator optimization +data structures to get populated and provide better group scanning +order. So, turn it on bu default. prefetch_block_bitmaps mount option +is now marked as removed and a new option no_prefetch_block_bitmaps is +added to disable block bitmap prefetching. + +Signed-off-by: Harshad Shirwadkar +Link: https://lore.kernel.org/r/20210401172129.189766-8-harshadshirwadkar@gmail.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1178,7 +1178,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +-#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 ++#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 + #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ + #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -1623,7 +1623,7 @@ enum { + Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, +- Opt_prefetch_block_bitmaps, Opt_mb_optimize_scan, ++ Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, + }; + + static const match_table_t tokens = { +@@ -1717,7 +1717,8 @@ static const match_table_t tokens = { + {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ +- {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, ++ {Opt_removed, "prefetch_block_bitmaps"}, ++ {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, + {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ +@@ -1943,7 +1944,7 @@ static const struct mount_opts { + {Opt_mb_c3_threshold, 0, MOPT_STRING}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, +- {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, ++ {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, + {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, + {Opt_err, 0, 0} +@@ -3551,11 +3552,11 @@ static struct ext4_li_request *ext4_li_r + + elr->lr_super = sb; + elr->lr_first_not_zeroed = start; +- if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) +- elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; +- else { ++ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { + elr->lr_mode = EXT4_LI_MODE_ITABLE; + elr->lr_next_group = start; ++ } else { ++ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; + } + + /* +@@ -3586,7 +3587,7 @@ int ext4_register_li_request(struct supe + goto out; + } + +- if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && ++ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || sb_rdonly(sb) || + !test_opt(sb, INIT_INODE_TABLE))) + goto out; diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch new file mode 100644 index 0000000..452a13b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch @@ -0,0 +1,66 @@ +commit 3c6296046c85333bc52555a670a9093d9e2657bb +Author: Ojaswin Mujoo +AuthorDate: Tue May 30 18:03:46 2023 +0530 +Commit: Theodore Ts'o +CommitDate: Mon Jun 26 19:34:56 2023 -0400 + +ext4: Don't skip prefetching BLOCK_UNINIT groups + +Currently, ext4_mb_prefetch() and ext4_mb_prefetch_fini() skip +BLOCK_UNINIT groups since fetching their bitmaps doesn't need disk IO. +As a consequence, we end not initializing the buddy structures and CR0/1 +lists for these BGs, even though it can be done without any disk IO +overhead. Hence, don't skip such BGs during prefetch and prefetch_fini. + +This improves the accuracy of CR0/1 allocation as earlier, we could have +essentially empty BLOCK_UNINIT groups being ignored by CR0/1 due to their buddy +not being initialized, leading to slower CR2 allocations. With this patch CR0/1 +will be able to discover these groups as well, thus improving performance. + +Signed-off-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/dc3130b8daf45ffe63d8a3c1edcf00eb8ba70e1f.1685449706.git.ojaswin@linux.ibm.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2580,9 +2580,7 @@ ext4_group_t ext4_mb_prefetch(struct sup + */ + if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && +- ext4_free_group_clusters(sb, gdp) > 0 && +- !(ext4_has_group_desc_csum(sb) && +- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ ext4_free_group_clusters(sb, gdp) > 0 ) { + bh = ext4_read_block_bitmap_nowait(sb, group, 1); + if (bh && !IS_ERR(bh)) { + if (!buffer_uptodate(bh) && cnt) +@@ -2613,20 +2611,18 @@ ext4_group_t ext4_mb_prefetch(struct sup + void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr) + { +- while (nr-- > 0) { +- struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, +- NULL); +- struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ struct ext4_group_desc *gdp; ++ struct ext4_group_info *grp; + ++ while (nr-- > 0) { + if (!group) + group = ext4_get_groups_count(sb); + group--; ++ gdp = ext4_get_group_desc(sb, group, NULL); + grp = ext4_get_group_info(sb, group); + + if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && +- ext4_free_group_clusters(sb, gdp) > 0 && +- !(ext4_has_group_desc_csum(sb) && +- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ ext4_free_group_clusters(sb, gdp) > 0) { + if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-optimize-the-ext4_mb_good_group.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-optimize-the-ext4_mb_good_group.patch new file mode 100644 index 0000000..bbee71d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-optimize-the-ext4_mb_good_group.patch @@ -0,0 +1,59 @@ +commit dddcd2f9ebdeca9fbd36526e950bbcd0f7c1765f +Author: brookxu +AuthorDate: Fri Aug 7 22:01:39 2020 +0800 +Commit: Theodore Ts'o +CommitDate: Tue Aug 18 14:18:36 2020 -0400 + +ext4: optimize the implementation of ext4_mb_good_group() + +It might be better to adjust the code in two places: +1. Determine whether grp is currupt or not should be placed first. +2. (cr<=2 && free ac_g_ex.fe_len)should may belong to the crx + strategy, and it may be more appropriate to put it in the + subsequent switch statement block. For cr1, cr2, the conditions + in switch potentially realize the above judgment. For cr0, we + should add (free ac_g_ex.fe_len) judgment, and then delete + (free / fragments) >= ac->ac_g_ex.fe_len), because cr0 returns + true by default. + +Signed-off-by: Chunguang Xu +Reviewed-by: Andreas Dilger +Reviewed-by: Ritesh Harjani +Link: https://lore.kernel.org/r/e20b2d8f-1154-adb7-3831-a9e11ba842e9@gmail.com +Signed-off-by: Theodore Ts'o +--- + +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2135,13 +2135,11 @@ static bool ext4_mb_good_group(struct ex + + BUG_ON(cr < 0 || cr >= 4); + +- free = grp->bb_free; +- if (free == 0) +- return false; +- if (cr <= 2 && free < ac->ac_g_ex.fe_len) ++ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return false; + +- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) ++ free = grp->bb_free; ++ if (free == 0) + return false; + + fragments = grp->bb_fragments; +@@ -2158,8 +2156,10 @@ static bool ext4_mb_good_group(struct ex + ((group % flex_size) == 0)) + return false; + +- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || +- (free / fragments) >= ac->ac_g_ex.fe_len) ++ if (free < ac->ac_g_ex.fe_len) ++ return false; ++ ++ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) + return true; + + if (grp->bb_largest_free_order < ac->ac_2order) diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-refactor-code-related-to-freeing-pa.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-refactor-code-related-to-freeing-pa.patch new file mode 100644 index 0000000..a078949 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-refactor-code-related-to-freeing-pa.patch @@ -0,0 +1,104 @@ +commit 820897258ad342e78388ee9f5814fc485e79102a +Author: Ojaswin Mujoo +AuthorDate: Sat Mar 25 13:43:35 2023 +0530 +Commit: Theodore Ts'o +CommitDate: Thu Apr 6 01:13:12 2023 -0400 + +ext4: Refactor code related to freeing PAs + +This patch makes the following changes: + +* Rename ext4_mb_pa_free to ext4_mb_pa_put_free + to better reflect its purpose + +* Add new ext4_mb_pa_free() which only handles freeing + +* Refactor ext4_mb_pa_callback() to use ext4_mb_pa_free() + +There are no functional changes in this patch + +Signed-off-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/b273bc9cbf5bd278f641fa5bc6c0cc9e6cb3330c.1679731817.git.ojaswin@linux.ibm.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -4612,16 +4612,22 @@ static void ext4_mb_mark_pa_deleted(stru + } + } + +-static void ext4_mb_pa_callback(struct rcu_head *head) ++static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) + { +- struct ext4_prealloc_space *pa; +- pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); +- ++ BUG_ON(!pa); + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); + kmem_cache_free(ext4_pspace_cachep, pa); + } + ++static void ext4_mb_pa_callback(struct rcu_head *head) ++{ ++ struct ext4_prealloc_space *pa; ++ ++ pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); ++ ext4_mb_pa_free(pa); ++} ++ + /* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed +@@ -5150,14 +5156,20 @@ static int ext4_mb_pa_alloc(struct ext4_ + return 0; + } + +-static void ext4_mb_pa_free(struct ext4_allocation_context *ac) ++static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) + { + struct ext4_prealloc_space *pa = ac->ac_pa; + + BUG_ON(!pa); + ac->ac_pa = NULL; + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); +- kmem_cache_free(ext4_pspace_cachep, pa); ++ /* ++ * current function is only called due to an error or due to ++ * len of found blocks < len of requested blocks hence the PA has not ++ * been added to grp->bb_prealloc_list. So we don't need to lock it ++ */ ++ pa->pa_deleted = 1; ++ ext4_mb_pa_free(pa); + } + + #ifdef CONFIG_EXT4_DEBUG +@@ -5705,13 +5717,13 @@ repeat: + * So we have to free this pa here itself. + */ + if (*errp) { +- ext4_mb_pa_free(ac); ++ ext4_mb_pa_put_free(ac); + ext4_discard_allocated_blocks(ac); + goto errout; + } + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) +- ext4_mb_pa_free(ac); ++ ext4_mb_pa_put_free(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); +@@ -5730,7 +5742,7 @@ repeat: + * If block allocation fails then the pa allocated above + * needs to be freed here itself. + */ +- ext4_mb_pa_free(ac); ++ ext4_mb_pa_put_free(ac); + *errp = -ENOSPC; + } + diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-reflect-mb_optimize_scan-value-in-options.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-reflect-mb_optimize_scan-value-in-options.patch new file mode 100644 index 0000000..dc5691a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-reflect-mb_optimize_scan-value-in-options.patch @@ -0,0 +1,35 @@ +commit 3fa5d23e68a34dae9df2be168750dc5e03e0e40d +Author: Ojaswin Mujoo +AuthorDate: Mon Jul 4 11:16:03 2022 +0530 +Commit: Theodore Ts'o +CommitDate: Tue Aug 2 23:56:17 2022 -0400 + +ext4: reflect mb_optimize_scan value in options file + +Add support to display the mb_optimize_scan value in +/proc/fs/ext4//options file. The option is only +displayed when the value is non default. + +Signed-off-by: Ojaswin Mujoo +Link: https://lore.kernel.org/r/20220704054603.21462-1-ojaswin@linux.ibm.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -2488,6 +2488,14 @@ static int _ext4_show_options(struct seq + SEQ_OPTS_PUTS("dax=inode"); + } + ++ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && ++ !test_opt2(sb, MB_OPTIMIZE_SCAN)) { ++ SEQ_OPTS_PUTS("mb_optimize_scan=0"); ++ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && ++ test_opt2(sb, MB_OPTIMIZE_SCAN)) { ++ SEQ_OPTS_PUTS("mb_optimize_scan=1"); ++ } ++ + ext4_show_quota_options(seq, sb); + return 0; + } diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-buckets-for-cr1-block-scan.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-buckets-for-cr1-block-scan.patch new file mode 100644 index 0000000..9af065e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-buckets-for-cr1-block-scan.patch @@ -0,0 +1,440 @@ +commit 83e80a6e3543f37f74c8e48a5f305b054b65ce2a +Author: Jan Kara +AuthorDate: Thu Sep 8 11:21:28 2022 +0200 +Commit: Theodore Ts'o +CommitDate: Wed Sep 21 22:12:03 2022 -0400 + +ext4: use buckets for cr 1 block scan instead of rbtree + +Using rbtree for sorting groups by average fragment size is relatively +expensive (needs rbtree update on every block freeing or allocation) and +leads to wide spreading of allocations because selection of block group +is very sentitive both to changes in free space and amount of blocks +allocated. Furthermore selecting group with the best matching average +fragment size is not necessary anyway, even more so because the +variability of fragment sizes within a group is likely large so average +is not telling much. We just need a group with large enough average +fragment size so that we have high probability of finding large enough +free extent and we don't want average fragment size to be too big so +that we are likely to find free extent only somewhat larger than what we +need. + +So instead of maintaing rbtree of groups sorted by fragment size keep +bins (lists) or groups where average fragment size is in the interval +[2^i, 2^(i+1)). This structure requires less updates on block allocation +/ freeing, generally avoids chaotic spreading of allocations into block +groups, and still is able to quickly (even faster that the rbtree) +provide a block group which is likely to have a suitably sized free +space extent. + +This patch reduces number of block groups used when untarring archive +with medium sized files (size somewhat above 64k which is default +mballoc limit for avoiding locality group preallocation) to about half +and thus improves write speeds for eMMC flash significantly. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Signed-off-by: Jan Kara +Reviewed-by: Ritesh Harjani (IBM) +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Link: https://lore.kernel.org/r/20220908092136.11770-5-jack@suse.cz +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -156,8 +156,6 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_CR0_OPTIMIZED 0x8000 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ + #define EXT4_MB_CR1_OPTIMIZED 0x00010000 +-/* Perform linear traversal for one group */ +-#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 + #define EXT4_MB_VERY_DENSE 0x80000 + + struct ext4_allocation_request { +@@ -1497,8 +1495,8 @@ struct ext4_sb_info { + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ +- struct rb_root s_mb_avg_fragment_size_root; +- rwlock_t s_mb_rb_lock; ++ struct list_head *s_mb_avg_fragment_size; ++ rwlock_t *s_mb_avg_fragment_size_locks; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; + +@@ -3221,6 +3219,8 @@ struct ext4_group_info { + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_freed_since_trim; /* blocks freed since last trim */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ++ int bb_avg_fragment_size_order; /* order of average ++ fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; +@@ -3229,7 +3229,7 @@ struct ext4_group_info { + void *bb_bitmap; + #endif + struct rw_semaphore alloc_sem; +- struct rb_node bb_avg_fragment_size_rb; ++ struct list_head bb_avg_fragment_size_node; + struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -140,13 +140,15 @@ + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * +- * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) ++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) + * +- * Locking: sbi->s_mb_rb_lock (rwlock) ++ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) + * +- * This is a red black tree consisting of group infos and the tree is sorted +- * by average fragment sizes (which is calculated as ext4_group_info->bb_free +- * / ext4_group_info->bb_fragments). ++ * This is an array of lists where in the i-th list there are groups with ++ * average fragment size >= 2^i and < 2^(i+1). The average fragment size ++ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. ++ * Note that we don't bother with a special list for completely empty groups ++ * so we only have MB_NUM_ORDERS(sb) lists. + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for +@@ -160,7 +162,8 @@ + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or +- * equal to request size using our rb tree (data structure 2) in O(log N) time. ++ * equal to request size using our average fragment size group lists (data ++ * structure 2) in O(1) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. +@@ -803,65 +806,51 @@ static void ext4_mb_mark_free_simple(str + } + } + +-static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, +- int (*cmp)(struct rb_node *, struct rb_node *)) ++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) + { +- struct rb_node **iter = &root->rb_node, *parent = NULL; ++ int order; + +- while (*iter) { +- parent = *iter; +- if (cmp(new, *iter) > 0) +- iter = &((*iter)->rb_left); +- else +- iter = &((*iter)->rb_right); +- } +- +- rb_link_node(new, parent, iter); +- rb_insert_color(new, root); +-} +- +-static int +-ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +-{ +- struct ext4_group_info *grp1 = rb_entry(rb1, +- struct ext4_group_info, +- bb_avg_fragment_size_rb); +- struct ext4_group_info *grp2 = rb_entry(rb2, +- struct ext4_group_info, +- bb_avg_fragment_size_rb); +- int num_frags_1, num_frags_2; +- +- num_frags_1 = grp1->bb_fragments ? +- grp1->bb_free / grp1->bb_fragments : 0; +- num_frags_2 = grp2->bb_fragments ? +- grp2->bb_free / grp2->bb_fragments : 0; +- +- return (num_frags_2 - num_frags_1); ++ /* ++ * We don't bother with a special lists groups with only 1 block free ++ * extents and for completely empty groups. ++ */ ++ order = fls(len) - 2; ++ if (order < 0) ++ return 0; ++ if (order == MB_NUM_ORDERS(sb)) ++ order--; ++ return order; + } + +-/* +- * Reinsert grpinfo into the avg_fragment_size tree with new average +- * fragment size. +- */ ++/* Move group to appropriate avg_fragment_size list */ + static void + mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); ++ int new_order; + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + +- write_lock(&sbi->s_mb_rb_lock); +- if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { +- rb_erase(&grp->bb_avg_fragment_size_rb, +- &sbi->s_mb_avg_fragment_size_root); +- RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); +- } ++ new_order = mb_avg_fragment_size_order(sb, ++ grp->bb_free / grp->bb_fragments); ++ if (new_order == grp->bb_avg_fragment_size_order) ++ return; + +- ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, +- &grp->bb_avg_fragment_size_rb, +- ext4_mb_avg_fragment_size_cmp); +- write_unlock(&sbi->s_mb_rb_lock); ++ if (grp->bb_avg_fragment_size_order != -1) { ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_del(&grp->bb_avg_fragment_size_node); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ } ++ grp->bb_avg_fragment_size_order = new_order; ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_add_tail(&grp->bb_avg_fragment_size_node, ++ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); + } + + /* +@@ -910,86 +899,56 @@ static void ext4_mb_choose_next_group_cr + *new_cr = 1; + } else { + *group = grp->bb_group; +- ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } + } + + /* +- * Choose next group by traversing average fragment size tree. Updates *new_cr +- * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that +- * the linear search should continue for one iteration since there's lock +- * contention on the rb tree lock. ++ * Choose next group by traversing average fragment size list of suitable ++ * order. Updates *new_cr if cr level needs an update. + */ + static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- int avg_fragment_size, best_so_far; +- struct rb_node *node, *found; +- struct ext4_group_info *grp; +- +- /* +- * If there is contention on the lock, instead of waiting for the lock +- * to become available, just continue searching lineraly. We'll resume +- * our rb tree search later starting at ac->ac_last_optimal_group. +- */ +- if (!read_trylock(&sbi->s_mb_rb_lock)) { +- ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; +- return; +- } ++ struct ext4_group_info *grp, *iter; ++ int i; + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); +- /* We have found something at CR 1 in the past */ +- grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); +- for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; +- found = rb_next(found)) { +- grp = rb_entry(found, struct ext4_group_info, +- bb_avg_fragment_size_rb); ++ } ++ ++ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ++ i < MB_NUM_ORDERS(ac->ac_sb); i++) { ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) ++ continue; ++ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ continue; ++ } ++ grp = NULL; ++ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], ++ bb_avg_fragment_size_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); +- if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) ++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { ++ grp = iter; + break; +- } +- goto done; +- } +- +- node = sbi->s_mb_avg_fragment_size_root.rb_node; +- best_so_far = 0; +- found = NULL; +- +- while (node) { +- grp = rb_entry(node, struct ext4_group_info, +- bb_avg_fragment_size_rb); +- avg_fragment_size = 0; +- if (ext4_mb_good_group(ac, grp->bb_group, 1)) { +- avg_fragment_size = grp->bb_fragments ? +- grp->bb_free / grp->bb_fragments : 0; +- if (!best_so_far || avg_fragment_size < best_so_far) { +- best_so_far = avg_fragment_size; +- found = node; + } + } +- if (avg_fragment_size > ac->ac_g_ex.fe_len) +- node = node->rb_right; +- else +- node = node->rb_left; ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (grp) ++ break; + } + +-done: +- if (found) { +- grp = rb_entry(found, struct ext4_group_info, +- bb_avg_fragment_size_rb); ++ if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } +- +- read_unlock(&sbi->s_mb_rb_lock); +- ac->ac_last_optimal_group = *group; + } + + static inline int should_optimize_scan(struct ext4_allocation_context *ac) +@@ -1018,11 +977,6 @@ next_linear_group(struct ext4_allocation + goto inc_and_return; + } + +- if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { +- ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; +- goto inc_and_return; +- } +- + return group; + inc_and_return: + /* +@@ -1158,13 +1112,13 @@ int ext4_mb_generate_buddy(struct super_ + return -EIO; + } + mb_set_largest_free_order(sb, grp); ++ mb_update_avg_fragment_size(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); +- mb_update_avg_fragment_size(sb, grp); + + return 0; + } +@@ -2756,7 +2710,6 @@ repeat: + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; +- ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; + prefetch_grp = group; + +@@ -3294,8 +3247,9 @@ int ext4_mb_add_groupinfo(struct super_b + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); +- RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); ++ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ ++ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ + meta_group_info[i]->bb_freed_since_trim = 0; + meta_group_info[i]->bb_group = group; + +@@ -3488,7 +3442,24 @@ int ext4_mb_init(struct super_block *sb) + i++; + } while (i < MB_NUM_ORDERS(sb)); + +- sbi->s_mb_avg_fragment_size_root = RB_ROOT; ++ sbi->s_mb_avg_fragment_size = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ sbi->s_mb_avg_fragment_size_locks = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size_locks) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) { ++ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); ++ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); ++ } + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); +@@ -3507,7 +3478,6 @@ int ext4_mb_init(struct super_block *sb) + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } +- rwlock_init(&sbi->s_mb_rb_lock); + + spin_lock_init(&sbi->s_md_lock); + sbi->s_mb_free_pending = 0; +@@ -3623,6 +3593,8 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_prealloc_table); +@@ -3682,6 +3654,8 @@ int ext4_mb_release(struct super_block * + kvfree(group_info); + rcu_read_unlock(); + } ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_prealloc_table); +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h ++++ linux-stage/fs/ext4/mballoc.h +@@ -185,7 +185,6 @@ struct ext4_allocation_context { + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + +- ext4_group_t ac_last_optimal_group; + __u32 ac_groups_considered; + __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; diff --git a/ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-locality-group-preallocation-for-small-files.patch b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-locality-group-preallocation-for-small-files.patch new file mode 100644 index 0000000..d4bd0c6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8.8/ext4-use-locality-group-preallocation-for-small-files.patch @@ -0,0 +1,79 @@ +commit a9f2a2931d0e197ab28c6007966053fdababd53f +Author: Jan Kara +AuthorDate: Thu Sep 8 11:21:27 2022 +0200 +Commit: Theodore Ts'o +CommitDate: Wed Sep 21 22:12:00 2022 -0400 + +ext4: use locality group preallocation for small closed files + +Curently we don't use any preallocation when a file is already closed +when allocating blocks (from writeback code when converting delayed +allocation). However for small files, using locality group preallocation +is actually desirable as that is not specific to a particular file. +Rather it is a method to pack small files together to reduce +fragmentation and for that the fact the file is closed is actually even +stronger hint the file would benefit from packing. So change the logic +to allow locality group preallocation in this case. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Link: https://lore.kernel.org/r/20220908092136.11770-4-jack@suse.cz +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -5273,6 +5273,7 @@ static void ext4_mb_group_or_file(struct + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; ++ bool inode_pa_eligible, group_pa_eligible; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; +@@ -5280,26 +5281,27 @@ static void ext4_mb_group_or_file(struct + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + ++ group_pa_eligible = sbi->s_mb_group_prealloc > 0; ++ inode_pa_eligible = true; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + ++ /* No point in using inode preallocation for closed files */ + if ((size == isize) && !ext4_fs_is_busy(sbi) && +- !inode_is_open_for_write(ac->ac_inode)) { +- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; +- return; +- } +- +- if (sbi->s_mb_group_prealloc <= 0) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; +- return; +- } ++ !inode_is_open_for_write(ac->ac_inode)) ++ inode_pa_eligible = false; + +- /* don't use group allocation for large files */ + size = max(size, isize); + if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || +- (size >= sbi->s_mb_large_req)) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ (size >= sbi->s_mb_large_req)) ++ group_pa_eligible = false; ++ ++ if (!group_pa_eligible) { ++ if (inode_pa_eligible) ++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ else ++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + diff --git a/ldiskfs/kernel_patches/patches/rhel9.4/ext4-allow-ext4_get_group_info-to-fail.patch b/ldiskfs/kernel_patches/patches/rhel9.4/ext4-allow-ext4_get_group_info-to-fail.patch new file mode 100644 index 0000000..fa52298 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel9.4/ext4-allow-ext4_get_group_info-to-fail.patch @@ -0,0 +1,433 @@ +commit 5354b2af34064a4579be8bc0e2f15a7b70f14b5f +Author: Theodore Ts'o +AuthorDate: Sat Apr 29 00:06:28 2023 -0400 +Commit: Theodore Ts'o +CommitDate: Sat May 13 18:02:46 2023 -0400 + +ext4: allow ext4_get_group_info() to fail + +Previously, ext4_get_group_info() would treat an invalid group number +as BUG(), since in theory it should never happen. However, if a +malicious attaker (or fuzzer) modifies the superblock via the block +device while it is the file system is mounted, it is possible for +s_first_data_block to get set to a very large number. In that case, +when calculating the block group of some block number (such as the +starting block of a preallocation region), could result in an +underflow and very large block group number. Then the BUG_ON check in +ext4_get_group_info() would fire, resutling in a denial of service +attack that can be triggered by root or someone with write access to +the block device. + +For a quality of implementation perspective, it's best that even if +the system administrator does something that they shouldn't, that it +will not trigger a BUG. So instead of BUG'ing, ext4_get_group_info() +will call ext4_error and return NULL. We also add fallback code in +all of the callers of ext4_get_group_info() that it might NULL. + +Also, since ext4_get_group_info() was already borderline to be an +inline function, un-inline it. The results in a next reduction of the +compiled text size of ext4 by roughly 2k. + +Cc: stable@kernel.org +Link: https://lore.kernel.org/r/20230430154311.579720-2-tytso@mit.edu +Reported-by: syzbot+e2efa3efc15a1c9e95c3@syzkaller.appspotmail.com +Link: https://syzkaller.appspot.com/bug?id=69b28112e098b070f639efb356393af3ffec4220 +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +--- +Index: linux-stage/fs/ext4/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/balloc.c ++++ linux-stage/fs/ext4/balloc.c +@@ -321,6 +321,22 @@ static ext4_fsblk_t ext4_valid_block_bit + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); + } + ++struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ++ ext4_group_t group) ++{ ++ struct ext4_group_info **grp_info; ++ long indexv, indexh; ++ ++ if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) { ++ ext4_error(sb, "invalid group %u", group); ++ return NULL; ++ } ++ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); ++ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); ++ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); ++ return grp_info[indexh]; ++} ++ + /* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. +@@ -395,7 +411,7 @@ static int ext4_validate_block_bitmap(st + + if (buffer_verified(bh)) + return 0; +- if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) ++ if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -2903,6 +2903,8 @@ extern void ext4_check_blocks_bitmap(str + extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); ++extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ++ ext4_group_t group); + extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + + extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, +@@ -3546,19 +3548,6 @@ static inline void ext4_isize_set(struct + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); + } + +-static inline +-struct ext4_group_info *ext4_get_group_info(struct super_block *sb, +- ext4_group_t group) +-{ +- struct ext4_group_info **grp_info; +- long indexv, indexh; +- BUG_ON(group >= EXT4_SB(sb)->s_groups_count); +- indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); +- indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); +- grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); +- return grp_info[indexh]; +-} +- + /* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() +Index: linux-stage/fs/ext4/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/ialloc.c ++++ linux-stage/fs/ext4/ialloc.c +@@ -92,7 +92,7 @@ static int ext4_validate_inode_bitmap(st + + if (buffer_verified(bh)) + return 0; +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ++ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); +@@ -295,7 +295,7 @@ void ext4_free_inode(handle_t *handle, s + } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, block_group); +- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { ++ if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; + goto error_return; + } +@@ -1048,7 +1048,7 @@ got_group: + * Skip groups with already-known suspicious inode + * tables + */ +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ++ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + } + +@@ -1185,6 +1185,10 @@ got: + + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); ++ if (!grp) { ++ err = -EFSCORRUPTED; ++ goto out; ++ } + down_read(&grp->alloc_sem); /* + * protect vs itable + * lazyinit +@@ -1538,7 +1542,7 @@ int ext4_init_inode_table(struct super_b + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); +- if (!gdp) ++ if (!gdp || !grp) + goto out; + + /* +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -745,6 +745,8 @@ static int __mb_check_buddy(struct ext4_ + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); ++ if (!grp) ++ return NULL; + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; +@@ -1059,10 +1061,10 @@ mb_set_largest_free_order(struct super_b + } + + static noinline_for_stack +-int ext4_mb_generate_buddy(struct super_block *sb, +- void *buddy, void *bitmap, ext4_group_t group) ++void ext4_mb_generate_buddy(struct super_block *sb, ++ void *buddy, void *bitmap, ext4_group_t group, ++ struct ext4_group_info *grp) + { +- struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; +@@ -1107,7 +1109,6 @@ int ext4_mb_generate_buddy(struct super_ + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); +- return -EIO; + } + mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); +@@ -1117,8 +1118,6 @@ int ext4_mb_generate_buddy(struct super_ + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); +- +- return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -1188,6 +1187,8 @@ static int ext4_mb_init_cache(struct pag + break; + + grinfo = ext4_get_group_info(sb, group); ++ if (!grinfo) ++ continue; + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so +@@ -1253,6 +1254,10 @@ static int ext4_mb_init_cache(struct pag + group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); + grinfo = ext4_get_group_info(sb, group); ++ if (!grinfo) { ++ err = -EFSCORRUPTED; ++ goto out; ++ } + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * +@@ -1263,7 +1268,7 @@ static int ext4_mb_init_cache(struct pag + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- err = ext4_mb_generate_buddy(sb, data, incore, group); ++ ext4_mb_generate_buddy(sb, data, incore, group, grinfo); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1378,6 +1383,9 @@ int ext4_mb_init_group(struct super_bloc + might_sleep(); + mb_debug(sb, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); ++ if (!this_grp) ++ return -EFSCORRUPTED; ++ + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already +@@ -1452,6 +1460,8 @@ ext4_mb_load_buddy_gfp(struct super_bloc + + blocks_per_page = PAGE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); ++ if (!grp) ++ return -EFSCORRUPTED; + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = grp; +@@ -2182,6 +2192,8 @@ int ext4_mb_find_by_goal(struct ext4_all + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct ext4_free_extent ex; + ++ if (!grp) ++ return -EFSCORRUPTED; + if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) + return 0; + if (grp->bb_free == 0) +@@ -2410,7 +2422,7 @@ static bool ext4_mb_good_group(struct ex + + BUG_ON(cr < 0 || cr >= 4); + +- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) ++ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) || !grp) + return false; + + free = grp->bb_free; +@@ -2479,6 +2491,8 @@ static int ext4_mb_good_group_nolock(str + ext4_grpblk_t free; + int ret = 0; + ++ if (!grp) ++ return -EFSCORRUPTED; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); + if (should_lock) { +@@ -2559,7 +2573,7 @@ ext4_group_t ext4_mb_prefetch(struct sup + * prefetch once, so we avoid getblk() call, which can + * be expensive. + */ +- if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && ++ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && +@@ -2603,7 +2617,7 @@ void ext4_mb_prefetch_fini(struct super_ + gdp = ext4_get_group_desc(sb, group, NULL); + grp = ext4_get_group_info(sb, group); + +- if (EXT4_MB_GRP_NEED_INIT(grp) && ++ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && + ext4_free_group_clusters(sb, gdp) > 0 && + !(ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { +@@ -2865,6 +2879,8 @@ static int ext4_mb_seq_groups_show(struc + sizeof(struct ext4_group_info); + + grinfo = ext4_get_group_info(sb, group); ++ if (!grinfo) ++ return 0; + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); +@@ -2879,7 +2895,7 @@ static int ext4_mb_seq_groups_show(struc + if (gdp != NULL) + free = ext4_free_group_clusters(sb, gdp); + +- memcpy(&sg, ext4_get_group_info(sb, group), i); ++ memcpy(&sg, grinfo, i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); +@@ -3394,8 +3410,12 @@ static int ext4_mb_init_backend(struct s + + err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); +- while (i-- > 0) +- kmem_cache_free(cachep, ext4_get_group_info(sb, i)); ++ while (i-- > 0) { ++ struct ext4_group_info *grp = ext4_get_group_info(sb, i); ++ ++ if (grp) ++ kmem_cache_free(cachep, grp); ++ } + i = sbi->s_group_info_size; + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); +@@ -3730,6 +3750,8 @@ int ext4_mb_release(struct super_block * + for (i = 0; i < ngroups; i++) { + cond_resched(); + grinfo = ext4_get_group_info(sb, i); ++ if (!grinfo) ++ continue; + mb_group_bb_bitmap_free(grinfo); + ext4_lock_group(sb, i); + count = ext4_mb_cleanup_pa(grinfo); +@@ -4934,6 +4956,8 @@ static void ext4_mb_generate_from_freeli + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); ++ if (!grp) ++ return; + n = rb_first(&(grp->bb_free_root)); + + while (n) { +@@ -5003,6 +5027,9 @@ int ext4_mb_generate_from_pa(struct supe + int err; + int len; + ++ if (!grp) ++ return -EIO; ++ + gdp = ext4_get_group_desc(sb, group, NULL); + if (gdp == NULL) + return -EIO; +@@ -5256,6 +5283,8 @@ adjust_bex: + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); ++ if (!grp) ++ return; + + pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; +@@ -5307,6 +5336,8 @@ ext4_mb_new_group_pa(struct ext4_allocat + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); ++ if (!grp) ++ return; + lg = ac->ac_lg; + BUG_ON(lg == NULL); + +@@ -5435,6 +5466,8 @@ ext4_mb_discard_group_preallocations(str + int err; + int free = 0; + ++ if (!grp) ++ return 0; + mb_debug(sb, "discard preallocation for group %u\n", group); + if (list_empty(&grp->bb_prealloc_list)) + goto out_dbg; +@@ -5683,6 +5716,9 @@ static inline void ext4_mb_show_pa(struc + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; ++ ++ if (!grp) ++ continue; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, +@@ -6481,6 +6517,7 @@ static void ext4_mb_clear_bb(handle_t *h + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; ++ struct ext4_group_info *grp; + unsigned int overflow; + ext4_grpblk_t bit; + struct buffer_head *gd_bh; +@@ -6506,8 +6543,8 @@ do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + +- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( +- ext4_get_group_info(sb, block_group)))) ++ grp = ext4_get_group_info(sb, block_group); ++ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return; + + /* +@@ -7169,6 +7206,8 @@ int ext4_trim_fs(struct super_block *sb, + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); ++ if (!grp) ++ continue; + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group, GFP_NOFS); +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -1094,6 +1094,8 @@ void ext4_mark_group_bitmap_corrupted(st + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + int ret; + ++ if (!grp || !gdp) ++ return; + if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); diff --git a/ldiskfs/kernel_patches/patches/rhel9.4/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch b/ldiskfs/kernel_patches/patches/rhel9.4/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch new file mode 100644 index 0000000..5981ee3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel9.4/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch @@ -0,0 +1,51 @@ +commit 3c6296046c85333bc52555a670a9093d9e2657bb +Author: Ojaswin Mujoo +AuthorDate: Tue May 30 18:03:46 2023 +0530 +Commit: Theodore Ts'o +CommitDate: Mon Jun 26 19:34:56 2023 -0400 + +ext4: Don't skip prefetching BLOCK_UNINIT groups + +Currently, ext4_mb_prefetch() and ext4_mb_prefetch_fini() skip +BLOCK_UNINIT groups since fetching their bitmaps doesn't need disk IO. +As a consequence, we end not initializing the buddy structures and CR0/1 +lists for these BGs, even though it can be done without any disk IO +overhead. Hence, don't skip such BGs during prefetch and prefetch_fini. + +This improves the accuracy of CR0/1 allocation as earlier, we could have +essentially empty BLOCK_UNINIT groups being ignored by CR0/1 due to their buddy +not being initialized, leading to slower CR2 allocations. With this patch CR0/1 +will be able to discover these groups as well, thus improving performance. + +Signed-off-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/dc3130b8daf45ffe63d8a3c1edcf00eb8ba70e1f.1685449706.git.ojaswin@linux.ibm.com +Signed-off-by: Theodore Ts'o +--- +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2575,9 +2575,7 @@ ext4_group_t ext4_mb_prefetch(struct sup + */ + if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + EXT4_MB_GRP_NEED_INIT(grp) && +- ext4_free_group_clusters(sb, gdp) > 0 && +- !(ext4_has_group_desc_csum(sb) && +- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ ext4_free_group_clusters(sb, gdp) > 0 ) { + bh = ext4_read_block_bitmap_nowait(sb, group, true); + if (bh && !IS_ERR(bh)) { + if (!buffer_uptodate(bh) && cnt) +@@ -2618,9 +2616,7 @@ void ext4_mb_prefetch_fini(struct super_ + grp = ext4_get_group_info(sb, group); + + if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && +- ext4_free_group_clusters(sb, gdp) > 0 && +- !(ext4_has_group_desc_csum(sb) && +- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { ++ ext4_free_group_clusters(sb, gdp) > 0) { + if (ext4_mb_init_group(sb, group, GFP_NOFS)) + break; + } diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.10.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.10.series index 948b74b..0c4cb3a 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.10.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.10.series @@ -42,3 +42,22 @@ rhel8/ext4-encdata.patch rhel8/ext4-race-in-ext4-destroy-inode.patch rhel8/ext4-mballoc-dense.patch rhel8/ext4-limit-per-inode-preallocation-list.patch +rhel8.8/ext4-add-prefetching-for-block-allocation-bitmaps.patch +rhel8.8/ext4-add-prefetch-block-bitmap-mount-option.patch +rhel8.8/ext4-optimize-the-ext4_mb_good_group.patch +rhel8.8/ext4-drop-s_mb_bal_lock-convert-protected-fields-to-atomic.patch +rhel8.8/ext4-add-mballoc-stats-proc-file.patch +rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch +rhel8.8/ext4-improve-cr0-cr1-group-scanning.patch +rhel8.8/ext4-make-prefetch_block_bitmaps-default.patch +rhel8.8/ext4-make-mb_optimize_scan-performance-with-extents.patch +rhel8.8/ext4-reflect-mb_optimize_scan-value-in-options.patch +rhel8.8/ext4-make-mballoc-try-target-group-first.patch +rhel8.8/ext4-avoiod-unnecessary-spreading-of-allocations.patch +rhel8.8/ext4-use-locality-group-preallocation-for-small-files.patch +rhel8.8/ext4-use-buckets-for-cr1-block-scan.patch +rhel8.8/ext4-fixup-possible-uninit-var-in-ext4_mb_choose_next_group_cr1.patch +rhel8.8/ext4-refactor-code-related-to-freeing-pa.patch +rhel8.8/ext4-allow-ext4_get_group_info-to-fail.patch +rhel8.8/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch +rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series index 9fef225..de867e7 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series @@ -42,3 +42,23 @@ rhel8/ext4-encdata.patch rhel8/ext4-race-in-ext4-destroy-inode.patch rhel8/ext4-mballoc-dense.patch rhel8/ext4-limit-per-inode-preallocation-list.patch +rhel8.8/ext4-add-prefetching-for-block-allocation-bitmaps.patch +rhel8.8/ext4-add-prefetch-block-bitmap-mount-option.patch +rhel8.8/ext4-optimize-the-ext4_mb_good_group.patch +rhel8.8/ext4-drop-s_mb_bal_lock-convert-protected-fields-to-atomic.patch +rhel8.8/ext4-add-mballoc-stats-proc-file.patch +rhel8.8/ext4-add-MB_NUM_ORDERS-macro.patch +rhel8.8/ext4-improve-cr0-cr1-group-scanning.patch +rhel8.8/ext4-make-prefetch_block_bitmaps-default.patch +rhel8.8/ext4-make-mb_optimize_scan-performance-with-extents.patch +rhel8.8/ext4-reflect-mb_optimize_scan-value-in-options.patch +rhel8.8/ext4-make-mballoc-try-target-group-first.patch +rhel8.8/ext4-avoiod-unnecessary-spreading-of-allocations.patch +rhel8.8/ext4-use-locality-group-preallocation-for-small-files.patch +rhel8.8/ext4-use-buckets-for-cr1-block-scan.patch +rhel8.8/ext4-limit-number-of-retries-after-discard-prealloc-blocks.patch +rhel8.8/ext4-fixup-possible-uninit-var-in-ext4_mb_choose_next_group_cr1.patch +rhel8.8/ext4-refactor-code-related-to-freeing-pa.patch +rhel8.8/ext4-allow-ext4_get_group_info-to-fail.patch +rhel8.8/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch +rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.4.series b/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.4.series index f8014c7..5fa1ef9 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.4.series @@ -38,3 +38,6 @@ rhel9.1/ext4-enc-flag.patch rhel9.2/ext4-encdata.patch rhel9/ext4-add-periodic-superblock-update.patch rhel9.4/ext4-add-IGET_NO_CHECKS-flag.patch +rhel9.4/ext4-allow-ext4_get_group_info-to-fail.patch +rhel9.4/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch +rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.5.series b/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.5.series index 42c95ac..cae7767 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.14-rhel9.5.series @@ -38,3 +38,5 @@ rhel9.1/ext4-enc-flag.patch rhel9.2/ext4-encdata.patch rhel9.5/ext4-add-periodic-superblock-update.patch rhel9.4/ext4-add-IGET_NO_CHECKS-flag.patch +rhel9.4/ext4-not-skip-prefetching-BLOCK_UNINIT-groups.patch +rhel8.8/ext4-ensure-ext4_mb_prefetch_fini-called-for-all-prefetched-bg.patch -- 1.8.3.1