From cb8761fa4e33973a95e310c45f935630b85f359d Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 23 Jul 2024 09:09:42 +0700 Subject: [PATCH] LU-16350 ldiskfs: Server support for linux v6.10 Updated patch series for Linux v6.10: ext4-corrupted-inode-block-bitmaps-handling-patches.patch ext4-delayed-iput.patch ext4-filename-encode.patch ext4-max-dir-size.patch ext4-mballoc-extra-checks.patch ext4-misc.patch ext4-prealloc.patch The same updates applies for Ubuntu 6.10.0 kernel Test-Parameters: trivial HPE-bug-id: LUS-11376 Signed-off-by: Shaun Tancheff Change-Id: I456ec723f04aaf57cb64965cc9d53fbea23a8c27 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55729 Tested-by: jenkins Tested-by: Maloo Tested-by: Shuichi Ihara Reviewed-by: Shuichi Ihara Reviewed-by: Jian Yu Reviewed-by: Oleg Drokin --- config/lustre-build-ldiskfs.m4 | 11 +- ...pted-inode-block-bitmaps-handling-patches.patch | 298 +++++++++++++ .../patches/linux-6.10/ext4-delayed-iput.patch | 185 ++++++++ .../patches/linux-6.10/ext4-filename-encode.patch | 474 +++++++++++++++++++++ .../patches/linux-6.10/ext4-max-dir-size.patch | 50 +++ .../linux-6.10/ext4-mballoc-extra-checks.patch | 320 ++++++++++++++ .../patches/linux-6.10/ext4-misc.patch | 208 +++++++++ .../patches/linux-6.10/ext4-prealloc.patch | 409 ++++++++++++++++++ .../kernel_patches/series/ldiskfs-6.10-ml.series | 37 ++ lustre/ChangeLog | 5 + 10 files changed, 1995 insertions(+), 2 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index 093b69d..41f7f34 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -91,6 +91,7 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ ]) ], [test x$UBUNTU_KERNEL = xyes], [ BASEVER=$(echo $LINUXRELEASE | cut -d'-' -f1) + AS_VERSION_COMPARE([$BASEVER],[6.10.0],[ AS_VERSION_COMPARE([$BASEVER],[6.8.0],[ AS_VERSION_COMPARE([$BASEVER],[5.19.0],[ AS_VERSION_COMPARE([$BASEVER],[5.15.0],[ @@ -157,7 +158,9 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ [LDISKFS_SERIES="5.19.0-35-ubuntu.series"], [LDISKFS_SERIES="5.19.0-35-ubuntu.series"])], [LDISKFS_SERIES="6.7-ml.series"], - [LDISKFS_SERIES="6.7-ml.series"]) + [LDISKFS_SERIES="6.7-ml.series"])], + [LDISKFS_SERIES="6.10-ml.series"], + [LDISKFS_SERIES="6.10-ml.series"]) ], [test x$OPENEULER_KERNEL = xyes], [ case $OPENEULER_VERSION_NO in 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; @@ -186,7 +189,11 @@ AS_IF([test -z "$LDISKFS_SERIES"], AS_VERSION_COMPARE([$LINUXRELEASE],[6.7.0], [ LDISKFS_SERIES="6.6-ml.series"], [ LDISKFS_SERIES="6.7-ml.series"], [ - LDISKFS_SERIES="6.7-ml.series"] + AS_VERSION_COMPARE([$LINUXRELEASE],[6.10.0], [ + LDISKFS_SERIES="6.7-ml.series"], [ + LDISKFS_SERIES="6.10-ml.series"], [ + LDISKFS_SERIES="6.10-ml.series"] + )] # 6.10 )] # 6.7 )] # 6.6 )] # 6.1 diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch new file mode 100644 index 0000000..9d1e4b0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch @@ -0,0 +1,298 @@ +commit 2963f3d09eb3a0817f87386c0bd7be7ce086809d +Author: Wang Shilong +AuthorDate: Tue Sep 8 21:54:29 2015 +0800 +LU-7114 ldiskfs: corrupted bitmaps handling patches + +This patch backported following patches from upstream: + +163a203ddb36c36d4a1c942aececda0cc8d06aa7 +ext4: mark block group as corrupt on block bitmap error + +87a39389be3e3b007d341be510a7e4a0542bdf05 +ext4: mark block group as corrupt on inode bitmap error + +bdfb6ff4a255dcebeb09a901250e13a97eff75af +ext4: mark group corrupt on group descriptor checksum + +Also use ext4_warning() instead of ext4_error() so that +filesystem don't become RO in default, and together +with these patches,FS wil still be usable even such +bad things happen. + +Signed-off-by: Wang Shilong +Change-Id: Ib4075aba7df6f7f59e89a90475405080acd43dd0 +Reviewed-on: http://review.whamcloud.com/16312 +Reviewed-by: Andreas Dilger +Reviewed-by: Yang Sheng + +NOTE: Ported to linux 6.7 keeps the ext4_warning() updates. +--- + fs/ext4/balloc.c | 18 +++++++-------- + fs/ext4/ialloc.c | 6 ++--- + fs/ext4/mballoc.c | 59 ++++++++++++++++++----------------------------- + 3 files changed, 34 insertions(+), 49 deletions(-) + +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 591fb3f7..6f19cefd 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -420,7 +420,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ++ ext4_warning(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -428,8 +428,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: block %llu: invalid block bitmap", +- block_group, blk); ++ ext4_warning(sb, "bg %u: block %llu: invalid block bitmap", ++ block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; +@@ -519,18 +519,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + goto out; + } + err = ext4_init_block_bitmap(sb, bh, block_group, desc); +- if (err) { +- ext4_unlock_group(sb, block_group); +- unlock_buffer(bh); +- ext4_error(sb, "Failed to init block bitmap for group " +- "%u: %d", block_group, err); +- goto out; +- } + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); ++ if (err) { ++ ext4_warning(sb, "Failed to init block bitmap for group " ++ "%u: %d", block_group, err); ++ goto out; ++ } + return bh; + } + ext4_unlock_group(sb, block_group); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 31480792..c725ade0 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -102,8 +102,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " +- "inode_bitmap = %llu", block_group, blk); ++ ext4_warning(sb, "Corrupt inode bitmap - block_group = %u, " ++ "inode_bitmap = %llu", block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -353,7 +353,7 @@ out: + if (!fatal) + fatal = err; + } else { +- ext4_error(sb, "bit already cleared for inode %lu", ino); ++ ext4_warning(sb, "bit already cleared for inode %lu", ino); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 11551a01..3bcfb5d1 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1214,10 +1214,14 @@ int ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, 0, 0, +- "block bitmap and bg descriptor " +- "inconsistent: %u vs %u free clusters", +- free, grp->bb_free); ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ ext4_warning(sb, "group %lu: block bitmap and bg descriptor " ++ "inconsistent: %u vs %u free clusters " ++ "%u in gd, %lu pa's", ++ (long unsigned int)group, free, grp->bb_free, ++ ext4_free_group_clusters(sb, gdp), ++ grp->bb_prealloc_nr); + /* + * If we intend to continue, we consider group descriptor + * corrupt and update bb_free using bitmap value +@@ -1588,7 +1592,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + int block; + int pnum; + int poff; +- struct folio *folio; ++ struct folio *folio = NULL; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -1616,7 +1620,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + */ + ret = ext4_mb_init_group(sb, group, gfp); + if (ret) +- return ret; ++ goto err; + } + + /* +@@ -1728,6 +1732,7 @@ err: + + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; ++ ext4_warning(sb, "Error loading buddy information for %u", group); + return ret; + } + +@@ -5129,9 +5134,11 @@ int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, + } + + if (free != free_in_gdp) { +- ext4_error(sb, "on-disk bitmap for group %d" ++ ext4_warning(sb, "on-disk bitmap for group %d" + "corrupted: %u blocks free in bitmap, %u - in gd\n", + group, free, free_in_gdp); ++ ext4_mark_group_bitmap_corrupted(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EIO; + } + return 0; +@@ -5547,16 +5554,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + /* "free < pa->pa_free" means we maybe double alloc the same blocks, + * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ + if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { +- ext4_error(sb, "pa free mismatch: [pa %p] " +- "[phy %lu] [logic %lu] [len %u] [free %u] " +- "[error %u] [inode %d] [freed %u]", pa, +- (unsigned long)pa->pa_pstart, +- (unsigned long)pa->pa_lstart, +- pa->pa_len, (unsigned)pa->pa_free, +- (unsigned)pa->pa_error, pa->pa_inode->i_ino, +- free); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", +- free, pa->pa_free); ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. +@@ -5619,16 +5618,11 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, +- "Error %d reading block bitmap for %u", +- err, group); + goto out_dbg; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- err, group); + put_bh(bitmap_bh); + goto out_dbg; + } +@@ -5788,17 +5782,12 @@ repeat: + + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + return; +- } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", +- err, group); + ext4_mb_unload_buddy(&e4b); + continue; + } +@@ -6103,11 +6092,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + group = ext4_get_group_number(sb, pa->pa_pstart); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + continue; +- } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_get_group_info(sb, group)->bb_prealloc_nr--; +@@ -6471,7 +6457,7 @@ errout: + * been updated or not when fail case. So can + * not revert pa_free back, just mark pa_error*/ + pa->pa_error++; +- ext4_error(sb, ++ ext4_warning(sb, + "Updating bitmap error: [err %d] " + "[pa %p] [phy %lu] [logic %lu] " + "[len %u] [free %u] [error %u] " +@@ -6482,6 +6468,7 @@ errout: + (unsigned)pa->pa_free, + (unsigned)pa->pa_error, + pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ ext4_mark_group_bitmap_corrupted(sb, 0, 0); + } + } + ext4_mb_release_context(ac); +@@ -6677,7 +6664,7 @@ do_more: + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) +- goto error_out; ++ goto error_quiet; + + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { +@@ -6773,6 +6760,7 @@ error_clean: + ext4_mb_unload_buddy(&e4b); + error_out: + ext4_std_error(sb, err); ++error_quiet: + } + + /** +@@ -6918,7 +6906,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) +- goto error_out; ++ goto error_quiet; + + if (!ext4_sb_block_valid(sb, NULL, block, count)) { + ext4_error(sb, "Adding blocks in system zones - " +@@ -6947,6 +6935,7 @@ error_clean: + ext4_mb_unload_buddy(&e4b); + error_out: + ext4_std_error(sb, err); ++error_quiet: + return err; + } + +@@ -7095,8 +7084,6 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- ret, group); + return ret; + } + +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch new file mode 100644 index 0000000..ad576c3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch @@ -0,0 +1,185 @@ +commit e239a14001b62d96c186ae2c9f58402f73e63dcc +Author: Andrew Perepechko +AuthorDate: Mon Jan 31 19:55:31 2022 +0300 +LU-15404 ldiskfs: truncate during setxattr leads to kernel panic + +When changing a large xattr value to a different large xattr value, +the old xattr inode is freed. Truncate during the final iput causes +current transaction restart. Eventually, parent inode bh is marked +dirty and kernel panic happens when jbd2 figures out that this bh +belongs to the committed transaction. + +A possible fix is to call this final iput in a separate thread. +This way, setxattr transactions will never be split into two. +Since the setxattr code adds xattr inodes with nlink=0 into the +orphan list, old xattr inodes will be properly cleaned up in +any case. + +Change-Id: Idd70befa6a83818ece06daccf9bb6256812674b9 +Signed-off-by: Andrew Perepechko +HPE-bug-id: LUS-10534 +Reviewed-on: https://review.whamcloud.com/46358 +Reviewed-by: Andreas Dilger +Reviewed-by: Alexander Zarochentsev +--- + fs/ext4/ext4.h | 7 +++++-- + fs/ext4/page-io.c | 2 +- + fs/ext4/super.c | 15 ++++++++------- + fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- + 4 files changed, 51 insertions(+), 12 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index a3276ccc..25fb849f 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1665,8 +1665,11 @@ struct ext4_sb_info { + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + +- /* workqueue for reserved extent conversions (buffered io) */ +- struct workqueue_struct *rsv_conversion_wq; ++ /* ++ * workqueue for reserved extent conversions (buffered io) ++ * and large ea inodes reclaim ++ */ ++ struct workqueue_struct *s_misc_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; +diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c +index ad554386..aea9ce61 100644 +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -229,7 +229,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); +- wq = sbi->rsv_conversion_wq; ++ wq = sbi->s_misc_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 56dfb89a..8465e403 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1296,10 +1296,11 @@ static void ext4_put_super(struct super_block *sb) + &sb->s_uuid); + + ext4_unregister_li_request(sb); ++ flush_workqueue(sbi->s_misc_wq); + ext4_quotas_off(sb, EXT4_MAXQUOTAS); + + flush_work(&sbi->s_sb_upd_work); +- destroy_workqueue(sbi->rsv_conversion_wq); ++ destroy_workqueue(sbi->s_misc_wq); + ext4_release_orphan_info(sb); + + if (sbi->s_journal) { +@@ -5443,9 +5444,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ +- EXT4_SB(sb)->rsv_conversion_wq = +- alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); +- if (!EXT4_SB(sb)->rsv_conversion_wq) { ++ EXT4_SB(sb)->s_misc_wq = ++ alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); ++ if (!EXT4_SB(sb)->s_misc_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + err = -ENOMEM; + goto failed_mount4; +@@ -5618,8 +5619,8 @@ failed_mount4a: + sb->s_root = NULL; + failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); +- if (EXT4_SB(sb)->rsv_conversion_wq) +- destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); ++ if (EXT4_SB(sb)->s_misc_wq) ++ destroy_workqueue(EXT4_SB(sb)->s_misc_wq); + failed_mount_wq: + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; +@@ -6306,7 +6307,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) + return 0; + + trace_ext4_sync_fs(sb, wait); +- flush_workqueue(sbi->rsv_conversion_wq); ++ flush_workqueue(sbi->s_misc_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 7dcb257b..7e6da3a6 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1658,6 +1658,36 @@ out_err: + return ERR_PTR(err); + } + ++struct delayed_iput_work { ++ struct work_struct work; ++ struct inode *inode; ++}; ++ ++static void delayed_iput_fn(struct work_struct *work) ++{ ++ struct delayed_iput_work *diwork; ++ ++ diwork = container_of(work, struct delayed_iput_work, work); ++ iput(diwork->inode); ++ kfree(diwork); ++} ++ ++static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) ++{ ++ if (!inode) { ++ kfree(work); ++ return; ++ } ++ ++ if (!work) { ++ iput(inode); ++ } else { ++ INIT_WORK(&work->work, delayed_iput_fn); ++ work->inode = inode; ++ queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); ++ } ++} ++ + /* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. +@@ -1675,6 +1705,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + size_t min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + struct inode *old_ea_inode = NULL; ++ struct delayed_iput_work *diwork = NULL; + size_t old_size, new_size; + int ret; + +@@ -1751,7 +1782,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { +- ret = ext4_xattr_inode_iget(inode, ++ diwork = kmalloc(sizeof(*diwork), GFP_NOFS); ++ if (!diwork) ++ ret = -ENOMEM; ++ else ++ ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), + &old_ea_inode); +@@ -1886,7 +1921,7 @@ update_hash: + + ret = 0; + out: +- iput(old_ea_inode); ++ delayed_iput(old_ea_inode, diwork); + return ret; + } + +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch new file mode 100644 index 0000000..dda4847 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch @@ -0,0 +1,474 @@ +From d0a722cb8fb886380e24e8261e8efca09a3262d6 Mon Sep 17 00:00:00 2001 +From: Sebastien Buisson +Date: Tue, 20 Dec 2022 15:40:52 +0100 +Subject: [PATCH] LU-16374 ldiskfs: implement security.encdata xattr + +security.encdata is a virtual xattr containing information related +to encrypted files. It is expressed as ASCII text with a "key: value" +format, and space as field separator. For instance: + + { encoding: base64url, size: 3012, enc_ctx: YWJjZGVmZ2hpamtsbW + 5vcHFyc3R1dnd4eXphYmNkZWZnaGlqa2xtbg, enc_name: ZmlsZXdpdGh2ZX + J5bG9uZ25hbWVmaWxld2l0aHZlcnlsb25nbmFtZWZpbGV3aXRodmVyeWxvbmdu + YW1lZmlsZXdpdGg } + +'encoding' is the encoding method used for binary data, assume name +can be up to 255 chars. +'size' is the clear text file data length in bytes. +'enc_ctx' is encoded encryption context, 40 bytes for v2. +'enc_name' is encoded encrypted name, 256 bytes max. +So on overall, this xattr is at most 727 chars plus terminating '0'. + +On get, the value of the security.encdata xattr is computed from +encrypted file's information. +On set, encrypted file's information is restored from xattr value. +The encrypted name is stored temporarily in a dedicated xattr +LDISKFS_XATTR_NAME_RAWENCNAME, that will be used to set correct name +at linkat. + +Signed-off-by: Sebastien Buisson +Change-Id: Ia318c39d403b1c448e71bcd5b29862d022d05d0a +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49456 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Andreas Dilger +Reviewed-by: Li Dongyang +Reviewed-by: Oleg Drokin +--- + fs/ext4/critical_encode.h | 170 ++++++++++++++++++++++++++++++++++++++ + fs/ext4/dir.c | 33 +++++--- + fs/ext4/ialloc.c | 1 + + fs/ext4/namei.c | 56 +++++++++---- + 4 files changed, 235 insertions(+), 25 deletions(-) + create mode 100644 fs/ext4/critical_encode.h + +diff --git a/fs/ext4/critical_encode.h b/fs/ext4/critical_encode.h +new file mode 100644 +index 00000000..f75aedab +--- /dev/null ++++ b/fs/ext4/critical_encode.h +@@ -0,0 +1,170 @@ ++/* ++ * critical_encode.h ++ * ++ * Copyright (c) 2022 Whamcloud ++ */ ++ ++#ifndef _CRITICAL_ENCODE_H ++#define _CRITICAL_ENCODE_H ++ ++#include ++ ++/* Encoding/decoding routines inspired from yEnc principles. ++ * We just take care of a few critical characters: ++ * NULL, LF, CR, /, DEL and =. ++ * If such a char is found, it is replaced with '=' followed by ++ * the char value + 64. ++ * All other chars are left untouched. ++ * Efficiency of this encoding depends on the occurences of the ++ * critical chars, but statistically on binary data it can be much higher ++ * than base64 for instance. ++ */ ++static inline int critical_encode(const u8 *src, int len, char *dst) ++{ ++ u8 *p = (u8 *)src, *q = dst; ++ ++ while (p - src < len) { ++ /* escape NULL, LF, CR, /, DEL and = */ ++ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || ++ *p == '/' || *p == 0x7F || *p == '=')) { ++ *(q++) = '='; ++ *(q++) = *(p++) + 64; ++ } else { ++ *(q++) = *(p++); ++ } ++ } ++ ++ return (char *)q - dst; ++} ++ ++/* returns the number of chars encoding would produce */ ++static inline int critical_chars(const u8 *src, int len) ++{ ++ u8 *p = (u8 *)src; ++ int newlen = len; ++ ++ while (p - src < len) { ++ /* NULL, LF, CR, /, DEL and = cost an additional '=' */ ++ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || ++ *p == '/' || *p == 0x7F || *p == '=')) ++ newlen++; ++ p++; ++ } ++ ++ return newlen; ++} ++ ++/* decoding routine - returns the number of chars in output */ ++static inline int critical_decode(const u8 *src, int len, char *dst) ++{ ++ u8 *p = (u8 *)src, *q = dst; ++ ++ while (p - src < len) { ++ if (unlikely(*p == '=')) { ++ *(q++) = *(++p) - 64; ++ p++; ++ } else { ++ *(q++) = *(p++); ++ } ++ } ++ ++ return (char *)q - dst; ++} ++ ++#define fscrypt_get_encryption_info(inode) \ ++ (unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)) ? 0 : -EOPNOTSUPP) ++ ++static inline int ext4_has_permitted_context(struct inode *parent, ++ struct inode *child) ++{ ++ if (unlikely(!IS_LUSTRE_MOUNT(parent->i_sb))) ++ return 1; ++ return fscrypt_has_permitted_context(parent, child); ++} ++ ++struct ext4_filename; ++ ++static inline int ext4_prepare_readdir(struct inode *dir) ++{ ++ if (unlikely(!IS_LUSTRE_MOUNT(dir->i_sb))) ++ return 0; ++ return fscrypt_prepare_readdir(dir); ++} ++ ++static inline int ext4_fname_alloc_buffer(const struct inode *inode, ++ u32 max_encrypted_len, ++ struct fscrypt_str *crypto_str) ++{ ++ crypto_str->name = kmalloc(max_encrypted_len + 1, GFP_NOFS); ++ if (!crypto_str->name) ++ return -ENOMEM; ++ crypto_str->len = max_encrypted_len; ++ return 0; ++} ++ ++static inline void ext4_fname_free_buffer(struct fscrypt_str *crypto_str) ++{ ++ if (!crypto_str) ++ return; ++ kfree(crypto_str->name); ++ crypto_str->name = NULL; ++} ++ ++static inline int ext4_fname_disk_to_usr(struct inode *inode, ++ u32 hash, u32 minor_hash, ++ const struct fscrypt_str *iname, ++ struct fscrypt_str *oname) ++{ ++ int presented_len; ++ ++ presented_len = critical_encode(iname->name, iname->len, oname->name); ++ if (presented_len > NAME_MAX) { ++ /* truncate at NAME_MAX, ++ * or NAME_MAX-1 if name ends with '=' to avoid decoding issue ++ */ ++ presented_len = NAME_MAX; ++ if (oname->name[presented_len - 1] == '=') ++ presented_len--; ++ oname->len = presented_len; ++ } ++ oname->name[presented_len] = '\0'; ++ ++ return 0; ++} ++ ++static inline int ext4_setup_filename(struct inode *dir, ++ const struct qstr *iname, ++ int lookup, ++ struct ext4_filename *fname) ++{ ++ fname->usr_fname = iname; ++ ++ if (lookup && IS_ENCRYPTED(dir) && ++ unlikely(!IS_LUSTRE_MOUNT(dir->i_sb) && ++ strnchr(iname->name, iname->len, '='))) { ++ /* Only proceed to critical decode if ++ * iname contains escape char '='. ++ */ ++ int len = iname->len; ++ char *buf; ++ ++ buf = kmalloc(len, GFP_NOFS); ++ if (!buf) ++ return -ENOMEM; ++ ++ len = critical_decode(iname->name, len, buf); ++ fname->disk_name.name = (unsigned char *)buf; ++ fname->disk_name.len = len; ++ return 0; ++ } ++ ++ fname->disk_name.name = (unsigned char *) iname->name; ++ fname->disk_name.len = iname->len; ++ ++#ifdef CONFIG_UNICODE ++ ext4_fname_setup_ci_filename(dir, iname, fname); ++#endif ++ return 0; ++} ++ ++#endif /* _CRITICAL_ENCODE_H */ +diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c +index ae110d71..849b1e0e 100644 +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -29,6 +29,7 @@ + #include + #include "ext4.h" + #include "xattr.h" ++#include "critical_encode.h" + + static int ext4_dx_readdir(struct file *, struct dir_context *); + +@@ -134,7 +135,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) + struct buffer_head *bh = NULL; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + +- err = fscrypt_prepare_readdir(inode); ++ err = ext4_prepare_readdir(inode); + if (err) + return err; + +@@ -161,7 +162,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) + return err; + } + +- if (IS_ENCRYPTED(inode)) { ++ /* disable decryption of filename, present only escaped name */ ++ if (0 && IS_ENCRYPTED(inode)) { + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); + if (err < 0) + return err; +@@ -275,24 +277,33 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) + get_dtype(sb, de->file_type))) + goto done; + } else { +- int save_len = fstr.len; + struct fscrypt_str de_name = + FSTR_INIT(de->name, + de->name_len); ++ int presented_len; + + /* Directory is encrypted */ +- err = fscrypt_fname_disk_to_usr(inode, +- EXT4_DIRENT_HASH(de), +- EXT4_DIRENT_MINOR_HASH(de), +- &de_name, &fstr); +- de_name = fstr; +- fstr.len = save_len; ++ presented_len = critical_chars(de->name, ++ de->name_len); ++ err = ext4_fname_alloc_buffer(inode, ++ presented_len, ++ &fstr); + if (err) + goto errout; +- if (!dir_emit(ctx, ++ ++ err = ext4_fname_disk_to_usr(inode, ++ 0, 0, &de_name, &fstr); ++ de_name = fstr; ++ if (err) { ++ ext4_fname_free_buffer(&fstr); ++ goto errout; ++ } ++ err = dir_emit(ctx, + de_name.name, de_name.len, + le32_to_cpu(de->inode), +- get_dtype(sb, de->file_type))) ++ get_dtype(sb, de->file_type)); ++ ext4_fname_free_buffer(&fstr); ++ if (!err) + goto done; + } + } +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index fc1f09fe..b12a4324 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -30,6 +30,7 @@ + #include "ext4_jbd2.h" + #include "xattr.h" + #include "acl.h" ++#include "critical_encode.h" + + #include + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 24b07d9d..1986ab5a 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -41,6 +41,7 @@ + + #include "xattr.h" + #include "acl.h" ++#include "critical_encode.h" + + #include + /* +@@ -1441,7 +1442,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, + ext4_dir_rec_len(0, + csum ? NULL : dir)); + /* Check if the directory is encrypted */ +- if (IS_ENCRYPTED(dir)) { ++ if (0 && IS_ENCRYPTED(dir)) { + err = fscrypt_prepare_readdir(dir); + if (err < 0) { + brelse(bh); +@@ -1492,22 +1493,31 @@ static int htree_dirblock_to_tree(struct file *dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { +- int save_len = fname_crypto_str.len; + struct fscrypt_str de_name = FSTR_INIT(de->name, + de->name_len); ++ int presented_len; + + /* Directory is encrypted */ +- err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, ++ presented_len = critical_chars(de->name, de->name_len); ++ err = ext4_fname_alloc_buffer(dir, presented_len, ++ &fname_crypto_str); ++ if (err) { ++ count = err; ++ goto errout; ++ } ++ ++ err = ext4_fname_disk_to_usr(dir, hinfo->hash, + hinfo->minor_hash, &de_name, + &fname_crypto_str); + if (err) { ++ ext4_fname_free_buffer(&fname_crypto_str); + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); +- fname_crypto_str.len = save_len; ++ ext4_fname_free_buffer(&fname_crypto_str); + } + if (err != 0) { + count = err; +@@ -1837,7 +1847,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, + */ + static bool ext4_match(struct inode *parent, + const struct ext4_filename *fname, +- struct ext4_dir_entry_2 *de) ++ struct ext4_dir_entry_2 *de, int denamelen) + { + struct fscrypt_name f; + +@@ -1872,7 +1882,7 @@ static bool ext4_match(struct inode *parent, + } + #endif + +- return fscrypt_match_name(&f, de->name, de->name_len); ++ return fscrypt_match_name(&f, de->name, denamelen); + } + + /* +@@ -1883,16 +1893,30 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) + { + struct ext4_dir_entry_2 * de; ++ bool probablytrunc; + char * dlimit; +- int de_len; ++ int de_len, denamelen; + + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; ++ /* fname is probably truncated if it is the decoded representation of ++ * an encrypted filename not aligned on a 32-byte boundary ++ */ ++ probablytrunc = !IS_LUSTRE_MOUNT(dir->i_sb) && IS_ENCRYPTED(dir) && ++ fname->disk_name.len & 31; + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ ++ denamelen = de->name_len; ++ if (unlikely(probablytrunc) && ++ de->name_len > fname->disk_name.len) ++ /* Adjust name len to look for a partial match. ++ * Since it is binary encrypted names, there ++ * should not be any collision between names. ++ */ ++ denamelen = fname->disk_name.len; + if (de->name + de->name_len <= dlimit && +- ext4_match(dir, fname, de)) { ++ ext4_match(dir, fname, de, denamelen)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, +@@ -2093,7 +2117,7 @@ struct buffer_head *ext4_find_entry_locked(struct inode *dir, + struct ext4_filename fname; + struct buffer_head *bh; + +- err = ext4_fname_setup_filename(dir, d_name, 1, &fname); ++ err = ext4_setup_filename(dir, d_name, 1, &fname); + if (err == -ENOENT) + return NULL; + if (err) +@@ -2101,7 +2125,9 @@ struct buffer_head *ext4_find_entry_locked(struct inode *dir, + + bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); + +- ext4_fname_free_filename(&fname); ++ if (fname.disk_name.name != d_name->name) ++ kfree(fname.disk_name.name); ++ + return bh; + } + +@@ -2115,7 +2141,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct ext4_filename fname; + struct buffer_head *bh; + +- err = ext4_fname_prepare_lookup(dir, dentry, &fname); ++ err = ext4_setup_filename(dir, &dentry->d_name, 1, &fname); + if (err == -ENOENT) + return NULL; + if (err) +@@ -2123,7 +2149,9 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + + bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); + +- ext4_fname_free_filename(&fname); ++ if (fname.disk_name.name != dentry->d_name.name) ++ kfree(fname.disk_name.name); ++ + return bh; + } + +@@ -2215,7 +2243,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi + } + if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && +- !fscrypt_has_permitted_context(dir, inode)) { ++ !ext4_has_permitted_context(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); +@@ -2508,7 +2536,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EFSCORRUPTED; +- if (ext4_match(dir, fname, de)) ++ if (ext4_match(dir, fname, de, de->name_len)) + return -EEXIST; + nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch new file mode 100644 index 0000000..bd46b0e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch @@ -0,0 +1,50 @@ +Add a proc interface for max_dir_size. + +--- + fs/ext4/sysfs.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index 2aeff069..17c391ec 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -218,6 +218,8 @@ EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, + EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); + EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); ++EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size_kb); ++EXT4_RW_ATTR_SBI_UI(max_dir_size_kb, s_max_dir_size_kb); + EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +@@ -267,6 +269,8 @@ static struct attribute *ext4_attrs[] = { + ATTR_LIST(sra_exceeded_retry_limit), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), ++ ATTR_LIST(max_dir_size), ++ ATTR_LIST(max_dir_size_kb), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), +@@ -392,6 +396,9 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); ++ if (strcmp("max_dir_size", a->attr.name) == 0) ++ return sysfs_emit(buf, "%u\n", ++ (*((unsigned int *) ptr)) << 10); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); +@@ -471,6 +478,8 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; ++ if (strcmp("max_dir_size", a->attr.name) == 0) ++ t >>= 10; + if (a->attr_ptr == ptr_ext4_super_block_offset) + *((__le32 *) ptr) = cpu_to_le32(t); + else +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch new file mode 100644 index 0000000..e957f37 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch @@ -0,0 +1,320 @@ +commit f2f28f1d09c0a00b3fc569422f881931d857fac9 +Author: Alex Zhuravlev +AuthorDate: Tue Oct 28 17:59:09 2008 +0000 +Subject: ext4: detect on-disk corruption of block bitmap +Detect on-disk corruption of block bitmap and better checking of +preallocated blocks. +Bugzilla-ID: b=16680 +Signed-off-by: Alex Zhuravlev +Reviewed-by: Kalpak Shah +Signed-off-by: Andreas Dilger +--- + fs/ext4/ext4.h | 1 + + fs/ext4/mballoc.c | 105 ++++++++++++++++++++++++++++++++++++++++------ + fs/ext4/mballoc.h | 2 +- + 3 files changed, 94 insertions(+), 14 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 95bbfd52..860680e4 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3449,6 +3449,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index e64b31e5..838f5303 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -416,7 +416,7 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_64k", "ext4_groupinfo_128k" + }; + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); + +@@ -1181,7 +1181,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) + { +@@ -1225,6 +1225,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ++ return -EIO; + } + mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); +@@ -1234,6 +1235,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); ++ ++ return 0; + } + + static void mb_regenerate_buddy(struct ext4_buddy *e4b) +@@ -1355,7 +1358,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) + } + + first_block = folio->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; +@@ -1403,7 +1406,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ++ err = ext4_mb_generate_buddy(sb, data, incore, group, grinfo); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1418,7 +1421,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); + ext4_unlock_group(sb, group); + +@@ -1428,7 +1431,8 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) + incore = data; + } + } +- folio_mark_uptodate(folio); ++ if (likely(err == 0)) ++ folio_mark_uptodate(folio); + + out: + if (bh) { +@@ -3028,8 +3032,10 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = pde_data(file_inode(seq->file)); + ext4_group_t group = (ext4_group_t) ((unsigned long) v); ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + int i, err; + char nbuf[16]; ++ int free = 0; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, +@@ -3040,9 +3046,12 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; + } sg; + ++ if (gdp) ++ free = ext4_free_group_clusters(sb, gdp); ++ + group--; + if (group == 0) +- seq_puts(seq, "#group: free frags first [" ++ seq_puts(seq, "#group: bfree gfree frags first pa [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + +@@ -3067,8 +3076,10 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + * these are safe to access even after the buddy has been unloaded + */ + memcpy(&sg, grinfo, i); +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -5088,25 +5099,75 @@ try_group_pa: + return false; + } + ++/* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++static ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp); ++ ++ if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) ++ return 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != free_in_gdp) { ++ ext4_error(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, free_in_gdp); ++ return -EIO; ++ } ++ return 0; ++} ++ + /* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; ++ int skip = 0, count = 0; ++ int err; + int len; + + if (!grp) +- return; ++ return -EIO; ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. +@@ -5123,13 +5184,23 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + mb_set_bits(bitmap, start, len); + preallocated += len; ++ count++; ++ } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; + } + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_mark_pa_deleted(struct super_block *sb, +@@ -5220,6 +5291,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + if (pa->pa_type == MB_INODE_PA) { +@@ -5353,6 +5425,7 @@ adjust_bex: + pa->pa_inode = ac->ac_inode; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); +@@ -5406,6 +5479,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) + pa->pa_inode = NULL; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + /* + * We will later add the new pa to the right bucket +@@ -5572,6 +5646,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -5703,7 +5779,7 @@ repeat: + if (err) { + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -5716,6 +5792,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); +@@ -6020,6 +6098,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index d8553f14..fec1b8c2 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -66,7 +66,7 @@ + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch new file mode 100644 index 0000000..a81589e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch @@ -0,0 +1,208 @@ +commit b175e2441b0cd9fae60341ba92b0f7f192e71446 +Author: girish + +b=16893 +i=adilger +i=johann + +ext4 ldiskfs patches for rhel5 + +ported to linux 6.10 +--- + fs/ext4/ext4.h | 23 ++++++++++++++++++++++- + fs/ext4/ialloc.c | 3 ++- + fs/ext4/inode.c | 17 +++++++++++++++++ + fs/ext4/namei.c | 9 ++++++--- + fs/ext4/super.c | 6 ------ + fs/ext4/xattr.c | 2 ++ + 6 files changed, 49 insertions(+), 11 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index b5827d01..95bbfd52 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2225,7 +2225,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl + + EXTN_FEATURE_FUNCS(2) + EXTN_FEATURE_FUNCS(3) +-EXTN_FEATURE_FUNCS(4) ++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_compat & ++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & ++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & ++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); ++} + + static inline bool ext4_has_compat_features(struct super_block *sb) + { +@@ -3695,6 +3709,13 @@ struct ext4_extent; + #define EXT_MAX_BLOCKS 0xffffffff + + extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); ++extern void ext4_inc_count(struct inode *inode); ++extern void ext4_dec_count(struct inode *inode); ++extern struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block); + extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); + extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 93689dae..31480792 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -120,7 +120,7 @@ verified: + * + * Return buffer_head of bitmap on success, or an ERR_PTR on error. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -215,6 +215,7 @@ out: + put_bh(bh); + return ERR_PTR(err); + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 339bdfac..cd8ab8d3 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -6166,3 +6166,20 @@ out_error: + ext4_journal_stop(handle); + goto out; + } ++EXPORT_SYMBOL(ext4_map_blocks); ++EXPORT_SYMBOL(ext4_truncate); ++EXPORT_SYMBOL(ext4_iget); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_itable_unused_count); ++EXPORT_SYMBOL(ext4_force_commit); ++EXPORT_SYMBOL(__ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_get_group_desc); ++EXPORT_SYMBOL(__ext4_journal_get_write_access); ++EXPORT_SYMBOL(__ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); ++EXPORT_SYMBOL(__ext4_std_error); ++EXPORT_SYMBOL(ext4fs_dirhash); ++EXPORT_SYMBOL(ext4_get_inode_loc); ++EXPORT_SYMBOL(__ext4_journal_ensure_credits); ++EXPORT_SYMBOL(ext4_chunk_trans_blocks); +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 8f9c3c0e..bfd849ca 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -50,7 +50,7 @@ + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +-static struct buffer_head *ext4_append(handle_t *handle, ++struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) + { +@@ -210,6 +210,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + } + return bh; + } ++EXPORT_SYMBOL(ext4_append); + + #ifdef DX_DEBUG + #define dxtrace(command) command +@@ -2786,23 +2787,25 @@ EXPORT_SYMBOL(ext4_delete_entry); + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. + */ +-static void ext4_inc_count(struct inode *inode) ++void ext4_inc_count(struct inode *inode) + { + inc_nlink(inode); + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); + } ++EXPORT_SYMBOL(ext4_inc_count); + + /* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +-static void ext4_dec_count(struct inode *inode) ++void ext4_dec_count(struct inode *inode) + { + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); + } ++EXPORT_SYMBOL(ext4_dec_count); + + + /* +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index c682fb92..5250fa60 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -7365,16 +7365,12 @@ static int __init ext4_init_fs(void) + if (err) + goto out05; + +- register_as_ext3(); +- register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; + out: +- unregister_as_ext2(); +- unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); + out05: + destroy_inodecache(); +@@ -7399,8 +7395,6 @@ out7: + static void __exit ext4_exit_fs(void) + { + ext4_destroy_lazyinit_thread(); +- unregister_as_ext2(); +- unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); + destroy_inodecache(); +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 6460879b..4b94e270 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -716,6 +716,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, + up_read(&EXT4_I(inode)->xattr_sem); + return error; + } ++EXPORT_SYMBOL(ext4_xattr_get); + + static int + ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, +@@ -2490,6 +2491,7 @@ cleanup: + ext4_write_unlock_xattr(inode, &no_expand); + return error; + } ++EXPORT_SYMBOL(ext4_xattr_set_handle); + + int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch new file mode 100644 index 0000000..45cc9e1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch @@ -0,0 +1,409 @@ +commit d8d8fd9192a54c7b8caef8cca9b7a1eb5e5e3298 +Author: Alex Zhuravlev +AuthorDate: Thu Oct 23 10:02:19 2008 +0000 +Subject: ext4: support for tunable preallocation window +Add support for tunable preallocation window and new tunables +for large/small requests. +Bugzilla-ID: b=12800 +Signed-off-by: Alex Zhuravlev +Reviewed-by: Kalpak Shah +Reviewed-by: Andreas Dilger +--- + fs/ext4/ext4.h | 7 +- + fs/ext4/inode.c | 3 + + fs/ext4/mballoc.c | 220 +++++++++++++++++++++++++++++++++++----------- + fs/ext4/sysfs.c | 8 +- + 4 files changed, 182 insertions(+), 56 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 7332e538..b0723244 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1287,6 +1287,8 @@ extern void mb_set_bits(void *bm, int cur, int len); + #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ + #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + ++#define EXT4_MAX_PREALLOC_TABLE 64 ++ + /* + * Behaviour when detecting errors + */ +@@ -1595,11 +1597,13 @@ struct ext4_sb_info { + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ +@@ -2915,6 +2919,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, + int len, int replay); + + /* mballoc.c */ ++extern const struct proc_ops ext4_seq_prealloc_table_fops; + extern const struct seq_operations ext4_mb_seq_groups_ops; + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; + extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 401cf597..339bdfac 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2595,6 +2595,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) + PAGE_SIZE >> inode->i_blkbits); + } + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 9dda9cd6..e64b31e5 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3274,6 +3274,99 @@ const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .show = ext4_mb_seq_structs_summary_show, + }; + ++static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, ++ char *str, size_t cnt, ++ int update) ++{ ++ unsigned long value; ++ unsigned long prev = 0; ++ char *cur; ++ char *next; ++ char *end; ++ int num = 0; ++ ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &next, 0); ++ if (value == 0) ++ break; ++ if (cur == next) ++ return -EINVAL; ++ ++ cur = next; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return -EINVAL; ++ ++ /* they should add values in order */ ++ if (value <= prev) ++ return -EINVAL; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = value; ++ ++ prev = value; ++ num++; ++ } ++ ++ if (num > EXT4_MAX_PREALLOC_TABLE - 1) ++ return -EOVERFLOW; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = 0; ++ ++ return 0; ++} ++ ++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(pde_data(file_inode(file))); ++ char str[128]; ++ int rc; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0); ++ if (rc) ++ return rc; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1); ++ return rc ? rc : cnt; ++} ++ ++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ int i; ++ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE && ++ sbi->s_mb_prealloc_table[i] != 0; i++) ++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]); ++ seq_printf(m, "\n"); ++ ++ return 0; ++} ++ ++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_prealloc_table_seq_show, pde_data(inode)); ++} ++ ++const struct proc_ops ext4_seq_prealloc_table_fops = { ++ .proc_open = mb_prealloc_table_seq_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = single_release, ++ .proc_write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + { + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; +@@ -3590,7 +3683,7 @@ static void ext4_discard_work(struct work_struct *work) + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- unsigned i, j; ++ unsigned i, j, k, l; + unsigned offset, offset_incr; + unsigned max; + int ret; +@@ -3679,7 +3772,6 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; + +@@ -3705,9 +3797,29 @@ int ext4_mb_init(struct super_block *sb) + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ +- if (sbi->s_stripe > 1) { +- sbi->s_mb_group_prealloc = roundup( +- sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); ++ ++ /* Allocate table once */ ++ sbi->s_mb_prealloc_table = kzalloc( ++ EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (sbi->s_stripe == 0) { ++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); +@@ -3743,6 +3855,7 @@ out: + kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); +@@ -4099,7 +4212,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + ext4_grpblk_t changed; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); +- BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -4423,10 +4535,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; +- int bsbits, max; +- loff_t size, start_off, end; ++ int bsbits, i, wind; ++ loff_t size, end; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; ++ unsigned long value, last_non_zero; + + /* do normalize only data requests, metadata requests + do not need preallocation */ +@@ -4455,51 +4568,46 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); +- orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ start = wind = 0; ++ value = last_non_zero = 0; + +- /* max size of free chunks */ +- max = 2 << bsbits; +- +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) +- +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) { ++ value = sbi->s_mb_prealloc_table[i]; ++ if (value == 0) ++ break; ++ else ++ last_non_zero = value; ++ ++ if (size <= value) { ++ wind = value; ++ break; ++ } ++ } ++ ++ if (wind == 0) { ++ if (last_non_zero != 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = last_non_zero; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; ++ } + } else { +- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; +- size = (loff_t) EXT4_C2B(sbi, +- ac->ac_o_ex.fe_len) << bsbits; ++ size = wind; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ ++ ++ orig_size = size; + + /* + * For tiny groups (smaller than 8MB) the chosen allocation +@@ -4558,7 +4666,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -5761,8 +5868,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + inode_pa_eligible = false; + + size = max(size, isize); +- /* Don't use group allocation for large files */ +- if (size > sbi->s_mb_stream_request) ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) + group_pa_eligible = false; + + if (!group_pa_eligible) { +@@ -5773,6 +5880,13 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index ddb54608..2aeff069 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -222,7 +222,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); + EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); +@@ -270,7 +271,8 @@ static struct attribute *ext4_attrs[] = { + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_linear_groups), + ATTR_LIST(max_writeback_mb_bump), +@@ -584,6 +586,8 @@ int ext4_register_sysfs(struct super_block *sb) + ext4_fc_info_show, sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, ++ &ext4_seq_prealloc_table_fops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series b/ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series new file mode 100644 index 0000000..99db9cb --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series @@ -0,0 +1,37 @@ +linux-5.16/ext4-inode-version.patch +linux-5.18/ext4-lookup-dotdot.patch +linux-5.14/ext4-print-inum-in-htree-warning.patch +linux-6.10/ext4-prealloc.patch +linux-5.16/ext4-osd-iop-common.patch +linux-6.10/ext4-misc.patch +linux-6.10/ext4-mballoc-extra-checks.patch +sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch +linux-5.14/ext4-kill-dx-root.patch +linux-6.5/ext4-mballoc-pa-free-mismatch.patch +linux-6.5/ext4-data-in-dirent.patch +linux-6.6/ext4-nocmtime.patch +base/ext4-htree-lock.patch +linux-6.5/ext4-pdirop.patch +linux-6.10/ext4-max-dir-size.patch +linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +rhel9/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +linux-6.2/ext4-attach-jinode-in-writepages.patch +linux-6.5/ext4-dont-check-before-replay.patch +rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7.6/ext4-export-orphan-add.patch +linux-5.18/ext4-export-mb-stream-allocator-variables.patch +ubuntu19/ext4-iget-with-flags.patch +linux-5.14/export-ext4fs-dirhash-helper.patch +linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch +rhel9/ext4-dquot-commit-speedup.patch +linux-6.7/ext4-ialloc-uid-gid-and-pass-owner-down.patch +linux-5.14/ext4-projid-xattrs.patch +linux-6.10/ext4-delayed-iput.patch +rhel8/ext4-ext-merge.patch +linux-5.14/ext4-xattr-disable-credits-check.patch +rhel9.2/ext4-fiemap-kernel-data.patch +rhel8/ext4-old_ea_inodes_handling_fix.patch +linux-6.10/ext4-filename-encode.patch +rhel9.1/ext4-enc-flag.patch +linux-6.6/ext4-encdata.patch diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 250d474..53965c5 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -26,12 +26,17 @@ TBD Whamcloud 5.14.0-284.30.1.el9 (RHEL9.2) 4.4.120-92.70 (SLES12 SP2) 4.4.180-94.100 (SLES12 SP3) + 5.14.21-150500.55.65 (SLES15 SP5) 4.4.0-131 (Ubuntu 16.04) 4.15.0-32 (Ubuntu 18.04) 5.4.0-48 (Ubuntu 20.04) + 6.8.0-38 (Ubuntu 24.04) + 6.10.0-15 (Ubuntu 24.04) vanilla linux 5.4.0 (ZFS + ldiskfs) vanilla linux 5.4.21 (ZFS + ldiskfs) vanilla linux 5.4.136 (ZFS + ldiskfs) + vanilla linux 6.1.36 (ZFS + ldiskfs) + vanilla linux 6.6.13 (ZFS + ldiskfs) 5.10.0-60.94.0.118.oe2203 (openEuler 22.03 LTS) 5.10.0-136.32.0.108.oe2203sp1 (openEuler 22.03 LTS SP1) 5.10.0-153.19.0.95.oe2203sp2 (openEuler 22.03 LTS SP2) -- 1.8.3.1