Whamcloud - gitweb
LU-16350 ldiskfs: Server support for linux v6.10 29/55729/3
authorShaun Tancheff <shaun.tancheff@hpe.com>
Tue, 23 Jul 2024 02:09:42 +0000 (09:09 +0700)
committerOleg Drokin <green@whamcloud.com>
Thu, 8 Aug 2024 00:17:37 +0000 (00:17 +0000)
Updated patch series for Linux v6.10:
   ext4-corrupted-inode-block-bitmaps-handling-patches.patch
   ext4-delayed-iput.patch
   ext4-filename-encode.patch
   ext4-max-dir-size.patch
   ext4-mballoc-extra-checks.patch
   ext4-misc.patch
   ext4-prealloc.patch

The same updates applies for Ubuntu 6.10.0 kernel

Test-Parameters: trivial
HPE-bug-id: LUS-11376
Signed-off-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Change-Id: I456ec723f04aaf57cb64965cc9d53fbea23a8c27
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55729
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: Shuichi Ihara <sihara@ddn.com>
Reviewed-by: Shuichi Ihara <sihara@ddn.com>
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
config/lustre-build-ldiskfs.m4
ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series [new file with mode: 0644]
lustre/ChangeLog

index 093b69d..41f7f34 100644 (file)
@@ -91,6 +91,7 @@ AS_IF([test x$RHEL_KERNEL = xyes], [
            ])
 ], [test x$UBUNTU_KERNEL = xyes], [
         BASEVER=$(echo $LINUXRELEASE | cut -d'-' -f1)
+       AS_VERSION_COMPARE([$BASEVER],[6.10.0],[
        AS_VERSION_COMPARE([$BASEVER],[6.8.0],[
        AS_VERSION_COMPARE([$BASEVER],[5.19.0],[
        AS_VERSION_COMPARE([$BASEVER],[5.15.0],[
@@ -157,7 +158,9 @@ AS_IF([test x$RHEL_KERNEL = xyes], [
        [LDISKFS_SERIES="5.19.0-35-ubuntu.series"],
        [LDISKFS_SERIES="5.19.0-35-ubuntu.series"])],
        [LDISKFS_SERIES="6.7-ml.series"],
-       [LDISKFS_SERIES="6.7-ml.series"])
+       [LDISKFS_SERIES="6.7-ml.series"])],
+       [LDISKFS_SERIES="6.10-ml.series"],
+       [LDISKFS_SERIES="6.10-ml.series"])
 ], [test x$OPENEULER_KERNEL = xyes], [
        case $OPENEULER_VERSION_NO in
        2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;;
@@ -186,7 +189,11 @@ AS_IF([test -z "$LDISKFS_SERIES"],
        AS_VERSION_COMPARE([$LINUXRELEASE],[6.7.0], [
                LDISKFS_SERIES="6.6-ml.series"], [
                LDISKFS_SERIES="6.7-ml.series"], [
-               LDISKFS_SERIES="6.7-ml.series"]
+       AS_VERSION_COMPARE([$LINUXRELEASE],[6.10.0], [
+               LDISKFS_SERIES="6.7-ml.series"], [
+               LDISKFS_SERIES="6.10-ml.series"], [
+               LDISKFS_SERIES="6.10-ml.series"]
+       )] # 6.10
        )] # 6.7
        )] # 6.6
        )] # 6.1
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
new file mode 100644 (file)
index 0000000..9d1e4b0
--- /dev/null
@@ -0,0 +1,298 @@
+commit 2963f3d09eb3a0817f87386c0bd7be7ce086809d
+Author:     Wang Shilong <wshilong@whamcloud.com>
+AuthorDate: Tue Sep 8 21:54:29 2015 +0800
+LU-7114 ldiskfs: corrupted bitmaps handling patches
+
+This patch backported following patches from upstream:
+
+163a203ddb36c36d4a1c942aececda0cc8d06aa7
+ext4: mark block group as corrupt on block bitmap error
+
+87a39389be3e3b007d341be510a7e4a0542bdf05
+ext4: mark block group as corrupt on inode bitmap error
+
+bdfb6ff4a255dcebeb09a901250e13a97eff75af
+ext4: mark group corrupt on group descriptor checksum
+
+Also use ext4_warning() instead of ext4_error() so that
+filesystem don't become RO in default, and together
+with these patches,FS wil still be usable even such
+bad things happen.
+
+Signed-off-by: Wang Shilong <wshilong@ddn.com>
+Change-Id: Ib4075aba7df6f7f59e89a90475405080acd43dd0
+Reviewed-on: http://review.whamcloud.com/16312
+Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
+Reviewed-by: Yang Sheng <yang.sheng@intel.com>
+
+NOTE: Ported to linux 6.7 keeps the ext4_warning() updates.
+---
+ fs/ext4/balloc.c  | 18 +++++++--------
+ fs/ext4/ialloc.c  |  6 ++---
+ fs/ext4/mballoc.c | 59 ++++++++++++++++++-----------------------------
+ 3 files changed, 34 insertions(+), 49 deletions(-)
+
+diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
+index 591fb3f7..6f19cefd 100644
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -420,7 +420,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
+       if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) ||
+                    ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
+               ext4_unlock_group(sb, block_group);
+-              ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
++              ext4_warning(sb, "bg %u: bad block bitmap checksum", block_group);
+               ext4_mark_group_bitmap_corrupted(sb, block_group,
+                                       EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+               return -EFSBADCRC;
+@@ -428,8 +428,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
+       blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+       if (unlikely(blk != 0)) {
+               ext4_unlock_group(sb, block_group);
+-              ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+-                         block_group, blk);
++              ext4_warning(sb, "bg %u: block %llu: invalid block bitmap",
++                           block_group, blk);
+               ext4_mark_group_bitmap_corrupted(sb, block_group,
+                                       EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+               return -EFSCORRUPTED;
+@@ -519,18 +519,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+                       goto out;
+               }
+               err = ext4_init_block_bitmap(sb, bh, block_group, desc);
+-              if (err) {
+-                      ext4_unlock_group(sb, block_group);
+-                      unlock_buffer(bh);
+-                      ext4_error(sb, "Failed to init block bitmap for group "
+-                                 "%u: %d", block_group, err);
+-                      goto out;
+-              }
+               set_bitmap_uptodate(bh);
+               set_buffer_uptodate(bh);
+               set_buffer_verified(bh);
+               ext4_unlock_group(sb, block_group);
+               unlock_buffer(bh);
++              if (err) {
++                      ext4_warning(sb, "Failed to init block bitmap for group "
++                                 "%u: %d", block_group, err);
++                      goto out;
++              }
+               return bh;
+       }
+       ext4_unlock_group(sb, block_group);
+diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
+index 31480792..c725ade0 100644
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -102,8 +102,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
+                                          EXT4_INODES_PER_GROUP(sb) / 8) ||
+           ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
+               ext4_unlock_group(sb, block_group);
+-              ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+-                         "inode_bitmap = %llu", block_group, blk);
++              ext4_warning(sb, "Corrupt inode bitmap - block_group = %u, "
++                           "inode_bitmap = %llu", block_group, blk);
+               ext4_mark_group_bitmap_corrupted(sb, block_group,
+                                       EXT4_GROUP_INFO_IBITMAP_CORRUPT);
+               return -EFSBADCRC;
+@@ -353,7 +353,7 @@ out:
+               if (!fatal)
+                       fatal = err;
+       } else {
+-              ext4_error(sb, "bit already cleared for inode %lu", ino);
++              ext4_warning(sb, "bit already cleared for inode %lu", ino);
+               ext4_mark_group_bitmap_corrupted(sb, block_group,
+                                       EXT4_GROUP_INFO_IBITMAP_CORRUPT);
+       }
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 11551a01..3bcfb5d1 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1214,10 +1214,14 @@ int ext4_mb_generate_buddy(struct super_block *sb,
+       grp->bb_fragments = fragments;
+       if (free != grp->bb_free) {
+-              ext4_grp_locked_error(sb, group, 0, 0,
+-                                    "block bitmap and bg descriptor "
+-                                    "inconsistent: %u vs %u free clusters",
+-                                    free, grp->bb_free);
++              struct ext4_group_desc *gdp;
++              gdp = ext4_get_group_desc(sb, group, NULL);
++              ext4_warning(sb, "group %lu: block bitmap and bg descriptor "
++                           "inconsistent: %u vs %u free clusters "
++                           "%u in gd, %lu pa's",
++                           (long unsigned int)group, free, grp->bb_free,
++                           ext4_free_group_clusters(sb, gdp),
++                           grp->bb_prealloc_nr);
+               /*
+                * If we intend to continue, we consider group descriptor
+                * corrupt and update bb_free using bitmap value
+@@ -1588,7 +1592,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
+       int block;
+       int pnum;
+       int poff;
+-      struct folio *folio;
++      struct folio *folio = NULL;
+       int ret;
+       struct ext4_group_info *grp;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -1616,7 +1620,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
+                */
+               ret = ext4_mb_init_group(sb, group, gfp);
+               if (ret)
+-                      return ret;
++                      goto err;
+       }
+       /*
+@@ -1728,6 +1732,7 @@ err:
+       e4b->bd_buddy = NULL;
+       e4b->bd_bitmap = NULL;
++      ext4_warning(sb, "Error loading buddy information for %u", group);
+       return ret;
+ }
+@@ -5129,9 +5134,11 @@ int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap,
+       }
+       if (free != free_in_gdp) {
+-              ext4_error(sb, "on-disk bitmap for group %d"
++              ext4_warning(sb, "on-disk bitmap for group %d"
+                       "corrupted: %u blocks free in bitmap, %u - in gd\n",
+                       group, free, free_in_gdp);
++              ext4_mark_group_bitmap_corrupted(sb, group,
++                                      EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+               return -EIO;
+       }
+       return 0;
+@@ -5547,16 +5554,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+       /* "free < pa->pa_free" means we maybe double alloc the same blocks,
+        * otherwise maybe leave some free blocks unavailable, no need to BUG.*/
+       if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) {
+-              ext4_error(sb, "pa free mismatch: [pa %p] "
+-                              "[phy %lu] [logic %lu] [len %u] [free %u] "
+-                              "[error %u] [inode %d] [freed %u]", pa,
+-                              (unsigned long)pa->pa_pstart,
+-                              (unsigned long)pa->pa_lstart,
+-                              pa->pa_len, (unsigned)pa->pa_free,
+-                              (unsigned)pa->pa_error, pa->pa_inode->i_ino,
+-                              free);
+               ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
+-                                      free, pa->pa_free);
++                                    free, pa->pa_free);
+               /*
+                * pa is already deleted so we use the value obtained
+                * from the bitmap and continue.
+@@ -5619,16 +5618,11 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
+       bitmap_bh = ext4_read_block_bitmap(sb, group);
+       if (IS_ERR(bitmap_bh)) {
+               err = PTR_ERR(bitmap_bh);
+-              ext4_error_err(sb, -err,
+-                             "Error %d reading block bitmap for %u",
+-                             err, group);
+               goto out_dbg;
+       }
+       err = ext4_mb_load_buddy(sb, group, &e4b);
+       if (err) {
+-              ext4_warning(sb, "Error %d loading buddy information for %u",
+-                           err, group);
+               put_bh(bitmap_bh);
+               goto out_dbg;
+       }
+@@ -5788,17 +5782,12 @@ repeat:
+               err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
+                                            GFP_NOFS|__GFP_NOFAIL);
+-              if (err) {
+-                      ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+-                                     err, group);
++              if (err)
+                       return;
+-              }
+               bitmap_bh = ext4_read_block_bitmap(sb, group);
+               if (IS_ERR(bitmap_bh)) {
+                       err = PTR_ERR(bitmap_bh);
+-                      ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
+-                                     err, group);
+                       ext4_mb_unload_buddy(&e4b);
+                       continue;
+               }
+@@ -6103,11 +6092,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
+               group = ext4_get_group_number(sb, pa->pa_pstart);
+               err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
+                                            GFP_NOFS|__GFP_NOFAIL);
+-              if (err) {
+-                      ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+-                                     err, group);
++              if (err)
+                       continue;
+-              }
+               ext4_lock_group(sb, group);
+               list_del(&pa->pa_group_list);
+               ext4_get_group_info(sb, group)->bb_prealloc_nr--;
+@@ -6471,7 +6457,7 @@ errout:
+                        * been updated or not when fail case. So can
+                        * not revert pa_free back, just mark pa_error*/
+                       pa->pa_error++;
+-                      ext4_error(sb,
++                      ext4_warning(sb,
+                               "Updating bitmap error: [err %d] "
+                               "[pa %p] [phy %lu] [logic %lu] "
+                               "[len %u] [free %u] [error %u] "
+@@ -6482,6 +6468,7 @@ errout:
+                               (unsigned)pa->pa_free,
+                               (unsigned)pa->pa_error,
+                               pa->pa_inode ? pa->pa_inode->i_ino : 0);
++                      ext4_mark_group_bitmap_corrupted(sb, 0, 0);
+               }
+       }
+       ext4_mb_release_context(ac);
+@@ -6677,7 +6664,7 @@ do_more:
+       err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+                                    GFP_NOFS|__GFP_NOFAIL);
+       if (err)
+-              goto error_out;
++              goto error_quiet;
+       if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+           !ext4_inode_block_valid(inode, block, count)) {
+@@ -6773,6 +6760,7 @@ error_clean:
+       ext4_mb_unload_buddy(&e4b);
+ error_out:
+       ext4_std_error(sb, err);
++error_quiet:
+ }
+ /**
+@@ -6918,7 +6906,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+-              goto error_out;
++              goto error_quiet;
+       if (!ext4_sb_block_valid(sb, NULL, block, count)) {
+               ext4_error(sb, "Adding blocks in system zones - "
+@@ -6947,6 +6935,7 @@ error_clean:
+       ext4_mb_unload_buddy(&e4b);
+ error_out:
+       ext4_std_error(sb, err);
++error_quiet:
+       return err;
+ }
+@@ -7095,8 +7084,6 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+       ret = ext4_mb_load_buddy(sb, group, &e4b);
+       if (ret) {
+-              ext4_warning(sb, "Error %d loading buddy information for %u",
+-                           ret, group);
+               return ret;
+       }
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-delayed-iput.patch
new file mode 100644 (file)
index 0000000..ad576c3
--- /dev/null
@@ -0,0 +1,185 @@
+commit e239a14001b62d96c186ae2c9f58402f73e63dcc
+Author:     Andrew Perepechko <andrew.perepechko@hpe.com>
+AuthorDate: Mon Jan 31 19:55:31 2022 +0300
+LU-15404 ldiskfs: truncate during setxattr leads to kernel panic
+
+When changing a large xattr value to a different large xattr value,
+the old xattr inode is freed. Truncate during the final iput causes
+current transaction restart. Eventually, parent inode bh is marked
+dirty and kernel panic happens when jbd2 figures out that this bh
+belongs to the committed transaction.
+
+A possible fix is to call this final iput in a separate thread.
+This way, setxattr transactions will never be split into two.
+Since the setxattr code adds xattr inodes with nlink=0 into the
+orphan list, old xattr inodes will be properly cleaned up in
+any case.
+
+Change-Id: Idd70befa6a83818ece06daccf9bb6256812674b9
+Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
+HPE-bug-id: LUS-10534
+Reviewed-on: https://review.whamcloud.com/46358
+Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
+Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
+---
+ fs/ext4/ext4.h    |  7 +++++--
+ fs/ext4/page-io.c |  2 +-
+ fs/ext4/super.c   | 15 ++++++++-------
+ fs/ext4/xattr.c   | 39 +++++++++++++++++++++++++++++++++++++--
+ 4 files changed, 51 insertions(+), 12 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index a3276ccc..25fb849f 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1665,8 +1665,11 @@ struct ext4_sb_info {
+       struct flex_groups * __rcu *s_flex_groups;
+       ext4_group_t s_flex_groups_allocated;
+-      /* workqueue for reserved extent conversions (buffered io) */
+-      struct workqueue_struct *rsv_conversion_wq;
++      /*
++       * workqueue for reserved extent conversions (buffered io)
++       * and large ea inodes reclaim
++       */
++      struct workqueue_struct *s_misc_wq;
+       /* timer for periodic error stats printing */
+       struct timer_list s_err_report;
+diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
+index ad554386..aea9ce61 100644
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -229,7 +229,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
+       WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+       WARN_ON(!io_end->handle && sbi->s_journal);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+-      wq = sbi->rsv_conversion_wq;
++      wq = sbi->s_misc_wq;
+       if (list_empty(&ei->i_rsv_conversion_list))
+               queue_work(wq, &ei->i_rsv_conversion_work);
+       list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 56dfb89a..8465e403 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1296,10 +1296,11 @@ static void ext4_put_super(struct super_block *sb)
+                        &sb->s_uuid);
+       ext4_unregister_li_request(sb);
++      flush_workqueue(sbi->s_misc_wq);
+       ext4_quotas_off(sb, EXT4_MAXQUOTAS);
+       flush_work(&sbi->s_sb_upd_work);
+-      destroy_workqueue(sbi->rsv_conversion_wq);
++      destroy_workqueue(sbi->s_misc_wq);
+       ext4_release_orphan_info(sb);
+       if (sbi->s_journal) {
+@@ -5443,9 +5444,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+        * The maximum number of concurrent works can be high and
+        * concurrency isn't really necessary.  Limit it to 1.
+        */
+-      EXT4_SB(sb)->rsv_conversion_wq =
+-              alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+-      if (!EXT4_SB(sb)->rsv_conversion_wq) {
++      EXT4_SB(sb)->s_misc_wq =
++              alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
++      if (!EXT4_SB(sb)->s_misc_wq) {
+               printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+               err = -ENOMEM;
+               goto failed_mount4;
+@@ -5618,8 +5619,8 @@ failed_mount4a:
+       sb->s_root = NULL;
+ failed_mount4:
+       ext4_msg(sb, KERN_ERR, "mount failed");
+-      if (EXT4_SB(sb)->rsv_conversion_wq)
+-              destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
++      if (EXT4_SB(sb)->s_misc_wq)
++              destroy_workqueue(EXT4_SB(sb)->s_misc_wq);
+ failed_mount_wq:
+       ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+       sbi->s_ea_inode_cache = NULL;
+@@ -6306,7 +6307,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
+               return 0;
+       trace_ext4_sync_fs(sb, wait);
+-      flush_workqueue(sbi->rsv_conversion_wq);
++      flush_workqueue(sbi->s_misc_wq);
+       /*
+        * Writeback quota in non-journalled quota case - journalled quota has
+        * no dirty dquots
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 7dcb257b..7e6da3a6 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1658,6 +1658,36 @@ out_err:
+       return ERR_PTR(err);
+ }
++struct delayed_iput_work {
++      struct work_struct work;
++      struct inode *inode;
++};
++
++static void delayed_iput_fn(struct work_struct *work)
++{
++      struct delayed_iput_work *diwork;
++
++      diwork = container_of(work, struct delayed_iput_work, work);
++      iput(diwork->inode);
++      kfree(diwork);
++}
++
++static void delayed_iput(struct inode *inode, struct delayed_iput_work *work)
++{
++      if (!inode) {
++              kfree(work);
++              return;
++      }
++
++      if (!work) {
++              iput(inode);
++      } else {
++              INIT_WORK(&work->work, delayed_iput_fn);
++              work->inode = inode;
++              queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work);
++      }
++}
++
+ /*
+  * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode
+  * feature is enabled.
+@@ -1675,6 +1705,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+       size_t min_offs = s->end - s->base, name_len = strlen(i->name);
+       int in_inode = i->in_inode;
+       struct inode *old_ea_inode = NULL;
++      struct delayed_iput_work *diwork = NULL;
+       size_t old_size, new_size;
+       int ret;
+@@ -1751,7 +1782,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+        * Finish that work before doing any modifications to the xattr data.
+        */
+       if (!s->not_found && here->e_value_inum) {
+-              ret = ext4_xattr_inode_iget(inode,
++              diwork = kmalloc(sizeof(*diwork), GFP_NOFS);
++              if (!diwork)
++                      ret = -ENOMEM;
++              else
++                      ret = ext4_xattr_inode_iget(inode,
+                                           le32_to_cpu(here->e_value_inum),
+                                           le32_to_cpu(here->e_hash),
+                                           &old_ea_inode);
+@@ -1886,7 +1921,7 @@ update_hash:
+       ret = 0;
+ out:
+-      iput(old_ea_inode);
++      delayed_iput(old_ea_inode, diwork);
+       return ret;
+ }
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-filename-encode.patch
new file mode 100644 (file)
index 0000000..dda4847
--- /dev/null
@@ -0,0 +1,474 @@
+From d0a722cb8fb886380e24e8261e8efca09a3262d6 Mon Sep 17 00:00:00 2001
+From: Sebastien Buisson <sbuisson@ddn.com>
+Date: Tue, 20 Dec 2022 15:40:52 +0100
+Subject: [PATCH] LU-16374 ldiskfs: implement security.encdata xattr
+
+security.encdata is a virtual xattr containing information related
+to encrypted files. It is expressed as ASCII text with a "key: value"
+format, and space as field separator. For instance:
+
+   { encoding: base64url, size: 3012, enc_ctx: YWJjZGVmZ2hpamtsbW
+   5vcHFyc3R1dnd4eXphYmNkZWZnaGlqa2xtbg, enc_name: ZmlsZXdpdGh2ZX
+   J5bG9uZ25hbWVmaWxld2l0aHZlcnlsb25nbmFtZWZpbGV3aXRodmVyeWxvbmdu
+   YW1lZmlsZXdpdGg }
+
+'encoding' is the encoding method used for binary data, assume name
+can be up to 255 chars.
+'size' is the clear text file data length in bytes.
+'enc_ctx' is encoded encryption context, 40 bytes for v2.
+'enc_name' is encoded encrypted name, 256 bytes max.
+So on overall, this xattr is at most 727 chars plus terminating '0'.
+
+On get, the value of the security.encdata xattr is computed from
+encrypted file's information.
+On set, encrypted file's information is restored from xattr value.
+The encrypted name is stored temporarily in a dedicated xattr
+LDISKFS_XATTR_NAME_RAWENCNAME, that will be used to set correct name
+at linkat.
+
+Signed-off-by: Sebastien Buisson <sbuisson@ddn.com>
+Change-Id: Ia318c39d403b1c448e71bcd5b29862d022d05d0a
+Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49456
+Tested-by: jenkins <devops@whamcloud.com>
+Tested-by: Maloo <maloo@whamcloud.com>
+Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
+Reviewed-by: Li Dongyang <dongyangli@ddn.com>
+Reviewed-by: Oleg Drokin <green@whamcloud.com>
+---
+ fs/ext4/critical_encode.h | 170 ++++++++++++++++++++++++++++++++++++++
+ fs/ext4/dir.c             |  33 +++++---
+ fs/ext4/ialloc.c          |   1 +
+ fs/ext4/namei.c           |  56 +++++++++----
+ 4 files changed, 235 insertions(+), 25 deletions(-)
+ create mode 100644 fs/ext4/critical_encode.h
+
+diff --git a/fs/ext4/critical_encode.h b/fs/ext4/critical_encode.h
+new file mode 100644
+index 00000000..f75aedab
+--- /dev/null
++++ b/fs/ext4/critical_encode.h
+@@ -0,0 +1,170 @@
++/*
++ *  critical_encode.h
++ *
++ *  Copyright (c) 2022 Whamcloud
++ */
++
++#ifndef _CRITICAL_ENCODE_H
++#define _CRITICAL_ENCODE_H
++
++#include <linux/ctype.h>
++
++/* Encoding/decoding routines inspired from yEnc principles.
++ * We just take care of a few critical characters:
++ * NULL, LF, CR, /, DEL and =.
++ * If such a char is found, it is replaced with '=' followed by
++ * the char value + 64.
++ * All other chars are left untouched.
++ * Efficiency of this encoding depends on the occurences of the
++ * critical chars, but statistically on binary data it can be much higher
++ * than base64 for instance.
++ */
++static inline int critical_encode(const u8 *src, int len, char *dst)
++{
++      u8 *p = (u8 *)src, *q = dst;
++
++      while (p - src < len) {
++              /* escape NULL, LF, CR, /, DEL and = */
++              if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD ||
++                           *p == '/' || *p == 0x7F || *p == '=')) {
++                      *(q++) = '=';
++                      *(q++) = *(p++) + 64;
++              } else {
++                      *(q++) = *(p++);
++              }
++      }
++
++      return (char *)q - dst;
++}
++
++/* returns the number of chars encoding would produce */
++static inline int critical_chars(const u8 *src, int len)
++{
++      u8 *p = (u8 *)src;
++      int newlen = len;
++
++      while (p - src < len) {
++              /* NULL, LF, CR, /, DEL and = cost an additional '=' */
++              if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD ||
++                           *p == '/' || *p == 0x7F || *p == '='))
++                      newlen++;
++              p++;
++      }
++
++      return newlen;
++}
++
++/* decoding routine - returns the number of chars in output */
++static inline int critical_decode(const u8 *src, int len, char *dst)
++{
++      u8 *p = (u8 *)src, *q = dst;
++
++      while (p - src < len) {
++              if (unlikely(*p == '=')) {
++                      *(q++) = *(++p) - 64;
++                      p++;
++              } else {
++                      *(q++) = *(p++);
++              }
++      }
++
++      return (char *)q - dst;
++}
++
++#define fscrypt_get_encryption_info(inode) \
++      (unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)) ? 0 : -EOPNOTSUPP)
++
++static inline int ext4_has_permitted_context(struct inode *parent,
++                                      struct inode *child)
++{
++      if (unlikely(!IS_LUSTRE_MOUNT(parent->i_sb)))
++              return 1;
++      return fscrypt_has_permitted_context(parent, child);
++}
++
++struct ext4_filename;
++
++static inline int ext4_prepare_readdir(struct inode *dir)
++{
++      if (unlikely(!IS_LUSTRE_MOUNT(dir->i_sb)))
++              return 0;
++      return fscrypt_prepare_readdir(dir);
++}
++
++static inline int ext4_fname_alloc_buffer(const struct inode *inode,
++                                           u32 max_encrypted_len,
++                                           struct fscrypt_str *crypto_str)
++{
++      crypto_str->name = kmalloc(max_encrypted_len + 1, GFP_NOFS);
++      if (!crypto_str->name)
++              return -ENOMEM;
++      crypto_str->len = max_encrypted_len;
++      return 0;
++}
++
++static inline void ext4_fname_free_buffer(struct fscrypt_str *crypto_str)
++{
++      if (!crypto_str)
++              return;
++      kfree(crypto_str->name);
++      crypto_str->name = NULL;
++}
++
++static inline int ext4_fname_disk_to_usr(struct inode *inode,
++                                          u32 hash, u32 minor_hash,
++                                          const struct fscrypt_str *iname,
++                                          struct fscrypt_str *oname)
++{
++      int presented_len;
++
++      presented_len = critical_encode(iname->name, iname->len, oname->name);
++      if (presented_len > NAME_MAX) {
++              /* truncate at NAME_MAX,
++               * or NAME_MAX-1 if name ends with '=' to avoid decoding issue
++               */
++              presented_len = NAME_MAX;
++              if (oname->name[presented_len - 1] == '=')
++                      presented_len--;
++              oname->len = presented_len;
++      }
++      oname->name[presented_len] = '\0';
++
++      return 0;
++}
++
++static inline int ext4_setup_filename(struct inode *dir,
++                                       const struct qstr *iname,
++                                       int lookup,
++                                       struct ext4_filename *fname)
++{
++      fname->usr_fname = iname;
++
++      if (lookup && IS_ENCRYPTED(dir) &&
++          unlikely(!IS_LUSTRE_MOUNT(dir->i_sb) &&
++                   strnchr(iname->name, iname->len, '='))) {
++              /* Only proceed to critical decode if
++               * iname contains escape char '='.
++               */
++              int len = iname->len;
++              char *buf;
++
++              buf = kmalloc(len, GFP_NOFS);
++              if (!buf)
++                      return -ENOMEM;
++
++              len = critical_decode(iname->name, len, buf);
++              fname->disk_name.name = (unsigned char *)buf;
++              fname->disk_name.len = len;
++              return 0;
++      }
++
++      fname->disk_name.name = (unsigned char *) iname->name;
++      fname->disk_name.len = iname->len;
++
++#ifdef CONFIG_UNICODE
++      ext4_fname_setup_ci_filename(dir, iname, fname);
++#endif
++      return 0;
++}
++
++#endif /* _CRITICAL_ENCODE_H */
+diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
+index ae110d71..849b1e0e 100644
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -29,6 +29,7 @@
+ #include <linux/unicode.h>
+ #include "ext4.h"
+ #include "xattr.h"
++#include "critical_encode.h"
+ static int ext4_dx_readdir(struct file *, struct dir_context *);
+@@ -134,7 +135,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
+       struct buffer_head *bh = NULL;
+       struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
+-      err = fscrypt_prepare_readdir(inode);
++      err = ext4_prepare_readdir(inode);
+       if (err)
+               return err;
+@@ -161,7 +162,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
+                       return err;
+       }
+-      if (IS_ENCRYPTED(inode)) {
++      /* disable decryption of filename, present only escaped name */
++      if (0 && IS_ENCRYPTED(inode)) {
+               err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
+               if (err < 0)
+                       return err;
+@@ -275,24 +277,33 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
+                                           get_dtype(sb, de->file_type)))
+                                               goto done;
+                               } else {
+-                                      int save_len = fstr.len;
+                                       struct fscrypt_str de_name =
+                                                       FSTR_INIT(de->name,
+                                                               de->name_len);
++                                      int presented_len;
+                                       /* Directory is encrypted */
+-                                      err = fscrypt_fname_disk_to_usr(inode,
+-                                              EXT4_DIRENT_HASH(de),
+-                                              EXT4_DIRENT_MINOR_HASH(de),
+-                                              &de_name, &fstr);
+-                                      de_name = fstr;
+-                                      fstr.len = save_len;
++                                      presented_len = critical_chars(de->name,
++                                                                de->name_len);
++                                      err = ext4_fname_alloc_buffer(inode,
++                                                                presented_len,
++                                                                &fstr);
+                                       if (err)
+                                               goto errout;
+-                                      if (!dir_emit(ctx,
++
++                                      err = ext4_fname_disk_to_usr(inode,
++                                              0, 0, &de_name, &fstr);
++                                      de_name = fstr;
++                                      if (err) {
++                                              ext4_fname_free_buffer(&fstr);
++                                              goto errout;
++                                      }
++                                      err = dir_emit(ctx,
+                                           de_name.name, de_name.len,
+                                           le32_to_cpu(de->inode),
+-                                          get_dtype(sb, de->file_type)))
++                                          get_dtype(sb, de->file_type));
++                                      ext4_fname_free_buffer(&fstr);
++                                      if (!err)
+                                               goto done;
+                               }
+                       }
+diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
+index fc1f09fe..b12a4324 100644
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -30,6 +30,7 @@
+ #include "ext4_jbd2.h"
+ #include "xattr.h"
+ #include "acl.h"
++#include "critical_encode.h"
+ #include <trace/events/ext4.h>
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 24b07d9d..1986ab5a 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -41,6 +41,7 @@
+ #include "xattr.h"
+ #include "acl.h"
++#include "critical_encode.h"
+ #include <trace/events/ext4.h>
+ /*
+@@ -1441,7 +1442,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
+                                          ext4_dir_rec_len(0,
+                                                          csum ? NULL : dir));
+       /* Check if the directory is encrypted */
+-      if (IS_ENCRYPTED(dir)) {
++      if (0 && IS_ENCRYPTED(dir)) {
+               err = fscrypt_prepare_readdir(dir);
+               if (err < 0) {
+                       brelse(bh);
+@@ -1492,22 +1493,31 @@ static int htree_dirblock_to_tree(struct file *dir_file,
+                                  hinfo->hash, hinfo->minor_hash, de,
+                                  &tmp_str);
+               } else {
+-                      int save_len = fname_crypto_str.len;
+                       struct fscrypt_str de_name = FSTR_INIT(de->name,
+                                                               de->name_len);
++                      int presented_len;
+                       /* Directory is encrypted */
+-                      err = fscrypt_fname_disk_to_usr(dir, hinfo->hash,
++                      presented_len = critical_chars(de->name, de->name_len);
++                      err = ext4_fname_alloc_buffer(dir, presented_len,
++                                                       &fname_crypto_str);
++                      if (err) {
++                              count = err;
++                              goto errout;
++                      }
++
++                      err = ext4_fname_disk_to_usr(dir, hinfo->hash,
+                                       hinfo->minor_hash, &de_name,
+                                       &fname_crypto_str);
+                       if (err) {
++                              ext4_fname_free_buffer(&fname_crypto_str);
+                               count = err;
+                               goto errout;
+                       }
+                       err = ext4_htree_store_dirent(dir_file,
+                                  hinfo->hash, hinfo->minor_hash, de,
+                                       &fname_crypto_str);
+-                      fname_crypto_str.len = save_len;
++                      ext4_fname_free_buffer(&fname_crypto_str);
+               }
+               if (err != 0) {
+                       count = err;
+@@ -1837,7 +1847,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+  */
+ static bool ext4_match(struct inode *parent,
+                             const struct ext4_filename *fname,
+-                            struct ext4_dir_entry_2 *de)
++                            struct ext4_dir_entry_2 *de, int denamelen)
+ {
+       struct fscrypt_name f;
+@@ -1872,7 +1882,7 @@ static bool ext4_match(struct inode *parent,
+       }
+ #endif
+-      return fscrypt_match_name(&f, de->name, de->name_len);
++      return fscrypt_match_name(&f, de->name, denamelen);
+ }
+ /*
+@@ -1883,16 +1893,30 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
+                   unsigned int offset, struct ext4_dir_entry_2 **res_dir)
+ {
+       struct ext4_dir_entry_2 * de;
++      bool probablytrunc;
+       char * dlimit;
+-      int de_len;
++      int de_len, denamelen;
+       de = (struct ext4_dir_entry_2 *)search_buf;
+       dlimit = search_buf + buf_size;
++      /* fname is probably truncated if it is the decoded representation of
++       * an encrypted filename not aligned on a 32-byte boundary
++       */
++      probablytrunc = !IS_LUSTRE_MOUNT(dir->i_sb) && IS_ENCRYPTED(dir) &&
++              fname->disk_name.len & 31;
+       while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
+               /* this code is executed quadratically often */
+               /* do minimal checking `by hand' */
++              denamelen = de->name_len;
++              if (unlikely(probablytrunc) &&
++                  de->name_len > fname->disk_name.len)
++                      /* Adjust name len to look for a partial match.
++                       * Since it is binary encrypted names, there
++                       * should not be any collision between names.
++                       */
++                      denamelen = fname->disk_name.len;
+               if (de->name + de->name_len <= dlimit &&
+-                  ext4_match(dir, fname, de)) {
++                  ext4_match(dir, fname, de, denamelen)) {
+                       /* found a match - just to be sure, do
+                        * a full check */
+                       if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
+@@ -2093,7 +2117,7 @@ struct buffer_head *ext4_find_entry_locked(struct inode *dir,
+       struct ext4_filename fname;
+       struct buffer_head *bh;
+-      err = ext4_fname_setup_filename(dir, d_name, 1, &fname);
++      err = ext4_setup_filename(dir, d_name, 1, &fname);
+       if (err == -ENOENT)
+               return NULL;
+       if (err)
+@@ -2101,7 +2125,9 @@ struct buffer_head *ext4_find_entry_locked(struct inode *dir,
+       bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck);
+-      ext4_fname_free_filename(&fname);
++      if (fname.disk_name.name != d_name->name)
++              kfree(fname.disk_name.name);
++
+       return bh;
+ }
+@@ -2115,7 +2141,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+       struct ext4_filename fname;
+       struct buffer_head *bh;
+-      err = ext4_fname_prepare_lookup(dir, dentry, &fname);
++      err = ext4_setup_filename(dir, &dentry->d_name, 1, &fname);
+       if (err == -ENOENT)
+               return NULL;
+       if (err)
+@@ -2123,7 +2149,9 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+       bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL);
+-      ext4_fname_free_filename(&fname);
++      if (fname.disk_name.name != dentry->d_name.name)
++              kfree(fname.disk_name.name);
++
+       return bh;
+ }
+@@ -2215,7 +2243,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
+               }
+               if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
+                   (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+-                  !fscrypt_has_permitted_context(dir, inode)) {
++                  !ext4_has_permitted_context(dir, inode)) {
+                       ext4_warning(inode->i_sb,
+                                    "Inconsistent encryption contexts: %lu/%lu",
+                                    dir->i_ino, inode->i_ino);
+@@ -2508,7 +2536,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+               if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                        buf, buf_size, offset))
+                       return -EFSCORRUPTED;
+-              if (ext4_match(dir, fname, de))
++              if (ext4_match(dir, fname, de, de->name_len))
+                       return -EEXIST;
+               nlen = EXT4_DIR_ENTRY_LEN(de, dir);
+               rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-max-dir-size.patch
new file mode 100644 (file)
index 0000000..bd46b0e
--- /dev/null
@@ -0,0 +1,50 @@
+Add a proc interface for max_dir_size.
+
+---
+ fs/ext4/sysfs.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
+index 2aeff069..17c391ec 100644
+--- a/fs/ext4/sysfs.c
++++ b/fs/ext4/sysfs.c
+@@ -218,6 +218,8 @@ EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
+ EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
+                ext4_sb_info, s_mb_best_avail_max_trim_order);
+ EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
++EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size_kb);
++EXT4_RW_ATTR_SBI_UI(max_dir_size_kb, s_max_dir_size_kb);
+ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+@@ -267,6 +269,8 @@ static struct attribute *ext4_attrs[] = {
+       ATTR_LIST(sra_exceeded_retry_limit),
+       ATTR_LIST(inode_readahead_blks),
+       ATTR_LIST(inode_goal),
++      ATTR_LIST(max_dir_size),
++      ATTR_LIST(max_dir_size_kb),
+       ATTR_LIST(mb_stats),
+       ATTR_LIST(mb_max_to_scan),
+       ATTR_LIST(mb_min_to_scan),
+@@ -392,6 +396,9 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
+       case attr_pointer_ui:
+               if (a->attr_ptr == ptr_ext4_super_block_offset)
+                       return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
++              if (strcmp("max_dir_size", a->attr.name) == 0)
++                      return sysfs_emit(buf, "%u\n",
++                                        (*((unsigned int *) ptr)) << 10);
+               return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
+       case attr_pointer_ul:
+               return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
+@@ -471,6 +478,8 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
+               ret = kstrtouint(skip_spaces(buf), 0, &t);
+               if (ret)
+                       return ret;
++              if (strcmp("max_dir_size", a->attr.name) == 0)
++                      t >>= 10;
+               if (a->attr_ptr == ptr_ext4_super_block_offset)
+                       *((__le32 *) ptr) = cpu_to_le32(t);
+               else
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-mballoc-extra-checks.patch
new file mode 100644 (file)
index 0000000..e957f37
--- /dev/null
@@ -0,0 +1,320 @@
+commit f2f28f1d09c0a00b3fc569422f881931d857fac9
+Author:     Alex Zhuravlev <alex.zhuravlev@sun.com>
+AuthorDate: Tue Oct 28 17:59:09 2008 +0000
+Subject: ext4: detect on-disk corruption of block bitmap
+Detect on-disk corruption of block bitmap and better checking of
+preallocated blocks.
+Bugzilla-ID: b=16680
+Signed-off-by: Alex Zhuravlev <alex.zhuravlev@sun.com>
+Reviewed-by: Kalpak Shah <kalpak.shah@sun.com>
+Signed-off-by: Andreas Dilger <andreas.dilger@sun.com>
+---
+ fs/ext4/ext4.h    |   1 +
+ fs/ext4/mballoc.c | 105 ++++++++++++++++++++++++++++++++++++++++------
+ fs/ext4/mballoc.h |   2 +-
+ 3 files changed, 94 insertions(+), 14 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 95bbfd52..860680e4 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -3449,6 +3449,7 @@ struct ext4_group_info {
+       ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
+       ext4_group_t    bb_group;       /* Group number */
+       struct          list_head bb_prealloc_list;
++      unsigned long   bb_prealloc_nr;
+ #ifdef DOUBLE_CHECK
+       void            *bb_bitmap;
+ #endif
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index e64b31e5..838f5303 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -416,7 +416,7 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+       "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+ };
+-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group);
+ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+@@ -1181,7 +1181,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+ }
+ static noinline_for_stack
+-void ext4_mb_generate_buddy(struct super_block *sb,
++int ext4_mb_generate_buddy(struct super_block *sb,
+                           void *buddy, void *bitmap, ext4_group_t group,
+                           struct ext4_group_info *grp)
+ {
+@@ -1225,6 +1225,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
+               grp->bb_free = free;
+               ext4_mark_group_bitmap_corrupted(sb, group,
+                                       EXT4_GROUP_INFO_BBITMAP_CORRUPT);
++              return -EIO;
+       }
+       mb_set_largest_free_order(sb, grp);
+       mb_update_avg_fragment_size(sb, grp);
+@@ -1234,6 +1235,8 @@ void ext4_mb_generate_buddy(struct super_block *sb,
+       period = get_cycles() - period;
+       atomic_inc(&sbi->s_mb_buddies_generated);
+       atomic64_add(period, &sbi->s_mb_generation_time);
++
++      return 0;
+ }
+ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+@@ -1355,7 +1358,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
+       }
+       first_block = folio->index * blocks_per_page;
+-      for (i = 0; i < blocks_per_page; i++) {
++      for (i = 0; i < blocks_per_page && err == 0; i++) {
+               group = (first_block + i) >> 1;
+               if (group >= ngroups)
+                       break;
+@@ -1403,7 +1406,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
+                       ext4_lock_group(sb, group);
+                       /* init the buddy */
+                       memset(data, 0xff, blocksize);
+-                      ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
++                      err = ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
+                       ext4_unlock_group(sb, group);
+                       incore = NULL;
+               } else {
+@@ -1418,7 +1421,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
+                       memcpy(data, bitmap, blocksize);
+                       /* mark all preallocated blks used in in-core bitmap */
+-                      ext4_mb_generate_from_pa(sb, data, group);
++                      err = ext4_mb_generate_from_pa(sb, data, group);
+                       WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
+                       ext4_unlock_group(sb, group);
+@@ -1428,7 +1431,8 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
+                       incore = data;
+               }
+       }
+-      folio_mark_uptodate(folio);
++      if (likely(err == 0))
++              folio_mark_uptodate(folio);
+ out:
+       if (bh) {
+@@ -3028,8 +3032,10 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+ {
+       struct super_block *sb = pde_data(file_inode(seq->file));
+       ext4_group_t group = (ext4_group_t) ((unsigned long) v);
++      struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+       int i, err;
+       char nbuf[16];
++      int free = 0;
+       struct ext4_buddy e4b;
+       struct ext4_group_info *grinfo;
+       unsigned char blocksize_bits = min_t(unsigned char,
+@@ -3040,9 +3046,12 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+               ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
+       } sg;
++      if (gdp)
++              free = ext4_free_group_clusters(sb, gdp);
++
+       group--;
+       if (group == 0)
+-              seq_puts(seq, "#group: free  frags first ["
++              seq_puts(seq, "#group: bfree gfree frags first pa    ["
+                             " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
+                             " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
+@@ -3067,8 +3076,10 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+        * these are safe to access even after the buddy has been unloaded
+        */
+       memcpy(&sg, grinfo, i);
+-      seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+-                      sg.info.bb_fragments, sg.info.bb_first_free);
++      seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [",
++                      (long unsigned int)group, sg.info.bb_free, free,
++                      sg.info.bb_fragments, sg.info.bb_first_free,
++                      sg.info.bb_prealloc_nr);
+       for (i = 0; i <= 13; i++)
+               seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
+                               sg.info.bb_counters[i] : 0);
+@@ -5088,25 +5099,75 @@ try_group_pa:
+       return false;
+ }
++/*
++ * check free blocks in bitmap match free block in group descriptor
++ * do this before taking preallocated blocks into account to be able
++ * to detect on-disk corruptions. The group lock should be hold by the
++ * caller.
++ */
++static
++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap,
++                              struct ext4_group_desc *gdp, int group)
++{
++      unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb);
++      unsigned short i, first, free = 0;
++      unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp);
++
++      if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
++              return 0;
++
++      i = mb_find_next_zero_bit(bitmap, max, 0);
++
++      while (i < max) {
++              first = i;
++              i = mb_find_next_bit(bitmap, max, i);
++              if (i > max)
++                      i = max;
++              free += i - first;
++              if (i < max)
++                      i = mb_find_next_zero_bit(bitmap, max, i);
++      }
++
++      if (free != free_in_gdp) {
++              ext4_error(sb, "on-disk bitmap for group %d"
++                      "corrupted: %u blocks free in bitmap, %u - in gd\n",
++                      group, free, free_in_gdp);
++              return -EIO;
++      }
++      return 0;
++}
++
+ /*
+  * the function goes through all preallocation in this group and marks them
+  * used in in-core bitmap. buddy must be generated from this bitmap
+  * Need to be called with ext4 group lock held
+  */
+ static noinline_for_stack
+-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct ext4_prealloc_space *pa;
++      struct ext4_group_desc *gdp;
+       struct list_head *cur;
+       ext4_group_t groupnr;
+       ext4_grpblk_t start;
+       int preallocated = 0;
++      int skip = 0, count = 0;
++      int err;
+       int len;
+       if (!grp)
+-              return;
++              return -EIO;
++
++      gdp = ext4_get_group_desc(sb, group, NULL);
++      if (gdp == NULL)
++              return -EIO;
++
++      /* before applying preallocations, check bitmap consistency */
++      err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group);
++      if (err)
++              return err;
+       /* all form of preallocation discards first load group,
+        * so the only competing code is preallocation use.
+@@ -5123,13 +5184,23 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                            &groupnr, &start);
+               len = pa->pa_len;
+               spin_unlock(&pa->pa_lock);
+-              if (unlikely(len == 0))
++              if (unlikely(len == 0)) {
++                      skip++;
+                       continue;
++              }
+               BUG_ON(groupnr != group);
+               mb_set_bits(bitmap, start, len);
+               preallocated += len;
++              count++;
++      }
++      if (count + skip != grp->bb_prealloc_nr) {
++              ext4_error(sb, "lost preallocations: "
++                         "count %d, bb_prealloc_nr %lu, skip %d\n",
++                         count, grp->bb_prealloc_nr, skip);
++              return -EIO;
+       }
+       mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
++      return 0;
+ }
+ static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+@@ -5220,6 +5291,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+        */
+       ext4_lock_group(sb, grp);
+       list_del(&pa->pa_group_list);
++      ext4_get_group_info(sb, grp)->bb_prealloc_nr--;
+       ext4_unlock_group(sb, grp);
+       if (pa->pa_type == MB_INODE_PA) {
+@@ -5353,6 +5425,7 @@ adjust_bex:
+       pa->pa_inode = ac->ac_inode;
+       list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
++      grp->bb_prealloc_nr++;
+       write_lock(pa->pa_node_lock.inode_lock);
+       ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
+@@ -5406,6 +5479,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+       pa->pa_inode = NULL;
+       list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
++      grp->bb_prealloc_nr++;
+       /*
+        * We will later add the new pa to the right bucket
+@@ -5572,6 +5646,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
+               spin_unlock(&pa->pa_lock);
++              BUG_ON(grp->bb_prealloc_nr == 0);
++              grp->bb_prealloc_nr--;
+               list_del(&pa->pa_group_list);
+               list_add(&pa->u.pa_tmp_list, &list);
+       }
+@@ -5703,7 +5779,7 @@ repeat:
+               if (err) {
+                       ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+                                      err, group);
+-                      continue;
++                      return;
+               }
+               bitmap_bh = ext4_read_block_bitmap(sb, group);
+@@ -5716,6 +5792,8 @@ repeat:
+               }
+               ext4_lock_group(sb, group);
++              BUG_ON(e4b.bd_info->bb_prealloc_nr == 0);
++              e4b.bd_info->bb_prealloc_nr--;
+               list_del(&pa->pa_group_list);
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+               ext4_unlock_group(sb, group);
+@@ -6020,6 +6098,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
+               }
+               ext4_lock_group(sb, group);
+               list_del(&pa->pa_group_list);
++              ext4_get_group_info(sb, group)->bb_prealloc_nr--;
+               ext4_mb_release_group_pa(&e4b, pa);
+               ext4_unlock_group(sb, group);
+diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
+index d8553f14..fec1b8c2 100644
+--- a/fs/ext4/mballoc.h
++++ b/fs/ext4/mballoc.h
+@@ -66,7 +66,7 @@
+ /*
+  * for which requests use 2^N search using buddies
+  */
+-#define MB_DEFAULT_ORDER2_REQS                2
++#define MB_DEFAULT_ORDER2_REQS                8
+ /*
+  * default group prealloc size 512 blocks
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-misc.patch
new file mode 100644 (file)
index 0000000..a81589e
--- /dev/null
@@ -0,0 +1,208 @@
+commit b175e2441b0cd9fae60341ba92b0f7f192e71446
+Author: girish <girish>
+
+b=16893
+i=adilger
+i=johann
+
+ext4 ldiskfs patches for rhel5
+
+ported to linux 6.10
+---
+ fs/ext4/ext4.h   | 23 ++++++++++++++++++++++-
+ fs/ext4/ialloc.c |  3 ++-
+ fs/ext4/inode.c  | 17 +++++++++++++++++
+ fs/ext4/namei.c  |  9 ++++++---
+ fs/ext4/super.c  |  6 ------
+ fs/ext4/xattr.c  |  2 ++
+ 6 files changed, 49 insertions(+), 11 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index b5827d01..95bbfd52 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -2225,7 +2225,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl
+ EXTN_FEATURE_FUNCS(2)
+ EXTN_FEATURE_FUNCS(3)
+-EXTN_FEATURE_FUNCS(4)
++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb)
++{
++      return ((EXT4_SB(sb)->s_es->s_feature_compat &
++              cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0);
++}
++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb)
++{
++      return ((EXT4_SB(sb)->s_es->s_feature_ro_compat &
++              cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0);
++}
++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb)
++{
++      return ((EXT4_SB(sb)->s_es->s_feature_incompat &
++              cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0);
++}
+ static inline bool ext4_has_compat_features(struct super_block *sb)
+ {
+@@ -3695,6 +3709,13 @@ struct ext4_extent;
+ #define EXT_MAX_BLOCKS        0xffffffff
+ extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb,
++                                                ext4_group_t block_group);
++extern void ext4_inc_count(struct inode *inode);
++extern void ext4_dec_count(struct inode *inode);
++extern struct buffer_head *ext4_append(handle_t *handle,
++                                     struct inode *inode,
++                                     ext4_lblk_t *block);
+ extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
+ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+                              struct ext4_map_blocks *map, int flags);
+diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
+index 93689dae..31480792 100644
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -120,7 +120,7 @@ verified:
+  *
+  * Return buffer_head of bitmap on success, or an ERR_PTR on error.
+  */
+-static struct buffer_head *
++struct buffer_head *
+ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ {
+       struct ext4_group_desc *desc;
+@@ -215,6 +215,7 @@ out:
+       put_bh(bh);
+       return ERR_PTR(err);
+ }
++EXPORT_SYMBOL(ext4_read_inode_bitmap);
+ /*
+  * NOTE! When we get the inode, we're the only people
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 339bdfac..cd8ab8d3 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -6166,3 +6166,20 @@ out_error:
+       ext4_journal_stop(handle);
+       goto out;
+ }
++EXPORT_SYMBOL(ext4_map_blocks);
++EXPORT_SYMBOL(ext4_truncate);
++EXPORT_SYMBOL(ext4_iget);
++EXPORT_SYMBOL(ext4_bread);
++EXPORT_SYMBOL(ext4_itable_unused_count);
++EXPORT_SYMBOL(ext4_force_commit);
++EXPORT_SYMBOL(__ext4_mark_inode_dirty);
++EXPORT_SYMBOL(ext4_get_group_desc);
++EXPORT_SYMBOL(__ext4_journal_get_write_access);
++EXPORT_SYMBOL(__ext4_journal_start_sb);
++EXPORT_SYMBOL(__ext4_journal_stop);
++EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
++EXPORT_SYMBOL(__ext4_std_error);
++EXPORT_SYMBOL(ext4fs_dirhash);
++EXPORT_SYMBOL(ext4_get_inode_loc);
++EXPORT_SYMBOL(__ext4_journal_ensure_credits);
++EXPORT_SYMBOL(ext4_chunk_trans_blocks);
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 8f9c3c0e..bfd849ca 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -50,7 +50,7 @@
+ #define NAMEI_RA_BLOCKS  4
+ #define NAMEI_RA_SIZE      (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+-static struct buffer_head *ext4_append(handle_t *handle,
++struct buffer_head *ext4_append(handle_t *handle,
+                                       struct inode *inode,
+                                       ext4_lblk_t *block)
+ {
+@@ -210,6 +210,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+       }
+       return bh;
+ }
++EXPORT_SYMBOL(ext4_append);
+ #ifdef DX_DEBUG
+ #define dxtrace(command) command
+@@ -2786,23 +2787,25 @@ EXPORT_SYMBOL(ext4_delete_entry);
+  * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
+  * on regular files) and to avoid creating huge/slow non-HTREE directories.
+  */
+-static void ext4_inc_count(struct inode *inode)
++void ext4_inc_count(struct inode *inode)
+ {
+       inc_nlink(inode);
+       if (is_dx(inode) &&
+           (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2))
+               set_nlink(inode, 1);
+ }
++EXPORT_SYMBOL(ext4_inc_count);
+ /*
+  * If a directory had nlink == 1, then we should let it be 1. This indicates
+  * directory has >EXT4_LINK_MAX subdirs.
+  */
+-static void ext4_dec_count(struct inode *inode)
++void ext4_dec_count(struct inode *inode)
+ {
+       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+               drop_nlink(inode);
+ }
++EXPORT_SYMBOL(ext4_dec_count);
+ /*
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index c682fb92..5250fa60 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -7365,16 +7365,12 @@ static int __init ext4_init_fs(void)
+       if (err)
+               goto out05;
+-      register_as_ext3();
+-      register_as_ext2();
+       err = register_filesystem(&ext4_fs_type);
+       if (err)
+               goto out;
+       return 0;
+ out:
+-      unregister_as_ext2();
+-      unregister_as_ext3();
+       ext4_fc_destroy_dentry_cache();
+ out05:
+       destroy_inodecache();
+@@ -7399,8 +7395,6 @@ out7:
+ static void __exit ext4_exit_fs(void)
+ {
+       ext4_destroy_lazyinit_thread();
+-      unregister_as_ext2();
+-      unregister_as_ext3();
+       unregister_filesystem(&ext4_fs_type);
+       ext4_fc_destroy_dentry_cache();
+       destroy_inodecache();
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 6460879b..4b94e270 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -716,6 +716,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+       up_read(&EXT4_I(inode)->xattr_sem);
+       return error;
+ }
++EXPORT_SYMBOL(ext4_xattr_get);
+ static int
+ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
+@@ -2490,6 +2491,7 @@ cleanup:
+       ext4_write_unlock_xattr(inode, &no_expand);
+       return error;
+ }
++EXPORT_SYMBOL(ext4_xattr_set_handle);
+ int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+                          bool is_create, int *credits)
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/linux-6.10/ext4-prealloc.patch
new file mode 100644 (file)
index 0000000..45cc9e1
--- /dev/null
@@ -0,0 +1,409 @@
+commit d8d8fd9192a54c7b8caef8cca9b7a1eb5e5e3298
+Author: Alex Zhuravlev <alex.zhuravlev@sun.com>
+AuthorDate: Thu Oct 23 10:02:19 2008 +0000
+Subject: ext4: support for tunable preallocation window
+Add support for tunable preallocation window and new tunables
+for large/small requests.
+Bugzilla-ID: b=12800
+Signed-off-by: Alex Zhuravlev <alex.zhuravlev@sun.com>
+Reviewed-by: Kalpak Shah <kalpak@clusterfs.com>
+Reviewed-by: Andreas Dilger <andreas.dilger@sun.com>
+---
+ fs/ext4/ext4.h    |   7 +-
+ fs/ext4/inode.c   |   3 +
+ fs/ext4/mballoc.c | 220 +++++++++++++++++++++++++++++++++++-----------
+ fs/ext4/sysfs.c   |   8 +-
+ 4 files changed, 182 insertions(+), 56 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 7332e538..b0723244 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1287,6 +1287,8 @@ extern void mb_set_bits(void *bm, int cur, int len);
+ #define EXT4_DFL_MAX_MNT_COUNT                20      /* Allow 20 mounts */
+ #define EXT4_DFL_CHECKINTERVAL                0       /* Don't use interval check */
++#define EXT4_MAX_PREALLOC_TABLE       64
++
+ /*
+  * Behaviour when detecting errors
+  */
+@@ -1595,11 +1597,13 @@ struct ext4_sb_info {
+       /* tunables */
+       unsigned long s_stripe;
+       unsigned int s_mb_max_linear_groups;
+-      unsigned int s_mb_stream_request;
++      unsigned long s_mb_small_req;
++      unsigned long s_mb_large_req;
+       unsigned int s_mb_max_to_scan;
+       unsigned int s_mb_min_to_scan;
+       unsigned int s_mb_stats;
+       unsigned int s_mb_order2_reqs;
++      unsigned long *s_mb_prealloc_table;
+       unsigned int s_mb_group_prealloc;
+       unsigned int s_max_dir_size_kb;
+       /* where last allocation was done - for stream allocation */
+@@ -2915,6 +2919,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
+                          int len, int replay);
+ /* mballoc.c */
++extern const struct proc_ops ext4_seq_prealloc_table_fops;
+ extern const struct seq_operations ext4_mb_seq_groups_ops;
+ extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
+ extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 401cf597..339bdfac 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2595,6 +2595,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
+                                               PAGE_SIZE >> inode->i_blkbits);
+       }
++      if (wbc->nr_to_write < sbi->s_mb_small_req)
++              wbc->nr_to_write = sbi->s_mb_small_req;
++
+       if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+               range_whole = 1;
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 9dda9cd6..e64b31e5 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3274,6 +3274,99 @@ const struct seq_operations ext4_mb_seq_structs_summary_ops = {
+       .show   = ext4_mb_seq_structs_summary_show,
+ };
++static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi,
++                                               char *str, size_t cnt,
++                                               int update)
++{
++      unsigned long value;
++      unsigned long prev = 0;
++      char *cur;
++      char *next;
++      char *end;
++      int num = 0;
++
++      cur = str;
++      end = str + cnt;
++      while (cur < end) {
++              while ((cur < end) && (*cur == ' ')) cur++;
++              value = simple_strtol(cur, &next, 0);
++              if (value == 0)
++                      break;
++              if (cur == next)
++                      return -EINVAL;
++
++              cur = next;
++
++              if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
++                      return -EINVAL;
++
++              /* they should add values in order */
++              if (value <= prev)
++                      return -EINVAL;
++
++              if (update)
++                      sbi->s_mb_prealloc_table[num] = value;
++
++              prev = value;
++              num++;
++      }
++
++      if (num > EXT4_MAX_PREALLOC_TABLE - 1)
++              return -EOVERFLOW;
++
++      if (update)
++              sbi->s_mb_prealloc_table[num] = 0;
++
++      return 0;
++}
++
++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file,
++                                           const char __user *buf,
++                                           size_t cnt, loff_t *pos)
++{
++      struct ext4_sb_info *sbi = EXT4_SB(pde_data(file_inode(file)));
++      char str[128];
++      int rc;
++
++      if (cnt >= sizeof(str))
++              return -EINVAL;
++      if (copy_from_user(str, buf, cnt))
++              return -EFAULT;
++
++      rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0);
++      if (rc)
++              return rc;
++
++      rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1);
++      return rc ? rc : cnt;
++}
++
++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v)
++{
++      struct ext4_sb_info *sbi = EXT4_SB(m->private);
++      int i;
++
++      for (i = 0; i < EXT4_MAX_PREALLOC_TABLE &&
++                      sbi->s_mb_prealloc_table[i] != 0; i++)
++              seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]);
++      seq_printf(m, "\n");
++
++      return 0;
++}
++
++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file)
++{
++      return single_open(file, mb_prealloc_table_seq_show, pde_data(inode));
++}
++
++const struct proc_ops ext4_seq_prealloc_table_fops = {
++      .proc_open      = mb_prealloc_table_seq_open,
++      .proc_read      = seq_read,
++      .proc_lseek     = seq_lseek,
++      .proc_release   = single_release,
++      .proc_write     = ext4_mb_prealloc_table_proc_write,
++};
++
+ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+ {
+       int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+@@ -3590,7 +3683,7 @@ static void ext4_discard_work(struct work_struct *work)
+ int ext4_mb_init(struct super_block *sb)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+-      unsigned i, j;
++      unsigned i, j, k, l;
+       unsigned offset, offset_incr;
+       unsigned max;
+       int ret;
+@@ -3679,7 +3772,6 @@ int ext4_mb_init(struct super_block *sb)
+       sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+       sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+       sbi->s_mb_stats = MB_DEFAULT_STATS;
+-      sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+       sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
+@@ -3705,9 +3797,29 @@ int ext4_mb_init(struct super_block *sb)
+        * RAID stripe size so that preallocations don't fragment
+        * the stripes.
+        */
+-      if (sbi->s_stripe > 1) {
+-              sbi->s_mb_group_prealloc = roundup(
+-                      sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
++
++      /* Allocate table once */
++      sbi->s_mb_prealloc_table = kzalloc(
++              EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS);
++      if (sbi->s_mb_prealloc_table == NULL) {
++              ret = -ENOMEM;
++              goto out;
++      }
++
++      if (sbi->s_stripe == 0) {
++              for (k = 0, l = 4; k <= 9; ++k, l *= 2)
++                      sbi->s_mb_prealloc_table[k] = l;
++
++              sbi->s_mb_small_req = 256;
++              sbi->s_mb_large_req = 1024;
++              sbi->s_mb_group_prealloc = 512;
++      } else {
++              for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2)
++                      sbi->s_mb_prealloc_table[k] = l;
++
++              sbi->s_mb_small_req = sbi->s_stripe;
++              sbi->s_mb_large_req = sbi->s_stripe * 8;
++              sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
+       }
+       sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+@@ -3743,6 +3855,7 @@ out:
+       kfree(sbi->s_mb_avg_fragment_size_locks);
+       kfree(sbi->s_mb_largest_free_orders);
+       kfree(sbi->s_mb_largest_free_orders_locks);
++      kfree(sbi->s_mb_prealloc_table);
+       kfree(sbi->s_mb_offsets);
+       sbi->s_mb_offsets = NULL;
+       kfree(sbi->s_mb_maxs);
+@@ -4099,7 +4212,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+       ext4_grpblk_t changed;
+       BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+-      BUG_ON(ac->ac_b_ex.fe_len <= 0);
+       sb = ac->ac_sb;
+       sbi = EXT4_SB(sb);
+@@ -4423,10 +4535,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_super_block *es = sbi->s_es;
+-      int bsbits, max;
+-      loff_t size, start_off, end;
++      int bsbits, i, wind;
++      loff_t size, end;
+       loff_t orig_size __maybe_unused;
+       ext4_lblk_t start;
++      unsigned long value, last_non_zero;
+       /* do normalize only data requests, metadata requests
+          do not need preallocation */
+@@ -4455,51 +4568,46 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+       size = size << bsbits;
+       if (size < i_size_read(ac->ac_inode))
+               size = i_size_read(ac->ac_inode);
+-      orig_size = size;
++      size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
++
++      start = wind = 0;
++      value = last_non_zero = 0;
+-      /* max size of free chunks */
+-      max = 2 << bsbits;
+-
+-#define NRL_CHECK_SIZE(req, size, max, chunk_size)    \
+-              (req <= (size) || max <= (chunk_size))
+-
+-      /* first, try to predict filesize */
+-      /* XXX: should this table be tunable? */
+-      start_off = 0;
+-      if (size <= 16 * 1024) {
+-              size = 16 * 1024;
+-      } else if (size <= 32 * 1024) {
+-              size = 32 * 1024;
+-      } else if (size <= 64 * 1024) {
+-              size = 64 * 1024;
+-      } else if (size <= 128 * 1024) {
+-              size = 128 * 1024;
+-      } else if (size <= 256 * 1024) {
+-              size = 256 * 1024;
+-      } else if (size <= 512 * 1024) {
+-              size = 512 * 1024;
+-      } else if (size <= 1024 * 1024) {
+-              size = 1024 * 1024;
+-      } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+-              start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+-                                              (21 - bsbits)) << 21;
+-              size = 2 * 1024 * 1024;
+-      } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+-              start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+-                                                      (22 - bsbits)) << 22;
+-              size = 4 * 1024 * 1024;
+-      } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
+-                                      (8<<20)>>bsbits, max, 8 * 1024)) {
+-              start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+-                                                      (23 - bsbits)) << 23;
+-              size = 8 * 1024 * 1024;
++      /* let's choose preallocation window depending on file size */
++      for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) {
++              value = sbi->s_mb_prealloc_table[i];
++              if (value == 0)
++                      break;
++              else
++                      last_non_zero = value;
++
++              if (size <= value) {
++                      wind = value;
++                      break;
++              }
++      }
++
++      if (wind == 0) {
++              if (last_non_zero != 0) {
++                      __u64 tstart, tend;
++                      /* file is quite large, we now preallocate with
++                      * the biggest configured window with regart to
++                      * logical offset */
++                      wind = last_non_zero;
++                      tstart = ac->ac_o_ex.fe_logical;
++                      do_div(tstart, wind);
++                      start = tstart * wind;
++                      tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
++                      do_div(tend, wind);
++                      tend = tend * wind + wind;
++                      size = tend - start;
++              }
+       } else {
+-              start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
+-              size      = (loff_t) EXT4_C2B(sbi,
+-                                            ac->ac_o_ex.fe_len) << bsbits;
++              size = wind;
+       }
+-      size = size >> bsbits;
+-      start = start_off >> bsbits;
++
++
++      orig_size = size;
+       /*
+        * For tiny groups (smaller than 8MB) the chosen allocation
+@@ -4558,7 +4666,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+                        (unsigned long) ac->ac_o_ex.fe_logical);
+               BUG();
+       }
+-      BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+       /* now prepare goal request */
+@@ -5761,8 +5868,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+               inode_pa_eligible = false;
+       size = max(size, isize);
+-      /* Don't use group allocation for large files */
+-      if (size > sbi->s_mb_stream_request)
++      if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
++          (size >= sbi->s_mb_large_req))
+               group_pa_eligible = false;
+       if (!group_pa_eligible) {
+@@ -5773,6 +5880,13 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+               return;
+       }
++      /*
++       * request is so large that we don't care about
++       * streaming - it overweights any possible seek
++       */
++      if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
++              return;
++
+       BUG_ON(ac->ac_lg != NULL);
+       /*
+        * locality group prealloc space are per cpu. The reason for having
+diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
+index ddb54608..2aeff069 100644
+--- a/fs/ext4/sysfs.c
++++ b/fs/ext4/sysfs.c
+@@ -222,7 +222,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
+ EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+ EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+@@ -270,7 +271,8 @@ static struct attribute *ext4_attrs[] = {
+       ATTR_LIST(mb_max_to_scan),
+       ATTR_LIST(mb_min_to_scan),
+       ATTR_LIST(mb_order2_req),
+-      ATTR_LIST(mb_stream_req),
++      ATTR_LIST(mb_small_req),
++      ATTR_LIST(mb_large_req),
+       ATTR_LIST(mb_group_prealloc),
+       ATTR_LIST(mb_max_linear_groups),
+       ATTR_LIST(max_writeback_mb_bump),
+@@ -584,6 +586,8 @@ int ext4_register_sysfs(struct super_block *sb)
+                                       ext4_fc_info_show, sb);
+               proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
+                               &ext4_mb_seq_groups_ops, sb);
++              proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc,
++                              &ext4_seq_prealloc_table_fops, sb);
+               proc_create_single_data("mb_stats", 0444, sbi->s_proc,
+                               ext4_seq_mb_stats_show, sb);
+               proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
+-- 
+2.34.1
+
diff --git a/ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series b/ldiskfs/kernel_patches/series/ldiskfs-6.10-ml.series
new file mode 100644 (file)
index 0000000..99db9cb
--- /dev/null
@@ -0,0 +1,37 @@
+linux-5.16/ext4-inode-version.patch
+linux-5.18/ext4-lookup-dotdot.patch
+linux-5.14/ext4-print-inum-in-htree-warning.patch
+linux-6.10/ext4-prealloc.patch
+linux-5.16/ext4-osd-iop-common.patch
+linux-6.10/ext4-misc.patch
+linux-6.10/ext4-mballoc-extra-checks.patch
+sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch
+linux-5.14/ext4-kill-dx-root.patch
+linux-6.5/ext4-mballoc-pa-free-mismatch.patch
+linux-6.5/ext4-data-in-dirent.patch
+linux-6.6/ext4-nocmtime.patch
+base/ext4-htree-lock.patch
+linux-6.5/ext4-pdirop.patch
+linux-6.10/ext4-max-dir-size.patch
+linux-6.10/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
+rhel9/ext4-give-warning-with-dir-htree-growing.patch
+ubuntu18/ext4-jcb-optimization.patch
+linux-6.2/ext4-attach-jinode-in-writepages.patch
+linux-6.5/ext4-dont-check-before-replay.patch
+rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7.6/ext4-export-orphan-add.patch
+linux-5.18/ext4-export-mb-stream-allocator-variables.patch
+ubuntu19/ext4-iget-with-flags.patch
+linux-5.14/export-ext4fs-dirhash-helper.patch
+linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch
+rhel9/ext4-dquot-commit-speedup.patch
+linux-6.7/ext4-ialloc-uid-gid-and-pass-owner-down.patch
+linux-5.14/ext4-projid-xattrs.patch
+linux-6.10/ext4-delayed-iput.patch
+rhel8/ext4-ext-merge.patch
+linux-5.14/ext4-xattr-disable-credits-check.patch
+rhel9.2/ext4-fiemap-kernel-data.patch
+rhel8/ext4-old_ea_inodes_handling_fix.patch
+linux-6.10/ext4-filename-encode.patch
+rhel9.1/ext4-enc-flag.patch
+linux-6.6/ext4-encdata.patch
index 250d474..53965c5 100644 (file)
@@ -26,12 +26,17 @@ TBD Whamcloud
          5.14.0-284.30.1.el9  (RHEL9.2)
          4.4.120-92.70        (SLES12 SP2)
          4.4.180-94.100       (SLES12 SP3)
+         5.14.21-150500.55.65 (SLES15 SP5)
          4.4.0-131            (Ubuntu 16.04)
          4.15.0-32            (Ubuntu 18.04)
          5.4.0-48             (Ubuntu 20.04)
+         6.8.0-38             (Ubuntu 24.04)
+         6.10.0-15            (Ubuntu 24.04)
          vanilla linux 5.4.0  (ZFS + ldiskfs)
          vanilla linux 5.4.21  (ZFS + ldiskfs)
          vanilla linux 5.4.136  (ZFS + ldiskfs)
+         vanilla linux 6.1.36 (ZFS + ldiskfs)
+         vanilla linux 6.6.13 (ZFS + ldiskfs)
          5.10.0-60.94.0.118.oe2203 (openEuler 22.03 LTS)
          5.10.0-136.32.0.108.oe2203sp1 (openEuler 22.03 LTS SP1)
          5.10.0-153.19.0.95.oe2203sp2 (openEuler 22.03 LTS SP2)