Whamcloud - gitweb
LU-16691 ldiskfs: limit length of per-inode prealloc list
authorAlex Zhuravlev <bzzz@whamcloud.com>
Fri, 31 Mar 2023 05:41:07 +0000 (08:41 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 29 Jul 2023 07:57:49 +0000 (07:57 +0000)
In the scenario of writing sparse files, the per-inode prealloc list may
be very long, resulting in high overhead for ext4_mb_use_preallocated().
To circumvent this problem, we limit the maximum length of per-inode
prealloc list to 512 and allow users to modify it.

After patching, we observed that the sys ratio of cpu has dropped, and
the system throughput has increased significantly. We created a process
to write the sparse file, and the running time of the process on the
fixed kernel was significantly reduced, as follows:

Running time on unfixed kernel:
    # time taskset 0x01 ./sparse /data1/sparce.dat
    real    0m2.051s
    user    0m0.008s
    sys     0m2.026s

Running time on fixed kernel:
    # time taskset 0x01 ./sparse /data1/sparce.dat
    real    0m0.471s
    user    0m0.004s
    sys     0m0.395s

Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
Lustre-change: https://review.whamcloud.com/50481
Lustre-commit: b16c9333a00802faea419dfe6fbb013c4477c9c6

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I5e4ea3acfc07f6e69890690211bf6a34c1230979
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51712
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
ldiskfs/kernel_patches/patches/rhel8/ext4-limit-per-inode-preallocation-list.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series

diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-limit-per-inode-preallocation-list.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-limit-per-inode-preallocation-list.patch
new file mode 100644 (file)
index 0000000..fcfecd8
--- /dev/null
@@ -0,0 +1,466 @@
+commit 27bc446e2def38db3244a6eb4bb1d6312936610a
+Author: brookxu <brookxu.cn@gmail.com>
+Date:   Mon Aug 17 15:36:15 2020 +0800
+
+ext4: limit the length of per-inode prealloc list
+
+In the scenario of writing sparse files, the per-inode prealloc list may
+be very long, resulting in high overhead for ext4_mb_use_preallocated().
+To circumvent this problem, we limit the maximum length of per-inode
+prealloc list to 512 and allow users to modify it.
+
+After patching, we observed that the sys ratio of cpu has dropped, and
+the system throughput has increased significantly. We created a process
+to write the sparse file, and the running time of the process on the
+fixed kernel was significantly reduced, as follows:
+
+Running time on unfixed kernel:
+    [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
+    real    0m2.051s
+    user    0m0.008s
+    sys     0m2.026s
+
+Running time on fixed kernel:
+    [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
+    real    0m0.471s
+    user    0m0.004s
+    sys     0m0.395s
+
+Signed-off-by: Chunguang Xu <brookxu@tencent.com>
+Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 71b4370a3f91..523e00d7b392 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1044,6 +1044,7 @@ struct ext4_inode_info {
+       struct timespec64 i_crtime;
+       /* mballoc */
++      atomic_t i_prealloc_active;
+       struct list_head i_prealloc_list;
+       spinlock_t i_prealloc_lock;
+@@ -1493,6 +1494,7 @@ struct ext4_sb_info {
+       ext4_fsblk_t s_mb_c3_blocks;
+       unsigned long *s_mb_prealloc_table;
+       unsigned int s_mb_group_prealloc;
++      unsigned int s_mb_max_inode_prealloc;
+       unsigned int s_max_dir_size_kb;
+       unsigned long s_warning_dir_size;
+       /* where last allocation was done - for stream allocation */
+@@ -2688,7 +2690,7 @@ extern int ext4_mb_release(struct super_
+ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+                               struct ext4_allocation_request *, int *);
+ extern int ext4_mb_reserve_blocks(struct super_block *, int);
+-extern void ext4_discard_preallocations(struct inode *);
++extern void ext4_discard_preallocations(struct inode *, unsigned int);
+ extern int __init ext4_init_mballoc(void);
+ extern void ext4_exit_mballoc(void);
+ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 0eea09aa0f26..a0481582187a 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -109,7 +109,7 @@ static int ext4_ext_trunc_restart_fn(str
+        * i_mutex. So we can safely drop the i_data_sem here.
+        */
+       BUG_ON(EXT4_JOURNAL(inode) == NULL);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       *dropped = 1;
+       return 0;
+@@ -4506,7 +4506,7 @@ got_allocated_blocks:
+               /* free data blocks we just allocated */
+               /* not a good idea to call discard here directly,
+                * but otherwise we'd need to call it every free() */
+-              ext4_discard_preallocations(inode);
++              ext4_discard_preallocations(inode, 0);
+               ext4_free_blocks(handle, inode, NULL, newblock,
+                                EXT4_C2B(sbi, allocated_clusters), fb_flags);
+               goto out2;
+@@ -5528,7 +5528,7 @@ static int ext4_collapse_range(struct fi
+       }
+       down_write(&EXT4_I(inode)->i_data_sem);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       ret = ext4_es_remove_extent(inode, punch_start,
+                                   EXT_MAX_BLOCKS - punch_start);
+@@ -5542,7 +5542,7 @@ static int ext4_collapse_range(struct fi
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+                                    punch_stop - punch_start, SHIFT_LEFT);
+@@ -5679,7 +5679,7 @@ static int ext4_insert_range(struct file
+               goto out_stop;
+       down_write(&EXT4_I(inode)->i_data_sem);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+       if (IS_ERR(path)) {
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 7a2720517bbb..e608ce3fb535 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -95,7 +95,7 @@ static int ext4_release_file(struct inod
+                       !EXT4_I(inode)->i_reserved_data_blocks)
+       {
+               down_write(&EXT4_I(inode)->i_data_sem);
+-              ext4_discard_preallocations(inode);
++              ext4_discard_preallocations(inode, 0);
+               up_write(&EXT4_I(inode)->i_data_sem);
+       }
+       if (is_dx(inode) && filp->private_data)
+diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
+index 433ca8415c5a..80c9f33800be 100644
+--- a/fs/ext4/indirect.c
++++ b/fs/ext4/indirect.c
+@@ -700,7 +700,7 @@ static int ext4_ind_trunc_restart_fn(han
+        * i_mutex. So we can safely drop the i_data_sem here.
+        */
+       BUG_ON(EXT4_JOURNAL(inode) == NULL);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       *dropped = 1;
+       return 0;
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 0b07576af3bf..77543f988258 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -392,7 +392,7 @@ void ext4_da_update_reserve_space(struct
+        */
+       if ((ei->i_reserved_data_blocks == 0) &&
+           !inode_is_open_for_write(inode))
+-              ext4_discard_preallocations(inode);
++              ext4_discard_preallocations(inode, 0);
+ }
+ static int __check_block_validity(struct inode *inode, const char *func,
+@@ -4410,7 +4410,7 @@ int ext4_punch_hole(struct file *file, l
+       if (stop_block > first_block) {
+               down_write(&EXT4_I(inode)->i_data_sem);
+-              ext4_discard_preallocations(inode);
++              ext4_discard_preallocations(inode, 0);
+               ret = ext4_es_remove_extent(inode, first_block,
+                                           stop_block - first_block);
+@@ -4566,7 +4566,7 @@ int ext4_truncate(struct inode *inode)
+       down_write(&EXT4_I(inode)->i_data_sem);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               err = ext4_ext_truncate(handle, inode);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 6e70a63dcca7..36eca3bc036a 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struc
+       reset_inode_seed(inode);
+       reset_inode_seed(inode_bl);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       err = ext4_mark_inode_dirty(handle, inode);
+       if (err < 0) {
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 45ac6088b4ac..132c118d12e1 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3134,6 +3134,7 @@ int ext4_mb_init(struct super_block *sb)
+       sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
+       sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
+       sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
++      sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
+       /*
+        * The default group preallocation is 512, which for 4k block
+        * sizes translates to 2 megabytes.  However for bigalloc file
+@@ -4185,6 +4186,26 @@ int ext4_mb_generate_from_pa(struct supe
+       return 0;
+ }
++static void ext4_mb_mark_pa_deleted(struct super_block *sb,
++                                  struct ext4_prealloc_space *pa)
++{
++      struct ext4_inode_info *ei;
++
++      if (pa->pa_deleted) {
++              ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
++                           pa->pa_type, pa->pa_pstart, pa->pa_lstart,
++                           pa->pa_len);
++              return;
++      }
++
++      pa->pa_deleted = 1;
++
++      if (pa->pa_type == MB_INODE_PA) {
++              ei = EXT4_I(pa->pa_inode);
++              atomic_dec(&ei->i_prealloc_active);
++      }
++}
++
+ static void ext4_mb_pa_callback(struct rcu_head *head)
+ {
+       struct ext4_prealloc_space *pa;
+@@ -4217,7 +4238,7 @@ static void ext4_mb_put_pa(struct ext4_a
+               return;
+       }
+-      pa->pa_deleted = 1;
++      ext4_mb_mark_pa_deleted(sb, pa);
+       spin_unlock(&pa->pa_lock);
+       grp_blk = pa->pa_pstart;
+@@ -4344,6 +4365,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat
+       spin_lock(pa->pa_obj_lock);
+       list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+       spin_unlock(pa->pa_obj_lock);
++      atomic_inc(&ei->i_prealloc_active);
+ }
+ /*
+@@ -4548,7 +4570,7 @@ ext4_mb_discard_group_preallocations(str
+               }
+               /* seems this one can be freed ... */
+-              pa->pa_deleted = 1;
++              ext4_mb_mark_pa_deleted(sb, pa);
+               if (!free)
+                       this_cpu_inc(discard_pa_seq);
+@@ -4599,7 +4621,7 @@ out_dbg:
+  *
+  * FIXME!! Make sure it is valid at all the call sites
+  */
+-void ext4_discard_preallocations(struct inode *inode)
++void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+ {
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct super_block *sb = inode->i_sb;
+@@ -4617,15 +4639,19 @@ void ext4_discard_preallocations(struct
+       mb_debug(sb, "discard preallocation for inode %lu\n",
+                inode->i_ino);
+-      trace_ext4_discard_preallocations(inode);
++      trace_ext4_discard_preallocations(inode,
++                      atomic_read(&ei->i_prealloc_active), needed);
+       INIT_LIST_HEAD(&list);
++      if (needed == 0)
++              needed = UINT_MAX;
++
+ repeat:
+       /* first, collect all pa's in the inode */
+       spin_lock(&ei->i_prealloc_lock);
+-      while (!list_empty(&ei->i_prealloc_list)) {
+-              pa = list_entry(ei->i_prealloc_list.next,
++      while (!list_empty(&ei->i_prealloc_list) && needed) {
++              pa = list_entry(ei->i_prealloc_list.prev,
+                               struct ext4_prealloc_space, pa_inode_list);
+               BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+               spin_lock(&pa->pa_lock);
+@@ -4642,10 +4668,11 @@ repeat:
+               }
+               if (pa->pa_deleted == 0) {
+-                      pa->pa_deleted = 1;
++                      ext4_mb_mark_pa_deleted(sb, pa);
+                       spin_unlock(&pa->pa_lock);
+                       list_del_rcu(&pa->pa_inode_list);
+                       list_add(&pa->u.pa_tmp_list, &list);
++                      needed--;
+                       continue;
+               }
+@@ -4950,7 +4977,7 @@ ext4_mb_discard_lg_preallocations(struct
+               BUG_ON(pa->pa_type != MB_GROUP_PA);
+               /* seems this one can be freed ... */
+-              pa->pa_deleted = 1;
++              ext4_mb_mark_pa_deleted(sb, pa);
+               spin_unlock(&pa->pa_lock);
+               list_del_rcu(&pa->pa_inode_list);
+@@ -5046,10 +5073,29 @@ static void ext4_mb_add_n_trim(struct ex
+ }
+ /*
++ * if per-inode prealloc list is too long, trim some PA
++ */
++static void ext4_mb_trim_inode_pa(struct inode *inode)
++{
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++      int count, delta;
++
++      count = atomic_read(&ei->i_prealloc_active);
++      delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
++      if (count > sbi->s_mb_max_inode_prealloc + delta) {
++              count -= sbi->s_mb_max_inode_prealloc;
++              ext4_discard_preallocations(inode, count);
++      }
++}
++
++/*
+  * release all resource we used in allocation
+  */
+ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ {
++      struct inode *inode = ac->ac_inode;
++      struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_prealloc_space *pa = ac->ac_pa;
+       if (pa) {
+@@ -5076,6 +5122,17 @@ static int ext4_mb_release_context(struc
+                       spin_unlock(pa->pa_obj_lock);
+                       ext4_mb_add_n_trim(ac);
+               }
++
++              if (pa->pa_type == MB_INODE_PA) {
++                      /*
++                       * treat per-inode prealloc list as a lru list, then try
++                       * to trim the least recently used PA.
++                       */
++                      spin_lock(pa->pa_obj_lock);
++                      list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
++                      spin_unlock(pa->pa_obj_lock);
++              }
++
+               ext4_mb_put_pa(ac, ac->ac_sb, pa);
+       }
+       if (ac->ac_bitmap_page)
+@@ -5085,6 +5142,7 @@ static int ext4_mb_release_context(struc
+       if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+               mutex_unlock(&ac->ac_lg->lg_mutex);
+       ext4_mb_collect_stats(ac);
++      ext4_mb_trim_inode_pa(inode);
+       return 0;
+ }
+diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
+index 6b4d17c2935d..e75b4749aa1c 100644
+--- a/fs/ext4/mballoc.h
++++ b/fs/ext4/mballoc.h
+@@ -77,6 +77,10 @@
+  */
+ #define MB_DEFAULT_GROUP_PREALLOC     512
++/*
++ * maximum length of inode prealloc list
++ */
++#define MB_DEFAULT_MAX_INODE_PREALLOC 512
+ struct ext4_free_data {
+       /* this links the free block information from sb_info */
+diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
+index 1ed86fb6c302..0d601b822875 100644
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, s
+ out:
+       if (*moved_len) {
+-              ext4_discard_preallocations(orig_inode);
+-              ext4_discard_preallocations(donor_inode);
++              ext4_discard_preallocations(orig_inode, 0);
++              ext4_discard_preallocations(donor_inode, 0);
+       }
+       ext4_ext_drop_refs(path);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index daa94c7f7271..13bdddc081e0 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1139,6 +1139,7 @@ static struct inode *ext4_alloc_inode(st
+       spin_lock_init(&ei->i_raw_lock);
+       sema_init(&ei->i_append_sem, 1);
+       INIT_LIST_HEAD(&ei->i_prealloc_list);
++      atomic_set(&ei->i_prealloc_active, 0);
+       spin_lock_init(&ei->i_prealloc_lock);
+       ext4_es_init_tree(&ei->i_es_tree);
+       rwlock_init(&ei->i_es_lock);
+@@ -1237,7 +1238,7 @@ void ext4_clear_inode(struct inode *inod
+       invalidate_inode_buffers(inode);
+       clear_inode(inode);
+       dquot_drop(inode);
+-      ext4_discard_preallocations(inode);
++      ext4_discard_preallocations(inode, 0);
+       ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+       if (EXT4_I(inode)->jinode) {
+               jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
+index 7fee11cc30e7..bfabb799fa45 100644
+--- a/fs/ext4/sysfs.c
++++ b/fs/ext4/sysfs.c
+@@ -222,6 +222,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_
+ EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
+ EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
++EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+ EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+ EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+@@ -260,6 +261,7 @@ static struct attribute *ext4_attrs[] =
+       ATTR_LIST(mb_small_req),
+       ATTR_LIST(mb_large_req),
+       ATTR_LIST(mb_group_prealloc),
++      ATTR_LIST(mb_max_inode_prealloc),
+       ATTR_LIST(max_writeback_mb_bump),
+       ATTR_LIST(extent_max_zeroout_kb),
+       ATTR_LIST(trigger_fs_error),
+diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
+index 8008d2e116b9..4c8b99ec8606 100644
+--- a/include/trace/events/ext4.h
++++ b/include/trace/events/ext4.h
+@@ -727,24 +727,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
+ );
+ TRACE_EVENT(ext4_discard_preallocations,
+-      TP_PROTO(struct inode *inode),
++      TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
+-      TP_ARGS(inode),
++      TP_ARGS(inode, len, needed),
+       TP_STRUCT__entry(
+-              __field(        dev_t,  dev                     )
+-              __field(        ino_t,  ino                     )
++              __field(        dev_t,          dev             )
++              __field(        ino_t,          ino             )
++              __field(        unsigned int,   len             )
++              __field(        unsigned int,   needed          )
+       ),
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
++              __entry->len    = len;
++              __entry->needed = needed;
+       ),
+-      TP_printk("dev %d,%d ino %lu",
++      TP_printk("dev %d,%d ino %lu len: %u needed %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+-                (unsigned long) __entry->ino)
++                (unsigned long) __entry->ino, __entry->len,
++                __entry->needed)
+ );
+ TRACE_EVENT(ext4_mb_discard_preallocations,
index 7b71bc6..ff380a7 100644 (file)
@@ -34,3 +34,4 @@ base/ext4-delayed-iput.patch
 rhel8.7/ext4-filename-encode.patch
 rhel8/ext4-old_ea_inodes_handling_fix.patch
 rhel8.4/ext4-optimize-find_delayed_extent.patch
+rhel8/ext4-limit-per-inode-preallocation-list.patch