--- /dev/null
+commit 27bc446e2def38db3244a6eb4bb1d6312936610a
+Author: brookxu <brookxu.cn@gmail.com>
+Date: Mon Aug 17 15:36:15 2020 +0800
+
+ext4: limit the length of per-inode prealloc list
+
+In the scenario of writing sparse files, the per-inode prealloc list may
+be very long, resulting in high overhead for ext4_mb_use_preallocated().
+To circumvent this problem, we limit the maximum length of per-inode
+prealloc list to 512 and allow users to modify it.
+
+After patching, we observed that the sys ratio of cpu has dropped, and
+the system throughput has increased significantly. We created a process
+to write the sparse file, and the running time of the process on the
+fixed kernel was significantly reduced, as follows:
+
+Running time on unfixed kernel:
+ [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
+ real 0m2.051s
+ user 0m0.008s
+ sys 0m2.026s
+
+Running time on fixed kernel:
+ [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
+ real 0m0.471s
+ user 0m0.004s
+ sys 0m0.395s
+
+Signed-off-by: Chunguang Xu <brookxu@tencent.com>
+Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 71b4370a3f91..523e00d7b392 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1070,6 +1070,7 @@ struct ext4_inode_info {
+ struct timespec64 i_crtime;
+
+ /* mballoc */
++ atomic_t i_prealloc_active;
+ struct list_head i_prealloc_list;
+ spinlock_t i_prealloc_lock;
+
+@@ -1518,6 +1519,7 @@ struct ext4_sb_info {
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
++ unsigned int s_mb_max_inode_prealloc;
+ unsigned int s_max_dir_size_kb;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+@@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
+ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+ struct ext4_allocation_request *, int *);
+ extern int ext4_mb_reserve_blocks(struct super_block *, int);
+-extern void ext4_discard_preallocations(struct inode *);
++extern void ext4_discard_preallocations(struct inode *, unsigned int);
+ extern int __init ext4_init_mballoc(void);
+ extern void ext4_exit_mballoc(void);
+ extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 0eea09aa0f26..a0481582187a 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+@@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ /* free data blocks we just allocated */
+ /* not a good idea to call discard here directly,
+ * but otherwise we'd need to call it every free() */
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
+ goto out2;
+@@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+@@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start, SHIFT_LEFT);
+@@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ if (IS_ERR(path)) {
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 7a2720517bbb..e608ce3fb535 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ if (is_dx(inode) && filp->private_data)
+diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
+index 433ca8415c5a..80c9f33800be 100644
+--- a/fs/ext4/indirect.c
++++ b/fs/ext4/indirect.c
+@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 0b07576af3bf..77543f988258 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
+ */
+ if ((ei->i_reserved_data_blocks == 0) &&
+ !inode_is_open_for_write(inode))
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ }
+
+ static int __check_block_validity(struct inode *inode, const char *func,
+@@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+ if (stop_block > first_block) {
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+@@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ err = ext4_ext_truncate(handle, inode);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 6e70a63dcca7..36eca3bc036a 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
+ reset_inode_seed(inode);
+ reset_inode_seed(inode_bl);
+
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 45ac6088b4ac..132c118d12e1 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
+ if (!sbi->s_mb_c3_blocks)
+ sbi->s_mb_c3_blocks =
+ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
++ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+@@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
+ }
+
++static void ext4_mb_mark_pa_deleted(struct super_block *sb,
++ struct ext4_prealloc_space *pa)
++{
++ struct ext4_inode_info *ei;
++
++ if (pa->pa_deleted) {
++ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
++ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
++ pa->pa_len);
++ return;
++ }
++
++ pa->pa_deleted = 1;
++
++ if (pa->pa_type == MB_INODE_PA) {
++ ei = EXT4_I(pa->pa_inode);
++ atomic_dec(&ei->i_prealloc_active);
++ }
++}
++
+ static void ext4_mb_pa_callback(struct rcu_head *head)
+ {
+ struct ext4_prealloc_space *pa;
+@@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+ return;
+ }
+
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+ spin_unlock(&pa->pa_lock);
+
+ grp_blk = pa->pa_pstart;
+@@ -4182,7 +4204,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
+ }
+
+ /* seems this one can be freed ... */
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+
+ /* we can trust pa_free ... */
+ free += pa->pa_free;
+@@ -4245,7 +4267,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+-void ext4_discard_preallocations(struct inode *inode)
++void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+ {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct super_block *sb = inode->i_sb;
+@@ -4263,14 +4285,18 @@ void ext4_discard_preallocations(struct inode *inode)
+
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
+- trace_ext4_discard_preallocations(inode);
++ trace_ext4_discard_preallocations(inode,
++ atomic_read(&ei->i_prealloc_active), needed);
+
+ INIT_LIST_HEAD(&list);
+
++ if (needed == 0)
++ needed = UINT_MAX;
++
+ repeat:
+ /* first, collect all pa's in the inode */
+ spin_lock(&ei->i_prealloc_lock);
+- while (!list_empty(&ei->i_prealloc_list)) {
+- pa = list_entry(ei->i_prealloc_list.next,
++ while (!list_empty(&ei->i_prealloc_list) && needed) {
++ pa = list_entry(ei->i_prealloc_list.prev,
+ struct ext4_prealloc_space, pa_inode_list);
+ BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+ spin_lock(&pa->pa_lock);
+@@ -4288,10 +4314,11 @@ void ext4_discard_preallocations(struct inode *inode)
+
+ }
+ if (pa->pa_deleted == 0) {
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+ spin_unlock(&pa->pa_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &list);
++ needed--;
+ continue;
+ }
+
+@@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
+ BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+ /* seems this one can be freed ... */
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+ spin_unlock(&pa->pa_lock);
+
+ list_del_rcu(&pa->pa_inode_list);
+@@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+ return ;
+ }
+
++/*
++ * if per-inode prealloc list is too long, trim some PA
++ */
++static void ext4_mb_trim_inode_pa(struct inode *inode)
++{
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++ int count, delta;
++
++ count = atomic_read(&ei->i_prealloc_active);
++ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
++ if (count > sbi->s_mb_max_inode_prealloc + delta) {
++ count -= sbi->s_mb_max_inode_prealloc;
++ ext4_discard_preallocations(inode, count);
++ }
++}
++
+ /*
+ * release all resource we used in allocation
+ */
+ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ {
++ struct inode *inode = ac->ac_inode;
++ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ if (pa) {
+@@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ ext4_mb_add_n_trim(ac);
+ }
+ }
++
++ if (pa->pa_type == MB_INODE_PA) {
++ /*
++ * treat per-inode prealloc list as a lru list, then try
++ * to trim the least recently used PA.
++ */
++ spin_lock(pa->pa_obj_lock);
++ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
++ spin_unlock(pa->pa_obj_lock);
++ }
++
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
+ }
+ if (ac->ac_bitmap_page)
+@@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+ mutex_unlock(&ac->ac_lg->lg_mutex);
+ ext4_mb_collect_stats(ac);
++ ext4_mb_trim_inode_pa(inode);
+ return 0;
+ }
+
+diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
+index 6b4d17c2935d..e75b4749aa1c 100644
+--- a/fs/ext4/mballoc.h
++++ b/fs/ext4/mballoc.h
+@@ -73,6 +73,10 @@
+ */
+ #define MB_DEFAULT_GROUP_PREALLOC 512
+
++/*
++ * maximum length of inode prealloc list
++ */
++#define MB_DEFAULT_MAX_INODE_PREALLOC 512
+
+ struct ext4_free_data {
+ /* this links the free block information from sb_info */
+diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
+index 1ed86fb6c302..0d601b822875 100644
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+
+ out:
+ if (*moved_len) {
+- ext4_discard_preallocations(orig_inode);
+- ext4_discard_preallocations(donor_inode);
++ ext4_discard_preallocations(orig_inode, 0);
++ ext4_discard_preallocations(donor_inode, 0);
+ }
+
+ ext4_ext_drop_refs(path);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index daa94c7f7271..13bdddc081e0 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ spin_lock_init(&ei->i_raw_lock);
+ sema_init(&ei->i_append_sem, 1);
+ INIT_LIST_HEAD(&ei->i_prealloc_list);
++ atomic_set(&ei->i_prealloc_active, 0);
+ spin_lock_init(&ei->i_prealloc_lock);
+ ldiskfs_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+@@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
+ invalidate_inode_buffers(inode);
+ clear_inode(inode);
+ dquot_drop(inode);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
+index 7fee11cc30e7..bfabb799fa45 100644
+--- a/fs/ext4/sysfs.c
++++ b/fs/ext4/sysfs.c
+@@ -218,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+ EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
++EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+ EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+ EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+@@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
++ ATTR_LIST(mb_max_inode_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
+index 8008d2e116b9..4c8b99ec8606 100644
+--- a/include/trace/events/ext4.h
++++ b/include/trace/events/ext4.h
+@@ -746,24 +746,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
+ );
+
+ TRACE_EVENT(ext4_discard_preallocations,
+- TP_PROTO(struct inode *inode),
++ TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
+
+- TP_ARGS(inode),
++ TP_ARGS(inode, len, needed),
+
+ TP_STRUCT__entry(
+- __field( dev_t, dev )
+- __field( ino_t, ino )
++ __field( dev_t, dev )
++ __field( ino_t, ino )
++ __field( unsigned int, len )
++ __field( unsigned int, needed )
+
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
++ __entry->len = len;
++ __entry->needed = needed;
+ ),
+
+- TP_printk("dev %d,%d ino %lu",
++ TP_printk("dev %d,%d ino %lu len: %u needed %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+- (unsigned long) __entry->ino)
++ (unsigned long) __entry->ino, __entry->len,
++ __entry->needed)
+ );
+
+ TRACE_EVENT(ext4_mb_discard_preallocations,
--- /dev/null
+commit 27bc446e2def38db3244a6eb4bb1d6312936610a
+Author: brookxu <brookxu.cn@gmail.com>
+Date: Mon Aug 17 15:36:15 2020 +0800
+
+ext4: limit the length of per-inode prealloc list
+
+In the scenario of writing sparse files, the per-inode prealloc list may
+be very long, resulting in high overhead for ext4_mb_use_preallocated().
+To circumvent this problem, we limit the maximum length of per-inode
+prealloc list to 512 and allow users to modify it.
+
+After patching, we observed that the sys ratio of cpu has dropped, and
+the system throughput has increased significantly. We created a process
+to write the sparse file, and the running time of the process on the
+fixed kernel was significantly reduced, as follows:
+
+Running time on unfixed kernel:
+ [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
+ real 0m2.051s
+ user 0m0.008s
+ sys 0m2.026s
+
+Running time on fixed kernel:
+ [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
+ real 0m0.471s
+ user 0m0.004s
+ sys 0m0.395s
+
+Signed-off-by: Chunguang Xu <brookxu@tencent.com>
+Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 71b4370a3f91..523e00d7b392 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1070,6 +1070,7 @@ struct ext4_inode_info {
+ struct timespec64 i_crtime;
+
+ /* mballoc */
++ atomic_t i_prealloc_active;
+ struct list_head i_prealloc_list;
+ spinlock_t i_prealloc_lock;
+
+@@ -1518,6 +1519,7 @@ struct ext4_sb_info {
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
++ unsigned int s_mb_max_inode_prealloc;
+ unsigned int s_max_dir_size_kb;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+@@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
+ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+ struct ext4_allocation_request *, int *);
+ extern int ext4_mb_reserve_blocks(struct super_block *, int);
+-extern void ext4_discard_preallocations(struct inode *);
++extern void ext4_discard_preallocations(struct inode *, unsigned int);
+ extern int __init ext4_init_mballoc(void);
+ extern void ext4_exit_mballoc(void);
+ extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 0eea09aa0f26..a0481582187a 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+@@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ /* free data blocks we just allocated */
+ /* not a good idea to call discard here directly,
+ * but otherwise we'd need to call it every free() */
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
+ goto out2;
+@@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+@@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start, SHIFT_LEFT);
+@@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ if (IS_ERR(path)) {
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 7a2720517bbb..e608ce3fb535 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ if (is_dx(inode) && filp->private_data)
+diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
+index 433ca8415c5a..80c9f33800be 100644
+--- a/fs/ext4/indirect.c
++++ b/fs/ext4/indirect.c
+@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 0b07576af3bf..77543f988258 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
+ */
+ if ((ei->i_reserved_data_blocks == 0) &&
+ !inode_is_open_for_write(inode))
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ }
+
+ static int __check_block_validity(struct inode *inode, const char *func,
+@@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+ if (stop_block > first_block) {
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+@@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ err = ext4_ext_truncate(handle, inode);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 6e70a63dcca7..36eca3bc036a 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
+ reset_inode_seed(inode);
+ reset_inode_seed(inode_bl);
+
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 45ac6088b4ac..132c118d12e1 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
+ if (!sbi->s_mb_c3_blocks)
+ sbi->s_mb_c3_blocks =
+ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
++ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+@@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
+ }
+
++static void ext4_mb_mark_pa_deleted(struct super_block *sb,
++ struct ext4_prealloc_space *pa)
++{
++ struct ext4_inode_info *ei;
++
++ if (pa->pa_deleted) {
++ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
++ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
++ pa->pa_len);
++ return;
++ }
++
++ pa->pa_deleted = 1;
++
++ if (pa->pa_type == MB_INODE_PA) {
++ ei = EXT4_I(pa->pa_inode);
++ atomic_dec(&ei->i_prealloc_active);
++ }
++}
++
+ static void ext4_mb_pa_callback(struct rcu_head *head)
+ {
+ struct ext4_prealloc_space *pa;
+@@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+ return;
+ }
+
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+ spin_unlock(&pa->pa_lock);
+
+ grp_blk = pa->pa_pstart;
+@@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+ spin_lock(pa->pa_obj_lock);
+ list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+ spin_unlock(pa->pa_obj_lock);
++ atomic_inc(&ei->i_prealloc_active);
+ }
+
+ /*
+@@ -4182,7 +4204,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
+ }
+
+ /* seems this one can be freed ... */
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+
+ /* we can trust pa_free ... */
+ free += pa->pa_free;
+@@ -4245,7 +4267,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+-void ext4_discard_preallocations(struct inode *inode)
++void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+ {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct super_block *sb = inode->i_sb;
+@@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
+
+ mb_debug(sb, "discard preallocation for inode %lu\n",
+ inode->i_ino);
+- trace_ext4_discard_preallocations(inode);
++ trace_ext4_discard_preallocations(inode,
++ atomic_read(&ei->i_prealloc_active), needed);
+
+ INIT_LIST_HEAD(&list);
+
++ if (needed == 0)
++ needed = UINT_MAX;
++
+ repeat:
+ /* first, collect all pa's in the inode */
+ spin_lock(&ei->i_prealloc_lock);
+- while (!list_empty(&ei->i_prealloc_list)) {
+- pa = list_entry(ei->i_prealloc_list.next,
++ while (!list_empty(&ei->i_prealloc_list) && needed) {
++ pa = list_entry(ei->i_prealloc_list.prev,
+ struct ext4_prealloc_space, pa_inode_list);
+ BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+ spin_lock(&pa->pa_lock);
+@@ -4288,10 +4314,11 @@ void ext4_discard_preallocations(struct inode *inode)
+
+ }
+ if (pa->pa_deleted == 0) {
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+ spin_unlock(&pa->pa_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &list);
++ needed--;
+ continue;
+ }
+
+@@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
+ BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+ /* seems this one can be freed ... */
+- pa->pa_deleted = 1;
++ ext4_mb_mark_pa_deleted(sb, pa);
+ spin_unlock(&pa->pa_lock);
+
+ list_del_rcu(&pa->pa_inode_list);
+@@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+ return ;
+ }
+
++/*
++ * if per-inode prealloc list is too long, trim some PA
++ */
++static void ext4_mb_trim_inode_pa(struct inode *inode)
++{
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++ int count, delta;
++
++ count = atomic_read(&ei->i_prealloc_active);
++ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
++ if (count > sbi->s_mb_max_inode_prealloc + delta) {
++ count -= sbi->s_mb_max_inode_prealloc;
++ ext4_discard_preallocations(inode, count);
++ }
++}
++
+ /*
+ * release all resource we used in allocation
+ */
+ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ {
++ struct inode *inode = ac->ac_inode;
++ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ if (pa) {
+@@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ ext4_mb_add_n_trim(ac);
+ }
+ }
++
++ if (pa->pa_type == MB_INODE_PA) {
++ /*
++ * treat per-inode prealloc list as a lru list, then try
++ * to trim the least recently used PA.
++ */
++ spin_lock(pa->pa_obj_lock);
++ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
++ spin_unlock(pa->pa_obj_lock);
++ }
++
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
+ }
+ if (ac->ac_bitmap_page)
+@@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+ if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+ mutex_unlock(&ac->ac_lg->lg_mutex);
+ ext4_mb_collect_stats(ac);
++ ext4_mb_trim_inode_pa(inode);
+ return 0;
+ }
+
+diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
+index 6b4d17c2935d..e75b4749aa1c 100644
+--- a/fs/ext4/mballoc.h
++++ b/fs/ext4/mballoc.h
+@@ -73,6 +73,10 @@
+ */
+ #define MB_DEFAULT_GROUP_PREALLOC 512
+
++/*
++ * maximum length of inode prealloc list
++ */
++#define MB_DEFAULT_MAX_INODE_PREALLOC 512
+
+ struct ext4_free_data {
+ /* this links the free block information from sb_info */
+diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
+index 1ed86fb6c302..0d601b822875 100644
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+
+ out:
+ if (*moved_len) {
+- ext4_discard_preallocations(orig_inode);
+- ext4_discard_preallocations(donor_inode);
++ ext4_discard_preallocations(orig_inode, 0);
++ ext4_discard_preallocations(donor_inode, 0);
+ }
+
+ ext4_ext_drop_refs(path);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index daa94c7f7271..13bdddc081e0 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ spin_lock_init(&ei->i_raw_lock);
+ sema_init(&ei->i_append_sem, 1);
+ INIT_LIST_HEAD(&ei->i_prealloc_list);
++ atomic_set(&ei->i_prealloc_active, 0);
+ spin_lock_init(&ei->i_prealloc_lock);
+ ldiskfs_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+@@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
+ invalidate_inode_buffers(inode);
+ clear_inode(inode);
+ dquot_drop(inode);
+- ext4_discard_preallocations(inode);
++ ext4_discard_preallocations(inode, 0);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
+index 7fee11cc30e7..bfabb799fa45 100644
+--- a/fs/ext4/sysfs.c
++++ b/fs/ext4/sysfs.c
+@@ -218,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+ EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
++EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+ EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+ EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+@@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
++ ATTR_LIST(mb_max_inode_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
+index 8008d2e116b9..4c8b99ec8606 100644
+--- a/include/trace/events/ext4.h
++++ b/include/trace/events/ext4.h
+@@ -746,24 +746,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
+ );
+
+ TRACE_EVENT(ext4_discard_preallocations,
+- TP_PROTO(struct inode *inode),
++ TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
+
+- TP_ARGS(inode),
++ TP_ARGS(inode, len, needed),
+
+ TP_STRUCT__entry(
+- __field( dev_t, dev )
+- __field( ino_t, ino )
++ __field( dev_t, dev )
++ __field( ino_t, ino )
++ __field( unsigned int, len )
++ __field( unsigned int, needed )
+
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
++ __entry->len = len;
++ __entry->needed = needed;
+ ),
+
+- TP_printk("dev %d,%d ino %lu",
++ TP_printk("dev %d,%d ino %lu len: %u needed %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+- (unsigned long) __entry->ino)
++ (unsigned long) __entry->ino, __entry->len,
++ __entry->needed)
+ );
+
+ TRACE_EVENT(ext4_mb_discard_preallocations,