1 commit 27bc446e2def38db3244a6eb4bb1d6312936610a
2 Author: brookxu <brookxu.cn@gmail.com>
3 Date: Mon Aug 17 15:36:15 2020 +0800
5 ext4: limit the length of per-inode prealloc list
7 In the scenario of writing sparse files, the per-inode prealloc list may
8 be very long, resulting in high overhead for ext4_mb_use_preallocated().
9 To circumvent this problem, we limit the maximum length of per-inode
10 prealloc list to 512 and allow users to modify it.
12 After patching, we observed that the sys ratio of cpu has dropped, and
13 the system throughput has increased significantly. We created a process
14 to write the sparse file, and the running time of the process on the
15 fixed kernel was significantly reduced, as follows:
17 Running time on unfixed kernel:
18 [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
23 Running time on fixed kernel:
24 [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
29 Signed-off-by: Chunguang Xu <brookxu@tencent.com>
30 Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
31 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
33 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
34 index 71b4370a3f91..523e00d7b392 100644
37 @@ -1070,6 +1070,7 @@ struct ext4_inode_info {
38 struct timespec64 i_crtime;
41 + atomic_t i_prealloc_active;
42 struct list_head i_prealloc_list;
43 spinlock_t i_prealloc_lock;
45 @@ -1518,6 +1519,7 @@ struct ext4_sb_info {
46 unsigned int s_mb_stats;
47 unsigned int s_mb_order2_reqs;
48 unsigned int s_mb_group_prealloc;
49 + unsigned int s_mb_max_inode_prealloc;
50 unsigned int s_max_dir_size_kb;
51 /* where last allocation was done - for stream allocation */
52 unsigned long s_mb_last_group;
53 @@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
54 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
55 struct ext4_allocation_request *, int *);
56 extern int ext4_mb_reserve_blocks(struct super_block *, int);
57 -extern void ext4_discard_preallocations(struct inode *);
58 +extern void ext4_discard_preallocations(struct inode *, unsigned int);
59 extern int __init ext4_init_mballoc(void);
60 extern void ext4_exit_mballoc(void);
61 extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
62 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
63 index 0eea09aa0f26..a0481582187a 100644
64 --- a/fs/ext4/extents.c
65 +++ b/fs/ext4/extents.c
66 @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
67 * i_mutex. So we can safely drop the i_data_sem here.
69 BUG_ON(EXT4_JOURNAL(inode) == NULL);
70 - ext4_discard_preallocations(inode);
71 + ext4_discard_preallocations(inode, 0);
72 up_write(&EXT4_I(inode)->i_data_sem);
75 @@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
76 /* free data blocks we just allocated */
77 /* not a good idea to call discard here directly,
78 * but otherwise we'd need to call it every free() */
79 - ext4_discard_preallocations(inode);
80 + ext4_discard_preallocations(inode, 0);
81 ext4_free_blocks(handle, inode, NULL, newblock,
82 EXT4_C2B(sbi, allocated_clusters), fb_flags);
84 @@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
87 down_write(&EXT4_I(inode)->i_data_sem);
88 - ext4_discard_preallocations(inode);
89 + ext4_discard_preallocations(inode, 0);
91 ret = ext4_es_remove_extent(inode, punch_start,
92 EXT_MAX_BLOCKS - punch_start);
93 @@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
94 up_write(&EXT4_I(inode)->i_data_sem);
97 - ext4_discard_preallocations(inode);
98 + ext4_discard_preallocations(inode, 0);
100 ret = ext4_ext_shift_extents(inode, handle, punch_stop,
101 punch_stop - punch_start, SHIFT_LEFT);
102 @@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
105 down_write(&EXT4_I(inode)->i_data_sem);
106 - ext4_discard_preallocations(inode);
107 + ext4_discard_preallocations(inode, 0);
109 path = ext4_find_extent(inode, offset_lblk, NULL, 0);
111 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
112 index 7a2720517bbb..e608ce3fb535 100644
115 @@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
116 (atomic_read(&inode->i_writecount) == 1) &&
117 !EXT4_I(inode)->i_reserved_data_blocks) {
118 down_write(&EXT4_I(inode)->i_data_sem);
119 - ext4_discard_preallocations(inode);
120 + ext4_discard_preallocations(inode, 0);
121 up_write(&EXT4_I(inode)->i_data_sem);
123 if (is_dx(inode) && filp->private_data)
124 diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
125 index 433ca8415c5a..80c9f33800be 100644
126 --- a/fs/ext4/indirect.c
127 +++ b/fs/ext4/indirect.c
128 @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
129 * i_mutex. So we can safely drop the i_data_sem here.
131 BUG_ON(EXT4_JOURNAL(inode) == NULL);
132 - ext4_discard_preallocations(inode);
133 + ext4_discard_preallocations(inode, 0);
134 up_write(&EXT4_I(inode)->i_data_sem);
137 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
138 index 0b07576af3bf..77543f988258 100644
139 --- a/fs/ext4/inode.c
140 +++ b/fs/ext4/inode.c
141 @@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
143 if ((ei->i_reserved_data_blocks == 0) &&
144 !inode_is_open_for_write(inode))
145 - ext4_discard_preallocations(inode);
146 + ext4_discard_preallocations(inode, 0);
149 static int __check_block_validity(struct inode *inode, const char *func,
150 @@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
151 if (stop_block > first_block) {
153 down_write(&EXT4_I(inode)->i_data_sem);
154 - ext4_discard_preallocations(inode);
155 + ext4_discard_preallocations(inode, 0);
157 ret = ext4_es_remove_extent(inode, first_block,
158 stop_block - first_block);
159 @@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
161 down_write(&EXT4_I(inode)->i_data_sem);
163 - ext4_discard_preallocations(inode);
164 + ext4_discard_preallocations(inode, 0);
166 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
167 err = ext4_ext_truncate(handle, inode);
168 diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
169 index 6e70a63dcca7..36eca3bc036a 100644
170 --- a/fs/ext4/ioctl.c
171 +++ b/fs/ext4/ioctl.c
172 @@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
173 reset_inode_seed(inode);
174 reset_inode_seed(inode_bl);
176 - ext4_discard_preallocations(inode);
177 + ext4_discard_preallocations(inode, 0);
179 err = ext4_mark_inode_dirty(handle, inode);
181 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
182 index 45ac6088b4ac..132c118d12e1 100644
183 --- a/fs/ext4/mballoc.c
184 +++ b/fs/ext4/mballoc.c
185 @@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
186 if (!sbi->s_mb_c3_blocks)
187 sbi->s_mb_c3_blocks =
188 THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
189 + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
191 * The default group preallocation is 512, which for 4k block
192 * sizes translates to 2 megabytes. However for bigalloc file
193 @@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
194 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
197 +static void ext4_mb_mark_pa_deleted(struct super_block *sb,
198 + struct ext4_prealloc_space *pa)
200 + struct ext4_inode_info *ei;
202 + if (pa->pa_deleted) {
203 + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
204 + pa->pa_type, pa->pa_pstart, pa->pa_lstart,
209 + pa->pa_deleted = 1;
211 + if (pa->pa_type == MB_INODE_PA) {
212 + ei = EXT4_I(pa->pa_inode);
213 + atomic_dec(&ei->i_prealloc_active);
217 static void ext4_mb_pa_callback(struct rcu_head *head)
219 struct ext4_prealloc_space *pa;
220 @@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
224 - pa->pa_deleted = 1;
225 + ext4_mb_mark_pa_deleted(sb, pa);
226 spin_unlock(&pa->pa_lock);
228 grp_blk = pa->pa_pstart;
229 @@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
230 spin_lock(pa->pa_obj_lock);
231 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
232 spin_unlock(pa->pa_obj_lock);
233 + atomic_inc(&ei->i_prealloc_active);
237 @@ -4182,7 +4204,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
240 /* seems this one can be freed ... */
241 - pa->pa_deleted = 1;
242 + ext4_mb_mark_pa_deleted(sb, pa);
244 /* we can trust pa_free ... */
246 @@ -4245,7 +4267,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
248 * FIXME!! Make sure it is valid at all the call sites
250 -void ext4_discard_preallocations(struct inode *inode)
251 +void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
253 struct ext4_inode_info *ei = EXT4_I(inode);
254 struct super_block *sb = inode->i_sb;
255 @@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
257 mb_debug(sb, "discard preallocation for inode %lu\n",
259 - trace_ext4_discard_preallocations(inode);
260 + trace_ext4_discard_preallocations(inode,
261 + atomic_read(&ei->i_prealloc_active), needed);
263 INIT_LIST_HEAD(&list);
269 /* first, collect all pa's in the inode */
270 spin_lock(&ei->i_prealloc_lock);
271 - while (!list_empty(&ei->i_prealloc_list)) {
272 - pa = list_entry(ei->i_prealloc_list.next,
273 + while (!list_empty(&ei->i_prealloc_list) && needed) {
274 + pa = list_entry(ei->i_prealloc_list.prev,
275 struct ext4_prealloc_space, pa_inode_list);
276 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
277 spin_lock(&pa->pa_lock);
278 @@ -4288,10 +4314,11 @@ void ext4_discard_preallocations(struct inode *inode)
281 if (pa->pa_deleted == 0) {
282 - pa->pa_deleted = 1;
283 + ext4_mb_mark_pa_deleted(sb, pa);
284 spin_unlock(&pa->pa_lock);
285 list_del_rcu(&pa->pa_inode_list);
286 list_add(&pa->u.pa_tmp_list, &list);
291 @@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
292 BUG_ON(pa->pa_type != MB_GROUP_PA);
294 /* seems this one can be freed ... */
295 - pa->pa_deleted = 1;
296 + ext4_mb_mark_pa_deleted(sb, pa);
297 spin_unlock(&pa->pa_lock);
299 list_del_rcu(&pa->pa_inode_list);
300 @@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
305 + * if per-inode prealloc list is too long, trim some PA
307 +static void ext4_mb_trim_inode_pa(struct inode *inode)
309 + struct ext4_inode_info *ei = EXT4_I(inode);
310 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
313 + count = atomic_read(&ei->i_prealloc_active);
314 + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
315 + if (count > sbi->s_mb_max_inode_prealloc + delta) {
316 + count -= sbi->s_mb_max_inode_prealloc;
317 + ext4_discard_preallocations(inode, count);
322 * release all resource we used in allocation
324 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
326 + struct inode *inode = ac->ac_inode;
327 + struct ext4_inode_info *ei = EXT4_I(inode);
328 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
329 struct ext4_prealloc_space *pa = ac->ac_pa;
331 @@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
332 ext4_mb_add_n_trim(ac);
336 + if (pa->pa_type == MB_INODE_PA) {
338 + * treat per-inode prealloc list as a lru list, then try
339 + * to trim the least recently used PA.
341 + spin_lock(pa->pa_obj_lock);
342 + list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
343 + spin_unlock(pa->pa_obj_lock);
346 ext4_mb_put_pa(ac, ac->ac_sb, pa);
348 if (ac->ac_bitmap_page)
349 @@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
350 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
351 mutex_unlock(&ac->ac_lg->lg_mutex);
352 ext4_mb_collect_stats(ac);
353 + ext4_mb_trim_inode_pa(inode);
357 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
358 index 6b4d17c2935d..e75b4749aa1c 100644
359 --- a/fs/ext4/mballoc.h
360 +++ b/fs/ext4/mballoc.h
363 #define MB_DEFAULT_GROUP_PREALLOC 512
366 + * maximum length of inode prealloc list
368 +#define MB_DEFAULT_MAX_INODE_PREALLOC 512
370 struct ext4_free_data {
371 /* this links the free block information from sb_info */
372 diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
373 index 1ed86fb6c302..0d601b822875 100644
374 --- a/fs/ext4/move_extent.c
375 +++ b/fs/ext4/move_extent.c
376 @@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
380 - ext4_discard_preallocations(orig_inode);
381 - ext4_discard_preallocations(donor_inode);
382 + ext4_discard_preallocations(orig_inode, 0);
383 + ext4_discard_preallocations(donor_inode, 0);
386 ext4_ext_drop_refs(path);
387 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
388 index daa94c7f7271..13bdddc081e0 100644
389 --- a/fs/ext4/super.c
390 +++ b/fs/ext4/super.c
391 @@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
392 spin_lock_init(&ei->i_raw_lock);
393 sema_init(&ei->i_append_sem, 1);
394 INIT_LIST_HEAD(&ei->i_prealloc_list);
395 + atomic_set(&ei->i_prealloc_active, 0);
396 spin_lock_init(&ei->i_prealloc_lock);
397 ldiskfs_es_init_tree(&ei->i_es_tree);
398 rwlock_init(&ei->i_es_lock);
399 @@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
400 invalidate_inode_buffers(inode);
403 - ext4_discard_preallocations(inode);
404 + ext4_discard_preallocations(inode, 0);
405 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
406 if (EXT4_I(inode)->jinode) {
407 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
408 diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
409 index 7fee11cc30e7..bfabb799fa45 100644
410 --- a/fs/ext4/sysfs.c
411 +++ b/fs/ext4/sysfs.c
412 @@ -218,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
413 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
414 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
415 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
416 +EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
417 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
418 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
419 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
420 @@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
421 ATTR_LIST(mb_order2_req),
422 ATTR_LIST(mb_stream_req),
423 ATTR_LIST(mb_group_prealloc),
424 + ATTR_LIST(mb_max_inode_prealloc),
425 ATTR_LIST(max_writeback_mb_bump),
426 ATTR_LIST(extent_max_zeroout_kb),
427 ATTR_LIST(trigger_fs_error),
428 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
429 index 8008d2e116b9..4c8b99ec8606 100644
430 --- a/include/trace/events/ext4.h
431 +++ b/include/trace/events/ext4.h
432 @@ -746,24 +746,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
435 TRACE_EVENT(ext4_discard_preallocations,
436 - TP_PROTO(struct inode *inode),
437 + TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
440 + TP_ARGS(inode, len, needed),
443 - __field( dev_t, dev )
444 - __field( ino_t, ino )
445 + __field( dev_t, dev )
446 + __field( ino_t, ino )
447 + __field( unsigned int, len )
448 + __field( unsigned int, needed )
453 __entry->dev = inode->i_sb->s_dev;
454 __entry->ino = inode->i_ino;
455 + __entry->len = len;
456 + __entry->needed = needed;
459 - TP_printk("dev %d,%d ino %lu",
460 + TP_printk("dev %d,%d ino %lu len: %u needed %u",
461 MAJOR(__entry->dev), MINOR(__entry->dev),
462 - (unsigned long) __entry->ino)
463 + (unsigned long) __entry->ino, __entry->len,
467 TRACE_EVENT(ext4_mb_discard_preallocations,