Whamcloud - gitweb
LU-16691 ldiskfs: limit length of per-inode prealloc list
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel8 / ext4-limit-per-inode-preallocation-list.patch
1 commit 27bc446e2def38db3244a6eb4bb1d6312936610a
2 Author: brookxu <brookxu.cn@gmail.com>
3 Date:   Mon Aug 17 15:36:15 2020 +0800
4
5 ext4: limit the length of per-inode prealloc list
6
7 In the scenario of writing sparse files, the per-inode prealloc list may
8 be very long, resulting in high overhead for ext4_mb_use_preallocated().
9 To circumvent this problem, we limit the maximum length of per-inode
10 prealloc list to 512 and allow users to modify it.
11
12 After patching, we observed that the sys ratio of cpu has dropped, and
13 the system throughput has increased significantly. We created a process
14 to write the sparse file, and the running time of the process on the
15 fixed kernel was significantly reduced, as follows:
16
17 Running time on unfixed kernel:
18     [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
19     real    0m2.051s
20     user    0m0.008s
21     sys     0m2.026s
22
23 Running time on fixed kernel:
24     [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
25     real    0m0.471s
26     user    0m0.004s
27     sys     0m0.395s
28
29 Signed-off-by: Chunguang Xu <brookxu@tencent.com>
30 Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
31 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
32
33 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
34 index 71b4370a3f91..523e00d7b392 100644
35 --- a/fs/ext4/ext4.h
36 +++ b/fs/ext4/ext4.h
37 @@ -1070,6 +1070,7 @@ struct ext4_inode_info {
38         struct timespec64 i_crtime;
39  
40         /* mballoc */
41 +       atomic_t i_prealloc_active;
42         struct list_head i_prealloc_list;
43         spinlock_t i_prealloc_lock;
44  
45 @@ -1518,6 +1519,7 @@ struct ext4_sb_info {
46         unsigned int s_mb_stats;
47         unsigned int s_mb_order2_reqs;
48         unsigned int s_mb_group_prealloc;
49 +       unsigned int s_mb_max_inode_prealloc;
50         unsigned int s_max_dir_size_kb;
51         /* where last allocation was done - for stream allocation */
52         unsigned long s_mb_last_group;
53 @@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
54  extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
55                                 struct ext4_allocation_request *, int *);
56  extern int ext4_mb_reserve_blocks(struct super_block *, int);
57 -extern void ext4_discard_preallocations(struct inode *);
58 +extern void ext4_discard_preallocations(struct inode *, unsigned int);
59  extern int __init ext4_init_mballoc(void);
60  extern void ext4_exit_mballoc(void);
61  extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
62 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
63 index 0eea09aa0f26..a0481582187a 100644
64 --- a/fs/ext4/extents.c
65 +++ b/fs/ext4/extents.c
66 @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
67          * i_mutex. So we can safely drop the i_data_sem here.
68          */
69         BUG_ON(EXT4_JOURNAL(inode) == NULL);
70 -       ext4_discard_preallocations(inode);
71 +       ext4_discard_preallocations(inode, 0);
72         up_write(&EXT4_I(inode)->i_data_sem);
73         *dropped = 1;
74         return 0;
75 @@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
76                 /* free data blocks we just allocated */
77                 /* not a good idea to call discard here directly,
78                  * but otherwise we'd need to call it every free() */
79 -               ext4_discard_preallocations(inode);
80 +               ext4_discard_preallocations(inode, 0);
81                 ext4_free_blocks(handle, inode, NULL, newblock,
82                                  EXT4_C2B(sbi, allocated_clusters), fb_flags);
83                 goto out2;
84 @@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
85         }
86  
87         down_write(&EXT4_I(inode)->i_data_sem);
88 -       ext4_discard_preallocations(inode);
89 +       ext4_discard_preallocations(inode, 0);
90  
91         ret = ext4_es_remove_extent(inode, punch_start,
92                                     EXT_MAX_BLOCKS - punch_start);
93 @@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
94                 up_write(&EXT4_I(inode)->i_data_sem);
95                 goto out_stop;
96         }
97 -       ext4_discard_preallocations(inode);
98 +       ext4_discard_preallocations(inode, 0);
99  
100         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
101                                      punch_stop - punch_start, SHIFT_LEFT);
102 @@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
103                 goto out_stop;
104  
105         down_write(&EXT4_I(inode)->i_data_sem);
106 -       ext4_discard_preallocations(inode);
107 +       ext4_discard_preallocations(inode, 0);
108  
109         path = ext4_find_extent(inode, offset_lblk, NULL, 0);
110         if (IS_ERR(path)) {
111 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
112 index 7a2720517bbb..e608ce3fb535 100644
113 --- a/fs/ext4/file.c
114 +++ b/fs/ext4/file.c
115 @@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
116                         (atomic_read(&inode->i_writecount) == 1) &&
117                         !EXT4_I(inode)->i_reserved_data_blocks) {
118                 down_write(&EXT4_I(inode)->i_data_sem);
119 -               ext4_discard_preallocations(inode);
120 +               ext4_discard_preallocations(inode, 0);
121                 up_write(&EXT4_I(inode)->i_data_sem);
122         }
123         if (is_dx(inode) && filp->private_data)
124 diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
125 index 433ca8415c5a..80c9f33800be 100644
126 --- a/fs/ext4/indirect.c
127 +++ b/fs/ext4/indirect.c
128 @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
129          * i_mutex. So we can safely drop the i_data_sem here.
130          */
131         BUG_ON(EXT4_JOURNAL(inode) == NULL);
132 -       ext4_discard_preallocations(inode);
133 +       ext4_discard_preallocations(inode, 0);
134         up_write(&EXT4_I(inode)->i_data_sem);
135         *dropped = 1;
136         return 0;
137 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
138 index 0b07576af3bf..77543f988258 100644
139 --- a/fs/ext4/inode.c
140 +++ b/fs/ext4/inode.c
141 @@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
142          */
143         if ((ei->i_reserved_data_blocks == 0) &&
144             !inode_is_open_for_write(inode))
145 -               ext4_discard_preallocations(inode);
146 +               ext4_discard_preallocations(inode, 0);
147  }
148  
149  static int __check_block_validity(struct inode *inode, const char *func,
150 @@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
151         if (stop_block > first_block) {
152  
153                 down_write(&EXT4_I(inode)->i_data_sem);
154 -               ext4_discard_preallocations(inode);
155 +               ext4_discard_preallocations(inode, 0);
156  
157                 ret = ext4_es_remove_extent(inode, first_block,
158                                             stop_block - first_block);
159 @@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
160  
161         down_write(&EXT4_I(inode)->i_data_sem);
162  
163 -       ext4_discard_preallocations(inode);
164 +       ext4_discard_preallocations(inode, 0);
165  
166         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
167                 err = ext4_ext_truncate(handle, inode);
168 diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
169 index 6e70a63dcca7..36eca3bc036a 100644
170 --- a/fs/ext4/ioctl.c
171 +++ b/fs/ext4/ioctl.c
172 @@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
173         reset_inode_seed(inode);
174         reset_inode_seed(inode_bl);
175  
176 -       ext4_discard_preallocations(inode);
177 +       ext4_discard_preallocations(inode, 0);
178  
179         err = ext4_mark_inode_dirty(handle, inode);
180         if (err < 0) {
181 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
182 index 45ac6088b4ac..132c118d12e1 100644
183 --- a/fs/ext4/mballoc.c
184 +++ b/fs/ext4/mballoc.c
185 @@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
186         if (!sbi->s_mb_c3_blocks)
187                 sbi->s_mb_c3_blocks =
188                         THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
189 +       sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
190         /*
191          * The default group preallocation is 512, which for 4k block
192          * sizes translates to 2 megabytes.  However for bigalloc file
193 @@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
194         mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
195  }
196  
197 +static void ext4_mb_mark_pa_deleted(struct super_block *sb,
198 +                                   struct ext4_prealloc_space *pa)
199 +{
200 +       struct ext4_inode_info *ei;
201 +
202 +       if (pa->pa_deleted) {
203 +               ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
204 +                            pa->pa_type, pa->pa_pstart, pa->pa_lstart,
205 +                            pa->pa_len);
206 +               return;
207 +       }
208 +
209 +       pa->pa_deleted = 1;
210 +
211 +       if (pa->pa_type == MB_INODE_PA) {
212 +               ei = EXT4_I(pa->pa_inode);
213 +               atomic_dec(&ei->i_prealloc_active);
214 +       }
215 +}
216 +
217  static void ext4_mb_pa_callback(struct rcu_head *head)
218  {
219         struct ext4_prealloc_space *pa;
220 @@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
221                 return;
222         }
223  
224 -       pa->pa_deleted = 1;
225 +       ext4_mb_mark_pa_deleted(sb, pa);
226         spin_unlock(&pa->pa_lock);
227  
228         grp_blk = pa->pa_pstart;
229 @@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
230         spin_lock(pa->pa_obj_lock);
231         list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
232         spin_unlock(pa->pa_obj_lock);
233 +       atomic_inc(&ei->i_prealloc_active);
234  }
235  
236  /*
237 @@ -4182,7 +4204,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
238                 }
239  
240                 /* seems this one can be freed ... */
241 -               pa->pa_deleted = 1;
242 +               ext4_mb_mark_pa_deleted(sb, pa);
243  
244                 /* we can trust pa_free ... */
245                 free += pa->pa_free;
246 @@ -4245,7 +4267,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
247   *
248   * FIXME!! Make sure it is valid at all the call sites
249   */
250 -void ext4_discard_preallocations(struct inode *inode)
251 +void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
252  {
253         struct ext4_inode_info *ei = EXT4_I(inode);
254         struct super_block *sb = inode->i_sb;
255 @@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
256  
257         mb_debug(sb, "discard preallocation for inode %lu\n",
258                  inode->i_ino);
259 -       trace_ext4_discard_preallocations(inode);
260 +       trace_ext4_discard_preallocations(inode,
261 +                       atomic_read(&ei->i_prealloc_active), needed);
262  
263         INIT_LIST_HEAD(&list);
264  
265 +       if (needed == 0)
266 +               needed = UINT_MAX;
267 +
268  repeat:
269         /* first, collect all pa's in the inode */
270         spin_lock(&ei->i_prealloc_lock);
271 -       while (!list_empty(&ei->i_prealloc_list)) {
272 -               pa = list_entry(ei->i_prealloc_list.next,
273 +       while (!list_empty(&ei->i_prealloc_list) && needed) {
274 +               pa = list_entry(ei->i_prealloc_list.prev,
275                                 struct ext4_prealloc_space, pa_inode_list);
276                 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
277                 spin_lock(&pa->pa_lock);
278 @@ -4288,10 +4314,11 @@ void ext4_discard_preallocations(struct inode *inode)
279  
280                 }
281                 if (pa->pa_deleted == 0) {
282 -                       pa->pa_deleted = 1;
283 +                       ext4_mb_mark_pa_deleted(sb, pa);
284                         spin_unlock(&pa->pa_lock);
285                         list_del_rcu(&pa->pa_inode_list);
286                         list_add(&pa->u.pa_tmp_list, &list);
287 +                       needed--;
288                         continue;
289                 }
290  
291 @@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
292                 BUG_ON(pa->pa_type != MB_GROUP_PA);
293  
294                 /* seems this one can be freed ... */
295 -               pa->pa_deleted = 1;
296 +               ext4_mb_mark_pa_deleted(sb, pa);
297                 spin_unlock(&pa->pa_lock);
298  
299                 list_del_rcu(&pa->pa_inode_list);
300 @@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
301         return ;
302  }
303  
304 +/*
305 + * if per-inode prealloc list is too long, trim some PA
306 + */
307 +static void ext4_mb_trim_inode_pa(struct inode *inode)
308 +{
309 +       struct ext4_inode_info *ei = EXT4_I(inode);
310 +       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
311 +       int count, delta;
312 +
313 +       count = atomic_read(&ei->i_prealloc_active);
314 +       delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
315 +       if (count > sbi->s_mb_max_inode_prealloc + delta) {
316 +               count -= sbi->s_mb_max_inode_prealloc;
317 +               ext4_discard_preallocations(inode, count);
318 +       }
319 +}
320 +
321  /*
322   * release all resource we used in allocation
323   */
324  static int ext4_mb_release_context(struct ext4_allocation_context *ac)
325  {
326 +       struct inode *inode = ac->ac_inode;
327 +       struct ext4_inode_info *ei = EXT4_I(inode);
328         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
329         struct ext4_prealloc_space *pa = ac->ac_pa;
330         if (pa) {
331 @@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
332                                 ext4_mb_add_n_trim(ac);
333                         }
334                 }
335 +
336 +               if (pa->pa_type == MB_INODE_PA) {
337 +                       /*
338 +                        * treat per-inode prealloc list as a lru list, then try
339 +                        * to trim the least recently used PA.
340 +                        */
341 +                       spin_lock(pa->pa_obj_lock);
342 +                       list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
343 +                       spin_unlock(pa->pa_obj_lock);
344 +               }
345 +
346                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
347         }
348         if (ac->ac_bitmap_page)
349 @@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
350         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
351                 mutex_unlock(&ac->ac_lg->lg_mutex);
352         ext4_mb_collect_stats(ac);
353 +       ext4_mb_trim_inode_pa(inode);
354         return 0;
355  }
356  
357 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
358 index 6b4d17c2935d..e75b4749aa1c 100644
359 --- a/fs/ext4/mballoc.h
360 +++ b/fs/ext4/mballoc.h
361 @@ -73,6 +73,10 @@
362   */
363  #define MB_DEFAULT_GROUP_PREALLOC      512
364  
365 +/*
366 + * maximum length of inode prealloc list
367 + */
368 +#define MB_DEFAULT_MAX_INODE_PREALLOC  512
369  
370  struct ext4_free_data {
371         /* this links the free block information from sb_info */
372 diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
373 index 1ed86fb6c302..0d601b822875 100644
374 --- a/fs/ext4/move_extent.c
375 +++ b/fs/ext4/move_extent.c
376 @@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
377  
378  out:
379         if (*moved_len) {
380 -               ext4_discard_preallocations(orig_inode);
381 -               ext4_discard_preallocations(donor_inode);
382 +               ext4_discard_preallocations(orig_inode, 0);
383 +               ext4_discard_preallocations(donor_inode, 0);
384         }
385  
386         ext4_ext_drop_refs(path);
387 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
388 index daa94c7f7271..13bdddc081e0 100644
389 --- a/fs/ext4/super.c
390 +++ b/fs/ext4/super.c
391 @@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
392         spin_lock_init(&ei->i_raw_lock);
393         sema_init(&ei->i_append_sem, 1);
394         INIT_LIST_HEAD(&ei->i_prealloc_list);
395 +       atomic_set(&ei->i_prealloc_active, 0);
396         spin_lock_init(&ei->i_prealloc_lock);
397         ldiskfs_es_init_tree(&ei->i_es_tree);
398         rwlock_init(&ei->i_es_lock);
399 @@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
400         invalidate_inode_buffers(inode);
401         clear_inode(inode);
402         dquot_drop(inode);
403 -       ext4_discard_preallocations(inode);
404 +       ext4_discard_preallocations(inode, 0);
405         ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
406         if (EXT4_I(inode)->jinode) {
407                 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
408 diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
409 index 7fee11cc30e7..bfabb799fa45 100644
410 --- a/fs/ext4/sysfs.c
411 +++ b/fs/ext4/sysfs.c
412 @@ -218,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
413  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
414  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
415  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
416 +EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
417  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
418  EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
419  EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
420 @@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
421         ATTR_LIST(mb_order2_req),
422         ATTR_LIST(mb_stream_req),
423         ATTR_LIST(mb_group_prealloc),
424 +       ATTR_LIST(mb_max_inode_prealloc),
425         ATTR_LIST(max_writeback_mb_bump),
426         ATTR_LIST(extent_max_zeroout_kb),
427         ATTR_LIST(trigger_fs_error),
428 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
429 index 8008d2e116b9..4c8b99ec8606 100644
430 --- a/include/trace/events/ext4.h
431 +++ b/include/trace/events/ext4.h
432 @@ -746,24 +746,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
433  );
434  
435  TRACE_EVENT(ext4_discard_preallocations,
436 -       TP_PROTO(struct inode *inode),
437 +       TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
438  
439 -       TP_ARGS(inode),
440 +       TP_ARGS(inode, len, needed),
441  
442         TP_STRUCT__entry(
443 -               __field(        dev_t,  dev                     )
444 -               __field(        ino_t,  ino                     )
445 +               __field(        dev_t,          dev             )
446 +               __field(        ino_t,          ino             )
447 +               __field(        unsigned int,   len             )
448 +               __field(        unsigned int,   needed          )
449  
450         ),
451  
452         TP_fast_assign(
453                 __entry->dev    = inode->i_sb->s_dev;
454                 __entry->ino    = inode->i_ino;
455 +               __entry->len    = len;
456 +               __entry->needed = needed;
457         ),
458  
459 -       TP_printk("dev %d,%d ino %lu",
460 +       TP_printk("dev %d,%d ino %lu len: %u needed %u",
461                   MAJOR(__entry->dev), MINOR(__entry->dev),
462 -                 (unsigned long) __entry->ino)
463 +                 (unsigned long) __entry->ino, __entry->len,
464 +                 __entry->needed)
465  );
466  
467  TRACE_EVENT(ext4_mb_discard_preallocations,