1 From f33e0fa5ab6cad962d3b88376f4611b9aba1d030 Mon Sep 17 00:00:00 2001
2 From: Wang Shilong <wshilong@ddn.com>
3 Date: Thu, 27 Feb 2020 17:08:04 +0800
4 Subject: [PATCH 1/7] ext4: track extent status tree shrinker delay statictics
6 This commit adds some statictics in extent status tree shrinker. The
7 purpose to add these is that we want to collect more details when we
8 encounter a stall caused by extent status tree shrinker. Here we count
9 the following statictics:
11 the number of all objects on all extent status trees
12 the number of reclaimable objects on lru list
14 the last sorted interval
15 the number of inodes on lru list
17 scan time for shrinking some objects
18 the number of shrunk objects
20 the inode that has max nr. of objects on lru list
21 the maximum scan time for shrinking some objects
23 The output looks like below:
24 $ cat /proc/fs/ext4/sda1/es_shrinker_info
27 6341 reclaimable objects
28 5281/631 cache hits/misses
29 586 ms last sorted interval
30 250 inodes on lru list
35 255 inode (255 objects, 198 reclaimable)
36 125723 us max scan time
38 If the lru list has never been sorted, the following line will not be
40 586ms last sorted interval
41 If there is an empty lru list, the following lines also will not be
43 250 inodes on lru list
46 255 inode (255 objects, 198 reclaimable)
49 Meanwhile in this commit a new trace point is defined to print some
50 details in __ext4_es_shrink().
52 [Shilong remove trace point parts of this patch]
54 Cc: Andreas Dilger <adilger.kernel@dilger.ca>
55 Cc: Jan Kara <jack@suse.cz>
56 Reviewed-by: Jan Kara <jack@suse.cz>
57 Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
58 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
61 fs/ext4/extents_status.c | 179 +++++++++++++++++++++++++++++++++++++--
62 fs/ext4/extents_status.h | 13 ++-
63 fs/ext4/super.c | 13 +--
64 4 files changed, 187 insertions(+), 22 deletions(-)
66 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
67 index 763276e2..cc5ba587 100644
70 @@ -1018,6 +1018,7 @@ struct ext4_inode_info {
71 struct ext4_es_tree i_es_tree;
73 struct list_head i_es_lru;
74 + unsigned int i_es_all_nr; /* protected by i_es_lock */
75 unsigned int i_es_lru_nr; /* protected by i_es_lock */
76 unsigned long i_touch_when; /* jiffies of last accessing */
78 @@ -1482,8 +1483,7 @@ struct ext4_sb_info {
79 /* Reclaim extents from extent status tree */
80 struct shrinker s_es_shrinker;
81 struct list_head s_es_lru;
82 - unsigned long s_es_last_sorted;
83 - struct percpu_counter s_extent_cache_cnt;
84 + struct ext4_es_stats s_es_stats;
85 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
87 /* Ratelimit ext4 messages. */
88 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
89 index 3ef7f932..7dfed27b 100644
90 --- a/fs/ext4/extents_status.c
91 +++ b/fs/ext4/extents_status.c
94 #include <linux/rbtree.h>
95 #include <linux/list_sort.h>
96 +#include <linux/proc_fs.h>
97 +#include <linux/seq_file.h>
99 #include "extents_status.h"
101 @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
103 if (!ext4_es_is_delayed(es)) {
104 EXT4_I(inode)->i_es_lru_nr++;
105 - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
106 + percpu_counter_inc(&EXT4_SB(inode->i_sb)->
107 + s_es_stats.es_stats_lru_cnt);
110 + EXT4_I(inode)->i_es_all_nr++;
111 + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
116 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
118 + EXT4_I(inode)->i_es_all_nr--;
119 + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
121 /* Decrease the lru counter when this es is not delayed */
122 if (!ext4_es_is_delayed(es)) {
123 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
124 EXT4_I(inode)->i_es_lru_nr--;
125 - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
126 + percpu_counter_dec(&EXT4_SB(inode->i_sb)->
127 + s_es_stats.es_stats_lru_cnt);
130 kmem_cache_free(ext4_es_cachep, es);
131 @@ -739,6 +749,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
132 struct extent_status *es)
134 struct ext4_es_tree *tree;
135 + struct ext4_es_stats *stats;
136 struct extent_status *es1 = NULL;
137 struct rb_node *node;
139 @@ -775,11 +786,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
143 + stats = &EXT4_SB(inode->i_sb)->s_es_stats;
146 es->es_lblk = es1->es_lblk;
147 es->es_len = es1->es_len;
148 es->es_pblk = es1->es_pblk;
149 + stats->es_stats_cache_hits++;
151 + stats->es_stats_cache_misses++;
154 read_unlock(&EXT4_I(inode)->i_es_lock);
155 @@ -941,11 +956,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
156 struct ext4_inode_info *locked_ei)
158 struct ext4_inode_info *ei;
159 + struct ext4_es_stats *es_stats;
160 struct list_head *cur, *tmp;
162 + ktime_t start_time;
164 int ret, nr_shrunk = 0;
165 int retried = 0, skip_precached = 1, nr_skipped = 0;
167 + es_stats = &sbi->s_es_stats;
168 + start_time = ktime_get();
169 spin_lock(&sbi->s_es_lru_lock);
172 @@ -954,7 +974,8 @@ retry:
173 * If we have already reclaimed all extents from extent
174 * status tree, just stop the loop immediately.
176 - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
177 + if (percpu_counter_read_positive(
178 + &es_stats->es_stats_lru_cnt) == 0)
181 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
182 @@ -964,7 +985,7 @@ retry:
183 * time. Normally we try hard to avoid shrinking
184 * precached inodes, but we will as a last resort.
186 - if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
187 + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
188 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
189 EXT4_STATE_EXT_PRECACHED))) {
191 @@ -998,7 +1019,7 @@ retry:
192 if ((nr_shrunk == 0) && nr_skipped && !retried) {
194 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
195 - sbi->s_es_last_sorted = jiffies;
196 + es_stats->es_stats_last_sorted = jiffies;
197 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
200 @@ -1016,6 +1037,20 @@ retry:
201 if (locked_ei && nr_shrunk == 0)
202 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
204 + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
205 + if (likely(es_stats->es_stats_scan_time))
206 + es_stats->es_stats_scan_time = (scan_time +
207 + es_stats->es_stats_scan_time*3) / 4;
209 + es_stats->es_stats_scan_time = scan_time;
210 + if (scan_time > es_stats->es_stats_max_scan_time)
211 + es_stats->es_stats_max_scan_time = scan_time;
212 + if (likely(es_stats->es_stats_shrunk))
213 + es_stats->es_stats_shrunk = (nr_shrunk +
214 + es_stats->es_stats_shrunk*3) / 4;
216 + es_stats->es_stats_shrunk = nr_shrunk;
221 @@ -1026,7 +1061,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
222 int nr_to_scan = sc->nr_to_scan;
225 - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
226 + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
227 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
230 @@ -1034,23 +1069,150 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
232 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
234 - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
235 + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
236 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
240 -void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
241 +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
243 + return *pos ? NULL : SEQ_START_TOKEN;
247 +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
253 +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
255 + struct ext4_sb_info *sbi = seq->private;
256 + struct ext4_es_stats *es_stats = &sbi->s_es_stats;
257 + struct ext4_inode_info *ei, *max = NULL;
258 + unsigned int inode_cnt = 0;
260 + if (v != SEQ_START_TOKEN)
263 + /* here we just find an inode that has the max nr. of objects */
264 + spin_lock(&sbi->s_es_lru_lock);
265 + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
267 + if (max && max->i_es_all_nr < ei->i_es_all_nr)
272 + spin_unlock(&sbi->s_es_lru_lock);
274 + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
275 + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
276 + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
277 + seq_printf(seq, " %lu/%lu cache hits/misses\n",
278 + es_stats->es_stats_cache_hits,
279 + es_stats->es_stats_cache_misses);
280 + if (es_stats->es_stats_last_sorted != 0)
281 + seq_printf(seq, " %u ms last sorted interval\n",
282 + jiffies_to_msecs(jiffies -
283 + es_stats->es_stats_last_sorted));
285 + seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
287 + seq_printf(seq, "average:\n %llu us scan time\n",
288 + div_u64(es_stats->es_stats_scan_time, 1000));
289 + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
292 + "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
293 + " %llu us max scan time\n",
294 + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
295 + div_u64(es_stats->es_stats_max_scan_time, 1000));
300 +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
304 +static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
305 + .start = ext4_es_seq_shrinker_info_start,
306 + .next = ext4_es_seq_shrinker_info_next,
307 + .stop = ext4_es_seq_shrinker_info_stop,
308 + .show = ext4_es_seq_shrinker_info_show,
312 +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
316 + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
318 + struct seq_file *m = file->private_data;
319 + m->private = PDE_DATA(inode);
326 +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
328 + return seq_release(inode, file);
331 +static const struct file_operations ext4_es_seq_shrinker_info_fops = {
332 + .owner = THIS_MODULE,
333 + .open = ext4_es_seq_shrinker_info_open,
335 + .llseek = seq_lseek,
336 + .release = ext4_es_seq_shrinker_info_release,
339 +int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
343 INIT_LIST_HEAD(&sbi->s_es_lru);
344 spin_lock_init(&sbi->s_es_lru_lock);
345 - sbi->s_es_last_sorted = 0;
346 + sbi->s_es_stats.es_stats_last_sorted = 0;
347 + sbi->s_es_stats.es_stats_shrunk = 0;
348 + sbi->s_es_stats.es_stats_cache_hits = 0;
349 + sbi->s_es_stats.es_stats_cache_misses = 0;
350 + sbi->s_es_stats.es_stats_scan_time = 0;
351 + sbi->s_es_stats.es_stats_max_scan_time = 0;
352 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt,
356 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt,
360 sbi->s_es_shrinker.shrink = ext4_es_shrink;
361 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
362 register_shrinker(&sbi->s_es_shrinker);
365 + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
366 + &ext4_es_seq_shrinker_info_fops, sbi);
371 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
375 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
378 + remove_proc_entry("es_shrinker_info", sbi->s_proc);
379 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
380 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
381 unregister_shrinker(&sbi->s_es_shrinker);
384 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
385 index f1b62a41..efd5f970 100644
386 --- a/fs/ext4/extents_status.h
387 +++ b/fs/ext4/extents_status.h
388 @@ -64,6 +64,17 @@ struct ext4_es_tree {
389 struct extent_status *cache_es; /* recently accessed extent */
392 +struct ext4_es_stats {
393 + unsigned long es_stats_last_sorted;
394 + unsigned long es_stats_shrunk;
395 + unsigned long es_stats_cache_hits;
396 + unsigned long es_stats_cache_misses;
397 + u64 es_stats_scan_time;
398 + u64 es_stats_max_scan_time;
399 + struct percpu_counter es_stats_all_cnt;
400 + struct percpu_counter es_stats_lru_cnt;
403 extern int __init ext4_init_es(void);
404 extern void ext4_exit_es(void);
405 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
406 @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
410 -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
411 +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
412 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
413 extern void ext4_es_lru_add(struct inode *inode);
414 extern void ext4_es_lru_del(struct inode *inode);
415 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
416 index 18fe358c..bcdb48cf 100644
417 --- a/fs/ext4/super.c
418 +++ b/fs/ext4/super.c
419 @@ -880,7 +880,6 @@ static void ext4_put_super(struct super_block *sb)
420 percpu_counter_destroy(&sbi->s_freeinodes_counter);
421 percpu_counter_destroy(&sbi->s_dirs_counter);
422 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
423 - percpu_counter_destroy(&sbi->s_extent_cache_cnt);
425 for (i = 0; i < EXT4_MAXQUOTAS; i++)
426 kfree(sbi->s_qf_names[i]);
427 @@ -944,6 +943,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
428 ext4_es_init_tree(&ei->i_es_tree);
429 rwlock_init(&ei->i_es_lock);
430 INIT_LIST_HEAD(&ei->i_es_lru);
431 + ei->i_es_all_nr = 0;
433 ei->i_touch_when = 0;
434 ei->i_reserved_data_blocks = 0;
435 @@ -4289,14 +4289,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
436 sbi->s_err_report.function = print_daily_error_info;
437 sbi->s_err_report.data = (unsigned long) sb;
439 - /* Register extent status tree shrinker */
440 - ext4_es_register_shrinker(sbi);
442 - err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
444 - ext4_msg(sb, KERN_ERR, "insufficient memory");
445 + if (ext4_es_register_shrinker(sbi))
449 sbi->s_stripe = ext4_get_stripe_size(sbi);
450 sbi->s_extent_max_zeroout_kb = 32;
451 @@ -4641,10 +4635,9 @@ failed_mount_wq:
452 jbd2_journal_destroy(sbi->s_journal);
453 sbi->s_journal = NULL;
456 ext4_es_unregister_shrinker(sbi);
458 del_timer_sync(&sbi->s_err_report);
459 - percpu_counter_destroy(&sbi->s_extent_cache_cnt);
461 kthread_stop(sbi->s_mmp_tsk);