X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=ldiskfs%2Fkernel_patches%2Fpatches%2Frhel7.6%2Fext4-track-extent-status-tree-shrinker-delay-statict.patch;fp=ldiskfs%2Fkernel_patches%2Fpatches%2Frhel7.6%2Fext4-track-extent-status-tree-shrinker-delay-statict.patch;h=9cb4d182e74cba000d1abc8b81404e9db2a34f1d;hb=cc058b253ac7987a8350880e1c34a39cdf0625e8;hp=0000000000000000000000000000000000000000;hpb=c7c83582aeee6f8bc54915cce6d64d3307bf1ff6;p=fs%2Flustre-release.git diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch new file mode 100644 index 0000000..9cb4d18 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch @@ -0,0 +1,464 @@ +From f33e0fa5ab6cad962d3b88376f4611b9aba1d030 Mon Sep 17 00:00:00 2001 +From: Wang Shilong +Date: Thu, 27 Feb 2020 17:08:04 +0800 +Subject: [PATCH 1/7] ext4: track extent status tree shrinker delay statictics + +This commit adds some statictics in extent status tree shrinker. The +purpose to add these is that we want to collect more details when we +encounter a stall caused by extent status tree shrinker. Here we count +the following statictics: + stats: + the number of all objects on all extent status trees + the number of reclaimable objects on lru list + cache hits/misses + the last sorted interval + the number of inodes on lru list + average: + scan time for shrinking some objects + the number of shrunk objects + maximum: + the inode that has max nr. of objects on lru list + the maximum scan time for shrinking some objects + +The output looks like below: + $ cat /proc/fs/ext4/sda1/es_shrinker_info + stats: + 28228 objects + 6341 reclaimable objects + 5281/631 cache hits/misses + 586 ms last sorted interval + 250 inodes on lru list + average: + 153 us scan time + 128 shrunk objects + maximum: + 255 inode (255 objects, 198 reclaimable) + 125723 us max scan time + +If the lru list has never been sorted, the following line will not be +printed: + 586ms last sorted interval +If there is an empty lru list, the following lines also will not be +printed: + 250 inodes on lru list + ... + maximum: + 255 inode (255 objects, 198 reclaimable) + 0 us max scan time + +Meanwhile in this commit a new trace point is defined to print some +details in __ext4_es_shrink(). + +[Shilong remove trace point parts of this patch] + +Cc: Andreas Dilger +Cc: Jan Kara +Reviewed-by: Jan Kara +Signed-off-by: Zheng Liu +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 4 +- + fs/ext4/extents_status.c | 179 +++++++++++++++++++++++++++++++++++++-- + fs/ext4/extents_status.h | 13 ++- + fs/ext4/super.c | 13 +-- + 4 files changed, 187 insertions(+), 22 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 763276e2..cc5ba587 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1018,6 +1018,7 @@ struct ext4_inode_info { + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_lru; ++ unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_lru_nr; /* protected by i_es_lock */ + unsigned long i_touch_when; /* jiffies of last accessing */ + +@@ -1482,8 +1483,7 @@ struct ext4_sb_info { + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_lru; +- unsigned long s_es_last_sorted; +- struct percpu_counter s_extent_cache_cnt; ++ struct ext4_es_stats s_es_stats; + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 3ef7f932..7dfed27b 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -11,6 +11,8 @@ + */ + #include + #include ++#include ++#include + #include "ext4.h" + #include "extents_status.h" + +@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + */ + if (!ext4_es_is_delayed(es)) { + EXT4_I(inode)->i_es_lru_nr++; +- percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); ++ percpu_counter_inc(&EXT4_SB(inode->i_sb)-> ++ s_es_stats.es_stats_lru_cnt); + } + ++ EXT4_I(inode)->i_es_all_nr++; ++ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); ++ + return es; + } + + static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + { ++ EXT4_I(inode)->i_es_all_nr--; ++ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); ++ + /* Decrease the lru counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); + EXT4_I(inode)->i_es_lru_nr--; +- percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); ++ percpu_counter_dec(&EXT4_SB(inode->i_sb)-> ++ s_es_stats.es_stats_lru_cnt); + } + + kmem_cache_free(ext4_es_cachep, es); +@@ -739,6 +749,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) + { + struct ext4_es_tree *tree; ++ struct ext4_es_stats *stats; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; +@@ -775,11 +786,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + } + + out: ++ stats = &EXT4_SB(inode->i_sb)->s_es_stats; + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; ++ stats->es_stats_cache_hits++; ++ } else { ++ stats->es_stats_cache_misses++; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); +@@ -941,11 +956,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) + { + struct ext4_inode_info *ei; ++ struct ext4_es_stats *es_stats; + struct list_head *cur, *tmp; + LIST_HEAD(skipped); ++ ktime_t start_time; ++ u64 scan_time; + int ret, nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; + ++ es_stats = &sbi->s_es_stats; ++ start_time = ktime_get(); + spin_lock(&sbi->s_es_lru_lock); + + retry: +@@ -954,7 +974,8 @@ retry: + * If we have already reclaimed all extents from extent + * status tree, just stop the loop immediately. + */ +- if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) ++ if (percpu_counter_read_positive( ++ &es_stats->es_stats_lru_cnt) == 0) + break; + + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); +@@ -964,7 +985,7 @@ retry: + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ +- if ((sbi->s_es_last_sorted < ei->i_touch_when) || ++ if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; +@@ -998,7 +1019,7 @@ retry: + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); +- sbi->s_es_last_sorted = jiffies; ++ es_stats->es_stats_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* +@@ -1016,6 +1037,20 @@ retry: + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + ++ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ++ if (likely(es_stats->es_stats_scan_time)) ++ es_stats->es_stats_scan_time = (scan_time + ++ es_stats->es_stats_scan_time*3) / 4; ++ else ++ es_stats->es_stats_scan_time = scan_time; ++ if (scan_time > es_stats->es_stats_max_scan_time) ++ es_stats->es_stats_max_scan_time = scan_time; ++ if (likely(es_stats->es_stats_shrunk)) ++ es_stats->es_stats_shrunk = (nr_shrunk + ++ es_stats->es_stats_shrunk*3) / 4; ++ else ++ es_stats->es_stats_shrunk = nr_shrunk; ++ + return nr_shrunk; + } + +@@ -1026,7 +1061,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + +- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) +@@ -1034,23 +1069,149 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) + + nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + +- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + return ret; + } + +-void ext4_es_register_shrinker(struct ext4_sb_info *sbi) ++static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) ++{ ++ return *pos ? NULL : SEQ_START_TOKEN; ++} ++ ++static void * ++ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ return NULL; ++} ++ ++static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) ++{ ++ struct ext4_sb_info *sbi = seq->private; ++ struct ext4_es_stats *es_stats = &sbi->s_es_stats; ++ struct ext4_inode_info *ei, *max = NULL; ++ unsigned int inode_cnt = 0; ++ ++ if (v != SEQ_START_TOKEN) ++ return 0; ++ ++ /* here we just find an inode that has the max nr. of objects */ ++ spin_lock(&sbi->s_es_lru_lock); ++ list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { ++ inode_cnt++; ++ if (max && max->i_es_all_nr < ei->i_es_all_nr) ++ max = ei; ++ else if (!max) ++ max = ei; ++ } ++ spin_unlock(&sbi->s_es_lru_lock); ++ ++ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", ++ percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), ++ percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); ++ seq_printf(seq, " %lu/%lu cache hits/misses\n", ++ es_stats->es_stats_cache_hits, ++ es_stats->es_stats_cache_misses); ++ if (es_stats->es_stats_last_sorted != 0) ++ seq_printf(seq, " %u ms last sorted interval\n", ++ jiffies_to_msecs(jiffies - ++ es_stats->es_stats_last_sorted)); ++ if (inode_cnt) ++ seq_printf(seq, " %d inodes on lru list\n", inode_cnt); ++ ++ seq_printf(seq, "average:\n %llu us scan time\n", ++ div_u64(es_stats->es_stats_scan_time, 1000)); ++ seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); ++ if (inode_cnt) ++ seq_printf(seq, ++ "maximum:\n %lu inode (%u objects, %u reclaimable)\n" ++ " %llu us max scan time\n", ++ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, ++ div_u64(es_stats->es_stats_max_scan_time, 1000)); ++ ++ return 0; ++} ++ ++static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static const struct seq_operations ext4_es_seq_shrinker_info_ops = { ++ .start = ext4_es_seq_shrinker_info_start, ++ .next = ext4_es_seq_shrinker_info_next, ++ .stop = ext4_es_seq_shrinker_info_stop, ++ .show = ext4_es_seq_shrinker_info_show, ++}; ++ ++static int ++ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ ++ ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ m->private = PDE_DATA(inode); ++ } ++ ++ return ret; ++} ++ ++static int ++ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) + { ++ return seq_release(inode, file); ++} ++ ++static const struct file_operations ext4_es_seq_shrinker_info_fops = { ++ .owner = THIS_MODULE, ++ .open = ext4_es_seq_shrinker_info_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext4_es_seq_shrinker_info_release, ++}; ++ ++int ext4_es_register_shrinker(struct ext4_sb_info *sbi) ++{ ++ int err; ++ + INIT_LIST_HEAD(&sbi->s_es_lru); + spin_lock_init(&sbi->s_es_lru_lock); +- sbi->s_es_last_sorted = 0; ++ sbi->s_es_stats.es_stats_last_sorted = 0; ++ sbi->s_es_stats.es_stats_shrunk = 0; ++ sbi->s_es_stats.es_stats_cache_hits = 0; ++ sbi->s_es_stats.es_stats_cache_misses = 0; ++ sbi->s_es_stats.es_stats_scan_time = 0; ++ sbi->s_es_stats.es_stats_max_scan_time = 0; ++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, ++ 0, GFP_KERNEL); ++ if (err) ++ return err; ++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, ++ 0, GFP_KERNEL); ++ if (err) ++ goto err; + sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&sbi->s_es_shrinker); ++ ++ if (sbi->s_proc) ++ proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, ++ &ext4_es_seq_shrinker_info_fops, sbi); ++ ++ return 0; ++ ++err: ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); ++ return err; + } + + void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + { ++ if (sbi->s_proc) ++ remove_proc_entry("es_shrinker_info", sbi->s_proc); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + unregister_shrinker(&sbi->s_es_shrinker); + } + +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index f1b62a41..efd5f970 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -64,6 +64,17 @@ struct ext4_es_tree { + struct extent_status *cache_es; /* recently accessed extent */ + }; + ++struct ext4_es_stats { ++ unsigned long es_stats_last_sorted; ++ unsigned long es_stats_shrunk; ++ unsigned long es_stats_cache_hits; ++ unsigned long es_stats_cache_misses; ++ u64 es_stats_scan_time; ++ u64 es_stats_max_scan_time; ++ struct percpu_counter es_stats_all_cnt; ++ struct percpu_counter es_stats_lru_cnt; ++}; ++ + extern int __init ext4_init_es(void); + extern void ext4_exit_es(void); + extern void ext4_es_init_tree(struct ext4_es_tree *tree); +@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, + (pb & ~ES_MASK)); + } + +-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); ++extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_lru_add(struct inode *inode); + extern void ext4_es_lru_del(struct inode *inode); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 18fe358c..bcdb48cf 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -880,7 +880,6 @@ static void ext4_put_super(struct super_block *sb) + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +- percpu_counter_destroy(&sbi->s_extent_cache_cnt); + #ifdef CONFIG_QUOTA + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +@@ -944,6 +943,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_lru); ++ ei->i_es_all_nr = 0; + ei->i_es_lru_nr = 0; + ei->i_touch_when = 0; + ei->i_reserved_data_blocks = 0; +@@ -4289,14 +4289,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + +- /* Register extent status tree shrinker */ +- ext4_es_register_shrinker(sbi); +- +- err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); +- if (err) { +- ext4_msg(sb, KERN_ERR, "insufficient memory"); ++ if (ext4_es_register_shrinker(sbi)) + goto failed_mount3; +- } + + sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_extent_max_zeroout_kb = 32; +@@ -4641,10 +4635,9 @@ failed_mount_wq: + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } +-failed_mount3: + ext4_es_unregister_shrinker(sbi); ++failed_mount3: + del_timer_sync(&sbi->s_err_report); +- percpu_counter_destroy(&sbi->s_extent_cache_cnt); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + failed_mount2: +-- +2.24.1 +