Whamcloud - gitweb
LU-13300 ldiskfs: port patches to improve extent status shrink
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7.6 / ext4-track-extent-status-tree-shrinker-delay-statict.patch
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch
new file mode 100644 (file)
index 0000000..9cb4d18
--- /dev/null
@@ -0,0 +1,464 @@
+From f33e0fa5ab6cad962d3b88376f4611b9aba1d030 Mon Sep 17 00:00:00 2001
+From: Wang Shilong <wshilong@ddn.com>
+Date: Thu, 27 Feb 2020 17:08:04 +0800
+Subject: [PATCH 1/7] ext4: track extent status tree shrinker delay statictics
+
+This commit adds some statictics in extent status tree shrinker.  The
+purpose to add these is that we want to collect more details when we
+encounter a stall caused by extent status tree shrinker.  Here we count
+the following statictics:
+  stats:
+    the number of all objects on all extent status trees
+    the number of reclaimable objects on lru list
+    cache hits/misses
+    the last sorted interval
+    the number of inodes on lru list
+  average:
+    scan time for shrinking some objects
+    the number of shrunk objects
+  maximum:
+    the inode that has max nr. of objects on lru list
+    the maximum scan time for shrinking some objects
+
+The output looks like below:
+  $ cat /proc/fs/ext4/sda1/es_shrinker_info
+  stats:
+    28228 objects
+    6341 reclaimable objects
+    5281/631 cache hits/misses
+    586 ms last sorted interval
+    250 inodes on lru list
+  average:
+    153 us scan time
+    128 shrunk objects
+  maximum:
+    255 inode (255 objects, 198 reclaimable)
+    125723 us max scan time
+
+If the lru list has never been sorted, the following line will not be
+printed:
+    586ms last sorted interval
+If there is an empty lru list, the following lines also will not be
+printed:
+    250 inodes on lru list
+  ...
+  maximum:
+    255 inode (255 objects, 198 reclaimable)
+    0 us max scan time
+
+Meanwhile in this commit a new trace point is defined to print some
+details in __ext4_es_shrink().
+
+[Shilong remove trace point parts of this patch]
+
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Cc: Jan Kara <jack@suse.cz>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h           |   4 +-
+ fs/ext4/extents_status.c | 179 +++++++++++++++++++++++++++++++++++++--
+ fs/ext4/extents_status.h |  13 ++-
+ fs/ext4/super.c          |  13 +--
+ 4 files changed, 187 insertions(+), 22 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 763276e2..cc5ba587 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1018,6 +1018,7 @@ struct ext4_inode_info {
+       struct ext4_es_tree i_es_tree;
+       rwlock_t i_es_lock;
+       struct list_head i_es_lru;
++      unsigned int i_es_all_nr;       /* protected by i_es_lock */
+       unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+       unsigned long i_touch_when;     /* jiffies of last accessing */
+@@ -1482,8 +1483,7 @@ struct ext4_sb_info {
+       /* Reclaim extents from extent status tree */
+       struct shrinker s_es_shrinker;
+       struct list_head s_es_lru;
+-      unsigned long s_es_last_sorted;
+-      struct percpu_counter s_extent_cache_cnt;
++      struct ext4_es_stats s_es_stats;
+       spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+       /* Ratelimit ext4 messages. */
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 3ef7f932..7dfed27b 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -11,6 +11,8 @@
+  */
+ #include <linux/rbtree.h>
+ #include <linux/list_sort.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
+ #include "ext4.h"
+ #include "extents_status.h"
+@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+        */
+       if (!ext4_es_is_delayed(es)) {
+               EXT4_I(inode)->i_es_lru_nr++;
+-              percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
++              percpu_counter_inc(&EXT4_SB(inode->i_sb)->
++                                      s_es_stats.es_stats_lru_cnt);
+       }
++      EXT4_I(inode)->i_es_all_nr++;
++      percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
++
+       return es;
+ }
+ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ {
++      EXT4_I(inode)->i_es_all_nr--;
++      percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
++
+       /* Decrease the lru counter when this es is not delayed */
+       if (!ext4_es_is_delayed(es)) {
+               BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+               EXT4_I(inode)->i_es_lru_nr--;
+-              percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
++              percpu_counter_dec(&EXT4_SB(inode->i_sb)->
++                                      s_es_stats.es_stats_lru_cnt);
+       }
+       kmem_cache_free(ext4_es_cachep, es);
+@@ -739,6 +749,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+                         struct extent_status *es)
+ {
+       struct ext4_es_tree *tree;
++      struct ext4_es_stats *stats;
+       struct extent_status *es1 = NULL;
+       struct rb_node *node;
+       int found = 0;
+@@ -775,11 +786,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+       }
+ out:
++      stats = &EXT4_SB(inode->i_sb)->s_es_stats;
+       if (found) {
+               BUG_ON(!es1);
+               es->es_lblk = es1->es_lblk;
+               es->es_len = es1->es_len;
+               es->es_pblk = es1->es_pblk;
++              stats->es_stats_cache_hits++;
++      } else {
++              stats->es_stats_cache_misses++;
+       }
+       read_unlock(&EXT4_I(inode)->i_es_lock);
+@@ -941,11 +956,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                           struct ext4_inode_info *locked_ei)
+ {
+       struct ext4_inode_info *ei;
++      struct ext4_es_stats *es_stats;
+       struct list_head *cur, *tmp;
+       LIST_HEAD(skipped);
++      ktime_t start_time;
++      u64 scan_time;
+       int ret, nr_shrunk = 0;
+       int retried = 0, skip_precached = 1, nr_skipped = 0;
++      es_stats = &sbi->s_es_stats;
++      start_time = ktime_get();
+       spin_lock(&sbi->s_es_lru_lock);
+ retry:
+@@ -954,7 +974,8 @@ retry:
+                * If we have already reclaimed all extents from extent
+                * status tree, just stop the loop immediately.
+                */
+-              if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
++              if (percpu_counter_read_positive(
++                              &es_stats->es_stats_lru_cnt) == 0)
+                       break;
+               ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+@@ -964,7 +985,7 @@ retry:
+                * time.  Normally we try hard to avoid shrinking
+                * precached inodes, but we will as a last resort.
+                */
+-              if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
++              if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+                   (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+                                               EXT4_STATE_EXT_PRECACHED))) {
+                       nr_skipped++;
+@@ -998,7 +1019,7 @@ retry:
+       if ((nr_shrunk == 0) && nr_skipped && !retried) {
+               retried++;
+               list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+-              sbi->s_es_last_sorted = jiffies;
++              es_stats->es_stats_last_sorted = jiffies;
+               ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+                                     i_es_lru);
+               /*
+@@ -1016,6 +1037,20 @@ retry:
+       if (locked_ei && nr_shrunk == 0)
+               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
++      scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
++      if (likely(es_stats->es_stats_scan_time))
++              es_stats->es_stats_scan_time = (scan_time +
++                              es_stats->es_stats_scan_time*3) / 4;
++      else
++              es_stats->es_stats_scan_time = scan_time;
++      if (scan_time > es_stats->es_stats_max_scan_time)
++              es_stats->es_stats_max_scan_time = scan_time;
++      if (likely(es_stats->es_stats_shrunk))
++              es_stats->es_stats_shrunk = (nr_shrunk +
++                              es_stats->es_stats_shrunk*3) / 4;
++      else
++              es_stats->es_stats_shrunk = nr_shrunk;
++
+       return nr_shrunk;
+ }
+@@ -1026,7 +1061,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+       int nr_to_scan = sc->nr_to_scan;
+       int ret, nr_shrunk;
+-      ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
++      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+       trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+       if (!nr_to_scan)
+@@ -1034,23 +1069,149 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+       nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+-      ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
++      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+       trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+       return ret;
+ }
+-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
++static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
++{
++      return *pos ? NULL : SEQ_START_TOKEN;
++}
++
++static void *
++ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++      return NULL;
++}
++
++static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
++{
++      struct ext4_sb_info *sbi = seq->private;
++      struct ext4_es_stats *es_stats = &sbi->s_es_stats;
++      struct ext4_inode_info *ei, *max = NULL;
++      unsigned int inode_cnt = 0;
++
++      if (v != SEQ_START_TOKEN)
++              return 0;
++
++      /* here we just find an inode that has the max nr. of objects */
++      spin_lock(&sbi->s_es_lru_lock);
++      list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
++              inode_cnt++;
++              if (max && max->i_es_all_nr < ei->i_es_all_nr)
++                      max = ei;
++              else if (!max)
++                      max = ei;
++      }
++      spin_unlock(&sbi->s_es_lru_lock);
++
++      seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
++                 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
++                 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
++      seq_printf(seq, "  %lu/%lu cache hits/misses\n",
++                 es_stats->es_stats_cache_hits,
++                 es_stats->es_stats_cache_misses);
++      if (es_stats->es_stats_last_sorted != 0)
++              seq_printf(seq, "  %u ms last sorted interval\n",
++                         jiffies_to_msecs(jiffies -
++                                          es_stats->es_stats_last_sorted));
++      if (inode_cnt)
++              seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
++
++      seq_printf(seq, "average:\n  %llu us scan time\n",
++          div_u64(es_stats->es_stats_scan_time, 1000));
++      seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
++      if (inode_cnt)
++              seq_printf(seq,
++                  "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
++                  "  %llu us max scan time\n",
++                  max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
++                  div_u64(es_stats->es_stats_max_scan_time, 1000));
++
++      return 0;
++}
++
++static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
++{
++}
++
++static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
++      .start = ext4_es_seq_shrinker_info_start,
++      .next  = ext4_es_seq_shrinker_info_next,
++      .stop  = ext4_es_seq_shrinker_info_stop,
++      .show  = ext4_es_seq_shrinker_info_show,
++};
++
++static int
++ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
++{
++      int ret;
++
++      ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
++      if (!ret) {
++              struct seq_file *m = file->private_data;
++              m->private = PDE_DATA(inode);
++      }
++
++      return ret;
++}
++
++static int
++ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+ {
++      return seq_release(inode, file);
++}
++
++static const struct file_operations ext4_es_seq_shrinker_info_fops = {
++      .owner          = THIS_MODULE,
++      .open           = ext4_es_seq_shrinker_info_open,
++      .read           = seq_read,
++      .llseek         = seq_lseek,
++      .release        = ext4_es_seq_shrinker_info_release,
++};
++
++int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
++{
++      int err;
++
+       INIT_LIST_HEAD(&sbi->s_es_lru);
+       spin_lock_init(&sbi->s_es_lru_lock);
+-      sbi->s_es_last_sorted = 0;
++      sbi->s_es_stats.es_stats_last_sorted = 0;
++      sbi->s_es_stats.es_stats_shrunk = 0;
++      sbi->s_es_stats.es_stats_cache_hits = 0;
++      sbi->s_es_stats.es_stats_cache_misses = 0;
++      sbi->s_es_stats.es_stats_scan_time = 0;
++      sbi->s_es_stats.es_stats_max_scan_time = 0;
++      err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt,
++                                0, GFP_KERNEL);
++      if (err)
++              return err;
++      err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt,
++                                0, GFP_KERNEL);
++      if (err)
++              goto err;
+       sbi->s_es_shrinker.shrink = ext4_es_shrink;
+       sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+       register_shrinker(&sbi->s_es_shrinker);
++
++      if (sbi->s_proc)
++              proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
++                               &ext4_es_seq_shrinker_info_fops, sbi);
++
++      return 0;
++
++err:
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
++      return err;
+ }
+ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+ {
++      if (sbi->s_proc)
++              remove_proc_entry("es_shrinker_info", sbi->s_proc);
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+       unregister_shrinker(&sbi->s_es_shrinker);
+ }
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index f1b62a41..efd5f970 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -64,6 +64,17 @@ struct ext4_es_tree {
+       struct extent_status *cache_es; /* recently accessed extent */
+ };
++struct ext4_es_stats {
++      unsigned long es_stats_last_sorted;
++      unsigned long es_stats_shrunk;
++      unsigned long es_stats_cache_hits;
++      unsigned long es_stats_cache_misses;
++      u64 es_stats_scan_time;
++      u64 es_stats_max_scan_time;
++      struct percpu_counter es_stats_all_cnt;
++      struct percpu_counter es_stats_lru_cnt;
++};
++
+ extern int __init ext4_init_es(void);
+ extern void ext4_exit_es(void);
+ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                      (pb & ~ES_MASK));
+ }
+-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
++extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_lru_add(struct inode *inode);
+ extern void ext4_es_lru_del(struct inode *inode);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 18fe358c..bcdb48cf 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -880,7 +880,6 @@ static void ext4_put_super(struct super_block *sb)
+       percpu_counter_destroy(&sbi->s_freeinodes_counter);
+       percpu_counter_destroy(&sbi->s_dirs_counter);
+       percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+-      percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+ #ifdef CONFIG_QUOTA
+       for (i = 0; i < EXT4_MAXQUOTAS; i++)
+               kfree(sbi->s_qf_names[i]);
+@@ -944,6 +943,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       ext4_es_init_tree(&ei->i_es_tree);
+       rwlock_init(&ei->i_es_lock);
+       INIT_LIST_HEAD(&ei->i_es_lru);
++      ei->i_es_all_nr = 0;
+       ei->i_es_lru_nr = 0;
+       ei->i_touch_when = 0;
+       ei->i_reserved_data_blocks = 0;
+@@ -4289,14 +4289,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+       sbi->s_err_report.function = print_daily_error_info;
+       sbi->s_err_report.data = (unsigned long) sb;
+-      /* Register extent status tree shrinker */
+-      ext4_es_register_shrinker(sbi);
+-
+-      err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
+-      if (err) {
+-              ext4_msg(sb, KERN_ERR, "insufficient memory");
++      if (ext4_es_register_shrinker(sbi))
+               goto failed_mount3;
+-      }
+       sbi->s_stripe = ext4_get_stripe_size(sbi);
+       sbi->s_extent_max_zeroout_kb = 32;
+@@ -4641,10 +4635,9 @@ failed_mount_wq:
+               jbd2_journal_destroy(sbi->s_journal);
+               sbi->s_journal = NULL;
+       }
+-failed_mount3:
+       ext4_es_unregister_shrinker(sbi);
++failed_mount3:
+       del_timer_sync(&sbi->s_err_report);
+-      percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+       if (sbi->s_mmp_tsk)
+               kthread_stop(sbi->s_mmp_tsk);
+ failed_mount2:
+-- 
+2.24.1
+