Whamcloud - gitweb
LU-13300 ldiskfs: port patches to improve extent status shrink 49/37749/7
authorWang Shilong <wshilong@ddn.com>
Thu, 27 Feb 2020 09:21:53 +0000 (17:21 +0800)
committerOleg Drokin <green@whamcloud.com>
Sat, 14 Mar 2020 17:25:57 +0000 (17:25 +0000)
We see serious extent status shrink problem on some customer sites,
the backtrace could be:

NMI watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [kswapd0:106]
[<ffffffffae1958c2>] merge+0x62/0xc0
[<ffffffffc123fc50>] ? ldiskfs_init_inode_table+0x410/0x410 [ldiskfs]
[<ffffffffae1959bb>] list_sort+0x9b/0x250
[<ffffffffc124014e>] __ldiskfs_es_shrink+0x1ce/0x2a0 [ldiskfs]
[<ffffffffc12402d4>] ldiskfs_es_shrink+0xb4/0x130 [ldiskfs]
[<ffffffffadfcf805>] shrink_slab+0x175/0x340
[<ffffffffae03e1d7>] ? vmpressure+0x87/0x90
[<ffffffffadfd3538>] balance_pgdat+0x3a8/0x5e0

Backport following Linux upstrem commits since v3.18 to RHEL7:

Linux-commit: eb68d0e2fc5a4e5c06324ea5f485fccbae626d05
ext4: track extent status tree shrinker delay statictics

Linux-commit: 50460fe8c6d1d95b16427936e351f277a1c72d43
ext4: remove extent status procfs files if journal load

Linux-commit: edaa53cac8fd4b96ed4b8f96c4933158ff2dd337
ext4: change LRU to round-robin in extent status tree shrinker

Linux-commit: b0dea4c1651f3cdb6d17604fa473e72cb74cdc6b
ext4: move handling of list of shrinkable inodes into extent status code

Linux-commit: dd4759255188771e60cf3455982959a1ba04f4eb
ext4: limit number of scanned extents in status tree shrinker

Linux-commit: 624d0f1dd7c80d2bac4fc3066b2ff3947f890883
ext4: cleanup flag definitions for extent status tree

Linux-commit: 2be12de98a1cc21c4de4e2d6fb2bf5aa0a279947
ext4: introduce aging to extent status tree

Test-Parameters: fstype=ldiskfs serverdistro=el7.7
Change-Id: Idd97872b1663bc001a63274a430eaade66efd37d
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Reviewed-on: https://review.whamcloud.com/37749
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series

diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch
new file mode 100644 (file)
index 0000000..bc04f8b
--- /dev/null
@@ -0,0 +1,570 @@
+From 4fac310a77c918d6a235a55cb76cf2f9bb22de71 Mon Sep 17 00:00:00 2001
+From: Zheng Liu <wenqing.lz@taobao.com>
+Date: Tue, 25 Nov 2014 11:45:37 -0500
+Subject: [PATCH 3/7] ext4: change LRU to round-robin in extent status tree
+ shrinker
+
+In this commit we discard the lru algorithm for inodes with extent
+status tree because it takes significant effort to maintain a lru list
+in extent status tree shrinker and the shrinker can take a long time to
+scan this lru list in order to reclaim some objects.
+
+We replace the lru ordering with a simple round-robin.  After that we
+never need to keep a lru list.  That means that the list needn't be
+sorted if the shrinker can not reclaim any objects in the first round.
+
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h           |  10 +-
+ fs/ext4/extents.c        |   4 +-
+ fs/ext4/extents_status.c | 221 +++++++++++++++++----------------------
+ fs/ext4/extents_status.h |   7 +-
+ fs/ext4/inode.c          |   4 +-
+ fs/ext4/ioctl.c          |   4 +-
+ fs/ext4/super.c          |   7 +-
+ 7 files changed, 112 insertions(+), 145 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index cc5ba587..0813afd6 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1017,10 +1017,9 @@ struct ext4_inode_info {
+       /* extents status tree */
+       struct ext4_es_tree i_es_tree;
+       rwlock_t i_es_lock;
+-      struct list_head i_es_lru;
++      struct list_head i_es_list;
+       unsigned int i_es_all_nr;       /* protected by i_es_lock */
+-      unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+-      unsigned long i_touch_when;     /* jiffies of last accessing */
++      unsigned int i_es_shk_nr;       /* protected by i_es_lock */
+       /* ialloc */
+       ext4_group_t    i_last_alloc_group;
+@@ -1482,9 +1481,10 @@ struct ext4_sb_info {
+       /* Reclaim extents from extent status tree */
+       struct shrinker s_es_shrinker;
+-      struct list_head s_es_lru;
++      struct list_head s_es_list;
++      long s_es_nr_inode;
+       struct ext4_es_stats s_es_stats;
+-      spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
++      spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+       /* Ratelimit ext4 messages. */
+       struct ratelimit_state s_err_ratelimit_state;
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index f618d0ba..c012dc51 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4689,7 +4689,7 @@ out2:
+       trace_ext4_ext_map_blocks_exit(inode, flags, map,
+                                      err ? err : allocated);
+-      ext4_es_lru_add(inode);
++      ext4_es_list_add(inode);
+       return err ? err : allocated;
+ }
+@@ -5263,7 +5263,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+               error = ext4_fill_fiemap_extents(inode, start_blk,
+                                                len_blks, fieinfo);
+       }
+-      ext4_es_lru_add(inode);
++      ext4_es_list_add(inode);
+       return error;
+ }
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 7dfed27b..382a7bf9 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -149,8 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+                             ext4_lblk_t end);
+ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+                                      int nr_to_scan);
+-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+-                          struct ext4_inode_info *locked_ei);
++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
++                     struct ext4_inode_info *locked_ei);
+ int __init ext4_init_es(void)
+ {
+@@ -298,6 +298,36 @@ out:
+       trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+ }
++void ext4_es_list_add(struct inode *inode)
++{
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++
++      if (!list_empty(&ei->i_es_list))
++              return;
++
++      spin_lock(&sbi->s_es_lock);
++      if (list_empty(&ei->i_es_list)) {
++              list_add_tail(&ei->i_es_list, &sbi->s_es_list);
++              sbi->s_es_nr_inode++;
++      }
++      spin_unlock(&sbi->s_es_lock);
++}
++
++void ext4_es_list_del(struct inode *inode)
++{
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++
++      spin_lock(&sbi->s_es_lock);
++      if (!list_empty(&ei->i_es_list)) {
++              list_del_init(&ei->i_es_list);
++              sbi->s_es_nr_inode--;
++              WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
++      }
++      spin_unlock(&sbi->s_es_lock);
++}
++
+ static struct extent_status *
+ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+                    ext4_fsblk_t pblk)
+@@ -314,9 +344,9 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+        * We don't count delayed extent because we never try to reclaim them
+        */
+       if (!ext4_es_is_delayed(es)) {
+-              EXT4_I(inode)->i_es_lru_nr++;
++              EXT4_I(inode)->i_es_shk_nr++;
+               percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+-                                      s_es_stats.es_stats_lru_cnt);
++                                      s_es_stats.es_stats_shk_cnt);
+       }
+       EXT4_I(inode)->i_es_all_nr++;
+@@ -330,12 +360,12 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+       EXT4_I(inode)->i_es_all_nr--;
+       percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+-      /* Decrease the lru counter when this es is not delayed */
++      /* Decrease the shrink counter when this es is not delayed */
+       if (!ext4_es_is_delayed(es)) {
+-              BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+-              EXT4_I(inode)->i_es_lru_nr--;
++              BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
++              EXT4_I(inode)->i_es_shk_nr--;
+               percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+-                                      s_es_stats.es_stats_lru_cnt);
++                                      s_es_stats.es_stats_shk_cnt);
+       }
+       kmem_cache_free(ext4_es_cachep, es);
+@@ -693,8 +723,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+               goto error;
+ retry:
+       err = __es_insert_extent(inode, &newes);
+-      if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+-                                             EXT4_I(inode)))
++      if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
++                                        1, EXT4_I(inode)))
+               goto retry;
+       if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+               err = 0;
+@@ -851,8 +881,8 @@ retry:
+                               es->es_lblk = orig_es.es_lblk;
+                               es->es_len = orig_es.es_len;
+                               if ((err == -ENOMEM) &&
+-                                  __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+-                                                   EXT4_I(inode)))
++                                  __es_shrink(EXT4_SB(inode->i_sb),
++                                                      1, EXT4_I(inode)))
+                                       goto retry;
+                               goto out;
+                       }
+@@ -924,6 +954,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+       end = lblk + len - 1;
+       BUG_ON(end < lblk);
++      /*
++       * ext4_clear_inode() depends on us taking i_es_lock unconditionally
++       * so that we are sure __es_shrink() is done with the inode before it
++       * is reclaimed.
++       */
+       write_lock(&EXT4_I(inode)->i_es_lock);
+       err = __es_remove_extent(inode, lblk, end);
+       write_unlock(&EXT4_I(inode)->i_es_lock);
+@@ -931,112 +966,77 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+       return err;
+ }
+-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+-                                   struct list_head *b)
+-{
+-      struct ext4_inode_info *eia, *eib;
+-      eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+-      eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+-
+-      if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+-          !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+-              return 1;
+-      if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+-          ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+-              return -1;
+-      if (eia->i_touch_when == eib->i_touch_when)
+-              return 0;
+-      if (time_after(eia->i_touch_when, eib->i_touch_when))
+-              return 1;
+-      else
+-              return -1;
+-}
+-
+-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+-                          struct ext4_inode_info *locked_ei)
++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
++                     struct ext4_inode_info *locked_ei)
+ {
+       struct ext4_inode_info *ei;
+       struct ext4_es_stats *es_stats;
+-      struct list_head *cur, *tmp;
+-      LIST_HEAD(skipped);
+       ktime_t start_time;
+       u64 scan_time;
++      int nr_to_walk;
+       int ret, nr_shrunk = 0;
+-      int retried = 0, skip_precached = 1, nr_skipped = 0;
++      int retried = 0, nr_skipped = 0;
+       es_stats = &sbi->s_es_stats;
+       start_time = ktime_get();
+-      spin_lock(&sbi->s_es_lru_lock);
+ retry:
+-      list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+-              /*
+-               * If we have already reclaimed all extents from extent
+-               * status tree, just stop the loop immediately.
+-               */
+-              if (percpu_counter_read_positive(
+-                              &es_stats->es_stats_lru_cnt) == 0)
+-                      break;
+-
+-              ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
++      spin_lock(&sbi->s_es_lock);
++      nr_to_walk = sbi->s_es_nr_inode;
++      while (nr_to_walk-- > 0) {
++              if (list_empty(&sbi->s_es_list)) {
++                      spin_unlock(&sbi->s_es_lock);
++                      goto out;
++              }
++              ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
++                                    i_es_list);
++              /* Move the inode to the tail */
++              list_move(&ei->i_es_list, sbi->s_es_list.prev);
+               /*
+-               * Skip the inode that is newer than the last_sorted
+-               * time.  Normally we try hard to avoid shrinking
+-               * precached inodes, but we will as a last resort.
++               * Normally we try hard to avoid shrinking precached inodes,
++               * but we will as a last resort.
+                */
+-              if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+-                  (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+-                                              EXT4_STATE_EXT_PRECACHED))) {
++              if (!retried && ext4_test_inode_state(&ei->vfs_inode,
++                                              EXT4_STATE_EXT_PRECACHED)) {
+                       nr_skipped++;
+-                      list_move_tail(cur, &skipped);
+                       continue;
+               }
+-              if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+-                  !write_trylock(&ei->i_es_lock))
+-                      continue;
++              if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
++                      nr_skipped++;
++                      continue;
++              }
++              /*
++               * Now we hold i_es_lock which protects us from inode reclaim
++               * freeing inode under us
++               */
++              spin_unlock(&sbi->s_es_lock);
+               ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+-              if (ei->i_es_lru_nr == 0)
+-                      list_del_init(&ei->i_es_lru);
+               write_unlock(&ei->i_es_lock);
+               nr_shrunk += ret;
+               nr_to_scan -= ret;
+               if (nr_to_scan == 0)
+-                      break;
++                      goto out;
++              spin_lock(&sbi->s_es_lock);
+       }
+-      /* Move the newer inodes into the tail of the LRU list. */
+-      list_splice_tail(&skipped, &sbi->s_es_lru);
+-      INIT_LIST_HEAD(&skipped);
++      spin_unlock(&sbi->s_es_lock);
+       /*
+        * If we skipped any inodes, and we weren't able to make any
+-       * forward progress, sort the list and try again.
++       * forward progress, try again to scan precached inodes.
+        */
+       if ((nr_shrunk == 0) && nr_skipped && !retried) {
+               retried++;
+-              list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+-              es_stats->es_stats_last_sorted = jiffies;
+-              ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+-                                    i_es_lru);
+-              /*
+-               * If there are no non-precached inodes left on the
+-               * list, start releasing precached extents.
+-               */
+-              if (ext4_test_inode_state(&ei->vfs_inode,
+-                                        EXT4_STATE_EXT_PRECACHED))
+-                      skip_precached = 0;
+               goto retry;
+       }
+-      spin_unlock(&sbi->s_es_lru_lock);
+-
+       if (locked_ei && nr_shrunk == 0)
+               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+-
++out:
+       scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+       if (likely(es_stats->es_stats_scan_time))
+               es_stats->es_stats_scan_time = (scan_time +
+@@ -1061,15 +1061,15 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+       int nr_to_scan = sc->nr_to_scan;
+       int ret, nr_shrunk;
+-      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
++      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+       trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+       if (!nr_to_scan)
+               return ret;
+-      nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
++      nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+-      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
++      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+       trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+       return ret;
+ }
+@@ -1096,28 +1096,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+               return 0;
+       /* here we just find an inode that has the max nr. of objects */
+-      spin_lock(&sbi->s_es_lru_lock);
+-      list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
++      spin_lock(&sbi->s_es_lock);
++      list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
+               inode_cnt++;
+               if (max && max->i_es_all_nr < ei->i_es_all_nr)
+                       max = ei;
+               else if (!max)
+                       max = ei;
+       }
+-      spin_unlock(&sbi->s_es_lru_lock);
++      spin_unlock(&sbi->s_es_lock);
+       seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
+                  percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+-                 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
++                 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
+       seq_printf(seq, "  %lu/%lu cache hits/misses\n",
+                  es_stats->es_stats_cache_hits,
+                  es_stats->es_stats_cache_misses);
+-      if (es_stats->es_stats_last_sorted != 0)
+-              seq_printf(seq, "  %u ms last sorted interval\n",
+-                         jiffies_to_msecs(jiffies -
+-                                          es_stats->es_stats_last_sorted));
+       if (inode_cnt)
+-              seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
++              seq_printf(seq, "  %d inodes on list\n", inode_cnt);
+       seq_printf(seq, "average:\n  %llu us scan time\n",
+           div_u64(es_stats->es_stats_scan_time, 1000));
+@@ -1126,7 +1122,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+               seq_printf(seq,
+                   "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
+                   "  %llu us max scan time\n",
+-                  max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
++                  max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
+                   div_u64(es_stats->es_stats_max_scan_time, 1000));
+       return 0;
+@@ -1175,9 +1171,9 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+ {
+       int err;
+-      INIT_LIST_HEAD(&sbi->s_es_lru);
+-      spin_lock_init(&sbi->s_es_lru_lock);
+-      sbi->s_es_stats.es_stats_last_sorted = 0;
++      INIT_LIST_HEAD(&sbi->s_es_list);
++      sbi->s_es_nr_inode = 0;
++      spin_lock_init(&sbi->s_es_lock);
+       sbi->s_es_stats.es_stats_shrunk = 0;
+       sbi->s_es_stats.es_stats_cache_hits = 0;
+       sbi->s_es_stats.es_stats_cache_misses = 0;
+@@ -1187,7 +1183,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+                                 0, GFP_KERNEL);
+       if (err)
+               return err;
+-      err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt,
++      err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt,
+                                 0, GFP_KERNEL);
+       if (err)
+               goto err;
+@@ -1211,37 +1207,10 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+       if (sbi->s_proc)
+               remove_proc_entry("es_shrinker_info", sbi->s_proc);
+       percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+-      percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+       unregister_shrinker(&sbi->s_es_shrinker);
+ }
+-void ext4_es_lru_add(struct inode *inode)
+-{
+-      struct ext4_inode_info *ei = EXT4_I(inode);
+-      struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-
+-      ei->i_touch_when = jiffies;
+-
+-      if (!list_empty(&ei->i_es_lru))
+-              return;
+-
+-      spin_lock(&sbi->s_es_lru_lock);
+-      if (list_empty(&ei->i_es_lru))
+-              list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+-      spin_unlock(&sbi->s_es_lru_lock);
+-}
+-
+-void ext4_es_lru_del(struct inode *inode)
+-{
+-      struct ext4_inode_info *ei = EXT4_I(inode);
+-      struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-
+-      spin_lock(&sbi->s_es_lru_lock);
+-      if (!list_empty(&ei->i_es_lru))
+-              list_del_init(&ei->i_es_lru);
+-      spin_unlock(&sbi->s_es_lru_lock);
+-}
+-
+ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+                                      int nr_to_scan)
+ {
+@@ -1253,7 +1222,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+-      if (ei->i_es_lru_nr == 0)
++      if (ei->i_es_shk_nr == 0)
+               return 0;
+       if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index efd5f970..0e6a33e8 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -65,14 +65,13 @@ struct ext4_es_tree {
+ };
+ struct ext4_es_stats {
+-      unsigned long es_stats_last_sorted;
+       unsigned long es_stats_shrunk;
+       unsigned long es_stats_cache_hits;
+       unsigned long es_stats_cache_misses;
+       u64 es_stats_scan_time;
+       u64 es_stats_max_scan_time;
+       struct percpu_counter es_stats_all_cnt;
+-      struct percpu_counter es_stats_lru_cnt;
++      struct percpu_counter es_stats_shk_cnt;
+ };
+ extern int __init ext4_init_es(void);
+@@ -151,7 +150,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+-extern void ext4_es_lru_add(struct inode *inode);
+-extern void ext4_es_lru_del(struct inode *inode);
++extern void ext4_es_list_add(struct inode *inode);
++extern void ext4_es_list_del(struct inode *inode);
+ #endif /* _EXT4_EXTENTS_STATUS_H */
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 21db5952..f6a2764c 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -523,7 +523,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
+       /* Lookup extent status tree firstly */
+       if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+-              ext4_es_lru_add(inode);
++              ext4_es_list_add(inode);
+               if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+                       map->m_pblk = ext4_es_pblock(&es) +
+                                       map->m_lblk - es.es_lblk;
+@@ -1519,7 +1519,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+       /* Lookup extent status tree firstly */
+       if (ext4_es_lookup_extent(inode, iblock, &es)) {
+-              ext4_es_lru_add(inode);
++              ext4_es_list_add(inode);
+               if (ext4_es_is_hole(&es)) {
+                       retval = 0;
+                       down_read(&EXT4_I(inode)->i_data_sem);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 858cf709..122d517c 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -80,8 +80,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+       memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+       ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+       ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+-      ext4_es_lru_del(inode1);
+-      ext4_es_lru_del(inode2);
++      ext4_es_list_del(inode1);
++      ext4_es_list_del(inode2);
+       isize = i_size_read(inode1);
+       i_size_write(inode1, i_size_read(inode2));
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 95a01d56..ea2a1026 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -942,10 +942,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       spin_lock_init(&ei->i_prealloc_lock);
+       ext4_es_init_tree(&ei->i_es_tree);
+       rwlock_init(&ei->i_es_lock);
+-      INIT_LIST_HEAD(&ei->i_es_lru);
++      INIT_LIST_HEAD(&ei->i_es_list);
+       ei->i_es_all_nr = 0;
+-      ei->i_es_lru_nr = 0;
+-      ei->i_touch_when = 0;
++      ei->i_es_shk_nr = 0;
+       ei->i_reserved_data_blocks = 0;
+       ei->i_reserved_meta_blocks = 0;
+       ei->i_allocated_meta_blocks = 0;
+@@ -1034,7 +1033,7 @@ void ext4_clear_inode(struct inode *inode)
+       dquot_drop(inode);
+       ext4_discard_preallocations(inode);
+       ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+-      ext4_es_lru_del(inode);
++      ext4_es_list_del(inode);
+       if (EXT4_I(inode)->jinode) {
+               jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+                                              EXT4_I(inode)->jinode);
+-- 
+2.24.1
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch
new file mode 100644 (file)
index 0000000..4d28b12
--- /dev/null
@@ -0,0 +1,139 @@
+From dd5c7af957dd0b9b3b04ef8aacffd601b46bc26c Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:53:47 -0500
+Subject: [PATCH 6/7] ext4: cleanup flag definitions for extent status tree
+
+Currently flags for extent status tree are defined twice, once shifted
+and once without a being shifted. Consolidate these definitions into one
+place and make some computations automatic to make adding flags less
+error prone. Compiler should be clever enough to figure out these are
+constants and generate the same code.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/extents_status.c |  2 ++
+ fs/ext4/extents_status.h | 58 ++++++++++++++++++----------------------
+ 2 files changed, 28 insertions(+), 32 deletions(-)
+
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index b78eec2a..a29708c0 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -1170,6 +1170,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+ {
+       int err;
++      /* Make sure we have enough bits for physical block number */
++      BUILD_BUG_ON(ES_SHIFT < 48);
+       INIT_LIST_HEAD(&sbi->s_es_list);
+       sbi->s_es_nr_inode = 0;
+       spin_lock_init(&sbi->s_es_lock);
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index b0b78b95..e86b1f34 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -29,25 +29,21 @@
+ /*
+  * These flags live in the high bits of extent_status.es_pblk
+  */
+-#define ES_SHIFT      60
+-
+-#define EXTENT_STATUS_WRITTEN (1 << 3)
+-#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+-#define EXTENT_STATUS_DELAYED (1 << 1)
+-#define EXTENT_STATUS_HOLE    (1 << 0)
+-
+-#define EXTENT_STATUS_FLAGS   (EXTENT_STATUS_WRITTEN | \
+-                               EXTENT_STATUS_UNWRITTEN | \
+-                               EXTENT_STATUS_DELAYED | \
+-                               EXTENT_STATUS_HOLE)
++enum {
++      ES_WRITTEN_B,
++      ES_UNWRITTEN_B,
++      ES_DELAYED_B,
++      ES_HOLE_B,
++      ES_FLAGS
++};
+-#define ES_WRITTEN            (1ULL << 63)
+-#define ES_UNWRITTEN          (1ULL << 62)
+-#define ES_DELAYED            (1ULL << 61)
+-#define ES_HOLE                       (1ULL << 60)
++#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
++#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
+-#define ES_MASK                       (ES_WRITTEN | ES_UNWRITTEN | \
+-                               ES_DELAYED | ES_HOLE)
++#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
++#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
++#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
++#define EXTENT_STATUS_HOLE    (1 << ES_HOLE_B)
+ struct ext4_sb_info;
+ struct ext4_extent;
+@@ -92,29 +88,29 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+ extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+                                struct extent_status *es);
++static inline unsigned int ext4_es_status(struct extent_status *es)
++{
++      return es->es_pblk >> ES_SHIFT;
++}
++
+ static inline int ext4_es_is_written(struct extent_status *es)
+ {
+-      return (es->es_pblk & ES_WRITTEN) != 0;
++      return (ext4_es_status(es) & EXTENT_STATUS_WRITTEN) != 0;
+ }
+ static inline int ext4_es_is_unwritten(struct extent_status *es)
+ {
+-      return (es->es_pblk & ES_UNWRITTEN) != 0;
++      return (ext4_es_status(es) & EXTENT_STATUS_UNWRITTEN) != 0;
+ }
+ static inline int ext4_es_is_delayed(struct extent_status *es)
+ {
+-      return (es->es_pblk & ES_DELAYED) != 0;
++      return (ext4_es_status(es) & EXTENT_STATUS_DELAYED) != 0;
+ }
+ static inline int ext4_es_is_hole(struct extent_status *es)
+ {
+-      return (es->es_pblk & ES_HOLE) != 0;
+-}
+-
+-static inline unsigned int ext4_es_status(struct extent_status *es)
+-{
+-      return es->es_pblk >> ES_SHIFT;
++      return (ext4_es_status(es) & EXTENT_STATUS_HOLE) != 0;
+ }
+ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+@@ -134,18 +130,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
+ static inline void ext4_es_store_status(struct extent_status *es,
+                                       unsigned int status)
+ {
+-      es->es_pblk = (((ext4_fsblk_t)
+-                      (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+-                     (es->es_pblk & ~ES_MASK));
++      es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
++                    (es->es_pblk & ~ES_MASK);
+ }
+ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                                              ext4_fsblk_t pb,
+                                              unsigned int status)
+ {
+-      es->es_pblk = (((ext4_fsblk_t)
+-                      (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+-                     (pb & ~ES_MASK));
++      es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
++                    (pb & ~ES_MASK);
+ }
+ extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+-- 
+2.24.1
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch
new file mode 100644 (file)
index 0000000..ddbe7e7
--- /dev/null
@@ -0,0 +1,156 @@
+From 1da6da1563df986dd35080d7edcf59b739696c40 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:55:24 -0500
+Subject: [PATCH 7/7] ext4: introduce aging to extent status tree
+
+Introduce a simple aging to extent status tree. Each extent has a
+REFERENCED bit which gets set when the extent is used. Shrinker then
+skips entries with referenced bit set and clears the bit. Thus
+frequently used extents have higher chances of staying in memory.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/extents_status.c | 22 +++++++++++++++++-----
+ fs/ext4/extents_status.h | 35 +++++++++++++++++++++++++++++++----
+ 2 files changed, 48 insertions(+), 9 deletions(-)
+
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index a29708c0..0305f308 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -382,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ static int ext4_es_can_be_merged(struct extent_status *es1,
+                                struct extent_status *es2)
+ {
+-      if (ext4_es_status(es1) != ext4_es_status(es2))
++      if (ext4_es_type(es1) != ext4_es_type(es2))
+               return 0;
+       if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+@@ -425,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
+       es1 = rb_entry(node, struct extent_status, rb_node);
+       if (ext4_es_can_be_merged(es1, es)) {
+               es1->es_len += es->es_len;
++              if (ext4_es_is_referenced(es))
++                      ext4_es_set_referenced(es1);
+               rb_erase(&es->rb_node, &tree->root);
+               ext4_es_free_extent(inode, es);
+               es = es1;
+@@ -447,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
+       es1 = rb_entry(node, struct extent_status, rb_node);
+       if (ext4_es_can_be_merged(es, es1)) {
+               es->es_len += es1->es_len;
++              if (ext4_es_is_referenced(es1))
++                      ext4_es_set_referenced(es);
+               rb_erase(node, &tree->root);
+               ext4_es_free_extent(inode, es1);
+       }
+@@ -823,6 +827,8 @@ out:
+               es->es_lblk = es1->es_lblk;
+               es->es_len = es1->es_len;
+               es->es_pblk = es1->es_pblk;
++              if (!ext4_es_is_referenced(es))
++                      ext4_es_set_referenced(es);
+               stats->es_stats_cache_hits++;
+       } else {
+               stats->es_stats_cache_misses++;
+@@ -1243,11 +1249,17 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
+                * We can't reclaim delayed extent from status tree because
+                * fiemap, bigallic, and seek_data/hole need to use it.
+                */
+-              if (!ext4_es_is_delayed(es)) {
+-                      rb_erase(&es->rb_node, &tree->root);
+-                      ext4_es_free_extent(inode, es);
+-                      (*nr_shrunk)++;
++              if (ext4_es_is_delayed(es))
++                      goto next;
++              if (ext4_es_is_referenced(es)) {
++                      ext4_es_clear_referenced(es);
++                      goto next;
+               }
++
++              rb_erase(&es->rb_node, &tree->root);
++              ext4_es_free_extent(inode, es);
++              (*nr_shrunk)++;
++next:
+               if (!node)
+                       goto out_wrap;
+               es = rb_entry(node, struct extent_status, rb_node);
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index e86b1f34..691b5261 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -34,6 +34,7 @@ enum {
+       ES_UNWRITTEN_B,
+       ES_DELAYED_B,
+       ES_HOLE_B,
++      ES_REFERENCED_B,
+       ES_FLAGS
+ };
+@@ -44,6 +45,12 @@ enum {
+ #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
+ #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
+ #define EXTENT_STATUS_HOLE    (1 << ES_HOLE_B)
++#define EXTENT_STATUS_REFERENCED      (1 << ES_REFERENCED_B)
++
++#define ES_TYPE_MASK  ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
++                        EXTENT_STATUS_UNWRITTEN | \
++                        EXTENT_STATUS_DELAYED | \
++                        EXTENT_STATUS_HOLE) << ES_SHIFT)
+ struct ext4_sb_info;
+ struct ext4_extent;
+@@ -93,24 +100,44 @@ static inline unsigned int ext4_es_status(struct extent_status *es)
+       return es->es_pblk >> ES_SHIFT;
+ }
++static inline unsigned int ext4_es_type(struct extent_status *es)
++{
++      return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
++}
++
+ static inline int ext4_es_is_written(struct extent_status *es)
+ {
+-      return (ext4_es_status(es) & EXTENT_STATUS_WRITTEN) != 0;
++      return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
+ }
+ static inline int ext4_es_is_unwritten(struct extent_status *es)
+ {
+-      return (ext4_es_status(es) & EXTENT_STATUS_UNWRITTEN) != 0;
++      return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
+ }
+ static inline int ext4_es_is_delayed(struct extent_status *es)
+ {
+-      return (ext4_es_status(es) & EXTENT_STATUS_DELAYED) != 0;
++      return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
+ }
+ static inline int ext4_es_is_hole(struct extent_status *es)
+ {
+-      return (ext4_es_status(es) & EXTENT_STATUS_HOLE) != 0;
++      return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
++}
++
++static inline void ext4_es_set_referenced(struct extent_status *es)
++{
++      es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
++}
++
++static inline void ext4_es_clear_referenced(struct extent_status *es)
++{
++      es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
++}
++
++static inline int ext4_es_is_referenced(struct extent_status *es)
++{
++      return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
+ }
+ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+-- 
+2.24.1
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch
new file mode 100644 (file)
index 0000000..30d3306
--- /dev/null
@@ -0,0 +1,235 @@
+From b72242d714ac3968bbb25867718e731be217e87b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:51:23 -0500
+Subject: [PATCH 5/7] ext4: limit number of scanned extents in status tree
+ shrinker
+
+Currently we scan extent status trees of inodes until we reclaim nr_to_scan
+extents. This can however require a lot of scanning when there are lots
+of delayed extents (as those cannot be reclaimed).
+
+Change shrinker to work as shrinkers are supposed to and *scan* only
+nr_to_scan extents regardless of how many extents did we actually
+reclaim. We however need to be careful and avoid scanning each status
+tree from the beginning - that could lead to a situation where we would
+not be able to reclaim anything at all when first nr_to_scan extents in
+the tree are always unreclaimable. We remember with each inode offset
+where we stopped scanning and continue from there when we next come
+across the inode.
+
+Note that we also need to update places calling __es_shrink() manually
+to pass reasonable nr_to_scan to have a chance of reclaiming anything and
+not just 1.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h           |  5 ++-
+ fs/ext4/extents_status.c | 91 ++++++++++++++++++++++++++--------------
+ fs/ext4/super.c          |  1 +
+ 3 files changed, 65 insertions(+), 32 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 0813afd6..2893a168 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1020,6 +1020,9 @@ struct ext4_inode_info {
+       struct list_head i_es_list;
+       unsigned int i_es_all_nr;       /* protected by i_es_lock */
+       unsigned int i_es_shk_nr;       /* protected by i_es_lock */
++      ext4_lblk_t i_es_shrink_lblk;   /* Offset where we start searching for
++                                         extents to shrink. Protected by
++                                         i_es_lock  */
+       /* ialloc */
+       ext4_group_t    i_last_alloc_group;
+@@ -1481,7 +1484,7 @@ struct ext4_sb_info {
+       /* Reclaim extents from extent status tree */
+       struct shrinker s_es_shrinker;
+-      struct list_head s_es_list;
++      struct list_head s_es_list;     /* List of inodes with reclaimable extents */
+       long s_es_nr_inode;
+       struct ext4_es_stats s_es_stats;
+       spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index edd49793..b78eec2a 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep;
+ static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+                             ext4_lblk_t end);
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+-                                     int nr_to_scan);
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                      struct ext4_inode_info *locked_ei);
+@@ -726,7 +725,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ retry:
+       err = __es_insert_extent(inode, &newes);
+       if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+-                                        1, EXT4_I(inode)))
++                                        128, EXT4_I(inode)))
+               goto retry;
+       if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+               err = 0;
+@@ -884,7 +883,7 @@ retry:
+                               es->es_len = orig_es.es_len;
+                               if ((err == -ENOMEM) &&
+                                   __es_shrink(EXT4_SB(inode->i_sb),
+-                                                      1, EXT4_I(inode)))
++                                                      128, EXT4_I(inode)))
+                                       goto retry;
+                               goto out;
+                       }
+@@ -976,7 +975,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+       ktime_t start_time;
+       u64 scan_time;
+       int nr_to_walk;
+-      int ret, nr_shrunk = 0;
++      int nr_shrunk = 0;
+       int retried = 0, nr_skipped = 0;
+       es_stats = &sbi->s_es_stats;
+@@ -994,7 +993,7 @@ retry:
+               ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+                                     i_es_list);
+               /* Move the inode to the tail */
+-              list_move(&ei->i_es_list, sbi->s_es_list.prev);
++              list_move_tail(&ei->i_es_list, &sbi->s_es_list);
+               /*
+                * Normally we try hard to avoid shrinking precached inodes,
+                * but we will as a last resort.
+@@ -1015,12 +1014,10 @@ retry:
+                */
+               spin_unlock(&sbi->s_es_lock);
+-              ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
++              nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
+               write_unlock(&ei->i_es_lock);
+-              nr_shrunk += ret;
+-              nr_to_scan -= ret;
+-              if (nr_to_scan == 0)
++              if (nr_to_scan <= 0)
+                       goto out;
+               spin_lock(&sbi->s_es_lock);
+       }
+@@ -1037,7 +1034,7 @@ retry:
+       }
+       if (locked_ei && nr_shrunk == 0)
+-              nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
++              nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+ out:
+       scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+       if (likely(es_stats->es_stats_scan_time))
+@@ -1213,27 +1210,32 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+       unregister_shrinker(&sbi->s_es_shrinker);
+ }
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+-                                     int nr_to_scan)
++/*
++ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
++ * most *nr_to_scan extents, update *nr_to_scan accordingly.
++ *
++ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
++ * Increment *nr_shrunk by the number of reclaimed extents. Also update
++ * ei->i_es_shrink_lblk to where we should continue scanning.
++ */
++static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
++                               int *nr_to_scan, int *nr_shrunk)
+ {
+       struct inode *inode = &ei->vfs_inode;
+       struct ext4_es_tree *tree = &ei->i_es_tree;
+-      struct rb_node *node;
+       struct extent_status *es;
+-      int nr_shrunk = 0;
+-      static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+-                                    DEFAULT_RATELIMIT_BURST);
+-
+-      if (ei->i_es_shk_nr == 0)
+-              return 0;
+-
+-      if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+-          __ratelimit(&_rs))
+-              ext4_warning(inode->i_sb, "forced shrink of precached extents");
++      struct rb_node *node;
+-      node = rb_first(&tree->root);
+-      while (node != NULL) {
+-              es = rb_entry(node, struct extent_status, rb_node);
++      es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
++      if (!es)
++              goto out_wrap;
++      node = &es->rb_node;
++      while (*nr_to_scan > 0) {
++              if (es->es_lblk > end) {
++                      ei->i_es_shrink_lblk = end + 1;
++                      return 0;
++              }
++              (*nr_to_scan)--;
+               node = rb_next(&es->rb_node);
+               /*
+                * We can't reclaim delayed extent from status tree because
+@@ -1242,11 +1244,38 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+               if (!ext4_es_is_delayed(es)) {
+                       rb_erase(&es->rb_node, &tree->root);
+                       ext4_es_free_extent(inode, es);
+-                      nr_shrunk++;
+-                      if (--nr_to_scan == 0)
+-                              break;
++                      (*nr_shrunk)++;
+               }
++              if (!node)
++                      goto out_wrap;
++              es = rb_entry(node, struct extent_status, rb_node);
+       }
+-      tree->cache_es = NULL;
++      ei->i_es_shrink_lblk = es->es_lblk;
++      return 1;
++out_wrap:
++      ei->i_es_shrink_lblk = 0;
++      return 0;
++}
++
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
++{
++      struct inode *inode = &ei->vfs_inode;
++      int nr_shrunk = 0;
++      ext4_lblk_t start = ei->i_es_shrink_lblk;
++      static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
++                                    DEFAULT_RATELIMIT_BURST);
++
++      if (ei->i_es_shk_nr == 0)
++              return 0;
++
++      if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
++          __ratelimit(&_rs))
++              ext4_warning(inode->i_sb, "forced shrink of precached extents");
++
++      if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
++          start != 0)
++              es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
++
++      ei->i_es_tree.cache_es = NULL;
+       return nr_shrunk;
+ }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 8a81fa73..d9cd4ff9 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -945,6 +945,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       INIT_LIST_HEAD(&ei->i_es_list);
+       ei->i_es_all_nr = 0;
+       ei->i_es_shk_nr = 0;
++      ei->i_es_shrink_lblk = 0;
+       ei->i_reserved_data_blocks = 0;
+       ei->i_reserved_meta_blocks = 0;
+       ei->i_allocated_meta_blocks = 0;
+-- 
+2.24.1
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch
new file mode 100644 (file)
index 0000000..6418198
--- /dev/null
@@ -0,0 +1,147 @@
+From 8d5847463404eb2d6b24f748d521d1930a432da9 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:49:25 -0500
+Subject: [PATCH 4/7] ext4: move handling of list of shrinkable inodes into
+ extent status code
+
+Currently callers adding extents to extent status tree were responsible
+for adding the inode to the list of inodes with freeable extents. This
+is error prone and puts list handling in unnecessarily many places.
+
+Just add inode to the list automatically when the first non-delay extent
+is added to the tree and remove inode from the list when the last
+non-delay extent is removed.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/extents.c        |  2 --
+ fs/ext4/extents_status.c | 10 ++++++----
+ fs/ext4/extents_status.h |  2 --
+ fs/ext4/inode.c          |  2 --
+ fs/ext4/ioctl.c          |  2 --
+ fs/ext4/super.c          |  1 -
+ 6 files changed, 6 insertions(+), 13 deletions(-)
+
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index c012dc51..d9d51a5b 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4689,7 +4689,6 @@ out2:
+       trace_ext4_ext_map_blocks_exit(inode, flags, map,
+                                      err ? err : allocated);
+-      ext4_es_list_add(inode);
+       return err ? err : allocated;
+ }
+@@ -5263,7 +5262,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+               error = ext4_fill_fiemap_extents(inode, start_blk,
+                                                len_blks, fieinfo);
+       }
+-      ext4_es_list_add(inode);
+       return error;
+ }
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 382a7bf9..edd49793 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -298,7 +298,7 @@ out:
+       trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+ }
+-void ext4_es_list_add(struct inode *inode)
++static void ext4_es_list_add(struct inode *inode)
+ {
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+@@ -314,7 +314,7 @@ void ext4_es_list_add(struct inode *inode)
+       spin_unlock(&sbi->s_es_lock);
+ }
+-void ext4_es_list_del(struct inode *inode)
++static void ext4_es_list_del(struct inode *inode)
+ {
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+@@ -344,7 +344,8 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+        * We don't count delayed extent because we never try to reclaim them
+        */
+       if (!ext4_es_is_delayed(es)) {
+-              EXT4_I(inode)->i_es_shk_nr++;
++              if (!EXT4_I(inode)->i_es_shk_nr++)
++                      ext4_es_list_add(inode);
+               percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+                                       s_es_stats.es_stats_shk_cnt);
+       }
+@@ -363,7 +364,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+       /* Decrease the shrink counter when this es is not delayed */
+       if (!ext4_es_is_delayed(es)) {
+               BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
+-              EXT4_I(inode)->i_es_shk_nr--;
++              if (!--EXT4_I(inode)->i_es_shk_nr)
++                      ext4_es_list_del(inode);
+               percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+                                       s_es_stats.es_stats_shk_cnt);
+       }
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index 0e6a33e8..b0b78b95 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -150,7 +150,5 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+-extern void ext4_es_list_add(struct inode *inode);
+-extern void ext4_es_list_del(struct inode *inode);
+ #endif /* _EXT4_EXTENTS_STATUS_H */
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index f6a2764c..9bbdc9e5 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -523,7 +523,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
+       /* Lookup extent status tree firstly */
+       if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+-              ext4_es_list_add(inode);
+               if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+                       map->m_pblk = ext4_es_pblock(&es) +
+                                       map->m_lblk - es.es_lblk;
+@@ -1519,7 +1518,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+       /* Lookup extent status tree firstly */
+       if (ext4_es_lookup_extent(inode, iblock, &es)) {
+-              ext4_es_list_add(inode);
+               if (ext4_es_is_hole(&es)) {
+                       retval = 0;
+                       down_read(&EXT4_I(inode)->i_data_sem);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 122d517c..6a6a9588 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -80,8 +80,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+       memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+       ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+       ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+-      ext4_es_list_del(inode1);
+-      ext4_es_list_del(inode2);
+       isize = i_size_read(inode1);
+       i_size_write(inode1, i_size_read(inode2));
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index ea2a1026..8a81fa73 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1033,7 +1033,6 @@ void ext4_clear_inode(struct inode *inode)
+       dquot_drop(inode);
+       ext4_discard_preallocations(inode);
+       ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+-      ext4_es_list_del(inode);
+       if (EXT4_I(inode)->jinode) {
+               jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+                                              EXT4_I(inode)->jinode);
+-- 
+2.24.1
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch
new file mode 100644 (file)
index 0000000..de226af
--- /dev/null
@@ -0,0 +1,49 @@
+From fabafc86567c2165c5b2165dcbf835edd6f81e72 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 30 Oct 2014 10:53:16 -0400
+Subject: [PATCH 2/7] ext4: remove extent status procfs files if journal load
+ fails
+
+If we can't load the journal, remove the procfs files for the extent
+status information file to avoid leaking resources.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@vger.kernel.org
+---
+ fs/ext4/super.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index bcdb48cf..95a01d56 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4326,7 +4326,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+           !(sb->s_flags & MS_RDONLY))
+               if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+-                      goto failed_mount3;
++                      goto failed_mount3a;
+       ext4_ext_init(sb); /* needed before using extent-mapped journal */
+@@ -4338,7 +4338,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+           EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+               err = ext4_load_journal(sb, es, journal_devnum);
+               if (err)
+-                      goto failed_mount3;
++                      goto failed_mount3a;
+       } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+             EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+               ext4_msg(sb, KERN_ERR, "required journal recovery "
+@@ -4635,6 +4635,7 @@ failed_mount_wq:
+               jbd2_journal_destroy(sbi->s_journal);
+               sbi->s_journal = NULL;
+       }
++failed_mount3a:
+       ext4_es_unregister_shrinker(sbi);
+ failed_mount3:
+       del_timer_sync(&sbi->s_err_report);
+-- 
+2.24.1
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch
new file mode 100644 (file)
index 0000000..9cb4d18
--- /dev/null
@@ -0,0 +1,464 @@
+From f33e0fa5ab6cad962d3b88376f4611b9aba1d030 Mon Sep 17 00:00:00 2001
+From: Wang Shilong <wshilong@ddn.com>
+Date: Thu, 27 Feb 2020 17:08:04 +0800
+Subject: [PATCH 1/7] ext4: track extent status tree shrinker delay statictics
+
+This commit adds some statictics in extent status tree shrinker.  The
+purpose to add these is that we want to collect more details when we
+encounter a stall caused by extent status tree shrinker.  Here we count
+the following statictics:
+  stats:
+    the number of all objects on all extent status trees
+    the number of reclaimable objects on lru list
+    cache hits/misses
+    the last sorted interval
+    the number of inodes on lru list
+  average:
+    scan time for shrinking some objects
+    the number of shrunk objects
+  maximum:
+    the inode that has max nr. of objects on lru list
+    the maximum scan time for shrinking some objects
+
+The output looks like below:
+  $ cat /proc/fs/ext4/sda1/es_shrinker_info
+  stats:
+    28228 objects
+    6341 reclaimable objects
+    5281/631 cache hits/misses
+    586 ms last sorted interval
+    250 inodes on lru list
+  average:
+    153 us scan time
+    128 shrunk objects
+  maximum:
+    255 inode (255 objects, 198 reclaimable)
+    125723 us max scan time
+
+If the lru list has never been sorted, the following line will not be
+printed:
+    586ms last sorted interval
+If there is an empty lru list, the following lines also will not be
+printed:
+    250 inodes on lru list
+  ...
+  maximum:
+    255 inode (255 objects, 198 reclaimable)
+    0 us max scan time
+
+Meanwhile in this commit a new trace point is defined to print some
+details in __ext4_es_shrink().
+
+[Shilong remove trace point parts of this patch]
+
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Cc: Jan Kara <jack@suse.cz>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h           |   4 +-
+ fs/ext4/extents_status.c | 179 +++++++++++++++++++++++++++++++++++++--
+ fs/ext4/extents_status.h |  13 ++-
+ fs/ext4/super.c          |  13 +--
+ 4 files changed, 187 insertions(+), 22 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 763276e2..cc5ba587 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1018,6 +1018,7 @@ struct ext4_inode_info {
+       struct ext4_es_tree i_es_tree;
+       rwlock_t i_es_lock;
+       struct list_head i_es_lru;
++      unsigned int i_es_all_nr;       /* protected by i_es_lock */
+       unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+       unsigned long i_touch_when;     /* jiffies of last accessing */
+@@ -1482,8 +1483,7 @@ struct ext4_sb_info {
+       /* Reclaim extents from extent status tree */
+       struct shrinker s_es_shrinker;
+       struct list_head s_es_lru;
+-      unsigned long s_es_last_sorted;
+-      struct percpu_counter s_extent_cache_cnt;
++      struct ext4_es_stats s_es_stats;
+       spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+       /* Ratelimit ext4 messages. */
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 3ef7f932..7dfed27b 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -11,6 +11,8 @@
+  */
+ #include <linux/rbtree.h>
+ #include <linux/list_sort.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
+ #include "ext4.h"
+ #include "extents_status.h"
+@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+        */
+       if (!ext4_es_is_delayed(es)) {
+               EXT4_I(inode)->i_es_lru_nr++;
+-              percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
++              percpu_counter_inc(&EXT4_SB(inode->i_sb)->
++                                      s_es_stats.es_stats_lru_cnt);
+       }
++      EXT4_I(inode)->i_es_all_nr++;
++      percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
++
+       return es;
+ }
+ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ {
++      EXT4_I(inode)->i_es_all_nr--;
++      percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
++
+       /* Decrease the lru counter when this es is not delayed */
+       if (!ext4_es_is_delayed(es)) {
+               BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+               EXT4_I(inode)->i_es_lru_nr--;
+-              percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
++              percpu_counter_dec(&EXT4_SB(inode->i_sb)->
++                                      s_es_stats.es_stats_lru_cnt);
+       }
+       kmem_cache_free(ext4_es_cachep, es);
+@@ -739,6 +749,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+                         struct extent_status *es)
+ {
+       struct ext4_es_tree *tree;
++      struct ext4_es_stats *stats;
+       struct extent_status *es1 = NULL;
+       struct rb_node *node;
+       int found = 0;
+@@ -775,11 +786,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+       }
+ out:
++      stats = &EXT4_SB(inode->i_sb)->s_es_stats;
+       if (found) {
+               BUG_ON(!es1);
+               es->es_lblk = es1->es_lblk;
+               es->es_len = es1->es_len;
+               es->es_pblk = es1->es_pblk;
++              stats->es_stats_cache_hits++;
++      } else {
++              stats->es_stats_cache_misses++;
+       }
+       read_unlock(&EXT4_I(inode)->i_es_lock);
+@@ -941,11 +956,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                           struct ext4_inode_info *locked_ei)
+ {
+       struct ext4_inode_info *ei;
++      struct ext4_es_stats *es_stats;
+       struct list_head *cur, *tmp;
+       LIST_HEAD(skipped);
++      ktime_t start_time;
++      u64 scan_time;
+       int ret, nr_shrunk = 0;
+       int retried = 0, skip_precached = 1, nr_skipped = 0;
++      es_stats = &sbi->s_es_stats;
++      start_time = ktime_get();
+       spin_lock(&sbi->s_es_lru_lock);
+ retry:
+@@ -954,7 +974,8 @@ retry:
+                * If we have already reclaimed all extents from extent
+                * status tree, just stop the loop immediately.
+                */
+-              if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
++              if (percpu_counter_read_positive(
++                              &es_stats->es_stats_lru_cnt) == 0)
+                       break;
+               ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+@@ -964,7 +985,7 @@ retry:
+                * time.  Normally we try hard to avoid shrinking
+                * precached inodes, but we will as a last resort.
+                */
+-              if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
++              if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+                   (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+                                               EXT4_STATE_EXT_PRECACHED))) {
+                       nr_skipped++;
+@@ -998,7 +1019,7 @@ retry:
+       if ((nr_shrunk == 0) && nr_skipped && !retried) {
+               retried++;
+               list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+-              sbi->s_es_last_sorted = jiffies;
++              es_stats->es_stats_last_sorted = jiffies;
+               ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+                                     i_es_lru);
+               /*
+@@ -1016,6 +1037,20 @@ retry:
+       if (locked_ei && nr_shrunk == 0)
+               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
++      scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
++      if (likely(es_stats->es_stats_scan_time))
++              es_stats->es_stats_scan_time = (scan_time +
++                              es_stats->es_stats_scan_time*3) / 4;
++      else
++              es_stats->es_stats_scan_time = scan_time;
++      if (scan_time > es_stats->es_stats_max_scan_time)
++              es_stats->es_stats_max_scan_time = scan_time;
++      if (likely(es_stats->es_stats_shrunk))
++              es_stats->es_stats_shrunk = (nr_shrunk +
++                              es_stats->es_stats_shrunk*3) / 4;
++      else
++              es_stats->es_stats_shrunk = nr_shrunk;
++
+       return nr_shrunk;
+ }
+@@ -1026,7 +1061,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+       int nr_to_scan = sc->nr_to_scan;
+       int ret, nr_shrunk;
+-      ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
++      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+       trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+       if (!nr_to_scan)
+@@ -1034,23 +1069,149 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+       nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+-      ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
++      ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+       trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+       return ret;
+ }
+-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
++static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
++{
++      return *pos ? NULL : SEQ_START_TOKEN;
++}
++
++static void *
++ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++      return NULL;
++}
++
++static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
++{
++      struct ext4_sb_info *sbi = seq->private;
++      struct ext4_es_stats *es_stats = &sbi->s_es_stats;
++      struct ext4_inode_info *ei, *max = NULL;
++      unsigned int inode_cnt = 0;
++
++      if (v != SEQ_START_TOKEN)
++              return 0;
++
++      /* here we just find an inode that has the max nr. of objects */
++      spin_lock(&sbi->s_es_lru_lock);
++      list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
++              inode_cnt++;
++              if (max && max->i_es_all_nr < ei->i_es_all_nr)
++                      max = ei;
++              else if (!max)
++                      max = ei;
++      }
++      spin_unlock(&sbi->s_es_lru_lock);
++
++      seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
++                 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
++                 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
++      seq_printf(seq, "  %lu/%lu cache hits/misses\n",
++                 es_stats->es_stats_cache_hits,
++                 es_stats->es_stats_cache_misses);
++      if (es_stats->es_stats_last_sorted != 0)
++              seq_printf(seq, "  %u ms last sorted interval\n",
++                         jiffies_to_msecs(jiffies -
++                                          es_stats->es_stats_last_sorted));
++      if (inode_cnt)
++              seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
++
++      seq_printf(seq, "average:\n  %llu us scan time\n",
++          div_u64(es_stats->es_stats_scan_time, 1000));
++      seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
++      if (inode_cnt)
++              seq_printf(seq,
++                  "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
++                  "  %llu us max scan time\n",
++                  max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
++                  div_u64(es_stats->es_stats_max_scan_time, 1000));
++
++      return 0;
++}
++
++static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
++{
++}
++
++static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
++      .start = ext4_es_seq_shrinker_info_start,
++      .next  = ext4_es_seq_shrinker_info_next,
++      .stop  = ext4_es_seq_shrinker_info_stop,
++      .show  = ext4_es_seq_shrinker_info_show,
++};
++
++static int
++ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
++{
++      int ret;
++
++      ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
++      if (!ret) {
++              struct seq_file *m = file->private_data;
++              m->private = PDE_DATA(inode);
++      }
++
++      return ret;
++}
++
++static int
++ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+ {
++      return seq_release(inode, file);
++}
++
++static const struct file_operations ext4_es_seq_shrinker_info_fops = {
++      .owner          = THIS_MODULE,
++      .open           = ext4_es_seq_shrinker_info_open,
++      .read           = seq_read,
++      .llseek         = seq_lseek,
++      .release        = ext4_es_seq_shrinker_info_release,
++};
++
++int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
++{
++      int err;
++
+       INIT_LIST_HEAD(&sbi->s_es_lru);
+       spin_lock_init(&sbi->s_es_lru_lock);
+-      sbi->s_es_last_sorted = 0;
++      sbi->s_es_stats.es_stats_last_sorted = 0;
++      sbi->s_es_stats.es_stats_shrunk = 0;
++      sbi->s_es_stats.es_stats_cache_hits = 0;
++      sbi->s_es_stats.es_stats_cache_misses = 0;
++      sbi->s_es_stats.es_stats_scan_time = 0;
++      sbi->s_es_stats.es_stats_max_scan_time = 0;
++      err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt,
++                                0, GFP_KERNEL);
++      if (err)
++              return err;
++      err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt,
++                                0, GFP_KERNEL);
++      if (err)
++              goto err;
+       sbi->s_es_shrinker.shrink = ext4_es_shrink;
+       sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+       register_shrinker(&sbi->s_es_shrinker);
++
++      if (sbi->s_proc)
++              proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
++                               &ext4_es_seq_shrinker_info_fops, sbi);
++
++      return 0;
++
++err:
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
++      return err;
+ }
+ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+ {
++      if (sbi->s_proc)
++              remove_proc_entry("es_shrinker_info", sbi->s_proc);
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
++      percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+       unregister_shrinker(&sbi->s_es_shrinker);
+ }
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index f1b62a41..efd5f970 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -64,6 +64,17 @@ struct ext4_es_tree {
+       struct extent_status *cache_es; /* recently accessed extent */
+ };
++struct ext4_es_stats {
++      unsigned long es_stats_last_sorted;
++      unsigned long es_stats_shrunk;
++      unsigned long es_stats_cache_hits;
++      unsigned long es_stats_cache_misses;
++      u64 es_stats_scan_time;
++      u64 es_stats_max_scan_time;
++      struct percpu_counter es_stats_all_cnt;
++      struct percpu_counter es_stats_lru_cnt;
++};
++
+ extern int __init ext4_init_es(void);
+ extern void ext4_exit_es(void);
+ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                      (pb & ~ES_MASK));
+ }
+-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
++extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_lru_add(struct inode *inode);
+ extern void ext4_es_lru_del(struct inode *inode);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 18fe358c..bcdb48cf 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -880,7 +880,6 @@ static void ext4_put_super(struct super_block *sb)
+       percpu_counter_destroy(&sbi->s_freeinodes_counter);
+       percpu_counter_destroy(&sbi->s_dirs_counter);
+       percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+-      percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+ #ifdef CONFIG_QUOTA
+       for (i = 0; i < EXT4_MAXQUOTAS; i++)
+               kfree(sbi->s_qf_names[i]);
+@@ -944,6 +943,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       ext4_es_init_tree(&ei->i_es_tree);
+       rwlock_init(&ei->i_es_lock);
+       INIT_LIST_HEAD(&ei->i_es_lru);
++      ei->i_es_all_nr = 0;
+       ei->i_es_lru_nr = 0;
+       ei->i_touch_when = 0;
+       ei->i_reserved_data_blocks = 0;
+@@ -4289,14 +4289,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+       sbi->s_err_report.function = print_daily_error_info;
+       sbi->s_err_report.data = (unsigned long) sb;
+-      /* Register extent status tree shrinker */
+-      ext4_es_register_shrinker(sbi);
+-
+-      err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
+-      if (err) {
+-              ext4_msg(sb, KERN_ERR, "insufficient memory");
++      if (ext4_es_register_shrinker(sbi))
+               goto failed_mount3;
+-      }
+       sbi->s_stripe = ext4_get_stripe_size(sbi);
+       sbi->s_extent_max_zeroout_kb = 32;
+@@ -4641,10 +4635,9 @@ failed_mount_wq:
+               jbd2_journal_destroy(sbi->s_journal);
+               sbi->s_journal = NULL;
+       }
+-failed_mount3:
+       ext4_es_unregister_shrinker(sbi);
++failed_mount3:
+       del_timer_sync(&sbi->s_err_report);
+-      percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+       if (sbi->s_mmp_tsk)
+               kthread_stop(sbi->s_mmp_tsk);
+ failed_mount2:
+-- 
+2.24.1
+
index 45cd224..97651e9 100644 (file)
@@ -40,3 +40,10 @@ rhel7/ext4-optimize-ext4_find_delalloc_range-in-nodelalloc.patch
 rhel7.2/ext4-simple-blockalloc.patch
 rhel7/ext4-mballoc-skip-uninit-groups-cr0.patch
 rhel7/ext4-mballoc-prefetch.patch
+rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch
+rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch
+rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch
+rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch
+rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch
+rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch
+rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch
index 0dc2498..70220a2 100644 (file)
@@ -40,3 +40,10 @@ rhel7.7/ext4-fix-project-with-unpatched-kernel.patch
 rhel7.2/ext4-simple-blockalloc.patch
 rhel7/ext4-mballoc-skip-uninit-groups-cr0.patch
 rhel7.7/ext4-mballoc-prefetch.patch
+rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch
+rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch
+rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch
+rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch
+rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch
+rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch
+rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch