--- /dev/null
+From 4fac310a77c918d6a235a55cb76cf2f9bb22de71 Mon Sep 17 00:00:00 2001
+From: Zheng Liu <wenqing.lz@taobao.com>
+Date: Tue, 25 Nov 2014 11:45:37 -0500
+Subject: [PATCH 3/7] ext4: change LRU to round-robin in extent status tree
+ shrinker
+
+In this commit we discard the lru algorithm for inodes with extent
+status tree because it takes significant effort to maintain a lru list
+in extent status tree shrinker and the shrinker can take a long time to
+scan this lru list in order to reclaim some objects.
+
+We replace the lru ordering with a simple round-robin. After that we
+never need to keep a lru list. That means that the list needn't be
+sorted if the shrinker can not reclaim any objects in the first round.
+
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h | 10 +-
+ fs/ext4/extents.c | 4 +-
+ fs/ext4/extents_status.c | 221 +++++++++++++++++----------------------
+ fs/ext4/extents_status.h | 7 +-
+ fs/ext4/inode.c | 4 +-
+ fs/ext4/ioctl.c | 4 +-
+ fs/ext4/super.c | 7 +-
+ 7 files changed, 112 insertions(+), 145 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index cc5ba587..0813afd6 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1017,10 +1017,9 @@ struct ext4_inode_info {
+ /* extents status tree */
+ struct ext4_es_tree i_es_tree;
+ rwlock_t i_es_lock;
+- struct list_head i_es_lru;
++ struct list_head i_es_list;
+ unsigned int i_es_all_nr; /* protected by i_es_lock */
+- unsigned int i_es_lru_nr; /* protected by i_es_lock */
+- unsigned long i_touch_when; /* jiffies of last accessing */
++ unsigned int i_es_shk_nr; /* protected by i_es_lock */
+
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+@@ -1482,9 +1481,10 @@ struct ext4_sb_info {
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+- struct list_head s_es_lru;
++ struct list_head s_es_list;
++ long s_es_nr_inode;
+ struct ext4_es_stats s_es_stats;
+- spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
++ spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+
+ /* Ratelimit ext4 messages. */
+ struct ratelimit_state s_err_ratelimit_state;
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index f618d0ba..c012dc51 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4689,7 +4689,7 @@ out2:
+
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+- ext4_es_lru_add(inode);
++ ext4_es_list_add(inode);
+ return err ? err : allocated;
+ }
+
+@@ -5263,7 +5263,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ error = ext4_fill_fiemap_extents(inode, start_blk,
+ len_blks, fieinfo);
+ }
+- ext4_es_lru_add(inode);
++ ext4_es_list_add(inode);
+ return error;
+ }
+
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 7dfed27b..382a7bf9 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -149,8 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+- struct ext4_inode_info *locked_ei);
++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
++ struct ext4_inode_info *locked_ei);
+
+ int __init ext4_init_es(void)
+ {
+@@ -298,6 +298,36 @@ out:
+ trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+ }
+
++void ext4_es_list_add(struct inode *inode)
++{
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++
++ if (!list_empty(&ei->i_es_list))
++ return;
++
++ spin_lock(&sbi->s_es_lock);
++ if (list_empty(&ei->i_es_list)) {
++ list_add_tail(&ei->i_es_list, &sbi->s_es_list);
++ sbi->s_es_nr_inode++;
++ }
++ spin_unlock(&sbi->s_es_lock);
++}
++
++void ext4_es_list_del(struct inode *inode)
++{
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++
++ spin_lock(&sbi->s_es_lock);
++ if (!list_empty(&ei->i_es_list)) {
++ list_del_init(&ei->i_es_list);
++ sbi->s_es_nr_inode--;
++ WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
++ }
++ spin_unlock(&sbi->s_es_lock);
++}
++
+ static struct extent_status *
+ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk)
+@@ -314,9 +344,9 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es)) {
+- EXT4_I(inode)->i_es_lru_nr++;
++ EXT4_I(inode)->i_es_shk_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+- s_es_stats.es_stats_lru_cnt);
++ s_es_stats.es_stats_shk_cnt);
+ }
+
+ EXT4_I(inode)->i_es_all_nr++;
+@@ -330,12 +360,12 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ EXT4_I(inode)->i_es_all_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
+- /* Decrease the lru counter when this es is not delayed */
++ /* Decrease the shrink counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+- BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+- EXT4_I(inode)->i_es_lru_nr--;
++ BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
++ EXT4_I(inode)->i_es_shk_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+- s_es_stats.es_stats_lru_cnt);
++ s_es_stats.es_stats_shk_cnt);
+ }
+
+ kmem_cache_free(ext4_es_cachep, es);
+@@ -693,8 +723,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ goto error;
+ retry:
+ err = __es_insert_extent(inode, &newes);
+- if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+- EXT4_I(inode)))
++ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
++ 1, EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
+@@ -851,8 +881,8 @@ retry:
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+- __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+- EXT4_I(inode)))
++ __es_shrink(EXT4_SB(inode->i_sb),
++ 1, EXT4_I(inode)))
+ goto retry;
+ goto out;
+ }
+@@ -924,6 +954,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
++ /*
++ * ext4_clear_inode() depends on us taking i_es_lock unconditionally
++ * so that we are sure __es_shrink() is done with the inode before it
++ * is reclaimed.
++ */
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+@@ -931,112 +966,77 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ return err;
+ }
+
+-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+- struct list_head *b)
+-{
+- struct ext4_inode_info *eia, *eib;
+- eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+- eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+-
+- if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+- !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+- return 1;
+- if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+- ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+- return -1;
+- if (eia->i_touch_when == eib->i_touch_when)
+- return 0;
+- if (time_after(eia->i_touch_when, eib->i_touch_when))
+- return 1;
+- else
+- return -1;
+-}
+-
+-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+- struct ext4_inode_info *locked_ei)
++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
++ struct ext4_inode_info *locked_ei)
+ {
+ struct ext4_inode_info *ei;
+ struct ext4_es_stats *es_stats;
+- struct list_head *cur, *tmp;
+- LIST_HEAD(skipped);
+ ktime_t start_time;
+ u64 scan_time;
++ int nr_to_walk;
+ int ret, nr_shrunk = 0;
+- int retried = 0, skip_precached = 1, nr_skipped = 0;
++ int retried = 0, nr_skipped = 0;
+
+ es_stats = &sbi->s_es_stats;
+ start_time = ktime_get();
+- spin_lock(&sbi->s_es_lru_lock);
+
+ retry:
+- list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+- /*
+- * If we have already reclaimed all extents from extent
+- * status tree, just stop the loop immediately.
+- */
+- if (percpu_counter_read_positive(
+- &es_stats->es_stats_lru_cnt) == 0)
+- break;
+-
+- ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
++ spin_lock(&sbi->s_es_lock);
++ nr_to_walk = sbi->s_es_nr_inode;
++ while (nr_to_walk-- > 0) {
+
++ if (list_empty(&sbi->s_es_list)) {
++ spin_unlock(&sbi->s_es_lock);
++ goto out;
++ }
++ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
++ i_es_list);
++ /* Move the inode to the tail */
++ list_move(&ei->i_es_list, sbi->s_es_list.prev);
+ /*
+- * Skip the inode that is newer than the last_sorted
+- * time. Normally we try hard to avoid shrinking
+- * precached inodes, but we will as a last resort.
++ * Normally we try hard to avoid shrinking precached inodes,
++ * but we will as a last resort.
+ */
+- if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+- (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+- EXT4_STATE_EXT_PRECACHED))) {
++ if (!retried && ext4_test_inode_state(&ei->vfs_inode,
++ EXT4_STATE_EXT_PRECACHED)) {
+ nr_skipped++;
+- list_move_tail(cur, &skipped);
+ continue;
+ }
+
+- if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+- !write_trylock(&ei->i_es_lock))
+- continue;
++ if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
++ nr_skipped++;
++ continue;
++ }
++ /*
++ * Now we hold i_es_lock which protects us from inode reclaim
++ * freeing inode under us
++ */
++ spin_unlock(&sbi->s_es_lock);
+
+ ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+- if (ei->i_es_lru_nr == 0)
+- list_del_init(&ei->i_es_lru);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += ret;
+ nr_to_scan -= ret;
+ if (nr_to_scan == 0)
+- break;
++ goto out;
++ spin_lock(&sbi->s_es_lock);
+ }
+
+- /* Move the newer inodes into the tail of the LRU list. */
+- list_splice_tail(&skipped, &sbi->s_es_lru);
+- INIT_LIST_HEAD(&skipped);
++ spin_unlock(&sbi->s_es_lock);
+
+ /*
+ * If we skipped any inodes, and we weren't able to make any
+- * forward progress, sort the list and try again.
++ * forward progress, try again to scan precached inodes.
+ */
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+- es_stats->es_stats_last_sorted = jiffies;
+- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+- i_es_lru);
+- /*
+- * If there are no non-precached inodes left on the
+- * list, start releasing precached extents.
+- */
+- if (ext4_test_inode_state(&ei->vfs_inode,
+- EXT4_STATE_EXT_PRECACHED))
+- skip_precached = 0;
+ goto retry;
+ }
+
+- spin_unlock(&sbi->s_es_lru_lock);
+-
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+-
++out:
+ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ if (likely(es_stats->es_stats_scan_time))
+ es_stats->es_stats_scan_time = (scan_time +
+@@ -1061,15 +1061,15 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+- ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+- nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
++ nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+
+- ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ return ret;
+ }
+@@ -1096,28 +1096,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+ return 0;
+
+ /* here we just find an inode that has the max nr. of objects */
+- spin_lock(&sbi->s_es_lru_lock);
+- list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
++ spin_lock(&sbi->s_es_lock);
++ list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
+ inode_cnt++;
+ if (max && max->i_es_all_nr < ei->i_es_all_nr)
+ max = ei;
+ else if (!max)
+ max = ei;
+ }
+- spin_unlock(&sbi->s_es_lru_lock);
++ spin_unlock(&sbi->s_es_lock);
+
+ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
+ percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+- percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
++ percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
+ seq_printf(seq, " %lu/%lu cache hits/misses\n",
+ es_stats->es_stats_cache_hits,
+ es_stats->es_stats_cache_misses);
+- if (es_stats->es_stats_last_sorted != 0)
+- seq_printf(seq, " %u ms last sorted interval\n",
+- jiffies_to_msecs(jiffies -
+- es_stats->es_stats_last_sorted));
+ if (inode_cnt)
+- seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
++ seq_printf(seq, " %d inodes on list\n", inode_cnt);
+
+ seq_printf(seq, "average:\n %llu us scan time\n",
+ div_u64(es_stats->es_stats_scan_time, 1000));
+@@ -1126,7 +1122,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+ seq_printf(seq,
+ "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
+ " %llu us max scan time\n",
+- max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
++ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
+ div_u64(es_stats->es_stats_max_scan_time, 1000));
+
+ return 0;
+@@ -1175,9 +1171,9 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+ {
+ int err;
+
+- INIT_LIST_HEAD(&sbi->s_es_lru);
+- spin_lock_init(&sbi->s_es_lru_lock);
+- sbi->s_es_stats.es_stats_last_sorted = 0;
++ INIT_LIST_HEAD(&sbi->s_es_list);
++ sbi->s_es_nr_inode = 0;
++ spin_lock_init(&sbi->s_es_lock);
+ sbi->s_es_stats.es_stats_shrunk = 0;
+ sbi->s_es_stats.es_stats_cache_hits = 0;
+ sbi->s_es_stats.es_stats_cache_misses = 0;
+@@ -1187,7 +1183,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+ 0, GFP_KERNEL);
+ if (err)
+ return err;
+- err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt,
++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt,
+ 0, GFP_KERNEL);
+ if (err)
+ goto err;
+@@ -1211,37 +1207,10 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+ if (sbi->s_proc)
+ remove_proc_entry("es_shrinker_info", sbi->s_proc);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+- percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+ unregister_shrinker(&sbi->s_es_shrinker);
+ }
+
+-void ext4_es_lru_add(struct inode *inode)
+-{
+- struct ext4_inode_info *ei = EXT4_I(inode);
+- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-
+- ei->i_touch_when = jiffies;
+-
+- if (!list_empty(&ei->i_es_lru))
+- return;
+-
+- spin_lock(&sbi->s_es_lru_lock);
+- if (list_empty(&ei->i_es_lru))
+- list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+- spin_unlock(&sbi->s_es_lru_lock);
+-}
+-
+-void ext4_es_lru_del(struct inode *inode)
+-{
+- struct ext4_inode_info *ei = EXT4_I(inode);
+- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-
+- spin_lock(&sbi->s_es_lru_lock);
+- if (!list_empty(&ei->i_es_lru))
+- list_del_init(&ei->i_es_lru);
+- spin_unlock(&sbi->s_es_lru_lock);
+-}
+-
+ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+ {
+@@ -1253,7 +1222,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+- if (ei->i_es_lru_nr == 0)
++ if (ei->i_es_shk_nr == 0)
+ return 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index efd5f970..0e6a33e8 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -65,14 +65,13 @@ struct ext4_es_tree {
+ };
+
+ struct ext4_es_stats {
+- unsigned long es_stats_last_sorted;
+ unsigned long es_stats_shrunk;
+ unsigned long es_stats_cache_hits;
+ unsigned long es_stats_cache_misses;
+ u64 es_stats_scan_time;
+ u64 es_stats_max_scan_time;
+ struct percpu_counter es_stats_all_cnt;
+- struct percpu_counter es_stats_lru_cnt;
++ struct percpu_counter es_stats_shk_cnt;
+ };
+
+ extern int __init ext4_init_es(void);
+@@ -151,7 +150,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+
+ extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+-extern void ext4_es_lru_add(struct inode *inode);
+-extern void ext4_es_lru_del(struct inode *inode);
++extern void ext4_es_list_add(struct inode *inode);
++extern void ext4_es_list_del(struct inode *inode);
+
+ #endif /* _EXT4_EXTENTS_STATUS_H */
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 21db5952..f6a2764c 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -523,7 +523,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+- ext4_es_lru_add(inode);
++ ext4_es_list_add(inode);
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+@@ -1519,7 +1519,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+- ext4_es_lru_add(inode);
++ ext4_es_list_add(inode);
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read(&EXT4_I(inode)->i_data_sem);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 858cf709..122d517c 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -80,8 +80,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+ memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+- ext4_es_lru_del(inode1);
+- ext4_es_lru_del(inode2);
++ ext4_es_list_del(inode1);
++ ext4_es_list_del(inode2);
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 95a01d56..ea2a1026 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -942,10 +942,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ spin_lock_init(&ei->i_prealloc_lock);
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+- INIT_LIST_HEAD(&ei->i_es_lru);
++ INIT_LIST_HEAD(&ei->i_es_list);
+ ei->i_es_all_nr = 0;
+- ei->i_es_lru_nr = 0;
+- ei->i_touch_when = 0;
++ ei->i_es_shk_nr = 0;
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+@@ -1034,7 +1033,7 @@ void ext4_clear_inode(struct inode *inode)
+ dquot_drop(inode);
+ ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+- ext4_es_lru_del(inode);
++ ext4_es_list_del(inode);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode);
+--
+2.24.1
+
--- /dev/null
+From dd5c7af957dd0b9b3b04ef8aacffd601b46bc26c Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:53:47 -0500
+Subject: [PATCH 6/7] ext4: cleanup flag definitions for extent status tree
+
+Currently flags for extent status tree are defined twice, once shifted
+and once without a being shifted. Consolidate these definitions into one
+place and make some computations automatic to make adding flags less
+error prone. Compiler should be clever enough to figure out these are
+constants and generate the same code.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/extents_status.c | 2 ++
+ fs/ext4/extents_status.h | 58 ++++++++++++++++++----------------------
+ 2 files changed, 28 insertions(+), 32 deletions(-)
+
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index b78eec2a..a29708c0 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -1170,6 +1170,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+ {
+ int err;
+
++ /* Make sure we have enough bits for physical block number */
++ BUILD_BUG_ON(ES_SHIFT < 48);
+ INIT_LIST_HEAD(&sbi->s_es_list);
+ sbi->s_es_nr_inode = 0;
+ spin_lock_init(&sbi->s_es_lock);
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index b0b78b95..e86b1f34 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -29,25 +29,21 @@
+ /*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+-#define ES_SHIFT 60
+-
+-#define EXTENT_STATUS_WRITTEN (1 << 3)
+-#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+-#define EXTENT_STATUS_DELAYED (1 << 1)
+-#define EXTENT_STATUS_HOLE (1 << 0)
+-
+-#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
+- EXTENT_STATUS_UNWRITTEN | \
+- EXTENT_STATUS_DELAYED | \
+- EXTENT_STATUS_HOLE)
++enum {
++ ES_WRITTEN_B,
++ ES_UNWRITTEN_B,
++ ES_DELAYED_B,
++ ES_HOLE_B,
++ ES_FLAGS
++};
+
+-#define ES_WRITTEN (1ULL << 63)
+-#define ES_UNWRITTEN (1ULL << 62)
+-#define ES_DELAYED (1ULL << 61)
+-#define ES_HOLE (1ULL << 60)
++#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
++#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
+
+-#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
+- ES_DELAYED | ES_HOLE)
++#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
++#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
++#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
++#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
+
+ struct ext4_sb_info;
+ struct ext4_extent;
+@@ -92,29 +88,29 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+ extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+
++static inline unsigned int ext4_es_status(struct extent_status *es)
++{
++ return es->es_pblk >> ES_SHIFT;
++}
++
+ static inline int ext4_es_is_written(struct extent_status *es)
+ {
+- return (es->es_pblk & ES_WRITTEN) != 0;
++ return (ext4_es_status(es) & EXTENT_STATUS_WRITTEN) != 0;
+ }
+
+ static inline int ext4_es_is_unwritten(struct extent_status *es)
+ {
+- return (es->es_pblk & ES_UNWRITTEN) != 0;
++ return (ext4_es_status(es) & EXTENT_STATUS_UNWRITTEN) != 0;
+ }
+
+ static inline int ext4_es_is_delayed(struct extent_status *es)
+ {
+- return (es->es_pblk & ES_DELAYED) != 0;
++ return (ext4_es_status(es) & EXTENT_STATUS_DELAYED) != 0;
+ }
+
+ static inline int ext4_es_is_hole(struct extent_status *es)
+ {
+- return (es->es_pblk & ES_HOLE) != 0;
+-}
+-
+-static inline unsigned int ext4_es_status(struct extent_status *es)
+-{
+- return es->es_pblk >> ES_SHIFT;
++ return (ext4_es_status(es) & EXTENT_STATUS_HOLE) != 0;
+ }
+
+ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+@@ -134,18 +130,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
+ static inline void ext4_es_store_status(struct extent_status *es,
+ unsigned int status)
+ {
+- es->es_pblk = (((ext4_fsblk_t)
+- (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+- (es->es_pblk & ~ES_MASK));
++ es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
++ (es->es_pblk & ~ES_MASK);
+ }
+
+ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ ext4_fsblk_t pb,
+ unsigned int status)
+ {
+- es->es_pblk = (((ext4_fsblk_t)
+- (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+- (pb & ~ES_MASK));
++ es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
++ (pb & ~ES_MASK);
+ }
+
+ extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+--
+2.24.1
+
--- /dev/null
+From 1da6da1563df986dd35080d7edcf59b739696c40 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:55:24 -0500
+Subject: [PATCH 7/7] ext4: introduce aging to extent status tree
+
+Introduce a simple aging to extent status tree. Each extent has a
+REFERENCED bit which gets set when the extent is used. Shrinker then
+skips entries with referenced bit set and clears the bit. Thus
+frequently used extents have higher chances of staying in memory.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/extents_status.c | 22 +++++++++++++++++-----
+ fs/ext4/extents_status.h | 35 +++++++++++++++++++++++++++++++----
+ 2 files changed, 48 insertions(+), 9 deletions(-)
+
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index a29708c0..0305f308 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -382,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+ {
+- if (ext4_es_status(es1) != ext4_es_status(es2))
++ if (ext4_es_type(es1) != ext4_es_type(es2))
+ return 0;
+
+ if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+@@ -425,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
++ if (ext4_es_is_referenced(es))
++ ext4_es_set_referenced(es1);
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ es = es1;
+@@ -447,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
++ if (ext4_es_is_referenced(es1))
++ ext4_es_set_referenced(es);
+ rb_erase(node, &tree->root);
+ ext4_es_free_extent(inode, es1);
+ }
+@@ -823,6 +827,8 @@ out:
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
++ if (!ext4_es_is_referenced(es))
++ ext4_es_set_referenced(es);
+ stats->es_stats_cache_hits++;
+ } else {
+ stats->es_stats_cache_misses++;
+@@ -1243,11 +1249,17 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+- if (!ext4_es_is_delayed(es)) {
+- rb_erase(&es->rb_node, &tree->root);
+- ext4_es_free_extent(inode, es);
+- (*nr_shrunk)++;
++ if (ext4_es_is_delayed(es))
++ goto next;
++ if (ext4_es_is_referenced(es)) {
++ ext4_es_clear_referenced(es);
++ goto next;
+ }
++
++ rb_erase(&es->rb_node, &tree->root);
++ ext4_es_free_extent(inode, es);
++ (*nr_shrunk)++;
++next:
+ if (!node)
+ goto out_wrap;
+ es = rb_entry(node, struct extent_status, rb_node);
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index e86b1f34..691b5261 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -34,6 +34,7 @@ enum {
+ ES_UNWRITTEN_B,
+ ES_DELAYED_B,
+ ES_HOLE_B,
++ ES_REFERENCED_B,
+ ES_FLAGS
+ };
+
+@@ -44,6 +45,12 @@ enum {
+ #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
+ #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
+ #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
++#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B)
++
++#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
++ EXTENT_STATUS_UNWRITTEN | \
++ EXTENT_STATUS_DELAYED | \
++ EXTENT_STATUS_HOLE) << ES_SHIFT)
+
+ struct ext4_sb_info;
+ struct ext4_extent;
+@@ -93,24 +100,44 @@ static inline unsigned int ext4_es_status(struct extent_status *es)
+ return es->es_pblk >> ES_SHIFT;
+ }
+
++static inline unsigned int ext4_es_type(struct extent_status *es)
++{
++ return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
++}
++
+ static inline int ext4_es_is_written(struct extent_status *es)
+ {
+- return (ext4_es_status(es) & EXTENT_STATUS_WRITTEN) != 0;
++ return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
+ }
+
+ static inline int ext4_es_is_unwritten(struct extent_status *es)
+ {
+- return (ext4_es_status(es) & EXTENT_STATUS_UNWRITTEN) != 0;
++ return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
+ }
+
+ static inline int ext4_es_is_delayed(struct extent_status *es)
+ {
+- return (ext4_es_status(es) & EXTENT_STATUS_DELAYED) != 0;
++ return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
+ }
+
+ static inline int ext4_es_is_hole(struct extent_status *es)
+ {
+- return (ext4_es_status(es) & EXTENT_STATUS_HOLE) != 0;
++ return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
++}
++
++static inline void ext4_es_set_referenced(struct extent_status *es)
++{
++ es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
++}
++
++static inline void ext4_es_clear_referenced(struct extent_status *es)
++{
++ es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
++}
++
++static inline int ext4_es_is_referenced(struct extent_status *es)
++{
++ return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
+ }
+
+ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+--
+2.24.1
+
--- /dev/null
+From b72242d714ac3968bbb25867718e731be217e87b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:51:23 -0500
+Subject: [PATCH 5/7] ext4: limit number of scanned extents in status tree
+ shrinker
+
+Currently we scan extent status trees of inodes until we reclaim nr_to_scan
+extents. This can however require a lot of scanning when there are lots
+of delayed extents (as those cannot be reclaimed).
+
+Change shrinker to work as shrinkers are supposed to and *scan* only
+nr_to_scan extents regardless of how many extents did we actually
+reclaim. We however need to be careful and avoid scanning each status
+tree from the beginning - that could lead to a situation where we would
+not be able to reclaim anything at all when first nr_to_scan extents in
+the tree are always unreclaimable. We remember with each inode offset
+where we stopped scanning and continue from there when we next come
+across the inode.
+
+Note that we also need to update places calling __es_shrink() manually
+to pass reasonable nr_to_scan to have a chance of reclaiming anything and
+not just 1.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h | 5 ++-
+ fs/ext4/extents_status.c | 91 ++++++++++++++++++++++++++--------------
+ fs/ext4/super.c | 1 +
+ 3 files changed, 65 insertions(+), 32 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 0813afd6..2893a168 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1020,6 +1020,9 @@ struct ext4_inode_info {
+ struct list_head i_es_list;
+ unsigned int i_es_all_nr; /* protected by i_es_lock */
+ unsigned int i_es_shk_nr; /* protected by i_es_lock */
++ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
++ extents to shrink. Protected by
++ i_es_lock */
+
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+@@ -1481,7 +1484,7 @@ struct ext4_sb_info {
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+- struct list_head s_es_list;
++ struct list_head s_es_list; /* List of inodes with reclaimable extents */
+ long s_es_nr_inode;
+ struct ext4_es_stats s_es_stats;
+ spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index edd49793..b78eec2a 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep;
+ static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+- int nr_to_scan);
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
+
+@@ -726,7 +725,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ retry:
+ err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+- 1, EXT4_I(inode)))
++ 128, EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
+@@ -884,7 +883,7 @@ retry:
+ es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __es_shrink(EXT4_SB(inode->i_sb),
+- 1, EXT4_I(inode)))
++ 128, EXT4_I(inode)))
+ goto retry;
+ goto out;
+ }
+@@ -976,7 +975,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ ktime_t start_time;
+ u64 scan_time;
+ int nr_to_walk;
+- int ret, nr_shrunk = 0;
++ int nr_shrunk = 0;
+ int retried = 0, nr_skipped = 0;
+
+ es_stats = &sbi->s_es_stats;
+@@ -994,7 +993,7 @@ retry:
+ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+ i_es_list);
+ /* Move the inode to the tail */
+- list_move(&ei->i_es_list, sbi->s_es_list.prev);
++ list_move_tail(&ei->i_es_list, &sbi->s_es_list);
+ /*
+ * Normally we try hard to avoid shrinking precached inodes,
+ * but we will as a last resort.
+@@ -1015,12 +1014,10 @@ retry:
+ */
+ spin_unlock(&sbi->s_es_lock);
+
+- ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
++ nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
+ write_unlock(&ei->i_es_lock);
+
+- nr_shrunk += ret;
+- nr_to_scan -= ret;
+- if (nr_to_scan == 0)
++ if (nr_to_scan <= 0)
+ goto out;
+ spin_lock(&sbi->s_es_lock);
+ }
+@@ -1037,7 +1034,7 @@ retry:
+ }
+
+ if (locked_ei && nr_shrunk == 0)
+- nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
++ nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+ out:
+ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ if (likely(es_stats->es_stats_scan_time))
+@@ -1213,27 +1210,32 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+ unregister_shrinker(&sbi->s_es_shrinker);
+ }
+
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+- int nr_to_scan)
++/*
++ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
++ * most *nr_to_scan extents, update *nr_to_scan accordingly.
++ *
++ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
++ * Increment *nr_shrunk by the number of reclaimed extents. Also update
++ * ei->i_es_shrink_lblk to where we should continue scanning.
++ */
++static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
++ int *nr_to_scan, int *nr_shrunk)
+ {
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+- struct rb_node *node;
+ struct extent_status *es;
+- int nr_shrunk = 0;
+- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+- DEFAULT_RATELIMIT_BURST);
+-
+- if (ei->i_es_shk_nr == 0)
+- return 0;
+-
+- if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+- __ratelimit(&_rs))
+- ext4_warning(inode->i_sb, "forced shrink of precached extents");
++ struct rb_node *node;
+
+- node = rb_first(&tree->root);
+- while (node != NULL) {
+- es = rb_entry(node, struct extent_status, rb_node);
++ es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
++ if (!es)
++ goto out_wrap;
++ node = &es->rb_node;
++ while (*nr_to_scan > 0) {
++ if (es->es_lblk > end) {
++ ei->i_es_shrink_lblk = end + 1;
++ return 0;
++ }
++ (*nr_to_scan)--;
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+@@ -1242,11 +1244,38 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+- nr_shrunk++;
+- if (--nr_to_scan == 0)
+- break;
++ (*nr_shrunk)++;
+ }
++ if (!node)
++ goto out_wrap;
++ es = rb_entry(node, struct extent_status, rb_node);
+ }
+- tree->cache_es = NULL;
++ ei->i_es_shrink_lblk = es->es_lblk;
++ return 1;
++out_wrap:
++ ei->i_es_shrink_lblk = 0;
++ return 0;
++}
++
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
++{
++ struct inode *inode = &ei->vfs_inode;
++ int nr_shrunk = 0;
++ ext4_lblk_t start = ei->i_es_shrink_lblk;
++ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
++ DEFAULT_RATELIMIT_BURST);
++
++ if (ei->i_es_shk_nr == 0)
++ return 0;
++
++ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
++ __ratelimit(&_rs))
++ ext4_warning(inode->i_sb, "forced shrink of precached extents");
++
++ if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
++ start != 0)
++ es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
++
++ ei->i_es_tree.cache_es = NULL;
+ return nr_shrunk;
+ }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 8a81fa73..d9cd4ff9 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -945,6 +945,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ INIT_LIST_HEAD(&ei->i_es_list);
+ ei->i_es_all_nr = 0;
+ ei->i_es_shk_nr = 0;
++ ei->i_es_shrink_lblk = 0;
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+--
+2.24.1
+
--- /dev/null
+From 8d5847463404eb2d6b24f748d521d1930a432da9 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:49:25 -0500
+Subject: [PATCH 4/7] ext4: move handling of list of shrinkable inodes into
+ extent status code
+
+Currently callers adding extents to extent status tree were responsible
+for adding the inode to the list of inodes with freeable extents. This
+is error prone and puts list handling in unnecessarily many places.
+
+Just add inode to the list automatically when the first non-delay extent
+is added to the tree and remove inode from the list when the last
+non-delay extent is removed.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/extents.c | 2 --
+ fs/ext4/extents_status.c | 10 ++++++----
+ fs/ext4/extents_status.h | 2 --
+ fs/ext4/inode.c | 2 --
+ fs/ext4/ioctl.c | 2 --
+ fs/ext4/super.c | 1 -
+ 6 files changed, 6 insertions(+), 13 deletions(-)
+
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index c012dc51..d9d51a5b 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4689,7 +4689,6 @@ out2:
+
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+- ext4_es_list_add(inode);
+ return err ? err : allocated;
+ }
+
+@@ -5263,7 +5262,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ error = ext4_fill_fiemap_extents(inode, start_blk,
+ len_blks, fieinfo);
+ }
+- ext4_es_list_add(inode);
+ return error;
+ }
+
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 382a7bf9..edd49793 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -298,7 +298,7 @@ out:
+ trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+ }
+
+-void ext4_es_list_add(struct inode *inode)
++static void ext4_es_list_add(struct inode *inode)
+ {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+@@ -314,7 +314,7 @@ void ext4_es_list_add(struct inode *inode)
+ spin_unlock(&sbi->s_es_lock);
+ }
+
+-void ext4_es_list_del(struct inode *inode)
++static void ext4_es_list_del(struct inode *inode)
+ {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+@@ -344,7 +344,8 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es)) {
+- EXT4_I(inode)->i_es_shk_nr++;
++ if (!EXT4_I(inode)->i_es_shk_nr++)
++ ext4_es_list_add(inode);
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+ s_es_stats.es_stats_shk_cnt);
+ }
+@@ -363,7 +364,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ /* Decrease the shrink counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
+- EXT4_I(inode)->i_es_shk_nr--;
++ if (!--EXT4_I(inode)->i_es_shk_nr)
++ ext4_es_list_del(inode);
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+ s_es_stats.es_stats_shk_cnt);
+ }
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index 0e6a33e8..b0b78b95 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -150,7 +150,5 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+
+ extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+-extern void ext4_es_list_add(struct inode *inode);
+-extern void ext4_es_list_del(struct inode *inode);
+
+ #endif /* _EXT4_EXTENTS_STATUS_H */
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index f6a2764c..9bbdc9e5 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -523,7 +523,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+- ext4_es_list_add(inode);
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+@@ -1519,7 +1518,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+- ext4_es_list_add(inode);
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read(&EXT4_I(inode)->i_data_sem);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 122d517c..6a6a9588 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -80,8 +80,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+ memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+- ext4_es_list_del(inode1);
+- ext4_es_list_del(inode2);
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index ea2a1026..8a81fa73 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1033,7 +1033,6 @@ void ext4_clear_inode(struct inode *inode)
+ dquot_drop(inode);
+ ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+- ext4_es_list_del(inode);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode);
+--
+2.24.1
+
--- /dev/null
+From fabafc86567c2165c5b2165dcbf835edd6f81e72 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 30 Oct 2014 10:53:16 -0400
+Subject: [PATCH 2/7] ext4: remove extent status procfs files if journal load
+ fails
+
+If we can't load the journal, remove the procfs files for the extent
+status information file to avoid leaking resources.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@vger.kernel.org
+---
+ fs/ext4/super.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index bcdb48cf..95a01d56 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4326,7 +4326,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+ !(sb->s_flags & MS_RDONLY))
+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+- goto failed_mount3;
++ goto failed_mount3a;
+
+ ext4_ext_init(sb); /* needed before using extent-mapped journal */
+
+@@ -4338,7 +4338,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ err = ext4_load_journal(sb, es, journal_devnum);
+ if (err)
+- goto failed_mount3;
++ goto failed_mount3a;
+ } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ ext4_msg(sb, KERN_ERR, "required journal recovery "
+@@ -4635,6 +4635,7 @@ failed_mount_wq:
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ }
++failed_mount3a:
+ ext4_es_unregister_shrinker(sbi);
+ failed_mount3:
+ del_timer_sync(&sbi->s_err_report);
+--
+2.24.1
+
--- /dev/null
+From f33e0fa5ab6cad962d3b88376f4611b9aba1d030 Mon Sep 17 00:00:00 2001
+From: Wang Shilong <wshilong@ddn.com>
+Date: Thu, 27 Feb 2020 17:08:04 +0800
+Subject: [PATCH 1/7] ext4: track extent status tree shrinker delay statictics
+
+This commit adds some statictics in extent status tree shrinker. The
+purpose to add these is that we want to collect more details when we
+encounter a stall caused by extent status tree shrinker. Here we count
+the following statictics:
+ stats:
+ the number of all objects on all extent status trees
+ the number of reclaimable objects on lru list
+ cache hits/misses
+ the last sorted interval
+ the number of inodes on lru list
+ average:
+ scan time for shrinking some objects
+ the number of shrunk objects
+ maximum:
+ the inode that has max nr. of objects on lru list
+ the maximum scan time for shrinking some objects
+
+The output looks like below:
+ $ cat /proc/fs/ext4/sda1/es_shrinker_info
+ stats:
+ 28228 objects
+ 6341 reclaimable objects
+ 5281/631 cache hits/misses
+ 586 ms last sorted interval
+ 250 inodes on lru list
+ average:
+ 153 us scan time
+ 128 shrunk objects
+ maximum:
+ 255 inode (255 objects, 198 reclaimable)
+ 125723 us max scan time
+
+If the lru list has never been sorted, the following line will not be
+printed:
+ 586ms last sorted interval
+If there is an empty lru list, the following lines also will not be
+printed:
+ 250 inodes on lru list
+ ...
+ maximum:
+ 255 inode (255 objects, 198 reclaimable)
+ 0 us max scan time
+
+Meanwhile in this commit a new trace point is defined to print some
+details in __ext4_es_shrink().
+
+[Shilong remove trace point parts of this patch]
+
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Cc: Jan Kara <jack@suse.cz>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h | 4 +-
+ fs/ext4/extents_status.c | 179 +++++++++++++++++++++++++++++++++++++--
+ fs/ext4/extents_status.h | 13 ++-
+ fs/ext4/super.c | 13 +--
+ 4 files changed, 187 insertions(+), 22 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 763276e2..cc5ba587 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1018,6 +1018,7 @@ struct ext4_inode_info {
+ struct ext4_es_tree i_es_tree;
+ rwlock_t i_es_lock;
+ struct list_head i_es_lru;
++ unsigned int i_es_all_nr; /* protected by i_es_lock */
+ unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
+
+@@ -1482,8 +1483,7 @@ struct ext4_sb_info {
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+ struct list_head s_es_lru;
+- unsigned long s_es_last_sorted;
+- struct percpu_counter s_extent_cache_cnt;
++ struct ext4_es_stats s_es_stats;
+ spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+ /* Ratelimit ext4 messages. */
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index 3ef7f932..7dfed27b 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -11,6 +11,8 @@
+ */
+ #include <linux/rbtree.h>
+ #include <linux/list_sort.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
+ #include "ext4.h"
+ #include "extents_status.h"
+
+@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ */
+ if (!ext4_es_is_delayed(es)) {
+ EXT4_I(inode)->i_es_lru_nr++;
+- percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
++ percpu_counter_inc(&EXT4_SB(inode->i_sb)->
++ s_es_stats.es_stats_lru_cnt);
+ }
+
++ EXT4_I(inode)->i_es_all_nr++;
++ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
++
+ return es;
+ }
+
+ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+ {
++ EXT4_I(inode)->i_es_all_nr--;
++ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
++
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+- percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
++ percpu_counter_dec(&EXT4_SB(inode->i_sb)->
++ s_es_stats.es_stats_lru_cnt);
+ }
+
+ kmem_cache_free(ext4_es_cachep, es);
+@@ -739,6 +749,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
+ {
+ struct ext4_es_tree *tree;
++ struct ext4_es_stats *stats;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ int found = 0;
+@@ -775,11 +786,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ }
+
+ out:
++ stats = &EXT4_SB(inode->i_sb)->s_es_stats;
+ if (found) {
+ BUG_ON(!es1);
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
++ stats->es_stats_cache_hits++;
++ } else {
++ stats->es_stats_cache_misses++;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+@@ -941,11 +956,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
+ {
+ struct ext4_inode_info *ei;
++ struct ext4_es_stats *es_stats;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skipped);
++ ktime_t start_time;
++ u64 scan_time;
+ int ret, nr_shrunk = 0;
+ int retried = 0, skip_precached = 1, nr_skipped = 0;
+
++ es_stats = &sbi->s_es_stats;
++ start_time = ktime_get();
+ spin_lock(&sbi->s_es_lru_lock);
+
+ retry:
+@@ -954,7 +974,8 @@ retry:
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+- if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
++ if (percpu_counter_read_positive(
++ &es_stats->es_stats_lru_cnt) == 0)
+ break;
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+@@ -964,7 +985,7 @@ retry:
+ * time. Normally we try hard to avoid shrinking
+ * precached inodes, but we will as a last resort.
+ */
+- if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
++ if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+ (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))) {
+ nr_skipped++;
+@@ -998,7 +1019,7 @@ retry:
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+- sbi->s_es_last_sorted = jiffies;
++ es_stats->es_stats_last_sorted = jiffies;
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+ i_es_lru);
+ /*
+@@ -1016,6 +1037,20 @@ retry:
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
++ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
++ if (likely(es_stats->es_stats_scan_time))
++ es_stats->es_stats_scan_time = (scan_time +
++ es_stats->es_stats_scan_time*3) / 4;
++ else
++ es_stats->es_stats_scan_time = scan_time;
++ if (scan_time > es_stats->es_stats_max_scan_time)
++ es_stats->es_stats_max_scan_time = scan_time;
++ if (likely(es_stats->es_stats_shrunk))
++ es_stats->es_stats_shrunk = (nr_shrunk +
++ es_stats->es_stats_shrunk*3) / 4;
++ else
++ es_stats->es_stats_shrunk = nr_shrunk;
++
+ return nr_shrunk;
+ }
+
+@@ -1026,7 +1061,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+@@ -1034,23 +1069,149 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+
+ nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
+- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ return ret;
+ }
+
+-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
++static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
++{
++ return *pos ? NULL : SEQ_START_TOKEN;
++}
++
++static void *
++ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ return NULL;
++}
++
++static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
++{
++ struct ext4_sb_info *sbi = seq->private;
++ struct ext4_es_stats *es_stats = &sbi->s_es_stats;
++ struct ext4_inode_info *ei, *max = NULL;
++ unsigned int inode_cnt = 0;
++
++ if (v != SEQ_START_TOKEN)
++ return 0;
++
++ /* here we just find an inode that has the max nr. of objects */
++ spin_lock(&sbi->s_es_lru_lock);
++ list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
++ inode_cnt++;
++ if (max && max->i_es_all_nr < ei->i_es_all_nr)
++ max = ei;
++ else if (!max)
++ max = ei;
++ }
++ spin_unlock(&sbi->s_es_lru_lock);
++
++ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
++ percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
++ percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
++ seq_printf(seq, " %lu/%lu cache hits/misses\n",
++ es_stats->es_stats_cache_hits,
++ es_stats->es_stats_cache_misses);
++ if (es_stats->es_stats_last_sorted != 0)
++ seq_printf(seq, " %u ms last sorted interval\n",
++ jiffies_to_msecs(jiffies -
++ es_stats->es_stats_last_sorted));
++ if (inode_cnt)
++ seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
++
++ seq_printf(seq, "average:\n %llu us scan time\n",
++ div_u64(es_stats->es_stats_scan_time, 1000));
++ seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
++ if (inode_cnt)
++ seq_printf(seq,
++ "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
++ " %llu us max scan time\n",
++ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
++ div_u64(es_stats->es_stats_max_scan_time, 1000));
++
++ return 0;
++}
++
++static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
++{
++}
++
++static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
++ .start = ext4_es_seq_shrinker_info_start,
++ .next = ext4_es_seq_shrinker_info_next,
++ .stop = ext4_es_seq_shrinker_info_stop,
++ .show = ext4_es_seq_shrinker_info_show,
++};
++
++static int
++ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
++{
++ int ret;
++
++ ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
++ if (!ret) {
++ struct seq_file *m = file->private_data;
++ m->private = PDE_DATA(inode);
++ }
++
++ return ret;
++}
++
++static int
++ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+ {
++ return seq_release(inode, file);
++}
++
++static const struct file_operations ext4_es_seq_shrinker_info_fops = {
++ .owner = THIS_MODULE,
++ .open = ext4_es_seq_shrinker_info_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext4_es_seq_shrinker_info_release,
++};
++
++int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
++{
++ int err;
++
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+- sbi->s_es_last_sorted = 0;
++ sbi->s_es_stats.es_stats_last_sorted = 0;
++ sbi->s_es_stats.es_stats_shrunk = 0;
++ sbi->s_es_stats.es_stats_cache_hits = 0;
++ sbi->s_es_stats.es_stats_cache_misses = 0;
++ sbi->s_es_stats.es_stats_scan_time = 0;
++ sbi->s_es_stats.es_stats_max_scan_time = 0;
++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt,
++ 0, GFP_KERNEL);
++ if (err)
++ return err;
++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt,
++ 0, GFP_KERNEL);
++ if (err)
++ goto err;
+ sbi->s_es_shrinker.shrink = ext4_es_shrink;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
++
++ if (sbi->s_proc)
++ proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
++ &ext4_es_seq_shrinker_info_fops, sbi);
++
++ return 0;
++
++err:
++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
++ return err;
+ }
+
+ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+ {
++ if (sbi->s_proc)
++ remove_proc_entry("es_shrinker_info", sbi->s_proc);
++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+ unregister_shrinker(&sbi->s_es_shrinker);
+ }
+
+diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
+index f1b62a41..efd5f970 100644
+--- a/fs/ext4/extents_status.h
++++ b/fs/ext4/extents_status.h
+@@ -64,6 +64,17 @@ struct ext4_es_tree {
+ struct extent_status *cache_es; /* recently accessed extent */
+ };
+
++struct ext4_es_stats {
++ unsigned long es_stats_last_sorted;
++ unsigned long es_stats_shrunk;
++ unsigned long es_stats_cache_hits;
++ unsigned long es_stats_cache_misses;
++ u64 es_stats_scan_time;
++ u64 es_stats_max_scan_time;
++ struct percpu_counter es_stats_all_cnt;
++ struct percpu_counter es_stats_lru_cnt;
++};
++
+ extern int __init ext4_init_es(void);
+ extern void ext4_exit_es(void);
+ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ (pb & ~ES_MASK));
+ }
+
+-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
++extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+ extern void ext4_es_lru_add(struct inode *inode);
+ extern void ext4_es_lru_del(struct inode *inode);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 18fe358c..bcdb48cf 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -880,7 +880,6 @@ static void ext4_put_super(struct super_block *sb)
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+- percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+ #ifdef CONFIG_QUOTA
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+@@ -944,6 +943,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_lru);
++ ei->i_es_all_nr = 0;
+ ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
+ ei->i_reserved_data_blocks = 0;
+@@ -4289,14 +4289,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ sbi->s_err_report.function = print_daily_error_info;
+ sbi->s_err_report.data = (unsigned long) sb;
+
+- /* Register extent status tree shrinker */
+- ext4_es_register_shrinker(sbi);
+-
+- err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
+- if (err) {
+- ext4_msg(sb, KERN_ERR, "insufficient memory");
++ if (ext4_es_register_shrinker(sbi))
+ goto failed_mount3;
+- }
+
+ sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_extent_max_zeroout_kb = 32;
+@@ -4641,10 +4635,9 @@ failed_mount_wq:
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ }
+-failed_mount3:
+ ext4_es_unregister_shrinker(sbi);
++failed_mount3:
+ del_timer_sync(&sbi->s_err_report);
+- percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
+ failed_mount2:
+--
+2.24.1
+
rhel7.2/ext4-simple-blockalloc.patch
rhel7/ext4-mballoc-skip-uninit-groups-cr0.patch
rhel7/ext4-mballoc-prefetch.patch
+rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch
+rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch
+rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch
+rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch
+rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch
+rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch
+rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch
rhel7.2/ext4-simple-blockalloc.patch
rhel7/ext4-mballoc-skip-uninit-groups-cr0.patch
rhel7.7/ext4-mballoc-prefetch.patch
+rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch
+rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch
+rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch
+rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch
+rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch
+rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch
+rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch