--- /dev/null
+From b72242d714ac3968bbb25867718e731be217e87b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:51:23 -0500
+Subject: [PATCH 5/7] ext4: limit number of scanned extents in status tree
+ shrinker
+
+Currently we scan extent status trees of inodes until we reclaim nr_to_scan
+extents. This can however require a lot of scanning when there are lots
+of delayed extents (as those cannot be reclaimed).
+
+Change shrinker to work as shrinkers are supposed to and *scan* only
+nr_to_scan extents regardless of how many extents did we actually
+reclaim. We however need to be careful and avoid scanning each status
+tree from the beginning - that could lead to a situation where we would
+not be able to reclaim anything at all when first nr_to_scan extents in
+the tree are always unreclaimable. We remember with each inode offset
+where we stopped scanning and continue from there when we next come
+across the inode.
+
+Note that we also need to update places calling __es_shrink() manually
+to pass reasonable nr_to_scan to have a chance of reclaiming anything and
+not just 1.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h | 5 ++-
+ fs/ext4/extents_status.c | 91 ++++++++++++++++++++++++++--------------
+ fs/ext4/super.c | 1 +
+ 3 files changed, 65 insertions(+), 32 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 0813afd6..2893a168 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1020,6 +1020,9 @@ struct ext4_inode_info {
+ struct list_head i_es_list;
+ unsigned int i_es_all_nr; /* protected by i_es_lock */
+ unsigned int i_es_shk_nr; /* protected by i_es_lock */
++ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
++ extents to shrink. Protected by
++ i_es_lock */
+
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+@@ -1481,7 +1484,7 @@ struct ext4_sb_info {
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+- struct list_head s_es_list;
++ struct list_head s_es_list; /* List of inodes with reclaimable extents */
+ long s_es_nr_inode;
+ struct ext4_es_stats s_es_stats;
+ spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index edd49793..b78eec2a 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep;
+ static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+- int nr_to_scan);
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
+
+@@ -726,7 +725,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ retry:
+ err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+- 1, EXT4_I(inode)))
++ 128, EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
+@@ -884,7 +883,7 @@ retry:
+ es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __es_shrink(EXT4_SB(inode->i_sb),
+- 1, EXT4_I(inode)))
++ 128, EXT4_I(inode)))
+ goto retry;
+ goto out;
+ }
+@@ -976,7 +975,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ ktime_t start_time;
+ u64 scan_time;
+ int nr_to_walk;
+- int ret, nr_shrunk = 0;
++ int nr_shrunk = 0;
+ int retried = 0, nr_skipped = 0;
+
+ es_stats = &sbi->s_es_stats;
+@@ -994,7 +993,7 @@ retry:
+ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+ i_es_list);
+ /* Move the inode to the tail */
+- list_move(&ei->i_es_list, sbi->s_es_list.prev);
++ list_move_tail(&ei->i_es_list, &sbi->s_es_list);
+ /*
+ * Normally we try hard to avoid shrinking precached inodes,
+ * but we will as a last resort.
+@@ -1015,12 +1014,10 @@ retry:
+ */
+ spin_unlock(&sbi->s_es_lock);
+
+- ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
++ nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
+ write_unlock(&ei->i_es_lock);
+
+- nr_shrunk += ret;
+- nr_to_scan -= ret;
+- if (nr_to_scan == 0)
++ if (nr_to_scan <= 0)
+ goto out;
+ spin_lock(&sbi->s_es_lock);
+ }
+@@ -1037,7 +1034,7 @@ retry:
+ }
+
+ if (locked_ei && nr_shrunk == 0)
+- nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
++ nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+ out:
+ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ if (likely(es_stats->es_stats_scan_time))
+@@ -1213,27 +1210,32 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+ unregister_shrinker(&sbi->s_es_shrinker);
+ }
+
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+- int nr_to_scan)
++/*
++ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
++ * most *nr_to_scan extents, update *nr_to_scan accordingly.
++ *
++ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
++ * Increment *nr_shrunk by the number of reclaimed extents. Also update
++ * ei->i_es_shrink_lblk to where we should continue scanning.
++ */
++static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
++ int *nr_to_scan, int *nr_shrunk)
+ {
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+- struct rb_node *node;
+ struct extent_status *es;
+- int nr_shrunk = 0;
+- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+- DEFAULT_RATELIMIT_BURST);
+-
+- if (ei->i_es_shk_nr == 0)
+- return 0;
+-
+- if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+- __ratelimit(&_rs))
+- ext4_warning(inode->i_sb, "forced shrink of precached extents");
++ struct rb_node *node;
+
+- node = rb_first(&tree->root);
+- while (node != NULL) {
+- es = rb_entry(node, struct extent_status, rb_node);
++ es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
++ if (!es)
++ goto out_wrap;
++ node = &es->rb_node;
++ while (*nr_to_scan > 0) {
++ if (es->es_lblk > end) {
++ ei->i_es_shrink_lblk = end + 1;
++ return 0;
++ }
++ (*nr_to_scan)--;
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+@@ -1242,11 +1244,38 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+- nr_shrunk++;
+- if (--nr_to_scan == 0)
+- break;
++ (*nr_shrunk)++;
+ }
++ if (!node)
++ goto out_wrap;
++ es = rb_entry(node, struct extent_status, rb_node);
+ }
+- tree->cache_es = NULL;
++ ei->i_es_shrink_lblk = es->es_lblk;
++ return 1;
++out_wrap:
++ ei->i_es_shrink_lblk = 0;
++ return 0;
++}
++
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
++{
++ struct inode *inode = &ei->vfs_inode;
++ int nr_shrunk = 0;
++ ext4_lblk_t start = ei->i_es_shrink_lblk;
++ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
++ DEFAULT_RATELIMIT_BURST);
++
++ if (ei->i_es_shk_nr == 0)
++ return 0;
++
++ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
++ __ratelimit(&_rs))
++ ext4_warning(inode->i_sb, "forced shrink of precached extents");
++
++ if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
++ start != 0)
++ es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
++
++ ei->i_es_tree.cache_es = NULL;
+ return nr_shrunk;
+ }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 8a81fa73..d9cd4ff9 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -945,6 +945,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ INIT_LIST_HEAD(&ei->i_es_list);
+ ei->i_es_all_nr = 0;
+ ei->i_es_shk_nr = 0;
++ ei->i_es_shrink_lblk = 0;
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+--
+2.24.1
+