Whamcloud - gitweb
LU-13300 ldiskfs: port patches to improve extent status shrink
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7.6 / ext4-limit-number-of-scanned-extents-in-status-tree-.patch
diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch
new file mode 100644 (file)
index 0000000..30d3306
--- /dev/null
@@ -0,0 +1,235 @@
+From b72242d714ac3968bbb25867718e731be217e87b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 25 Nov 2014 11:51:23 -0500
+Subject: [PATCH 5/7] ext4: limit number of scanned extents in status tree
+ shrinker
+
+Currently we scan extent status trees of inodes until we reclaim nr_to_scan
+extents. This can however require a lot of scanning when there are lots
+of delayed extents (as those cannot be reclaimed).
+
+Change shrinker to work as shrinkers are supposed to and *scan* only
+nr_to_scan extents regardless of how many extents did we actually
+reclaim. We however need to be careful and avoid scanning each status
+tree from the beginning - that could lead to a situation where we would
+not be able to reclaim anything at all when first nr_to_scan extents in
+the tree are always unreclaimable. We remember with each inode offset
+where we stopped scanning and continue from there when we next come
+across the inode.
+
+Note that we also need to update places calling __es_shrink() manually
+to pass reasonable nr_to_scan to have a chance of reclaiming anything and
+not just 1.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h           |  5 ++-
+ fs/ext4/extents_status.c | 91 ++++++++++++++++++++++++++--------------
+ fs/ext4/super.c          |  1 +
+ 3 files changed, 65 insertions(+), 32 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 0813afd6..2893a168 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1020,6 +1020,9 @@ struct ext4_inode_info {
+       struct list_head i_es_list;
+       unsigned int i_es_all_nr;       /* protected by i_es_lock */
+       unsigned int i_es_shk_nr;       /* protected by i_es_lock */
++      ext4_lblk_t i_es_shrink_lblk;   /* Offset where we start searching for
++                                         extents to shrink. Protected by
++                                         i_es_lock  */
+       /* ialloc */
+       ext4_group_t    i_last_alloc_group;
+@@ -1481,7 +1484,7 @@ struct ext4_sb_info {
+       /* Reclaim extents from extent status tree */
+       struct shrinker s_es_shrinker;
+-      struct list_head s_es_list;
++      struct list_head s_es_list;     /* List of inodes with reclaimable extents */
+       long s_es_nr_inode;
+       struct ext4_es_stats s_es_stats;
+       spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
+index edd49793..b78eec2a 100644
+--- a/fs/ext4/extents_status.c
++++ b/fs/ext4/extents_status.c
+@@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep;
+ static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+                             ext4_lblk_t end);
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+-                                     int nr_to_scan);
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                      struct ext4_inode_info *locked_ei);
+@@ -726,7 +725,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ retry:
+       err = __es_insert_extent(inode, &newes);
+       if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+-                                        1, EXT4_I(inode)))
++                                        128, EXT4_I(inode)))
+               goto retry;
+       if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+               err = 0;
+@@ -884,7 +883,7 @@ retry:
+                               es->es_len = orig_es.es_len;
+                               if ((err == -ENOMEM) &&
+                                   __es_shrink(EXT4_SB(inode->i_sb),
+-                                                      1, EXT4_I(inode)))
++                                                      128, EXT4_I(inode)))
+                                       goto retry;
+                               goto out;
+                       }
+@@ -976,7 +975,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+       ktime_t start_time;
+       u64 scan_time;
+       int nr_to_walk;
+-      int ret, nr_shrunk = 0;
++      int nr_shrunk = 0;
+       int retried = 0, nr_skipped = 0;
+       es_stats = &sbi->s_es_stats;
+@@ -994,7 +993,7 @@ retry:
+               ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+                                     i_es_list);
+               /* Move the inode to the tail */
+-              list_move(&ei->i_es_list, sbi->s_es_list.prev);
++              list_move_tail(&ei->i_es_list, &sbi->s_es_list);
+               /*
+                * Normally we try hard to avoid shrinking precached inodes,
+                * but we will as a last resort.
+@@ -1015,12 +1014,10 @@ retry:
+                */
+               spin_unlock(&sbi->s_es_lock);
+-              ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
++              nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
+               write_unlock(&ei->i_es_lock);
+-              nr_shrunk += ret;
+-              nr_to_scan -= ret;
+-              if (nr_to_scan == 0)
++              if (nr_to_scan <= 0)
+                       goto out;
+               spin_lock(&sbi->s_es_lock);
+       }
+@@ -1037,7 +1034,7 @@ retry:
+       }
+       if (locked_ei && nr_shrunk == 0)
+-              nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
++              nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+ out:
+       scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+       if (likely(es_stats->es_stats_scan_time))
+@@ -1213,27 +1210,32 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+       unregister_shrinker(&sbi->s_es_shrinker);
+ }
+-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+-                                     int nr_to_scan)
++/*
++ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
++ * most *nr_to_scan extents, update *nr_to_scan accordingly.
++ *
++ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
++ * Increment *nr_shrunk by the number of reclaimed extents. Also update
++ * ei->i_es_shrink_lblk to where we should continue scanning.
++ */
++static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
++                               int *nr_to_scan, int *nr_shrunk)
+ {
+       struct inode *inode = &ei->vfs_inode;
+       struct ext4_es_tree *tree = &ei->i_es_tree;
+-      struct rb_node *node;
+       struct extent_status *es;
+-      int nr_shrunk = 0;
+-      static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+-                                    DEFAULT_RATELIMIT_BURST);
+-
+-      if (ei->i_es_shk_nr == 0)
+-              return 0;
+-
+-      if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+-          __ratelimit(&_rs))
+-              ext4_warning(inode->i_sb, "forced shrink of precached extents");
++      struct rb_node *node;
+-      node = rb_first(&tree->root);
+-      while (node != NULL) {
+-              es = rb_entry(node, struct extent_status, rb_node);
++      es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
++      if (!es)
++              goto out_wrap;
++      node = &es->rb_node;
++      while (*nr_to_scan > 0) {
++              if (es->es_lblk > end) {
++                      ei->i_es_shrink_lblk = end + 1;
++                      return 0;
++              }
++              (*nr_to_scan)--;
+               node = rb_next(&es->rb_node);
+               /*
+                * We can't reclaim delayed extent from status tree because
+@@ -1242,11 +1244,38 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+               if (!ext4_es_is_delayed(es)) {
+                       rb_erase(&es->rb_node, &tree->root);
+                       ext4_es_free_extent(inode, es);
+-                      nr_shrunk++;
+-                      if (--nr_to_scan == 0)
+-                              break;
++                      (*nr_shrunk)++;
+               }
++              if (!node)
++                      goto out_wrap;
++              es = rb_entry(node, struct extent_status, rb_node);
+       }
+-      tree->cache_es = NULL;
++      ei->i_es_shrink_lblk = es->es_lblk;
++      return 1;
++out_wrap:
++      ei->i_es_shrink_lblk = 0;
++      return 0;
++}
++
++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
++{
++      struct inode *inode = &ei->vfs_inode;
++      int nr_shrunk = 0;
++      ext4_lblk_t start = ei->i_es_shrink_lblk;
++      static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
++                                    DEFAULT_RATELIMIT_BURST);
++
++      if (ei->i_es_shk_nr == 0)
++              return 0;
++
++      if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
++          __ratelimit(&_rs))
++              ext4_warning(inode->i_sb, "forced shrink of precached extents");
++
++      if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
++          start != 0)
++              es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
++
++      ei->i_es_tree.cache_es = NULL;
+       return nr_shrunk;
+ }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 8a81fa73..d9cd4ff9 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -945,6 +945,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       INIT_LIST_HEAD(&ei->i_es_list);
+       ei->i_es_all_nr = 0;
+       ei->i_es_shk_nr = 0;
++      ei->i_es_shrink_lblk = 0;
+       ei->i_reserved_data_blocks = 0;
+       ei->i_reserved_meta_blocks = 0;
+       ei->i_allocated_meta_blocks = 0;
+-- 
+2.24.1
+