From cc058b253ac7987a8350880e1c34a39cdf0625e8 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 27 Feb 2020 17:21:53 +0800 Subject: [PATCH] LU-13300 ldiskfs: port patches to improve extent status shrink We see serious extent status shrink problem on some customer sites, the backtrace could be: NMI watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [kswapd0:106] [] merge+0x62/0xc0 [] ? ldiskfs_init_inode_table+0x410/0x410 [ldiskfs] [] list_sort+0x9b/0x250 [] __ldiskfs_es_shrink+0x1ce/0x2a0 [ldiskfs] [] ldiskfs_es_shrink+0xb4/0x130 [ldiskfs] [] shrink_slab+0x175/0x340 [] ? vmpressure+0x87/0x90 [] balance_pgdat+0x3a8/0x5e0 Backport following Linux upstrem commits since v3.18 to RHEL7: Linux-commit: eb68d0e2fc5a4e5c06324ea5f485fccbae626d05 ext4: track extent status tree shrinker delay statictics Linux-commit: 50460fe8c6d1d95b16427936e351f277a1c72d43 ext4: remove extent status procfs files if journal load Linux-commit: edaa53cac8fd4b96ed4b8f96c4933158ff2dd337 ext4: change LRU to round-robin in extent status tree shrinker Linux-commit: b0dea4c1651f3cdb6d17604fa473e72cb74cdc6b ext4: move handling of list of shrinkable inodes into extent status code Linux-commit: dd4759255188771e60cf3455982959a1ba04f4eb ext4: limit number of scanned extents in status tree shrinker Linux-commit: 624d0f1dd7c80d2bac4fc3066b2ff3947f890883 ext4: cleanup flag definitions for extent status tree Linux-commit: 2be12de98a1cc21c4de4e2d6fb2bf5aa0a279947 ext4: introduce aging to extent status tree Test-Parameters: fstype=ldiskfs serverdistro=el7.7 Change-Id: Idd97872b1663bc001a63274a430eaade66efd37d Signed-off-by: Wang Shilong Reviewed-on: https://review.whamcloud.com/37749 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Li Dongyang Reviewed-by: Andreas Dilger --- ...-LRU-to-round-robin-in-extent-status-tree.patch | 570 +++++++++++++++++++++ ...p-flag-definitions-for-extent-status-tree.patch | 139 +++++ ...xt4-introduce-aging-to-extent-status-tree.patch | 156 ++++++ ...number-of-scanned-extents-in-status-tree-.patch | 235 +++++++++ ...andling-of-list-of-shrinkable-inodes-into.patch | 147 ++++++ ...-extent-status-procfs-files-if-journal-lo.patch | 49 ++ ...extent-status-tree-shrinker-delay-statict.patch | 464 +++++++++++++++++ .../series/ldiskfs-3.10-rhel7.6.series | 7 + .../series/ldiskfs-3.10-rhel7.7.series | 7 + 9 files changed, 1774 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch new file mode 100644 index 0000000..bc04f8b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch @@ -0,0 +1,570 @@ +From 4fac310a77c918d6a235a55cb76cf2f9bb22de71 Mon Sep 17 00:00:00 2001 +From: Zheng Liu +Date: Tue, 25 Nov 2014 11:45:37 -0500 +Subject: [PATCH 3/7] ext4: change LRU to round-robin in extent status tree + shrinker + +In this commit we discard the lru algorithm for inodes with extent +status tree because it takes significant effort to maintain a lru list +in extent status tree shrinker and the shrinker can take a long time to +scan this lru list in order to reclaim some objects. + +We replace the lru ordering with a simple round-robin. After that we +never need to keep a lru list. That means that the list needn't be +sorted if the shrinker can not reclaim any objects in the first round. + +Cc: Andreas Dilger +Signed-off-by: Zheng Liu +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 10 +- + fs/ext4/extents.c | 4 +- + fs/ext4/extents_status.c | 221 +++++++++++++++++---------------------- + fs/ext4/extents_status.h | 7 +- + fs/ext4/inode.c | 4 +- + fs/ext4/ioctl.c | 4 +- + fs/ext4/super.c | 7 +- + 7 files changed, 112 insertions(+), 145 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index cc5ba587..0813afd6 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1017,10 +1017,9 @@ struct ext4_inode_info { + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; +- struct list_head i_es_lru; ++ struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ +- unsigned int i_es_lru_nr; /* protected by i_es_lock */ +- unsigned long i_touch_when; /* jiffies of last accessing */ ++ unsigned int i_es_shk_nr; /* protected by i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; +@@ -1482,9 +1481,10 @@ struct ext4_sb_info { + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; +- struct list_head s_es_lru; ++ struct list_head s_es_list; ++ long s_es_nr_inode; + struct ext4_es_stats s_es_stats; +- spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; ++ spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index f618d0ba..c012dc51 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4689,7 +4689,7 @@ out2: + + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + return err ? err : allocated; + } + +@@ -5263,7 +5263,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); + } +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + return error; + } + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 7dfed27b..382a7bf9 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -149,8 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan); +-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, +- struct ext4_inode_info *locked_ei); ++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ++ struct ext4_inode_info *locked_ei); + + int __init ext4_init_es(void) + { +@@ -298,6 +298,36 @@ out: + trace_ext4_es_find_delayed_extent_range_exit(inode, es); + } + ++void ext4_es_list_add(struct inode *inode) ++{ ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ ++ if (!list_empty(&ei->i_es_list)) ++ return; ++ ++ spin_lock(&sbi->s_es_lock); ++ if (list_empty(&ei->i_es_list)) { ++ list_add_tail(&ei->i_es_list, &sbi->s_es_list); ++ sbi->s_es_nr_inode++; ++ } ++ spin_unlock(&sbi->s_es_lock); ++} ++ ++void ext4_es_list_del(struct inode *inode) ++{ ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ ++ spin_lock(&sbi->s_es_lock); ++ if (!list_empty(&ei->i_es_list)) { ++ list_del_init(&ei->i_es_list); ++ sbi->s_es_nr_inode--; ++ WARN_ON_ONCE(sbi->s_es_nr_inode < 0); ++ } ++ spin_unlock(&sbi->s_es_lock); ++} ++ + static struct extent_status * + ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + ext4_fsblk_t pblk) +@@ -314,9 +344,9 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { +- EXT4_I(inode)->i_es_lru_nr++; ++ EXT4_I(inode)->i_es_shk_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> +- s_es_stats.es_stats_lru_cnt); ++ s_es_stats.es_stats_shk_cnt); + } + + EXT4_I(inode)->i_es_all_nr++; +@@ -330,12 +360,12 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + +- /* Decrease the lru counter when this es is not delayed */ ++ /* Decrease the shrink counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { +- BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); +- EXT4_I(inode)->i_es_lru_nr--; ++ BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); ++ EXT4_I(inode)->i_es_shk_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> +- s_es_stats.es_stats_lru_cnt); ++ s_es_stats.es_stats_shk_cnt); + } + + kmem_cache_free(ext4_es_cachep, es); +@@ -693,8 +723,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + goto error; + retry: + err = __es_insert_extent(inode, &newes); +- if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, +- EXT4_I(inode))) ++ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), ++ 1, EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; +@@ -851,8 +881,8 @@ retry: + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && +- __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, +- EXT4_I(inode))) ++ __es_shrink(EXT4_SB(inode->i_sb), ++ 1, EXT4_I(inode))) + goto retry; + goto out; + } +@@ -924,6 +954,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + end = lblk + len - 1; + BUG_ON(end < lblk); + ++ /* ++ * ext4_clear_inode() depends on us taking i_es_lock unconditionally ++ * so that we are sure __es_shrink() is done with the inode before it ++ * is reclaimed. ++ */ + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); + write_unlock(&EXT4_I(inode)->i_es_lock); +@@ -931,112 +966,77 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + return err; + } + +-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, +- struct list_head *b) +-{ +- struct ext4_inode_info *eia, *eib; +- eia = list_entry(a, struct ext4_inode_info, i_es_lru); +- eib = list_entry(b, struct ext4_inode_info, i_es_lru); +- +- if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && +- !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) +- return 1; +- if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && +- ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) +- return -1; +- if (eia->i_touch_when == eib->i_touch_when) +- return 0; +- if (time_after(eia->i_touch_when, eib->i_touch_when)) +- return 1; +- else +- return -1; +-} +- +-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, +- struct ext4_inode_info *locked_ei) ++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ++ struct ext4_inode_info *locked_ei) + { + struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; +- struct list_head *cur, *tmp; +- LIST_HEAD(skipped); + ktime_t start_time; + u64 scan_time; ++ int nr_to_walk; + int ret, nr_shrunk = 0; +- int retried = 0, skip_precached = 1, nr_skipped = 0; ++ int retried = 0, nr_skipped = 0; + + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); +- spin_lock(&sbi->s_es_lru_lock); + + retry: +- list_for_each_safe(cur, tmp, &sbi->s_es_lru) { +- /* +- * If we have already reclaimed all extents from extent +- * status tree, just stop the loop immediately. +- */ +- if (percpu_counter_read_positive( +- &es_stats->es_stats_lru_cnt) == 0) +- break; +- +- ei = list_entry(cur, struct ext4_inode_info, i_es_lru); ++ spin_lock(&sbi->s_es_lock); ++ nr_to_walk = sbi->s_es_nr_inode; ++ while (nr_to_walk-- > 0) { + ++ if (list_empty(&sbi->s_es_list)) { ++ spin_unlock(&sbi->s_es_lock); ++ goto out; ++ } ++ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, ++ i_es_list); ++ /* Move the inode to the tail */ ++ list_move(&ei->i_es_list, sbi->s_es_list.prev); + /* +- * Skip the inode that is newer than the last_sorted +- * time. Normally we try hard to avoid shrinking +- * precached inodes, but we will as a last resort. ++ * Normally we try hard to avoid shrinking precached inodes, ++ * but we will as a last resort. + */ +- if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || +- (skip_precached && ext4_test_inode_state(&ei->vfs_inode, +- EXT4_STATE_EXT_PRECACHED))) { ++ if (!retried && ext4_test_inode_state(&ei->vfs_inode, ++ EXT4_STATE_EXT_PRECACHED)) { + nr_skipped++; +- list_move_tail(cur, &skipped); + continue; + } + +- if (ei->i_es_lru_nr == 0 || ei == locked_ei || +- !write_trylock(&ei->i_es_lock)) +- continue; ++ if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { ++ nr_skipped++; ++ continue; ++ } ++ /* ++ * Now we hold i_es_lock which protects us from inode reclaim ++ * freeing inode under us ++ */ ++ spin_unlock(&sbi->s_es_lock); + + ret = __es_try_to_reclaim_extents(ei, nr_to_scan); +- if (ei->i_es_lru_nr == 0) +- list_del_init(&ei->i_es_lru); + write_unlock(&ei->i_es_lock); + + nr_shrunk += ret; + nr_to_scan -= ret; + if (nr_to_scan == 0) +- break; ++ goto out; ++ spin_lock(&sbi->s_es_lock); + } + +- /* Move the newer inodes into the tail of the LRU list. */ +- list_splice_tail(&skipped, &sbi->s_es_lru); +- INIT_LIST_HEAD(&skipped); ++ spin_unlock(&sbi->s_es_lock); + + /* + * If we skipped any inodes, and we weren't able to make any +- * forward progress, sort the list and try again. ++ * forward progress, try again to scan precached inodes. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; +- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); +- es_stats->es_stats_last_sorted = jiffies; +- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, +- i_es_lru); +- /* +- * If there are no non-precached inodes left on the +- * list, start releasing precached extents. +- */ +- if (ext4_test_inode_state(&ei->vfs_inode, +- EXT4_STATE_EXT_PRECACHED)) +- skip_precached = 0; + goto retry; + } + +- spin_unlock(&sbi->s_es_lru_lock); +- + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); +- ++out: + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + +@@ -1061,15 +1061,15 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + +- ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + +- nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); ++ nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); + +- ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + return ret; + } +@@ -1096,28 +1096,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) + return 0; + + /* here we just find an inode that has the max nr. of objects */ +- spin_lock(&sbi->s_es_lru_lock); +- list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { ++ spin_lock(&sbi->s_es_lock); ++ list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } +- spin_unlock(&sbi->s_es_lru_lock); ++ spin_unlock(&sbi->s_es_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), +- percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); ++ percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); +- if (es_stats->es_stats_last_sorted != 0) +- seq_printf(seq, " %u ms last sorted interval\n", +- jiffies_to_msecs(jiffies - +- es_stats->es_stats_last_sorted)); + if (inode_cnt) +- seq_printf(seq, " %d inodes on lru list\n", inode_cnt); ++ seq_printf(seq, " %d inodes on list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); +@@ -1126,7 +1122,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", +- max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, ++ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +@@ -1175,9 +1171,9 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) + { + int err; + +- INIT_LIST_HEAD(&sbi->s_es_lru); +- spin_lock_init(&sbi->s_es_lru_lock); +- sbi->s_es_stats.es_stats_last_sorted = 0; ++ INIT_LIST_HEAD(&sbi->s_es_list); ++ sbi->s_es_nr_inode = 0; ++ spin_lock_init(&sbi->s_es_lock); + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; +@@ -1187,7 +1183,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) + 0, GFP_KERNEL); + if (err) + return err; +- err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, ++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, + 0, GFP_KERNEL); + if (err) + goto err; +@@ -1211,37 +1207,10 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); +- percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); + unregister_shrinker(&sbi->s_es_shrinker); + } + +-void ext4_es_lru_add(struct inode *inode) +-{ +- struct ext4_inode_info *ei = EXT4_I(inode); +- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- +- ei->i_touch_when = jiffies; +- +- if (!list_empty(&ei->i_es_lru)) +- return; +- +- spin_lock(&sbi->s_es_lru_lock); +- if (list_empty(&ei->i_es_lru)) +- list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); +- spin_unlock(&sbi->s_es_lru_lock); +-} +- +-void ext4_es_lru_del(struct inode *inode) +-{ +- struct ext4_inode_info *ei = EXT4_I(inode); +- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- +- spin_lock(&sbi->s_es_lru_lock); +- if (!list_empty(&ei->i_es_lru)) +- list_del_init(&ei->i_es_lru); +- spin_unlock(&sbi->s_es_lru_lock); +-} +- + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan) + { +@@ -1253,7 +1222,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +- if (ei->i_es_lru_nr == 0) ++ if (ei->i_es_shk_nr == 0) + return 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index efd5f970..0e6a33e8 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -65,14 +65,13 @@ struct ext4_es_tree { + }; + + struct ext4_es_stats { +- unsigned long es_stats_last_sorted; + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; +- struct percpu_counter es_stats_lru_cnt; ++ struct percpu_counter es_stats_shk_cnt; + }; + + extern int __init ext4_init_es(void); +@@ -151,7 +150,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, + + extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +-extern void ext4_es_lru_add(struct inode *inode); +-extern void ext4_es_lru_del(struct inode *inode); ++extern void ext4_es_list_add(struct inode *inode); ++extern void ext4_es_list_del(struct inode *inode); + + #endif /* _EXT4_EXTENTS_STATUS_H */ +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 21db5952..f6a2764c 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -523,7 +523,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; +@@ -1519,7 +1519,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read(&EXT4_I(inode)->i_data_sem); +diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c +index 858cf709..122d517c 100644 +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -80,8 +80,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); +- ext4_es_lru_del(inode1); +- ext4_es_lru_del(inode2); ++ ext4_es_list_del(inode1); ++ ext4_es_list_del(inode2); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 95a01d56..ea2a1026 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -942,10 +942,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); +- INIT_LIST_HEAD(&ei->i_es_lru); ++ INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; +- ei->i_es_lru_nr = 0; +- ei->i_touch_when = 0; ++ ei->i_es_shk_nr = 0; + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; +@@ -1034,7 +1033,7 @@ void ext4_clear_inode(struct inode *inode) + dquot_drop(inode); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); +- ext4_es_lru_del(inode); ++ ext4_es_list_del(inode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch new file mode 100644 index 0000000..4d28b12 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch @@ -0,0 +1,139 @@ +From dd5c7af957dd0b9b3b04ef8aacffd601b46bc26c Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 25 Nov 2014 11:53:47 -0500 +Subject: [PATCH 6/7] ext4: cleanup flag definitions for extent status tree + +Currently flags for extent status tree are defined twice, once shifted +and once without a being shifted. Consolidate these definitions into one +place and make some computations automatic to make adding flags less +error prone. Compiler should be clever enough to figure out these are +constants and generate the same code. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents_status.c | 2 ++ + fs/ext4/extents_status.h | 58 ++++++++++++++++++---------------------- + 2 files changed, 28 insertions(+), 32 deletions(-) + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index b78eec2a..a29708c0 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -1170,6 +1170,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) + { + int err; + ++ /* Make sure we have enough bits for physical block number */ ++ BUILD_BUG_ON(ES_SHIFT < 48); + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index b0b78b95..e86b1f34 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -29,25 +29,21 @@ + /* + * These flags live in the high bits of extent_status.es_pblk + */ +-#define ES_SHIFT 60 +- +-#define EXTENT_STATUS_WRITTEN (1 << 3) +-#define EXTENT_STATUS_UNWRITTEN (1 << 2) +-#define EXTENT_STATUS_DELAYED (1 << 1) +-#define EXTENT_STATUS_HOLE (1 << 0) +- +-#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ +- EXTENT_STATUS_UNWRITTEN | \ +- EXTENT_STATUS_DELAYED | \ +- EXTENT_STATUS_HOLE) ++enum { ++ ES_WRITTEN_B, ++ ES_UNWRITTEN_B, ++ ES_DELAYED_B, ++ ES_HOLE_B, ++ ES_FLAGS ++}; + +-#define ES_WRITTEN (1ULL << 63) +-#define ES_UNWRITTEN (1ULL << 62) +-#define ES_DELAYED (1ULL << 61) +-#define ES_HOLE (1ULL << 60) ++#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) ++#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +-#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ +- ES_DELAYED | ES_HOLE) ++#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) ++#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) ++#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) ++#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) + + struct ext4_sb_info; + struct ext4_extent; +@@ -92,29 +88,29 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, + extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); + ++static inline unsigned int ext4_es_status(struct extent_status *es) ++{ ++ return es->es_pblk >> ES_SHIFT; ++} ++ + static inline int ext4_es_is_written(struct extent_status *es) + { +- return (es->es_pblk & ES_WRITTEN) != 0; ++ return (ext4_es_status(es) & EXTENT_STATUS_WRITTEN) != 0; + } + + static inline int ext4_es_is_unwritten(struct extent_status *es) + { +- return (es->es_pblk & ES_UNWRITTEN) != 0; ++ return (ext4_es_status(es) & EXTENT_STATUS_UNWRITTEN) != 0; + } + + static inline int ext4_es_is_delayed(struct extent_status *es) + { +- return (es->es_pblk & ES_DELAYED) != 0; ++ return (ext4_es_status(es) & EXTENT_STATUS_DELAYED) != 0; + } + + static inline int ext4_es_is_hole(struct extent_status *es) + { +- return (es->es_pblk & ES_HOLE) != 0; +-} +- +-static inline unsigned int ext4_es_status(struct extent_status *es) +-{ +- return es->es_pblk >> ES_SHIFT; ++ return (ext4_es_status(es) & EXTENT_STATUS_HOLE) != 0; + } + + static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +@@ -134,18 +130,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es, + static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) + { +- es->es_pblk = (((ext4_fsblk_t) +- (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | +- (es->es_pblk & ~ES_MASK)); ++ es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | ++ (es->es_pblk & ~ES_MASK); + } + + static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) + { +- es->es_pblk = (((ext4_fsblk_t) +- (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | +- (pb & ~ES_MASK)); ++ es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | ++ (pb & ~ES_MASK); + } + + extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch new file mode 100644 index 0000000..ddbe7e7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch @@ -0,0 +1,156 @@ +From 1da6da1563df986dd35080d7edcf59b739696c40 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 25 Nov 2014 11:55:24 -0500 +Subject: [PATCH 7/7] ext4: introduce aging to extent status tree + +Introduce a simple aging to extent status tree. Each extent has a +REFERENCED bit which gets set when the extent is used. Shrinker then +skips entries with referenced bit set and clears the bit. Thus +frequently used extents have higher chances of staying in memory. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents_status.c | 22 +++++++++++++++++----- + fs/ext4/extents_status.h | 35 +++++++++++++++++++++++++++++++---- + 2 files changed, 48 insertions(+), 9 deletions(-) + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index a29708c0..0305f308 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -382,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) + { +- if (ext4_es_status(es1) != ext4_es_status(es2)) ++ if (ext4_es_type(es1) != ext4_es_type(es2)) + return 0; + + if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { +@@ -425,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; ++ if (ext4_es_is_referenced(es)) ++ ext4_es_set_referenced(es1); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + es = es1; +@@ -447,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; ++ if (ext4_es_is_referenced(es1)) ++ ext4_es_set_referenced(es); + rb_erase(node, &tree->root); + ext4_es_free_extent(inode, es1); + } +@@ -823,6 +827,8 @@ out: + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; ++ if (!ext4_es_is_referenced(es)) ++ ext4_es_set_referenced(es); + stats->es_stats_cache_hits++; + } else { + stats->es_stats_cache_misses++; +@@ -1243,11 +1249,17 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ +- if (!ext4_es_is_delayed(es)) { +- rb_erase(&es->rb_node, &tree->root); +- ext4_es_free_extent(inode, es); +- (*nr_shrunk)++; ++ if (ext4_es_is_delayed(es)) ++ goto next; ++ if (ext4_es_is_referenced(es)) { ++ ext4_es_clear_referenced(es); ++ goto next; + } ++ ++ rb_erase(&es->rb_node, &tree->root); ++ ext4_es_free_extent(inode, es); ++ (*nr_shrunk)++; ++next: + if (!node) + goto out_wrap; + es = rb_entry(node, struct extent_status, rb_node); +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index e86b1f34..691b5261 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -34,6 +34,7 @@ enum { + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, ++ ES_REFERENCED_B, + ES_FLAGS + }; + +@@ -44,6 +45,12 @@ enum { + #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) + #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) + #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) ++#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) ++ ++#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ ++ EXTENT_STATUS_UNWRITTEN | \ ++ EXTENT_STATUS_DELAYED | \ ++ EXTENT_STATUS_HOLE) << ES_SHIFT) + + struct ext4_sb_info; + struct ext4_extent; +@@ -93,24 +100,44 @@ static inline unsigned int ext4_es_status(struct extent_status *es) + return es->es_pblk >> ES_SHIFT; + } + ++static inline unsigned int ext4_es_type(struct extent_status *es) ++{ ++ return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; ++} ++ + static inline int ext4_es_is_written(struct extent_status *es) + { +- return (ext4_es_status(es) & EXTENT_STATUS_WRITTEN) != 0; ++ return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; + } + + static inline int ext4_es_is_unwritten(struct extent_status *es) + { +- return (ext4_es_status(es) & EXTENT_STATUS_UNWRITTEN) != 0; ++ return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; + } + + static inline int ext4_es_is_delayed(struct extent_status *es) + { +- return (ext4_es_status(es) & EXTENT_STATUS_DELAYED) != 0; ++ return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; + } + + static inline int ext4_es_is_hole(struct extent_status *es) + { +- return (ext4_es_status(es) & EXTENT_STATUS_HOLE) != 0; ++ return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; ++} ++ ++static inline void ext4_es_set_referenced(struct extent_status *es) ++{ ++ es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; ++} ++ ++static inline void ext4_es_clear_referenced(struct extent_status *es) ++{ ++ es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); ++} ++ ++static inline int ext4_es_is_referenced(struct extent_status *es) ++{ ++ return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; + } + + static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch new file mode 100644 index 0000000..30d3306 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch @@ -0,0 +1,235 @@ +From b72242d714ac3968bbb25867718e731be217e87b Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 25 Nov 2014 11:51:23 -0500 +Subject: [PATCH 5/7] ext4: limit number of scanned extents in status tree + shrinker + +Currently we scan extent status trees of inodes until we reclaim nr_to_scan +extents. This can however require a lot of scanning when there are lots +of delayed extents (as those cannot be reclaimed). + +Change shrinker to work as shrinkers are supposed to and *scan* only +nr_to_scan extents regardless of how many extents did we actually +reclaim. We however need to be careful and avoid scanning each status +tree from the beginning - that could lead to a situation where we would +not be able to reclaim anything at all when first nr_to_scan extents in +the tree are always unreclaimable. We remember with each inode offset +where we stopped scanning and continue from there when we next come +across the inode. + +Note that we also need to update places calling __es_shrink() manually +to pass reasonable nr_to_scan to have a chance of reclaiming anything and +not just 1. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 5 ++- + fs/ext4/extents_status.c | 91 ++++++++++++++++++++++++++-------------- + fs/ext4/super.c | 1 + + 3 files changed, 65 insertions(+), 32 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 0813afd6..2893a168 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1020,6 +1020,9 @@ struct ext4_inode_info { + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ ++ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for ++ extents to shrink. Protected by ++ i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; +@@ -1481,7 +1484,7 @@ struct ext4_sb_info { + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; +- struct list_head s_es_list; ++ struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index edd49793..b78eec2a 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep; + static int __es_insert_extent(struct inode *inode, struct extent_status *newes); + static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); +-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, +- int nr_to_scan); ++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); + static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); + +@@ -726,7 +725,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), +- 1, EXT4_I(inode))) ++ 128, EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; +@@ -884,7 +883,7 @@ retry: + es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && + __es_shrink(EXT4_SB(inode->i_sb), +- 1, EXT4_I(inode))) ++ 128, EXT4_I(inode))) + goto retry; + goto out; + } +@@ -976,7 +975,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + ktime_t start_time; + u64 scan_time; + int nr_to_walk; +- int ret, nr_shrunk = 0; ++ int nr_shrunk = 0; + int retried = 0, nr_skipped = 0; + + es_stats = &sbi->s_es_stats; +@@ -994,7 +993,7 @@ retry: + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ +- list_move(&ei->i_es_list, sbi->s_es_list.prev); ++ list_move_tail(&ei->i_es_list, &sbi->s_es_list); + /* + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. +@@ -1015,12 +1014,10 @@ retry: + */ + spin_unlock(&sbi->s_es_lock); + +- ret = __es_try_to_reclaim_extents(ei, nr_to_scan); ++ nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); + write_unlock(&ei->i_es_lock); + +- nr_shrunk += ret; +- nr_to_scan -= ret; +- if (nr_to_scan == 0) ++ if (nr_to_scan <= 0) + goto out; + spin_lock(&sbi->s_es_lock); + } +@@ -1037,7 +1034,7 @@ retry: + } + + if (locked_ei && nr_shrunk == 0) +- nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); ++ nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); + out: + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) +@@ -1213,27 +1210,32 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + unregister_shrinker(&sbi->s_es_shrinker); + } + +-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, +- int nr_to_scan) ++/* ++ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at ++ * most *nr_to_scan extents, update *nr_to_scan accordingly. ++ * ++ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. ++ * Increment *nr_shrunk by the number of reclaimed extents. Also update ++ * ei->i_es_shrink_lblk to where we should continue scanning. ++ */ ++static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, ++ int *nr_to_scan, int *nr_shrunk) + { + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; +- struct rb_node *node; + struct extent_status *es; +- int nr_shrunk = 0; +- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, +- DEFAULT_RATELIMIT_BURST); +- +- if (ei->i_es_shk_nr == 0) +- return 0; +- +- if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && +- __ratelimit(&_rs)) +- ext4_warning(inode->i_sb, "forced shrink of precached extents"); ++ struct rb_node *node; + +- node = rb_first(&tree->root); +- while (node != NULL) { +- es = rb_entry(node, struct extent_status, rb_node); ++ es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); ++ if (!es) ++ goto out_wrap; ++ node = &es->rb_node; ++ while (*nr_to_scan > 0) { ++ if (es->es_lblk > end) { ++ ei->i_es_shrink_lblk = end + 1; ++ return 0; ++ } ++ (*nr_to_scan)--; + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because +@@ -1242,11 +1244,38 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); +- nr_shrunk++; +- if (--nr_to_scan == 0) +- break; ++ (*nr_shrunk)++; + } ++ if (!node) ++ goto out_wrap; ++ es = rb_entry(node, struct extent_status, rb_node); + } +- tree->cache_es = NULL; ++ ei->i_es_shrink_lblk = es->es_lblk; ++ return 1; ++out_wrap: ++ ei->i_es_shrink_lblk = 0; ++ return 0; ++} ++ ++static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) ++{ ++ struct inode *inode = &ei->vfs_inode; ++ int nr_shrunk = 0; ++ ext4_lblk_t start = ei->i_es_shrink_lblk; ++ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, ++ DEFAULT_RATELIMIT_BURST); ++ ++ if (ei->i_es_shk_nr == 0) ++ return 0; ++ ++ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && ++ __ratelimit(&_rs)) ++ ext4_warning(inode->i_sb, "forced shrink of precached extents"); ++ ++ if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && ++ start != 0) ++ es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); ++ ++ ei->i_es_tree.cache_es = NULL; + return nr_shrunk; + } +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 8a81fa73..d9cd4ff9 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -945,6 +945,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; + ei->i_es_shk_nr = 0; ++ ei->i_es_shrink_lblk = 0; + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch new file mode 100644 index 0000000..6418198 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch @@ -0,0 +1,147 @@ +From 8d5847463404eb2d6b24f748d521d1930a432da9 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 25 Nov 2014 11:49:25 -0500 +Subject: [PATCH 4/7] ext4: move handling of list of shrinkable inodes into + extent status code + +Currently callers adding extents to extent status tree were responsible +for adding the inode to the list of inodes with freeable extents. This +is error prone and puts list handling in unnecessarily many places. + +Just add inode to the list automatically when the first non-delay extent +is added to the tree and remove inode from the list when the last +non-delay extent is removed. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents.c | 2 -- + fs/ext4/extents_status.c | 10 ++++++---- + fs/ext4/extents_status.h | 2 -- + fs/ext4/inode.c | 2 -- + fs/ext4/ioctl.c | 2 -- + fs/ext4/super.c | 1 - + 6 files changed, 6 insertions(+), 13 deletions(-) + +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index c012dc51..d9d51a5b 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4689,7 +4689,6 @@ out2: + + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); +- ext4_es_list_add(inode); + return err ? err : allocated; + } + +@@ -5263,7 +5262,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); + } +- ext4_es_list_add(inode); + return error; + } + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 382a7bf9..edd49793 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -298,7 +298,7 @@ out: + trace_ext4_es_find_delayed_extent_range_exit(inode, es); + } + +-void ext4_es_list_add(struct inode *inode) ++static void ext4_es_list_add(struct inode *inode) + { + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +@@ -314,7 +314,7 @@ void ext4_es_list_add(struct inode *inode) + spin_unlock(&sbi->s_es_lock); + } + +-void ext4_es_list_del(struct inode *inode) ++static void ext4_es_list_del(struct inode *inode) + { + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +@@ -344,7 +344,8 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { +- EXT4_I(inode)->i_es_shk_nr++; ++ if (!EXT4_I(inode)->i_es_shk_nr++) ++ ext4_es_list_add(inode); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } +@@ -363,7 +364,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + /* Decrease the shrink counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); +- EXT4_I(inode)->i_es_shk_nr--; ++ if (!--EXT4_I(inode)->i_es_shk_nr) ++ ext4_es_list_del(inode); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index 0e6a33e8..b0b78b95 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -150,7 +150,5 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, + + extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +-extern void ext4_es_list_add(struct inode *inode); +-extern void ext4_es_list_del(struct inode *inode); + + #endif /* _EXT4_EXTENTS_STATUS_H */ +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index f6a2764c..9bbdc9e5 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -523,7 +523,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +- ext4_es_list_add(inode); + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; +@@ -1519,7 +1518,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { +- ext4_es_list_add(inode); + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read(&EXT4_I(inode)->i_data_sem); +diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c +index 122d517c..6a6a9588 100644 +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -80,8 +80,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); +- ext4_es_list_del(inode1); +- ext4_es_list_del(inode2); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index ea2a1026..8a81fa73 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1033,7 +1033,6 @@ void ext4_clear_inode(struct inode *inode) + dquot_drop(inode); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); +- ext4_es_list_del(inode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch new file mode 100644 index 0000000..de226af --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch @@ -0,0 +1,49 @@ +From fabafc86567c2165c5b2165dcbf835edd6f81e72 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 30 Oct 2014 10:53:16 -0400 +Subject: [PATCH 2/7] ext4: remove extent status procfs files if journal load + fails + +If we can't load the journal, remove the procfs files for the extent +status information file to avoid leaking resources. + +Signed-off-by: Darrick J. Wong +Signed-off-by: Theodore Ts'o +Cc: stable@vger.kernel.org +--- + fs/ext4/super.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index bcdb48cf..95a01d56 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -4326,7 +4326,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) +- goto failed_mount3; ++ goto failed_mount3a; + + ext4_ext_init(sb); /* needed before using extent-mapped journal */ + +@@ -4338,7 +4338,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + err = ext4_load_journal(sb, es, journal_devnum); + if (err) +- goto failed_mount3; ++ goto failed_mount3a; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + ext4_msg(sb, KERN_ERR, "required journal recovery " +@@ -4635,6 +4635,7 @@ failed_mount_wq: + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } ++failed_mount3a: + ext4_es_unregister_shrinker(sbi); + failed_mount3: + del_timer_sync(&sbi->s_err_report); +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch new file mode 100644 index 0000000..9cb4d18 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch @@ -0,0 +1,464 @@ +From f33e0fa5ab6cad962d3b88376f4611b9aba1d030 Mon Sep 17 00:00:00 2001 +From: Wang Shilong +Date: Thu, 27 Feb 2020 17:08:04 +0800 +Subject: [PATCH 1/7] ext4: track extent status tree shrinker delay statictics + +This commit adds some statictics in extent status tree shrinker. The +purpose to add these is that we want to collect more details when we +encounter a stall caused by extent status tree shrinker. Here we count +the following statictics: + stats: + the number of all objects on all extent status trees + the number of reclaimable objects on lru list + cache hits/misses + the last sorted interval + the number of inodes on lru list + average: + scan time for shrinking some objects + the number of shrunk objects + maximum: + the inode that has max nr. of objects on lru list + the maximum scan time for shrinking some objects + +The output looks like below: + $ cat /proc/fs/ext4/sda1/es_shrinker_info + stats: + 28228 objects + 6341 reclaimable objects + 5281/631 cache hits/misses + 586 ms last sorted interval + 250 inodes on lru list + average: + 153 us scan time + 128 shrunk objects + maximum: + 255 inode (255 objects, 198 reclaimable) + 125723 us max scan time + +If the lru list has never been sorted, the following line will not be +printed: + 586ms last sorted interval +If there is an empty lru list, the following lines also will not be +printed: + 250 inodes on lru list + ... + maximum: + 255 inode (255 objects, 198 reclaimable) + 0 us max scan time + +Meanwhile in this commit a new trace point is defined to print some +details in __ext4_es_shrink(). + +[Shilong remove trace point parts of this patch] + +Cc: Andreas Dilger +Cc: Jan Kara +Reviewed-by: Jan Kara +Signed-off-by: Zheng Liu +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 4 +- + fs/ext4/extents_status.c | 179 +++++++++++++++++++++++++++++++++++++-- + fs/ext4/extents_status.h | 13 ++- + fs/ext4/super.c | 13 +-- + 4 files changed, 187 insertions(+), 22 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 763276e2..cc5ba587 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1018,6 +1018,7 @@ struct ext4_inode_info { + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_lru; ++ unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_lru_nr; /* protected by i_es_lock */ + unsigned long i_touch_when; /* jiffies of last accessing */ + +@@ -1482,8 +1483,7 @@ struct ext4_sb_info { + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_lru; +- unsigned long s_es_last_sorted; +- struct percpu_counter s_extent_cache_cnt; ++ struct ext4_es_stats s_es_stats; + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 3ef7f932..7dfed27b 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -11,6 +11,8 @@ + */ + #include + #include ++#include ++#include + #include "ext4.h" + #include "extents_status.h" + +@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + */ + if (!ext4_es_is_delayed(es)) { + EXT4_I(inode)->i_es_lru_nr++; +- percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); ++ percpu_counter_inc(&EXT4_SB(inode->i_sb)-> ++ s_es_stats.es_stats_lru_cnt); + } + ++ EXT4_I(inode)->i_es_all_nr++; ++ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); ++ + return es; + } + + static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + { ++ EXT4_I(inode)->i_es_all_nr--; ++ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); ++ + /* Decrease the lru counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); + EXT4_I(inode)->i_es_lru_nr--; +- percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); ++ percpu_counter_dec(&EXT4_SB(inode->i_sb)-> ++ s_es_stats.es_stats_lru_cnt); + } + + kmem_cache_free(ext4_es_cachep, es); +@@ -739,6 +749,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) + { + struct ext4_es_tree *tree; ++ struct ext4_es_stats *stats; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; +@@ -775,11 +786,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + } + + out: ++ stats = &EXT4_SB(inode->i_sb)->s_es_stats; + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; ++ stats->es_stats_cache_hits++; ++ } else { ++ stats->es_stats_cache_misses++; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); +@@ -941,11 +956,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) + { + struct ext4_inode_info *ei; ++ struct ext4_es_stats *es_stats; + struct list_head *cur, *tmp; + LIST_HEAD(skipped); ++ ktime_t start_time; ++ u64 scan_time; + int ret, nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; + ++ es_stats = &sbi->s_es_stats; ++ start_time = ktime_get(); + spin_lock(&sbi->s_es_lru_lock); + + retry: +@@ -954,7 +974,8 @@ retry: + * If we have already reclaimed all extents from extent + * status tree, just stop the loop immediately. + */ +- if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) ++ if (percpu_counter_read_positive( ++ &es_stats->es_stats_lru_cnt) == 0) + break; + + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); +@@ -964,7 +985,7 @@ retry: + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ +- if ((sbi->s_es_last_sorted < ei->i_touch_when) || ++ if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; +@@ -998,7 +1019,7 @@ retry: + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); +- sbi->s_es_last_sorted = jiffies; ++ es_stats->es_stats_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* +@@ -1016,6 +1037,20 @@ retry: + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + ++ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ++ if (likely(es_stats->es_stats_scan_time)) ++ es_stats->es_stats_scan_time = (scan_time + ++ es_stats->es_stats_scan_time*3) / 4; ++ else ++ es_stats->es_stats_scan_time = scan_time; ++ if (scan_time > es_stats->es_stats_max_scan_time) ++ es_stats->es_stats_max_scan_time = scan_time; ++ if (likely(es_stats->es_stats_shrunk)) ++ es_stats->es_stats_shrunk = (nr_shrunk + ++ es_stats->es_stats_shrunk*3) / 4; ++ else ++ es_stats->es_stats_shrunk = nr_shrunk; ++ + return nr_shrunk; + } + +@@ -1026,7 +1061,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + +- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) +@@ -1034,23 +1069,149 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) + + nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + +- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + return ret; + } + +-void ext4_es_register_shrinker(struct ext4_sb_info *sbi) ++static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) ++{ ++ return *pos ? NULL : SEQ_START_TOKEN; ++} ++ ++static void * ++ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ return NULL; ++} ++ ++static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) ++{ ++ struct ext4_sb_info *sbi = seq->private; ++ struct ext4_es_stats *es_stats = &sbi->s_es_stats; ++ struct ext4_inode_info *ei, *max = NULL; ++ unsigned int inode_cnt = 0; ++ ++ if (v != SEQ_START_TOKEN) ++ return 0; ++ ++ /* here we just find an inode that has the max nr. of objects */ ++ spin_lock(&sbi->s_es_lru_lock); ++ list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { ++ inode_cnt++; ++ if (max && max->i_es_all_nr < ei->i_es_all_nr) ++ max = ei; ++ else if (!max) ++ max = ei; ++ } ++ spin_unlock(&sbi->s_es_lru_lock); ++ ++ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", ++ percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), ++ percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); ++ seq_printf(seq, " %lu/%lu cache hits/misses\n", ++ es_stats->es_stats_cache_hits, ++ es_stats->es_stats_cache_misses); ++ if (es_stats->es_stats_last_sorted != 0) ++ seq_printf(seq, " %u ms last sorted interval\n", ++ jiffies_to_msecs(jiffies - ++ es_stats->es_stats_last_sorted)); ++ if (inode_cnt) ++ seq_printf(seq, " %d inodes on lru list\n", inode_cnt); ++ ++ seq_printf(seq, "average:\n %llu us scan time\n", ++ div_u64(es_stats->es_stats_scan_time, 1000)); ++ seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); ++ if (inode_cnt) ++ seq_printf(seq, ++ "maximum:\n %lu inode (%u objects, %u reclaimable)\n" ++ " %llu us max scan time\n", ++ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, ++ div_u64(es_stats->es_stats_max_scan_time, 1000)); ++ ++ return 0; ++} ++ ++static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static const struct seq_operations ext4_es_seq_shrinker_info_ops = { ++ .start = ext4_es_seq_shrinker_info_start, ++ .next = ext4_es_seq_shrinker_info_next, ++ .stop = ext4_es_seq_shrinker_info_stop, ++ .show = ext4_es_seq_shrinker_info_show, ++}; ++ ++static int ++ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ ++ ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ m->private = PDE_DATA(inode); ++ } ++ ++ return ret; ++} ++ ++static int ++ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) + { ++ return seq_release(inode, file); ++} ++ ++static const struct file_operations ext4_es_seq_shrinker_info_fops = { ++ .owner = THIS_MODULE, ++ .open = ext4_es_seq_shrinker_info_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext4_es_seq_shrinker_info_release, ++}; ++ ++int ext4_es_register_shrinker(struct ext4_sb_info *sbi) ++{ ++ int err; ++ + INIT_LIST_HEAD(&sbi->s_es_lru); + spin_lock_init(&sbi->s_es_lru_lock); +- sbi->s_es_last_sorted = 0; ++ sbi->s_es_stats.es_stats_last_sorted = 0; ++ sbi->s_es_stats.es_stats_shrunk = 0; ++ sbi->s_es_stats.es_stats_cache_hits = 0; ++ sbi->s_es_stats.es_stats_cache_misses = 0; ++ sbi->s_es_stats.es_stats_scan_time = 0; ++ sbi->s_es_stats.es_stats_max_scan_time = 0; ++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, ++ 0, GFP_KERNEL); ++ if (err) ++ return err; ++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, ++ 0, GFP_KERNEL); ++ if (err) ++ goto err; + sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&sbi->s_es_shrinker); ++ ++ if (sbi->s_proc) ++ proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, ++ &ext4_es_seq_shrinker_info_fops, sbi); ++ ++ return 0; ++ ++err: ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); ++ return err; + } + + void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + { ++ if (sbi->s_proc) ++ remove_proc_entry("es_shrinker_info", sbi->s_proc); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + unregister_shrinker(&sbi->s_es_shrinker); + } + +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index f1b62a41..efd5f970 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -64,6 +64,17 @@ struct ext4_es_tree { + struct extent_status *cache_es; /* recently accessed extent */ + }; + ++struct ext4_es_stats { ++ unsigned long es_stats_last_sorted; ++ unsigned long es_stats_shrunk; ++ unsigned long es_stats_cache_hits; ++ unsigned long es_stats_cache_misses; ++ u64 es_stats_scan_time; ++ u64 es_stats_max_scan_time; ++ struct percpu_counter es_stats_all_cnt; ++ struct percpu_counter es_stats_lru_cnt; ++}; ++ + extern int __init ext4_init_es(void); + extern void ext4_exit_es(void); + extern void ext4_es_init_tree(struct ext4_es_tree *tree); +@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, + (pb & ~ES_MASK)); + } + +-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); ++extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_lru_add(struct inode *inode); + extern void ext4_es_lru_del(struct inode *inode); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 18fe358c..bcdb48cf 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -880,7 +880,6 @@ static void ext4_put_super(struct super_block *sb) + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +- percpu_counter_destroy(&sbi->s_extent_cache_cnt); + #ifdef CONFIG_QUOTA + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +@@ -944,6 +943,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_lru); ++ ei->i_es_all_nr = 0; + ei->i_es_lru_nr = 0; + ei->i_touch_when = 0; + ei->i_reserved_data_blocks = 0; +@@ -4289,14 +4289,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + +- /* Register extent status tree shrinker */ +- ext4_es_register_shrinker(sbi); +- +- err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); +- if (err) { +- ext4_msg(sb, KERN_ERR, "insufficient memory"); ++ if (ext4_es_register_shrinker(sbi)) + goto failed_mount3; +- } + + sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_extent_max_zeroout_kb = 32; +@@ -4641,10 +4635,9 @@ failed_mount_wq: + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } +-failed_mount3: + ext4_es_unregister_shrinker(sbi); ++failed_mount3: + del_timer_sync(&sbi->s_err_report); +- percpu_counter_destroy(&sbi->s_extent_cache_cnt); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + failed_mount2: +-- +2.24.1 + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series index 45cd224..97651e9 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series @@ -40,3 +40,10 @@ rhel7/ext4-optimize-ext4_find_delalloc_range-in-nodelalloc.patch rhel7.2/ext4-simple-blockalloc.patch rhel7/ext4-mballoc-skip-uninit-groups-cr0.patch rhel7/ext4-mballoc-prefetch.patch +rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch +rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch +rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch +rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch +rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch +rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch +rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series index 0dc2498..70220a2 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series @@ -40,3 +40,10 @@ rhel7.7/ext4-fix-project-with-unpatched-kernel.patch rhel7.2/ext4-simple-blockalloc.patch rhel7/ext4-mballoc-skip-uninit-groups-cr0.patch rhel7.7/ext4-mballoc-prefetch.patch +rhel7.6/ext4-track-extent-status-tree-shrinker-delay-statict.patch +rhel7.6/ext4-remove-extent-status-procfs-files-if-journal-lo.patch +rhel7.6/ext4-change-LRU-to-round-robin-in-extent-status-tree.patch +rhel7.6/ext4-move-handling-of-list-of-shrinkable-inodes-into.patch +rhel7.6/ext4-limit-number-of-scanned-extents-in-status-tree-.patch +rhel7.6/ext4-cleanup-flag-definitions-for-extent-status-tree.patch +rhel7.6/ext4-introduce-aging-to-extent-status-tree.patch -- 1.8.3.1