From 4fac310a77c918d6a235a55cb76cf2f9bb22de71 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Tue, 25 Nov 2014 11:45:37 -0500 Subject: [PATCH 3/7] ext4: change LRU to round-robin in extent status tree shrinker In this commit we discard the lru algorithm for inodes with extent status tree because it takes significant effort to maintain a lru list in extent status tree shrinker and the shrinker can take a long time to scan this lru list in order to reclaim some objects. We replace the lru ordering with a simple round-robin. After that we never need to keep a lru list. That means that the list needn't be sorted if the shrinker can not reclaim any objects in the first round. Cc: Andreas Dilger Signed-off-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 10 +- fs/ext4/extents.c | 4 +- fs/ext4/extents_status.c | 221 +++++++++++++++++---------------------- fs/ext4/extents_status.h | 7 +- fs/ext4/inode.c | 4 +- fs/ext4/ioctl.c | 4 +- fs/ext4/super.c | 7 +- 7 files changed, 112 insertions(+), 145 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc5ba587..0813afd6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1017,10 +1017,9 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; - struct list_head i_es_lru; + struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ - unsigned int i_es_lru_nr; /* protected by i_es_lock */ - unsigned long i_touch_when; /* jiffies of last accessing */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -1482,9 +1481,10 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; - struct list_head s_es_lru; + struct list_head s_es_list; + long s_es_nr_inode; struct ext4_es_stats s_es_stats; - spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f618d0ba..c012dc51 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4689,7 +4689,7 @@ out2: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); - ext4_es_lru_add(inode); + ext4_es_list_add(inode); return err ? err : allocated; } @@ -5263,7 +5263,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - ext4_es_lru_add(inode); + ext4_es_list_add(inode); return error; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 7dfed27b..382a7bf9 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -149,8 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, int nr_to_scan); -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei); +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); int __init ext4_init_es(void) { @@ -298,6 +298,36 @@ out: trace_ext4_es_find_delayed_extent_range_exit(inode, es); } +void ext4_es_list_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (!list_empty(&ei->i_es_list)) + return; + + spin_lock(&sbi->s_es_lock); + if (list_empty(&ei->i_es_list)) { + list_add_tail(&ei->i_es_list, &sbi->s_es_list); + sbi->s_es_nr_inode++; + } + spin_unlock(&sbi->s_es_lock); +} + +void ext4_es_list_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lock); + if (!list_empty(&ei->i_es_list)) { + list_del_init(&ei->i_es_list); + sbi->s_es_nr_inode--; + WARN_ON_ONCE(sbi->s_es_nr_inode < 0); + } + spin_unlock(&sbi->s_es_lock); +} + static struct extent_status * ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) @@ -314,9 +344,9 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, * We don't count delayed extent because we never try to reclaim them */ if (!ext4_es_is_delayed(es)) { - EXT4_I(inode)->i_es_lru_nr++; + EXT4_I(inode)->i_es_shk_nr++; percpu_counter_inc(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } EXT4_I(inode)->i_es_all_nr++; @@ -330,12 +360,12 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); - /* Decrease the lru counter when this es is not delayed */ + /* Decrease the shrink counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { - BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); - EXT4_I(inode)->i_es_lru_nr--; + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); + EXT4_I(inode)->i_es_shk_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -693,8 +723,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, goto error; retry: err = __es_insert_extent(inode, &newes); - if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 1, EXT4_I(inode))) goto retry; if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; @@ -851,8 +881,8 @@ retry: es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; if ((err == -ENOMEM) && - __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + __es_shrink(EXT4_SB(inode->i_sb), + 1, EXT4_I(inode))) goto retry; goto out; } @@ -924,6 +954,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, end = lblk + len - 1; BUG_ON(end < lblk); + /* + * ext4_clear_inode() depends on us taking i_es_lock unconditionally + * so that we are sure __es_shrink() is done with the inode before it + * is reclaimed. + */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); @@ -931,112 +966,77 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct ext4_inode_info *eia, *eib; - eia = list_entry(a, struct ext4_inode_info, i_es_lru); - eib = list_entry(b, struct ext4_inode_info, i_es_lru); - - if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return 1; - if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return -1; - if (eia->i_touch_when == eib->i_touch_when) - return 0; - if (time_after(eia->i_touch_when, eib->i_touch_when)) - return 1; - else - return -1; -} - -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei) +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; struct ext4_es_stats *es_stats; - struct list_head *cur, *tmp; - LIST_HEAD(skipped); ktime_t start_time; u64 scan_time; + int nr_to_walk; int ret, nr_shrunk = 0; - int retried = 0, skip_precached = 1, nr_skipped = 0; + int retried = 0, nr_skipped = 0; es_stats = &sbi->s_es_stats; start_time = ktime_get(); - spin_lock(&sbi->s_es_lru_lock); retry: - list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - /* - * If we have already reclaimed all extents from extent - * status tree, just stop the loop immediately. - */ - if (percpu_counter_read_positive( - &es_stats->es_stats_lru_cnt) == 0) - break; - - ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (nr_to_walk-- > 0) { + if (list_empty(&sbi->s_es_list)) { + spin_unlock(&sbi->s_es_lock); + goto out; + } + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ + list_move(&ei->i_es_list, sbi->s_es_list.prev); /* - * Skip the inode that is newer than the last_sorted - * time. Normally we try hard to avoid shrinking - * precached inodes, but we will as a last resort. + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. */ - if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || - (skip_precached && ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED))) { + if (!retried && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) { nr_skipped++; - list_move_tail(cur, &skipped); continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei || - !write_trylock(&ei->i_es_lock)) - continue; + if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { + nr_skipped++; + continue; + } + /* + * Now we hold i_es_lock which protects us from inode reclaim + * freeing inode under us + */ + spin_unlock(&sbi->s_es_lock); ret = __es_try_to_reclaim_extents(ei, nr_to_scan); - if (ei->i_es_lru_nr == 0) - list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); nr_shrunk += ret; nr_to_scan -= ret; if (nr_to_scan == 0) - break; + goto out; + spin_lock(&sbi->s_es_lock); } - /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skipped, &sbi->s_es_lru); - INIT_LIST_HEAD(&skipped); + spin_unlock(&sbi->s_es_lock); /* * If we skipped any inodes, and we weren't able to make any - * forward progress, sort the list and try again. + * forward progress, try again to scan precached inodes. */ if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - es_stats->es_stats_last_sorted = jiffies; - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, - i_es_lru); - /* - * If there are no non-precached inodes left on the - * list, start releasing precached extents. - */ - if (ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED)) - skip_precached = 0; goto retry; } - spin_unlock(&sbi->s_es_lru_lock); - if (locked_ei && nr_shrunk == 0) nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); - +out: scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); if (likely(es_stats->es_stats_scan_time)) es_stats->es_stats_scan_time = (scan_time + @@ -1061,15 +1061,15 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; - nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); - ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); return ret; } @@ -1096,28 +1096,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) return 0; /* here we just find an inode that has the max nr. of objects */ - spin_lock(&sbi->s_es_lru_lock); - list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + spin_lock(&sbi->s_es_lock); + list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr) max = ei; else if (!max) max = ei; } - spin_unlock(&sbi->s_es_lru_lock); + spin_unlock(&sbi->s_es_lock); seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), - percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); seq_printf(seq, " %lu/%lu cache hits/misses\n", es_stats->es_stats_cache_hits, es_stats->es_stats_cache_misses); - if (es_stats->es_stats_last_sorted != 0) - seq_printf(seq, " %u ms last sorted interval\n", - jiffies_to_msecs(jiffies - - es_stats->es_stats_last_sorted)); if (inode_cnt) - seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + seq_printf(seq, " %d inodes on list\n", inode_cnt); seq_printf(seq, "average:\n %llu us scan time\n", div_u64(es_stats->es_stats_scan_time, 1000)); @@ -1126,7 +1122,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, "maximum:\n %lu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", - max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); return 0; @@ -1175,9 +1171,9 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; - INIT_LIST_HEAD(&sbi->s_es_lru); - spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_stats.es_stats_last_sorted = 0; + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; sbi->s_es_stats.es_stats_cache_hits = 0; sbi->s_es_stats.es_stats_cache_misses = 0; @@ -1187,7 +1183,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) 0, GFP_KERNEL); if (err) return err; - err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, + err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) goto err; @@ -1211,37 +1207,10 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) if (sbi->s_proc) remove_proc_entry("es_shrinker_info", sbi->s_proc); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); } -void ext4_es_lru_add(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - ei->i_touch_when = jiffies; - - if (!list_empty(&ei->i_es_lru)) - return; - - spin_lock(&sbi->s_es_lru_lock); - if (list_empty(&ei->i_es_lru)) - list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - spin_unlock(&sbi->s_es_lru_lock); -} - -void ext4_es_lru_del(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - spin_lock(&sbi->s_es_lru_lock); - if (!list_empty(&ei->i_es_lru)) - list_del_init(&ei->i_es_lru); - spin_unlock(&sbi->s_es_lru_lock); -} - static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, int nr_to_scan) { @@ -1253,7 +1222,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (ei->i_es_lru_nr == 0) + if (ei->i_es_shk_nr == 0) return 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index efd5f970..0e6a33e8 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -65,14 +65,13 @@ struct ext4_es_tree { }; struct ext4_es_stats { - unsigned long es_stats_last_sorted; unsigned long es_stats_shrunk; unsigned long es_stats_cache_hits; unsigned long es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; - struct percpu_counter es_stats_lru_cnt; + struct percpu_counter es_stats_shk_cnt; }; extern int __init ext4_init_es(void); @@ -151,7 +150,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); -extern void ext4_es_lru_add(struct inode *inode); -extern void ext4_es_lru_del(struct inode *inode); +extern void ext4_es_list_add(struct inode *inode); +extern void ext4_es_list_del(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 21db5952..f6a2764c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -523,7 +523,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { - ext4_es_lru_add(inode); + ext4_es_list_add(inode); if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -1519,7 +1519,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { - ext4_es_lru_add(inode); + ext4_es_list_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 858cf709..122d517c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -80,8 +80,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode1); - ext4_es_lru_del(inode2); + ext4_es_list_del(inode1); + ext4_es_list_del(inode2); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 95a01d56..ea2a1026 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -942,10 +942,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); - INIT_LIST_HEAD(&ei->i_es_lru); + INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; - ei->i_es_lru_nr = 0; - ei->i_touch_when = 0; + ei->i_es_shk_nr = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -1034,7 +1033,7 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode); + ext4_es_list_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); -- 2.24.1