From 9e7aa04af2591ae0ee942e0d72ac8a296a59b798 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Wed, 15 Jan 2020 06:22:13 -0600 Subject: [PATCH] LU-13141 ldiskfs: block alloc performance patch Add block alloc performance patch to CentOS 7.7, 8.0 and Ubuntu 19.04 5.0 kernel. Cray-bug-id: LUS-8402 Signed-off-by: Shaun Tancheff Change-Id: Ifeb78839e5dbe8731bbb5532906708b97d4d9d33 Reviewed-on: https://review.whamcloud.com/37250 Tested-by: jenkins Reviewed-by: Artem Blagodarenko Reviewed-by: Petros Koutoupis Tested-by: Maloo Reviewed-by: Li Dongyang Reviewed-by: Oleg Drokin --- .../patches/rhel8/ext4-simple-blockalloc.patch | 345 +++++++++++++++++++++ .../series/ldiskfs-3.10-rhel7.7.series | 1 + .../series/ldiskfs-4.18-rhel8.series | 1 + .../series/ldiskfs-5.0.0-13-ubuntu19.series | 1 + 4 files changed, 348 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch new file mode 100644 index 0000000..1a3ae06 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch @@ -0,0 +1,345 @@ +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 3b9ec24..64dc5fd 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1450,6 +1450,9 @@ struct ext4_sb_info { + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ ext4_fsblk_t s_mb_c1_blocks; ++ ext4_fsblk_t s_mb_c2_blocks; ++ ext4_fsblk_t s_mb_c3_blocks; + unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; +@@ -1466,6 +1469,9 @@ struct ext4_sb_info { + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ ++ /* cX loop didn't find blocks */ ++ atomic64_t s_bal_cX_failed[3]; ++ atomic64_t s_bal_cX_skipped[3]; + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; +@@ -2563,6 +2569,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + /* mballoc.c */ + extern const struct file_operations ext4_seq_prealloc_table_fops; + extern const struct seq_operations ext4_mb_seq_groups_ops; ++extern const struct file_operations ext4_mb_seq_alloc_fops; + extern const struct file_operations ext4_seq_mb_last_group_fops; + extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v); + extern long ext4_mb_stats; +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 15c962f..7870406 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2104,6 +2104,20 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, + return 0; + } + ++static u64 available_blocks_count(struct ext4_sb_info *sbi) ++{ ++ ext4_fsblk_t resv_blocks; ++ u64 bfree; ++ struct ext4_super_block *es = sbi->s_es; ++ ++ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); ++ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - ++ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); ++ ++ bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); ++ return bfree - (ext4_r_blocks_count(es) + resv_blocks); ++} ++ + static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { +@@ -2113,6 +2127,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; ++ ext4_fsblk_t avail_blocks; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -2165,6 +2180,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; ++ ++ /* Choose what loop to pass based on disk fullness */ ++ avail_blocks = available_blocks_count(sbi) ; ++ ++ if (avail_blocks < sbi->s_mb_c3_blocks) { ++ cr = 3; ++ atomic64_inc(&sbi->s_bal_cX_skipped[2]); ++ } else if(avail_blocks < sbi->s_mb_c2_blocks) { ++ cr = 2; ++ atomic64_inc(&sbi->s_bal_cX_skipped[1]); ++ } else if(avail_blocks < sbi->s_mb_c1_blocks) { ++ cr = 1; ++ atomic64_inc(&sbi->s_bal_cX_skipped[0]); ++ } ++ + /* + * cr == 0 try to get exact allocation, + * cr == 3 try to get anything +@@ -2230,6 +2260,9 @@ repeat: + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } ++ /* Processed all groups and haven't found blocks */ ++ if (i == ngroups) ++ atomic64_inc(&sbi->s_bal_cX_failed[cr]); + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && +@@ -2510,6 +2543,93 @@ const struct file_operations ext4_seq_mb_last_group_fops = { + .write = ext4_mb_last_group_write, + }; + ++static int mb_seq_alloc_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ ++ seq_printf(seq, "mballoc:\n"); ++ seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated)); ++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); ++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); ++ ++ seq_printf(seq, "\textents_scanned: %u\n", ++ atomic_read(&sbi->s_bal_ex_scanned)); ++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); ++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); ++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); ++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); ++ ++ seq_printf(seq, "\tuseless_c1_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[0])); ++ seq_printf(seq, "\tuseless_c2_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[1])); ++ seq_printf(seq, "\tuseless_c3_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[2])); ++ seq_printf(seq, "\tskipped_c1_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[0])); ++ seq_printf(seq, "\tskipped_c2_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[1])); ++ seq_printf(seq, "\tskipped_c3_loops: %llu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[2])); ++ seq_printf(seq, "\tbuddies_generated: %lu\n", ++ sbi->s_mb_buddies_generated); ++ seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time); ++ seq_printf(seq, "\tpreallocated: %u\n", ++ atomic_read(&sbi->s_mb_preallocated)); ++ seq_printf(seq, "\tdiscarded: %u\n", ++ atomic_read(&sbi->s_mb_discarded)); ++ return 0; ++} ++ ++static ssize_t mb_seq_alloc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); ++ ++ atomic_set(&sbi->s_bal_allocated, 0), ++ atomic_set(&sbi->s_bal_reqs, 0), ++ atomic_set(&sbi->s_bal_success, 0); ++ ++ atomic_set(&sbi->s_bal_ex_scanned, 0), ++ atomic_set(&sbi->s_bal_goals, 0), ++ atomic_set(&sbi->s_bal_2orders, 0), ++ atomic_set(&sbi->s_bal_breaks, 0), ++ atomic_set(&sbi->s_mb_lost_chunks, 0); ++ ++ atomic64_set(&sbi->s_bal_cX_failed[0], 0), ++ atomic64_set(&sbi->s_bal_cX_failed[1], 0), ++ atomic64_set(&sbi->s_bal_cX_failed[2], 0); ++ ++ atomic64_set(&sbi->s_bal_cX_skipped[0], 0), ++ atomic64_set(&sbi->s_bal_cX_skipped[1], 0), ++ atomic64_set(&sbi->s_bal_cX_skipped[2], 0); ++ ++ ++ sbi->s_mb_buddies_generated = 0; ++ sbi->s_mb_generation_time = 0; ++ ++ atomic_set(&sbi->s_mb_preallocated, 0), ++ atomic_set(&sbi->s_mb_discarded, 0); ++ ++ return cnt; ++} ++ ++static int mb_seq_alloc_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_seq_alloc_show, PDE_DATA(inode)); ++} ++ ++const struct file_operations ext4_mb_seq_alloc_fops = { ++ .owner = THIS_MODULE, ++ .open = mb_seq_alloc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++ .write = mb_seq_alloc_write, ++}; ++ + int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v) + { + struct ext4_sb_info *sbi = EXT4_SB(m->private); +@@ -2734,6 +2854,7 @@ static int ext4_groupinfo_create_slab(size_t size) + return 0; + } + ++#define THRESHOLD_BLOCKS(ts) (ext4_blocks_count(sbi->s_es) / 100 * ts) + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -2787,6 +2908,9 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; ++ sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C1_THRESHOLD); ++ sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C2_THRESHOLD); ++ sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C3_THRESHOLD); + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file +@@ -2922,6 +3046,16 @@ int ext4_mb_release(struct super_block *sb) + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: (%llu, %llu, %llu) useless c(0,1,2) loops", ++ atomic64_read(&sbi->s_bal_cX_failed[0]), ++ atomic64_read(&sbi->s_bal_cX_failed[1]), ++ atomic64_read(&sbi->s_bal_cX_failed[2])); ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops", ++ atomic64_read(&sbi->s_bal_cX_skipped[0]), ++ atomic64_read(&sbi->s_bal_cX_skipped[1]), ++ atomic64_read(&sbi->s_bal_cX_skipped[2])); + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index e00c3b7..d02daaf 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -72,6 +72,9 @@ do { \ + * for which requests use 2^N search using buddies + */ + #define MB_DEFAULT_ORDER2_REQS 8 ++#define MB_DEFAULT_C1_THRESHOLD 25 ++#define MB_DEFAULT_C2_THRESHOLD 15 ++#define MB_DEFAULT_C3_THRESHOLD 5 + + /* + * default group prealloc size 512 blocks +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index 417b33a..f49821e 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -20,6 +20,9 @@ + typedef enum { + attr_noop, + attr_delayed_allocation_blocks, ++ attr_mb_c1_threshold, ++ attr_mb_c2_threshold, ++ attr_mb_c3_threshold, + attr_session_write_kbytes, + attr_lifetime_write_kbytes, + attr_reserved_clusters, +@@ -134,6 +137,32 @@ static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) + task_pid_vnr(sbi->s_journal->j_task)); + } + ++#define THRESHOLD_PERCENT(ts) (ts * 100 / ext4_blocks_count(sbi->s_es)) ++ ++static int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf, ++ ext4_fsblk_t *blocks) ++{ ++ unsigned long long val; ++ ++ int ret; ++ ++ ret = kstrtoull(skip_spaces(buf), 0, &val); ++ if (ret || val > 100) ++ return -EINVAL; ++ ++ *blocks = val * ext4_blocks_count(sbi->s_es) / 100; ++ return 0; ++} ++ ++static ssize_t mb_threshold_store(struct ext4_sb_info *sbi, ++ const char *buf, size_t count, ++ ext4_fsblk_t *blocks) ++{ ++ int ret = save_threshold_percent(sbi, buf, blocks); ++ ++ return ret ?: count; ++} ++ + #define EXT4_ATTR(_name,_mode,_id) \ + static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ +@@ -176,6 +205,9 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); + EXT4_ATTR_FUNC(session_write_kbytes, 0444); + EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); + EXT4_ATTR_FUNC(reserved_clusters, 0644); ++EXT4_ATTR_FUNC(mb_c1_threshold, 0644); ++EXT4_ATTR_FUNC(mb_c2_threshold, 0644); ++EXT4_ATTR_FUNC(mb_c3_threshold, 0644); + + EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, + ext4_sb_info, s_inode_readahead_blks); +@@ -211,6 +243,9 @@ static struct attribute *ext4_attrs[] = { + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), ++ ATTR_LIST(mb_c1_threshold), ++ ATTR_LIST(mb_c2_threshold), ++ ATTR_LIST(mb_c3_threshold), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(max_dir_size), +@@ -294,6 +329,15 @@ static ssize_t ext4_attr_show(struct kobject *kobj, + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ++ case attr_mb_c1_threshold: ++ return scnprintf(buf, PAGE_SIZE, "%llu\n", ++ THRESHOLD_PERCENT(sbi->s_mb_c1_blocks)); ++ case attr_mb_c2_threshold: ++ return scnprintf(buf, PAGE_SIZE, "%llu\n", ++ THRESHOLD_PERCENT(sbi->s_mb_c2_blocks)); ++ case attr_mb_c3_threshold: ++ return scnprintf(buf, PAGE_SIZE, "%llu\n", ++ THRESHOLD_PERCENT(sbi->s_mb_c3_blocks)); + case attr_session_write_kbytes: + return session_write_kbytes_show(sbi, buf); + case attr_lifetime_write_kbytes: +@@ -363,6 +407,12 @@ static ssize_t ext4_attr_store(struct kobject *kobj, + return inode_readahead_blks_store(sbi, buf, len); + case attr_trigger_test_error: + return trigger_test_error(sbi, buf, len); ++ case attr_mb_c1_threshold: ++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks); ++ case attr_mb_c2_threshold: ++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks); ++ case attr_mb_c3_threshold: ++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks); + } + return 0; + } +@@ -425,6 +475,8 @@ int ext4_register_sysfs(struct super_block *sb) + &ext4_seq_mb_last_group_fops, sb); + proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc, + ext4_mb_seq_last_start_seq_show, sb); ++ proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR, ++ sbi->s_proc, &ext4_mb_seq_alloc_fops, sb); + } + return 0; + } diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series index 0f62125..689b1ed 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.7.series @@ -38,3 +38,4 @@ rhel7/ext4-include-terminating-u32-in-size-of-xattr-entries-when-expanding-inode rhel7.2/ext4-export-mb-stream-allocator-variables.patch rhel7/ext4-optimize-ext4_find_delalloc_range-in-nodelalloc.patch rhel7.7/ext4-fix-project-with-unpatched-kernel.patch +rhel7.2/ext4-simple-blockalloc.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series index 843a6d7..f476de4 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series @@ -22,3 +22,4 @@ rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch rhel7/ext4-export-orphan-add.patch ubuntu18/ext4-include-terminating-u32-in-size-of-xattr-entries-when-expanding-inodes.patch rhel8/ext4-export-mb-stream-allocator-variables.patch +rhel8/ext4-simple-blockalloc.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.0.0-13-ubuntu19.series b/ldiskfs/kernel_patches/series/ldiskfs-5.0.0-13-ubuntu19.series index 27f79ab..011a1ca 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-5.0.0-13-ubuntu19.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.0.0-13-ubuntu19.series @@ -22,3 +22,4 @@ rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch rhel7/ext4-export-orphan-add.patch rhel8/ext4-export-mb-stream-allocator-variables.patch ubuntu19/ext4-iget-with-flags.patch +rhel8/ext4-simple-blockalloc.patch -- 1.8.3.1