From 95f8ae5677491508ae7182b4f61ead3d413434ae Mon Sep 17 00:00:00 2001 From: Artem Blagodarenko Date: Thu, 6 Jun 2019 16:50:11 +0300 Subject: [PATCH] LU-12103 ldiskfs: don't search large block range if disk full Block allocator tries to find: 1) group with the same range as required 2) group with the same average range as required 3) group with required amount of space 4) any group For quite full disk step 1 is failed with higth probability, but takes a lot of time. Skip 1st step if disk space < 25% Skip 2d step if disk space < 15% Skip 3d step if disk space < 5% Also check if group has any free space on step 4. This three thresholds can be adjusted through added interface. Variables added which counts unsuccessfull group processing loops. This can show allocator effectiveness in different circumstances. This statistics output through mb_alloc file. This file is useful to track allocator activity. Signed-off-by: Artem Blagodarenko Change-Id: I18c7147e32951c49e12a2444803aa2995bb4ae2d Cray-bug-id: LUS-6746 Reviewed-on: https://review.whamcloud.com/35180 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Wang Shilong Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- .../patches/rhel7.2/ext4-simple-blockalloc.patch | 348 +++++++++++++++++++++ .../series/ldiskfs-3.10-rhel7.6.series | 1 + 2 files changed, 349 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel7.2/ext4-simple-blockalloc.patch diff --git a/ldiskfs/kernel_patches/patches/rhel7.2/ext4-simple-blockalloc.patch b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-simple-blockalloc.patch new file mode 100644 index 0000000..f7a11a6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-simple-blockalloc.patch @@ -0,0 +1,348 @@ +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2078,6 +2078,21 @@ static int ext4_mb_good_group(struct ext + return 0; + } + ++static u64 available_blocks_count(struct ext4_sb_info *sbi) ++{ ++ ext4_fsblk_t resv_blocks; ++ u64 bfree; ++ struct ext4_super_block *es = sbi->s_es; ++ ++ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); ++ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - ++ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); ++ ++ bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); ++ return bfree - (ext4_r_blocks_count(es) + resv_blocks); ++} ++ ++ + static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { +@@ -2087,6 +2102,7 @@ ext4_mb_regular_allocator(struct ext4_al + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; ++ ext4_fsblk_t avail_blocks; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -2136,6 +2152,21 @@ ext4_mb_regular_allocator(struct ext4_al + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; ++ ++ /* Choose what loop to pass based on disk fullness */ ++ avail_blocks = available_blocks_count(sbi) ; ++ ++ if (avail_blocks < sbi->s_mb_c3_blocks) { ++ cr = 3; ++ atomic64_inc(&sbi->s_bal_cX_skipped[2]); ++ } else if(avail_blocks < sbi->s_mb_c2_blocks) { ++ cr = 2; ++ atomic64_inc(&sbi->s_bal_cX_skipped[1]); ++ } else if(avail_blocks < sbi->s_mb_c1_blocks) { ++ cr = 1; ++ atomic64_inc(&sbi->s_bal_cX_skipped[0]); ++ } ++ + /* + * cr == 0 try to get exact allocation, + * cr == 3 try to get anything +@@ -2193,6 +2224,9 @@ repeat: + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } ++ /* Processed all groups and haven't found blocks */ ++ if (i == ngroups) ++ atomic64_inc(&sbi->s_bal_cX_failed[cr]); + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && +@@ -2316,6 +2350,93 @@ static const struct seq_operations ext4_ + .show = ext4_mb_seq_groups_show, + }; + ++static int mb_seq_alloc_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ ++ seq_printf(seq, "mballoc:\n"); ++ seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated)); ++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); ++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); ++ ++ seq_printf(seq, "\textents_scanned: %u\n", ++ atomic_read(&sbi->s_bal_ex_scanned)); ++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); ++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); ++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); ++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); ++ ++ seq_printf(seq, "\tuseless_c1_loops: %lu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[0])); ++ seq_printf(seq, "\tuseless_c2_loops: %lu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[1])); ++ seq_printf(seq, "\tuseless_c3_loops: %lu\n", ++ atomic64_read(&sbi->s_bal_cX_failed[2])); ++ seq_printf(seq, "\tskipped_c1_loops: %lu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[0])); ++ seq_printf(seq, "\tskipped_c2_loops: %lu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[1])); ++ seq_printf(seq, "\tskipped_c3_loops: %lu\n", ++ atomic64_read(&sbi->s_bal_cX_skipped[2])); ++ seq_printf(seq, "\tbuddies_generated: %lu\n", ++ sbi->s_mb_buddies_generated); ++ seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time); ++ seq_printf(seq, "\tpreallocated: %u\n", ++ atomic_read(&sbi->s_mb_preallocated)); ++ seq_printf(seq, "\tdiscarded: %u\n", ++ atomic_read(&sbi->s_mb_discarded)); ++ return 0; ++} ++ ++static ssize_t mb_seq_alloc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); ++ ++ atomic_set(&sbi->s_bal_allocated, 0), ++ atomic_set(&sbi->s_bal_reqs, 0), ++ atomic_set(&sbi->s_bal_success, 0); ++ ++ atomic_set(&sbi->s_bal_ex_scanned, 0), ++ atomic_set(&sbi->s_bal_goals, 0), ++ atomic_set(&sbi->s_bal_2orders, 0), ++ atomic_set(&sbi->s_bal_breaks, 0), ++ atomic_set(&sbi->s_mb_lost_chunks, 0); ++ ++ atomic64_set(&sbi->s_bal_cX_failed[0], 0), ++ atomic64_set(&sbi->s_bal_cX_failed[1], 0), ++ atomic64_set(&sbi->s_bal_cX_failed[2], 0); ++ ++ atomic64_set(&sbi->s_bal_cX_skipped[0], 0), ++ atomic64_set(&sbi->s_bal_cX_skipped[1], 0), ++ atomic64_set(&sbi->s_bal_cX_skipped[2], 0); ++ ++ ++ sbi->s_mb_buddies_generated = 0; ++ sbi->s_mb_generation_time = 0; ++ ++ atomic_set(&sbi->s_mb_preallocated, 0), ++ atomic_set(&sbi->s_mb_discarded, 0); ++ ++ return cnt; ++} ++ ++static int mb_seq_alloc_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_seq_alloc_show, PDE_DATA(inode)); ++} ++ ++static const struct file_operations ext4_mb_seq_alloc_fops = { ++ .owner = THIS_MODULE, ++ .open = mb_seq_alloc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++ .write = mb_seq_alloc_write, ++}; ++ + #define EXT4_MB_PREALLOC_TABLE "prealloc_table" + + static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, +@@ -2730,6 +2851,7 @@ static int ext4_groupinfo_create_slab(si + return 0; + } + ++#define THRESHOLD_BLOCKS(ts) (ext4_blocks_count(sbi->s_es) / 100 * ts) + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -2781,6 +2903,9 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; ++ sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C1_THRESHOLD); ++ sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C2_THRESHOLD); ++ sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C3_THRESHOLD); + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file +@@ -2853,6 +2978,8 @@ int ext4_mb_init(struct super_block *sb) + proc_create_data(EXT4_MB_PREALLOC_TABLE, S_IFREG | S_IRUGO | + S_IWUSR, sbi->s_proc, + &ext4_mb_prealloc_seq_fops, sb); ++ proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR, ++ sbi->s_proc, &ext4_mb_seq_alloc_fops, sb); + proc_create_data("mb_last_group", S_IFREG | S_IRUGO | + S_IWUSR, sbi->s_proc, + &ext4_mb_seq_last_group_fops, sb); +@@ -2906,6 +3033,7 @@ int ext4_mb_release(struct super_block * + remove_proc_entry("mb_last_group", sbi->s_proc); + remove_proc_entry("mb_last_start", sbi->s_proc); + remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ remove_proc_entry("mb_alloc_stats", sbi->s_proc); + } + + if (sbi->s_group_info) { +@@ -2936,6 +3064,16 @@ int ext4_mb_release(struct super_block * + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + ext4_msg(sb, KERN_INFO, ++ "mballoc: (%lu, %lu, %lu) useless c(0,1,2) loops", ++ atomic64_read(&sbi->s_bal_cX_failed[0]), ++ atomic64_read(&sbi->s_bal_cX_failed[1]), ++ atomic64_read(&sbi->s_bal_cX_failed[2])); ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: (%lu, %lu, %lu) skipped c(0,1,2) loops", ++ atomic64_read(&sbi->s_bal_cX_skipped[0]), ++ atomic64_read(&sbi->s_bal_cX_skipped[1]), ++ atomic64_read(&sbi->s_bal_cX_skipped[2])); ++ ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1409,6 +1409,9 @@ struct ext4_sb_info { + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ ext4_fsblk_t s_mb_c1_blocks; ++ ext4_fsblk_t s_mb_c2_blocks; ++ ext4_fsblk_t s_mb_c3_blocks; + unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; +@@ -1425,6 +1428,9 @@ struct ext4_sb_info { + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ ++ /* cX loop didn't find blocks */ ++ atomic64_t s_bal_cX_failed[3]; ++ atomic64_t s_bal_cX_skipped[3]; + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -2734,6 +2734,73 @@ static ssize_t sbi_deprecated_show(struc + return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); + } + ++static int save_threshold(struct ext4_sb_info *sbi, const char *buf, ++ ext4_fsblk_t *blocks) { ++ unsigned long long val; ++ ++ if (!parse_strtoull(buf, 100, &val)) { ++ *blocks = val * ext4_blocks_count(sbi->s_es) / 100; ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++#define THRESHOLD_PERCENT(ts) (ts * 100 / ext4_blocks_count(sbi->s_es)) ++static ssize_t mb_c1_threshold_store(struct ext4_attr *a, ++ struct ext4_sb_info *sbi, ++ const char *buf, size_t count) ++{ ++ int ret; ++ ++ ret = save_threshold(sbi, buf, &sbi->s_mb_c1_blocks); ++ ++ return ret ? ret : count; ++} ++ ++static ssize_t mb_c1_threshold_show(struct ext4_attr *a, ++ struct ext4_sb_info *sbi, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%llu\n", ++ THRESHOLD_PERCENT(sbi->s_mb_c1_blocks)); ++} ++ ++static ssize_t mb_c2_threshold_store(struct ext4_attr *a, ++ struct ext4_sb_info *sbi, ++ const char *buf, size_t count) ++{ ++ int ret; ++ ++ ret = save_threshold(sbi, buf, &sbi->s_mb_c2_blocks); ++ return ret ? ret : count; ++} ++ ++static ssize_t mb_c2_threshold_show(struct ext4_attr *a, ++ struct ext4_sb_info *sbi, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%llu\n", ++ THRESHOLD_PERCENT(sbi->s_mb_c2_blocks)); ++} ++ ++static ssize_t mb_c3_threshold_store(struct ext4_attr *a, ++ struct ext4_sb_info *sbi, ++ const char *buf, size_t count) ++{ ++ int ret; ++ ++ ret = save_threshold(sbi, buf, &sbi->s_mb_c3_blocks); ++ ++ return ret ? ret : count; ++} ++ ++static ssize_t mb_c3_threshold_show(struct ext4_attr *a, ++ struct ext4_sb_info *sbi, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%llu\n", ++ THRESHOLD_PERCENT(sbi->s_mb_c3_blocks)); ++} ++ ++ + #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ + static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ +@@ -2790,6 +2857,9 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); ++EXT4_RW_ATTR(mb_c1_threshold); ++EXT4_RW_ATTR(mb_c2_threshold); ++EXT4_RW_ATTR(mb_c3_threshold); + EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); + EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +@@ -2820,6 +2890,9 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), ++ ATTR_LIST(mb_c1_threshold), ++ ATTR_LIST(mb_c2_threshold), ++ ATTR_LIST(mb_c3_threshold), + ATTR_LIST(mb_small_req), + ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h ++++ linux-stage/fs/ext4/mballoc.h +@@ -84,6 +84,9 @@ extern ushort ext4_mballoc_debug; + * for which requests use 2^N search using buddies + */ + #define MB_DEFAULT_ORDER2_REQS 8 ++#define MB_DEFAULT_C1_THRESHOLD 25 ++#define MB_DEFAULT_C2_THRESHOLD 15 ++#define MB_DEFAULT_C3_THRESHOLD 5 + + /* + * default group prealloc size 512 blocks diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series index c89262a..d10b17f 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.6.series @@ -38,3 +38,4 @@ rhel7/ext4-mmp-dont-mark-bh-dirty.patch rhel7/ext4-include-terminating-u32-in-size-of-xattr-entries-when-expanding-inodes.patch rhel7.2/ext4-export-mb-stream-allocator-variables.patch rhel7/ext4-optimize-ext4_find_delalloc_range-in-nodelalloc.patch +rhel7.2/ext4-simple-blockalloc.patch -- 1.8.3.1