X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=ldiskfs%2Fkernel_patches%2Fpatches%2Flinux-5.8%2Fext4-prealloc.patch;fp=ldiskfs%2Fkernel_patches%2Fpatches%2Flinux-5.8%2Fext4-prealloc.patch;h=6aa6261b294afbe18a996363f2571985ed98d1a3;hb=4c591a8d8ed60a3ec2effbc08503bf02f5069192;hp=0000000000000000000000000000000000000000;hpb=f9c06dc07cc6c2125a014f472bba8f6022b660ac;p=fs%2Flustre-release.git diff --git a/ldiskfs/kernel_patches/patches/linux-5.8/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/linux-5.8/ext4-prealloc.patch new file mode 100644 index 0000000..6aa6261 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.8/ext4-prealloc.patch @@ -0,0 +1,385 @@ +--- + fs/ext4/ext4.h | 7 + + fs/ext4/inode.c | 3 + fs/ext4/mballoc.c | 215 +++++++++++++++++++++++++++++++++++++++++------------- + fs/ext4/sysfs.c | 8 +- + 4 files changed, 180 insertions(+), 53 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1241,6 +1241,8 @@ extern void ext4_set_bits(void *bm, int + /* Metadata checksum algorithm codes */ + #define EXT4_CRC32C_CHKSUM 1 + ++#define EXT4_MAX_PREALLOC_TABLE 64 ++ + /* + * Structure of the super block + */ +@@ -1497,11 +1499,13 @@ struct ext4_sb_info { + + /* tunables */ + unsigned long s_stripe; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ +@@ -2645,6 +2649,7 @@ extern int ext4_init_inode_table(struct + extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + + /* mballoc.c */ ++extern const struct proc_ops ext4_seq_prealloc_table_fops; + extern const struct seq_operations ext4_mb_seq_groups_ops; + extern long ext4_mb_stats; + extern long ext4_mb_max_to_scan; +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2698,6 +2698,9 @@ static int ext4_writepages(struct addres + PAGE_SIZE >> inode->i_blkbits); + } + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2455,6 +2455,99 @@ const struct seq_operations ext4_mb_seq_ + .show = ext4_mb_seq_groups_show, + }; + ++static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, ++ char *str, size_t cnt, ++ int update) ++{ ++ unsigned long value; ++ unsigned long prev = 0; ++ char *cur; ++ char *next; ++ char *end; ++ int num = 0; ++ ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &next, 0); ++ if (value == 0) ++ break; ++ if (cur == next) ++ return -EINVAL; ++ ++ cur = next; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return -EINVAL; ++ ++ /* they should add values in order */ ++ if (value <= prev) ++ return -EINVAL; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = value; ++ ++ prev = value; ++ num++; ++ } ++ ++ if (num > EXT4_MAX_PREALLOC_TABLE - 1) ++ return -EOVERFLOW; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = 0; ++ ++ return 0; ++} ++ ++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); ++ char str[128]; ++ int rc; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0); ++ if (rc) ++ return rc; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1); ++ return rc ? rc : cnt; ++} ++ ++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ int i; ++ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE && ++ sbi->s_mb_prealloc_table[i] != 0; i++) ++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]); ++ seq_printf(m, "\n"); ++ ++ return 0; ++} ++ ++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_prealloc_table_seq_show, PDE_DATA(inode)); ++} ++ ++const struct proc_ops ext4_seq_prealloc_table_fops = { ++ .proc_open = mb_prealloc_table_seq_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = single_release, ++ .proc_write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + { + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; +@@ -2685,7 +2778,7 @@ static int ext4_groupinfo_create_slab(si + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- unsigned i, j; ++ unsigned i, j, k, l; + unsigned offset, offset_incr; + unsigned max; + int ret; +@@ -2734,7 +2827,6 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + /* + * The default group preallocation is 512, which for 4k block +@@ -2758,9 +2850,29 @@ int ext4_mb_init(struct super_block *sb) + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ +- if (sbi->s_stripe > 1) { +- sbi->s_mb_group_prealloc = roundup( +- sbi->s_mb_group_prealloc, sbi->s_stripe); ++ ++ /* Allocate table once */ ++ sbi->s_mb_prealloc_table = kzalloc( ++ EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (sbi->s_stripe == 0) { ++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); +@@ -2788,6 +2900,7 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); +@@ -3057,7 +3170,6 @@ ext4_mb_mark_diskspace_used(struct ext4_ + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); +- BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -3187,13 +3299,14 @@ ext4_mb_normalize_request(struct ext4_al + struct ext4_allocation_request *ar) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, start_off; ++ loff_t size; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; ++ unsigned long value, last_non_zero; + + /* do normalize only data requests, metadata requests + do not need preallocation */ +@@ -3222,51 +3335,46 @@ ext4_mb_normalize_request(struct ext4_al + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); +- orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ start = wind = 0; ++ value = last_non_zero = 0; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) { ++ value = sbi->s_mb_prealloc_table[i]; ++ if (value == 0) ++ break; ++ else ++ last_non_zero = value; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ if (size <= value) { ++ wind = value; ++ break; ++ } ++ } + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; ++ if (wind == 0) { ++ if (last_non_zero != 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = last_non_zero; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; ++ } + } else { +- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; +- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), +- ac->ac_o_ex.fe_len) << bsbits; ++ size = wind; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ ++ ++ orig_size = size; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -3348,7 +3456,6 @@ ext4_mb_normalize_request(struct ext4_al + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -4341,11 +4448,19 @@ static void ext4_mb_group_or_file(struct + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -213,7 +213,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); + EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); +@@ -255,7 +256,8 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), +@@ -510,6 +512,8 @@ int ext4_register_sysfs(struct super_blo + sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, ++ &ext4_seq_prealloc_table_fops, sb); + } + return 0; + }