From: Wang Shilong Date: Sun, 28 Jun 2020 08:35:07 +0000 (+0800) Subject: LU-13669 llite: try to improve mmap performance X-Git-Tag: 2.13.57~16 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=0c5ad4b6df5bf35b291842fc6d42c2720246a026;p=fs%2Flustre-release.git LU-13669 llite: try to improve mmap performance We have observed slow mmap read performances for some applications. The problem is if access pattern is neither sequential nor stride, but could be still adjacent in a small range and then seek a random position. So the pattern could be something like this: [1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data] Every time an application reads mmap data, it may not only read a single 4KB page, but aslo a cluster of nearby pages in a range(e.g. 1MB) of the first page after a cache miss. The readahead engine is modified to track the range size of a cluster of mmap reads, so that after a seek and/or cache miss, the range size is used to efficiently prefetch multiple pages in a single RPC rather than many small RPCs. Benchmark: fio --name=randread --directory=/ai400/fio --rw=randread --ioengine=mmap --bs=128K --numjobs=32 --filesize=200G --filename=randread --time_based --status-interval=10s --runtime=30s --allow_file_create=1 --group_reporting --disable_lat=1 --disable_clat=1 --disable_slat=1 --disk_util=0 --aux-path=/tmp --randrepeat=0 --unique_filename=0 --fallocate=0 | master | patched | speedup | ---------------+-----------+------------+-----------+ page_fault_avg | 512usec | 52usec | 9.75x page_fault_max | 37698usec| 6543usec| 5.76x Change-Id: I86436cbd027ec6df5094599e54b4acfd0e018930 Signed-off-by: Wang Shilong Reviewed-on: https://review.whamcloud.com/38916 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Yingjin Qian Reviewed-by: Oleg Drokin --- diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index a0e3036..1a061b0 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -518,6 +518,12 @@ static inline struct pcc_inode *ll_i2pcci(struct inode *inode) /* default read-ahead full files smaller than limit on the second read */ #define SBI_DEFAULT_READ_AHEAD_WHOLE_MAX MiB_TO_PAGES(2UL) +/* default range pages */ +#define SBI_DEFAULT_RA_RANGE_PAGES MiB_TO_PAGES(1ULL) + +/* Min range pages */ +#define RA_MIN_MMAP_RANGE_PAGES 16UL + enum ra_stat { RA_STAT_HIT = 0, RA_STAT_MISS, @@ -534,6 +540,7 @@ enum ra_stat { RA_STAT_FAILED_REACH_END, RA_STAT_ASYNC, RA_STAT_FAILED_FAST_READ, + RA_STAT_MMAP_RANGE_READ, _NR_RA_STAT, }; @@ -541,6 +548,7 @@ struct ll_ra_info { atomic_t ra_cur_pages; unsigned long ra_max_pages; unsigned long ra_max_pages_per_file; + unsigned long ra_range_pages; unsigned long ra_max_read_ahead_whole_pages; struct workqueue_struct *ll_readahead_wq; /* @@ -810,6 +818,16 @@ struct ll_readahead_state { */ pgoff_t ras_window_start_idx; pgoff_t ras_window_pages; + + /* Page index where min range read starts */ + pgoff_t ras_range_min_start_idx; + /* Page index where mmap range read ends */ + pgoff_t ras_range_max_end_idx; + /* number of mmap pages where last time detected */ + pgoff_t ras_last_range_pages; + /* number of mmap range requests */ + pgoff_t ras_range_requests; + /* * Optimal RPC size in pages. * It decides how many pages will be sent for each read-ahead. diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 3744aa2..066f6e3 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -136,6 +136,7 @@ static struct ll_sb_info *ll_init_sbi(void) SBI_DEFAULT_READ_AHEAD_PER_FILE_MAX); sbi->ll_ra_info.ra_async_pages_per_file_threshold = sbi->ll_ra_info.ra_max_pages_per_file; + sbi->ll_ra_info.ra_range_pages = SBI_DEFAULT_RA_RANGE_PAGES; sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1; atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index f6a7699..cf753a5 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -1159,6 +1159,51 @@ read_ahead_async_file_threshold_mb_store(struct kobject *kobj, } LUSTRE_RW_ATTR(read_ahead_async_file_threshold_mb); +static ssize_t read_ahead_range_kb_show(struct kobject *kobj, + struct attribute *attr,char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%lu\n", + sbi->ll_ra_info.ra_range_pages << (PAGE_SHIFT - 10)); +} + +static ssize_t +read_ahead_range_kb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + unsigned long pages_number; + unsigned long max_ra_per_file; + u64 val; + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + int rc; + + rc = sysfs_memparse(buffer, count, &val, "KiB"); + if (rc < 0) + return rc; + + pages_number = val >> PAGE_SHIFT; + /* Disable mmap range read */ + if (pages_number == 0) + goto out; + + max_ra_per_file = sbi->ll_ra_info.ra_max_pages_per_file; + if (pages_number > max_ra_per_file || + pages_number < RA_MIN_MMAP_RANGE_PAGES) + return -ERANGE; + +out: + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_range_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(read_ahead_range_kb); + static ssize_t fast_read_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -1493,6 +1538,7 @@ static struct attribute *llite_attrs[] = { &lustre_attr_max_read_ahead_whole_mb.attr, &lustre_attr_max_read_ahead_async_active.attr, &lustre_attr_read_ahead_async_file_threshold_mb.attr, + &lustre_attr_read_ahead_range_kb.attr, &lustre_attr_stats_track_pid.attr, &lustre_attr_stats_track_ppid.attr, &lustre_attr_stats_track_gid.attr, @@ -1605,6 +1651,7 @@ static const char *ra_stat_string[] = { [RA_STAT_FAILED_REACH_END] = "failed to reach end", [RA_STAT_ASYNC] = "async readahead", [RA_STAT_FAILED_FAST_READ] = "failed to fast read", + [RA_STAT_MMAP_RANGE_READ] = "mmap range read", }; int ll_debugfs_register_super(struct super_block *sb, const char *name) diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 0cc784d..daebffa 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -386,7 +386,7 @@ static bool ras_inside_ra_window(pgoff_t idx, struct ra_io_arg *ria) static unsigned long ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, struct cl_page_list *queue, struct ll_readahead_state *ras, - struct ra_io_arg *ria, pgoff_t *ra_end) + struct ra_io_arg *ria, pgoff_t *ra_end, pgoff_t skip_index) { struct cl_read_ahead ra = { 0 }; /* busy page count is per stride */ @@ -399,6 +399,8 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, for (page_idx = ria->ria_start_idx; page_idx <= ria->ria_end_idx && ria->ria_reserved > 0; page_idx++) { + if (skip_index && page_idx == skip_index) + continue; if (ras_inside_ra_window(page_idx, ria)) { if (ra.cra_end_idx == 0 || ra.cra_end_idx < page_idx) { pgoff_t end_idx; @@ -442,10 +444,12 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, if (ras->ras_rpc_pages != ra.cra_rpc_pages && ra.cra_rpc_pages > 0) ras->ras_rpc_pages = ra.cra_rpc_pages; - /* trim it to align with optimal RPC size */ - end_idx = ras_align(ras, ria->ria_end_idx + 1); - if (end_idx > 0 && !ria->ria_eof) - ria->ria_end_idx = end_idx - 1; + if (!skip_index) { + /* trim it to align with optimal RPC size */ + end_idx = ras_align(ras, ria->ria_end_idx + 1); + if (end_idx > 0 && !ria->ria_eof) + ria->ria_end_idx = end_idx - 1; + } if (ria->ria_end_idx < ria->ria_end_idx_min) ria->ria_end_idx = ria->ria_end_idx_min; } @@ -638,7 +642,7 @@ static void ll_readahead_handle_work(struct work_struct *wq) cl_2queue_init(queue); rc = ll_read_ahead_pages(env, io, &queue->c2_qin, ras, ria, - &ra_end_idx); + &ra_end_idx, 0); if (ria->ria_reserved != 0) ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); if (queue->c2_qin.pl_nr > 0) { @@ -676,7 +680,7 @@ out_free_work: static int ll_readahead(const struct lu_env *env, struct cl_io *io, struct cl_page_list *queue, struct ll_readahead_state *ras, bool hit, - struct file *file) + struct file *file, pgoff_t skip_index) { struct vvp_io *vio = vvp_env_io(env); struct ll_thread_info *lti = ll_env_info(env); @@ -720,6 +724,9 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, if (ras->ras_window_pages > 0) end_idx = ras->ras_window_start_idx + ras->ras_window_pages - 1; + if (skip_index) + end_idx = start_idx + ras->ras_window_pages - 1; + /* Enlarge the RA window to encompass the full read */ if (vio->vui_ra_valid && end_idx < vio->vui_ra_start_idx + vio->vui_ra_pages - 1) @@ -772,6 +779,10 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, ria->ria_start_idx; } + /* don't over reserved for mmap range read */ + if (skip_index) + pages_min = 0; + ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, pages, pages_min); if (ria->ria_reserved < pages) @@ -782,8 +793,8 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), ll_i2sbi(inode)->ll_ra_info.ra_max_pages); - ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx); - + ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx, + skip_index); if (ria->ria_reserved != 0) ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); @@ -879,6 +890,10 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) ras_reset(ras, 0); ras->ras_last_read_end_bytes = 0; ras->ras_requests = 0; + ras->ras_range_min_start_idx = 0; + ras->ras_range_max_end_idx = 0; + ras->ras_range_requests = 0; + ras->ras_last_range_pages = 0; } /* @@ -1027,6 +1042,73 @@ static inline bool is_loose_seq_read(struct ll_readahead_state *ras, loff_t pos) 8UL << PAGE_SHIFT, 8UL << PAGE_SHIFT); } +static inline bool is_loose_mmap_read(struct ll_sb_info *sbi, + struct ll_readahead_state *ras, + unsigned long pos) +{ + unsigned long range_pages = sbi->ll_ra_info.ra_range_pages; + + return pos_in_window(pos, ras->ras_last_read_end_bytes, + range_pages << PAGE_SHIFT, + range_pages << PAGE_SHIFT); +} + +/** + * We have observed slow mmap read performances for some + * applications. The problem is if access pattern is neither + * sequential nor stride, but could be still adjacent in a + * small range and then seek a random position. + * + * So the pattern could be something like this: + * + * [1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data] + * + * + * Every time an application reads mmap data, it may not only + * read a single 4KB page, but aslo a cluster of nearby pages in + * a range(e.g. 1MB) of the first page after a cache miss. + * + * The readahead engine is modified to track the range size of + * a cluster of mmap reads, so that after a seek and/or cache miss, + * the range size is used to efficiently prefetch multiple pages + * in a single RPC rather than many small RPCs. + */ +static void ras_detect_cluster_range(struct ll_readahead_state *ras, + struct ll_sb_info *sbi, + unsigned long pos, unsigned long count) +{ + pgoff_t last_pages, pages; + pgoff_t end_idx = (pos + count - 1) >> PAGE_SHIFT; + + last_pages = ras->ras_range_max_end_idx - + ras->ras_range_min_start_idx + 1; + /* First time come here */ + if (!ras->ras_range_max_end_idx) + goto out; + + /* Random or Stride read */ + if (!is_loose_mmap_read(sbi, ras, pos)) + goto out; + + ras->ras_range_requests++; + if (ras->ras_range_max_end_idx < end_idx) + ras->ras_range_max_end_idx = end_idx; + + if (ras->ras_range_min_start_idx > (pos >> PAGE_SHIFT)) + ras->ras_range_min_start_idx = pos >> PAGE_SHIFT; + + /* Out of range, consider it as random or stride */ + pages = ras->ras_range_max_end_idx - + ras->ras_range_min_start_idx + 1; + if (pages <= sbi->ll_ra_info.ra_range_pages) + return; +out: + ras->ras_last_range_pages = last_pages; + ras->ras_range_requests = 0; + ras->ras_range_min_start_idx = pos >> PAGE_SHIFT; + ras->ras_range_max_end_idx = end_idx; +} + static void ras_detect_read_pattern(struct ll_readahead_state *ras, struct ll_sb_info *sbi, loff_t pos, size_t count, bool mmap) @@ -1075,8 +1157,12 @@ static void ras_detect_read_pattern(struct ll_readahead_state *ras, ras->ras_consecutive_bytes += count; if (mmap) { pgoff_t idx = ras->ras_consecutive_bytes >> PAGE_SHIFT; + unsigned long ra_range_pages = + max_t(unsigned long, RA_MIN_MMAP_RANGE_PAGES, + sbi->ll_ra_info.ra_range_pages); - if ((idx >= 4 && (idx & 3UL) == 0) || stride_detect) + if ((idx >= ra_range_pages && + idx % ra_range_pages == 0) || stride_detect) ras->ras_need_increase_window = true; } else if ((ras->ras_consecutive_requests > 1 || stride_detect)) { ras->ras_need_increase_window = true; @@ -1185,10 +1271,36 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, if (ras->ras_no_miss_check) GOTO(out_unlock, 0); - if (flags & LL_RAS_MMAP) + if (flags & LL_RAS_MMAP) { + unsigned long ra_pages; + + ras_detect_cluster_range(ras, sbi, index << PAGE_SHIFT, + PAGE_SIZE); ras_detect_read_pattern(ras, sbi, (loff_t)index << PAGE_SHIFT, PAGE_SIZE, true); + /* we did not detect anything but we could prefetch */ + if (!ras->ras_need_increase_window && + ras->ras_window_pages <= sbi->ll_ra_info.ra_range_pages && + ras->ras_range_requests >= 2) { + if (!hit) { + ra_pages = max_t(unsigned long, + RA_MIN_MMAP_RANGE_PAGES, + ras->ras_last_range_pages); + if (index < ra_pages / 2) + index = 0; + else + index -= ra_pages / 2; + ras->ras_window_pages = ra_pages; + ll_ra_stats_inc_sbi(sbi, + RA_STAT_MMAP_RANGE_READ); + } else { + ras->ras_window_pages = 0; + } + goto skip; + } + } + if (!hit && ras->ras_window_pages && index < ras->ras_next_readahead_idx && pos_in_window(index, ras->ras_window_start_idx, 0, @@ -1227,6 +1339,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, GOTO(out_unlock, 0); } } + +skip: ras_set_start(ras, index); if (stride_io_mode(ras)) { @@ -1495,8 +1609,12 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count - 1); if (ll_readahead_enabled(sbi) && ras) { + pgoff_t skip_index = 0; + + if (ras->ras_next_readahead_idx < vvp_index(vpg)) + skip_index = vvp_index(vpg); rc2 = ll_readahead(env, io, &queue->c2_qin, ras, - uptodate, file); + uptodate, file, skip_index); CDEBUG(D_READA, DFID " %d pages read ahead at %lu\n", PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); } else if (vvp_index(vpg) == io_start_index &&