From 7542820698696ed5853ded30c9bf7fd5a78f0937 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 9 Jan 2021 18:28:43 +0800 Subject: [PATCH] LU-13669 llite: make readahead aware of hints Calling madvise(MADV_SEQUENTIAL) and madvise(MADV_RANDOM) sets the VM_SEQ_READ and VM_RAND_READ hints in vma->vm_flags. These should be used to guide the Lustre readahead for better performance. Disable the kernel readahead for mmap() pages and use the llite readahead instead. There was also a bug in ll_fault0() that would set both VM_SEQ_READ and VM_RAND_READ at the same time, which was confusing the detection of the VM_SEQ_READ case, since VM_RAND_READ was being checked first. This changes the readahead for mmap from submitting mostly 4KB RPCs to a large number of 1MB RPCs for the application profiled: llite.*.read_ahead_stats before patched ------------------------ ------ ------- hits 2408 135924 samples [pages] misses 34160 2384 samples [pages] osc.*.rpc_stats read before read patched --------------- ------------- -------------- pages per rpc rpcs % cum% rpcs % cum% 1: 6542 95 95 351 55 55 2: 224 3 99 76 12 67 4: 32 0 99 28 4 72 8: 2 0 99 9 1 73 16: 25 0 99 32 5 78 32: 0 0 99 8 1 80 64: 0 0 99 5 0 80 128: 0 0 99 15 2 83 256: 2 0 99 102 16 99 512: 0 0 99 0 0 99 1024: 1 0 100 3 0 100 Readahead hit rate improved from 6% to 98%, and 4KB RPCs dropped from 95% to 55% and 1MB+ RPCs increased from 0% to 16% (79% of all pages). Add debug to ll_file_mmap(), ll_fault() and ll_fault_io_init() to allow tracing VMA state functions for future IO optimizations. Fixes: 62ef9c949753 ("add 2.6.27 kernel support") Signed-off-by: Wang Shilong Change-Id: I4bbb028db05b21ae01dafe6a7bea398e9b74d8a4 Reviewed-on: https://review.whamcloud.com/41228 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Wang Shilong Reviewed-by: Oleg Drokin --- lustre/include/cl_object.h | 10 +++++++- lustre/llite/file.c | 2 ++ lustre/llite/llite_mmap.c | 59 +++++++++++++++++++++++----------------------- lustre/llite/rw.c | 20 ++++++++++++---- 4 files changed, 57 insertions(+), 34 deletions(-) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index b1bcaaf..05c89eb 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1964,7 +1964,15 @@ struct cl_io { * the read IO will check to-be-read OSCs' status, and make fast-switch * another mirror if some of the OSTs are not healthy. */ - ci_tried_all_mirrors:1; + ci_tried_all_mirrors:1, + /** + * Random read hints, readahead will be disabled. + */ + ci_rand_read:1, + /** + * Sequential read hints. + */ + ci_seq_read:1; /** * Bypass quota check */ diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 2e78b2d..2e0ca9e 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -730,6 +730,8 @@ static int ll_local_open(struct file *file, struct lookup_intent *it, file->private_data = fd; ll_readahead_init(inode, &fd->fd_ras); fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + /* turn off the kernel's read-ahead */ + file->f_ra.ra_pages = 0; /* ll_cl_context initialize */ rwlock_init(&fd->fd_lock); diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index 242f4f4..8ac6698 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -76,13 +76,11 @@ struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, * \param env - corespondent lu_env to processing * \param vma - virtual memory area addressed to page fault * \param index - page index corespondent to fault. - * \parm ra_flags - vma readahead flags. * * \return error codes from cl_io_init. */ static struct cl_io * -ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, - pgoff_t index, unsigned long *ra_flags) +ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, pgoff_t index) { struct file *file = vma->vm_file; struct inode *inode = file_inode(file); @@ -91,30 +89,27 @@ ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, int rc; ENTRY; - if (ll_file_nolock(file)) - RETURN(ERR_PTR(-EOPNOTSUPP)); + if (ll_file_nolock(file)) + RETURN(ERR_PTR(-EOPNOTSUPP)); restart: io = vvp_env_thread_io(env); - io->ci_obj = ll_i2info(inode)->lli_clob; - LASSERT(io->ci_obj != NULL); - - fio = &io->u.ci_fault; - fio->ft_index = index; - fio->ft_executable = vma->vm_flags&VM_EXEC; - - /* - * disable VM_SEQ_READ and use VM_RAND_READ to make sure that - * the kernel will not read other pages not covered by ldlm in - * filemap_nopage. we do our readahead in ll_readpage. - */ - if (ra_flags != NULL) - *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); - vma->vm_flags &= ~VM_SEQ_READ; - vma->vm_flags |= VM_RAND_READ; - - CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, - fio->ft_index, fio->ft_executable); + io->ci_obj = ll_i2info(inode)->lli_clob; + LASSERT(io->ci_obj != NULL); + + fio = &io->u.ci_fault; + fio->ft_index = index; + fio->ft_executable = vma->vm_flags & VM_EXEC; + + CDEBUG(D_MMAP, + DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n", + PFID(&ll_i2info(inode)->lli_fid), vma, vma->vm_start, + vma->vm_end, vma->vm_flags, fio->ft_index); + + if (vma->vm_flags & VM_SEQ_READ) + io->ci_seq_read = 1; + else if (vma->vm_flags & VM_RAND_READ) + io->ci_rand_read = 1; rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); if (rc == 0) { @@ -158,7 +153,7 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, if (IS_ERR(env)) RETURN(PTR_ERR(env)); - io = ll_fault_io_init(env, vma, vmpage->index, NULL); + io = ll_fault_io_init(env, vma, vmpage->index); if (IS_ERR(io)) GOTO(out, result = PTR_ERR(io)); @@ -268,7 +263,6 @@ static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) struct cl_io *io; struct vvp_io *vio = NULL; struct page *vmpage; - unsigned long ra_flags; int result = 0; int fault_ret = 0; __u16 refcheck; @@ -302,7 +296,7 @@ static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) fault_ret = 0; } - io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags); + io = ll_fault_io_init(env, vma, vmf->pgoff); if (IS_ERR(io)) GOTO(out, result = PTR_ERR(io)); @@ -335,8 +329,6 @@ static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) } cl_io_fini(env, io); - vma->vm_flags |= ra_flags; - out: cl_env_put(env, &refcheck); if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) @@ -365,6 +357,10 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (cached) goto out; + CDEBUG(D_MMAP, DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx\n", + PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid), + vma, vma->vm_start, vma->vm_end, vma->vm_flags); + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite * so that it can be killed by admin but not cause segfault by * other signals. @@ -375,6 +371,7 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* make sure offset is not a negative number */ if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return VM_FAULT_SIGBUS; + restart: result = ll_fault0(vma, vmf); if (vmf->page && @@ -547,6 +544,10 @@ int ll_file_mmap(struct file *file, struct vm_area_struct * vma) int rc; ENTRY; + CDEBUG(D_VFSTRACE | D_MMAP, + "VFS_Op: fid="DFID" vma=%p start=%#lx end=%#lx vm_flags=%#lx\n", + PFID(&ll_i2info(inode)->lli_fid), + vma, vma->vm_start, vma->vm_end, vma->vm_flags); if (ll_file_nolock(file)) RETURN(-EOPNOTSUPP); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index daebffa..9202524 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1249,7 +1249,7 @@ static bool index_in_stride_window(struct ll_readahead_state *ras, */ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, struct ll_readahead_state *ras, pgoff_t index, - enum ras_update_flags flags) + enum ras_update_flags flags, struct cl_io *io) { struct ll_ra_info *ra = &sbi->ll_ra_info; bool hit = flags & LL_RAS_HIT; @@ -1271,6 +1271,18 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, if (ras->ras_no_miss_check) GOTO(out_unlock, 0); + if (io && io->ci_rand_read) + GOTO(out_unlock, 0); + + if (io && io->ci_seq_read) { + if (!hit) { + /* to avoid many small read RPC here */ + ras->ras_window_pages = sbi->ll_ra_info.ra_range_pages; + ll_ra_stats_inc_sbi(sbi, RA_STAT_MMAP_RANGE_READ); + } + goto skip; + } + if (flags & LL_RAS_MMAP) { unsigned long ra_pages; @@ -1589,7 +1601,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, flags |= LL_RAS_HIT; if (!vio->vui_ra_valid) flags |= LL_RAS_MMAP; - ras_update(sbi, inode, ras, vvp_index(vpg), flags); + ras_update(sbi, inode, ras, vvp_index(vpg), flags, io); } cl_2queue_init(queue); @@ -1608,7 +1620,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos); io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count - 1); - if (ll_readahead_enabled(sbi) && ras) { + if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) { pgoff_t skip_index = 0; if (ras->ras_next_readahead_idx < vvp_index(vpg)) @@ -1794,7 +1806,7 @@ int ll_readpage(struct file *file, struct page *vmpage) /* For fast read, it updates read ahead state only * if the page is hit in cache because non cache page * case will be handled by slow read later. */ - ras_update(sbi, inode, ras, vvp_index(vpg), flags); + ras_update(sbi, inode, ras, vvp_index(vpg), flags, io); /* avoid duplicate ras_update() call */ vpg->vpg_ra_updated = 1; -- 1.8.3.1