From 5b911e03261c3de6b0c2934c86dd191f01af4f2f Mon Sep 17 00:00:00 2001 From: Bobi Jam Date: Wed, 21 Sep 2022 00:27:04 +0800 Subject: [PATCH] LU-16160 llite: clear stale page's uptodate bit With truncate_inode_page()->do_invalidatepage()->ll_invalidatepage() call path before deleting vmpage from page cache, the page could be possibly picked up by ll_read_ahead_page()->grab_cache_page_nowait(). If ll_invalidatepage()->cl_page_delete() does not clear the vmpage's uptodate bit, the read ahead could pick it up and think it's already uptodate wrongly. In ll_fault()->vvp_io_fault_start()->vvp_io_kernel_fault(), the filemap_fault() will call ll_readpage() to read vmpage and wait for the unlock of the vmpage, and when ll_readpage() successfully read the vmpage then unlock the vmpage, memory pressure or truncate can get in and delete the cl_page, afterward filemap_fault() find that the vmpage is not uptodate and VM_FAULT_SIGBUS got returned. To fix this situation, this patch makes vvp_io_kernel_fault() restart filemap_fault() to get uptodated vmpage again. Test-Parameters: testlist=sanityn env=ONLY="16f",ONLY_REPEAT=50 Test-Parameters: testlist=sanityn env=ONLY="16g",ONLY_REPEAT=50 Test-Parameters: testlist=sanityn env=ONLY="16f 16g",ONLY_REPEAT=50 Signed-off-by: Bobi Jam Change-Id: I369e1362ffb071ec0a4de3cd5bad27a87cff5e05 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48607 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/cl_object.h | 15 +++- lustre/llite/rw.c | 10 ++- lustre/llite/vvp_io.c | 119 ++++++++++++++++++++++++++++--- lustre/llite/vvp_page.c | 5 ++ lustre/obdclass/cl_page.c | 40 ++++++++--- lustre/tests/rw_seq_cst_vs_drop_caches.c | 85 ++++++++++++++++++---- lustre/tests/sanityn.sh | 33 ++++++++- 7 files changed, 270 insertions(+), 37 deletions(-) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index b7de8eb..672004a 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -767,7 +767,15 @@ struct cl_page { enum cl_page_type cp_type:CP_TYPE_BITS; unsigned cp_defer_uptodate:1, cp_ra_updated:1, - cp_ra_used:1; + cp_ra_used:1, + /* fault page read grab extra referece */ + cp_fault_ref:1, + /** + * if fault page got delete before returned to + * filemap_fault(), defer the vmpage detach/put + * until filemap_fault() has been handled. + */ + cp_defer_detach:1; /* which slab kmem index this memory allocated from */ short int cp_kmem_index; @@ -2389,6 +2397,11 @@ static inline int cl_io_is_mkwrite(const struct cl_io *io) return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; } +static inline int cl_io_is_pagefault(const struct cl_io *io) +{ + return io->ci_type == CIT_FAULT && !io->u.ci_fault.ft_mkwrite; +} + /** * True, iff \a io is a truncate(2). */ diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 66496e6..0f76a88 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1943,7 +1943,15 @@ int ll_readpage(struct file *file, struct page *vmpage) unlock_page(vmpage); result = 0; } - cl_page_put(env, page); + if (cl_io_is_pagefault(io) && result == 0) { + /** + * page fault, retain the cl_page reference until + * vvp_io_kernel_fault() release it. + */ + page->cp_fault_ref = 1; + } else { + cl_page_put(env, page); + } } else { unlock_page(vmpage); result = PTR_ERR(page); diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index eda72e8..699e9a6 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -1386,14 +1386,42 @@ static void vvp_io_rw_end(const struct lu_env *env, trunc_sem_up_read(&lli->lli_trunc_sem); } -static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) +static void detach_and_deref_page(struct cl_page *clp, struct page *vmpage) +{ + if (!clp->cp_defer_detach) + return; + + /** + * cl_page_delete0() took a vmpage reference, but not unlink the vmpage + * from its cl_page. + */ + clp->cp_defer_detach = 0; + ClearPagePrivate(vmpage); + vmpage->private = 0; + + put_page(vmpage); + atomic_dec(&clp->cp_ref); +} + +static int vvp_io_kernel_fault(const struct lu_env *env, + struct vvp_fault_io *cfio) { struct vm_fault *vmf = cfio->ft_vmf; + struct file *vmff = cfio->ft_vma->vm_file; + struct address_space *mapping = vmff->f_mapping; + struct inode *inode = mapping->host; + struct page *vmpage = NULL; + struct cl_page *clp = NULL; + int rc = 0; + ENTRY; + ll_inode_size_lock(inode); +retry: cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf); cfio->ft_flags_valid = 1; if (vmf->page) { + /* success, vmpage is locked */ LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n", get_vmf_address(vmf)); if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { @@ -1403,25 +1431,100 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) cfio->ft_vmpage = vmf->page; - return 0; + /** + * ll_filemap_fault()->ll_readpage() could get an extra cl_page + * reference. So we have to get the cl_page's to check its + * cp_fault_ref and drop the reference later. + */ + clp = cl_vmpage_page(vmf->page, NULL); + + GOTO(unlock, rc = 0); + } + + /* filemap_fault() fails, vmpage is not locked */ + if (clp == NULL) { + vmpage = find_get_page(mapping, vmf->pgoff); + if (vmpage) { + lock_page(vmpage); + clp = cl_vmpage_page(vmpage, NULL); + unlock_page(vmpage); + } } if (cfio->ft_flags & VM_FAULT_SIGBUS) { + pgoff_t max_idx; + + /** + * ll_filemap_fault()->ll_readpage() could fill vmpage + * correctly, and unlock the vmpage, while memory pressure or + * truncate could detach cl_page from vmpage, and kernel + * filemap_fault() will wait_on_page_locked(vmpage) and find + * out that the vmpage has been cleared its uptodate bit, + * so it returns VM_FAULT_SIGBUS. + * + * In this case, we'd retry the filemap_fault()->ll_readpage() + * to rebuild the cl_page and fill vmpage with uptodated data. + */ + if (likely(vmpage)) { + bool need_retry = false; + + if (clp) { + if (clp->cp_defer_detach) { + detach_and_deref_page(clp, vmpage); + /** + * check i_size to make sure it's not + * over EOF, we don't want to call + * filemap_fault() repeatedly since it + * returns VM_FAULT_SIGBUS without even + * trying if vmf->pgoff is over EOF. + */ + max_idx = DIV_ROUND_UP( + i_size_read(inode), PAGE_SIZE); + if (vmf->pgoff < max_idx) + need_retry = true; + } + if (clp->cp_fault_ref) { + clp->cp_fault_ref = 0; + /* ref not released in ll_readpage() */ + cl_page_put(env, clp); + } + if (need_retry) + goto retry; + } + } + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf)); - return -EFAULT; + GOTO(unlock, rc = -EFAULT); } if (cfio->ft_flags & VM_FAULT_OOM) { CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf)); - return -ENOMEM; + GOTO(unlock, rc = -ENOMEM); } if (cfio->ft_flags & VM_FAULT_RETRY) - return -EAGAIN; + GOTO(unlock, rc = -EAGAIN); CERROR("unknown error in page fault %d\n", cfio->ft_flags); - - return -EINVAL; + rc = -EINVAL; +unlock: + ll_inode_size_unlock(inode); + if (clp) { + if (clp->cp_defer_detach && vmpage) + detach_and_deref_page(clp, vmpage); + + /* additional cl_page ref has been taken in ll_readpage() */ + if (clp->cp_fault_ref) { + clp->cp_fault_ref = 0; + /* ref not released in ll_readpage() */ + cl_page_put(env, clp); + } + /* ref taken in this function */ + cl_page_put(env, clp); + } + if (vmpage) + put_page(vmpage); + return rc; } static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io, @@ -1462,7 +1565,7 @@ static int vvp_io_fault_start(const struct lu_env *env, LASSERT(cfio->ft_vmpage != NULL); lock_page(cfio->ft_vmpage); } else { - result = vvp_io_kernel_fault(cfio); + result = vvp_io_kernel_fault(env, cfio); if (result != 0) RETURN(result); } diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c index 31edb33..f3840c4 100644 --- a/lustre/llite/vvp_page.c +++ b/lustre/llite/vvp_page.c @@ -108,6 +108,11 @@ static void vvp_page_completion_read(const struct lu_env *env, ll_ra_count_put(ll_i2sbi(inode), 1); if (ioret == 0) { + /** + * cp_defer_uptodate is used for readahead page, and the + * vmpage Uptodate bit is deferred to set in ll_readpage/ + * ll_io_read_page. + */ if (!cp->cp_defer_uptodate) SetPageUptodate(vmpage); } else if (cp->cp_defer_uptodate) { diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c index 29a5bc1..91a879f 100644 --- a/lustre/obdclass/cl_page.c +++ b/lustre/obdclass/cl_page.c @@ -828,18 +828,36 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *cp) LASSERT(PageLocked(vmpage)); LASSERT((struct cl_page *)vmpage->private == cp); - /* Drop the reference count held in vvp_page_init */ - refc = atomic_dec_return(&cp->cp_ref); - LASSERTF(refc >= 1, "page = %p, refc = %d\n", cp, refc); - - ClearPagePrivate(vmpage); - vmpage->private = 0; - - /* - * The reference from vmpage to cl_page is removed, - * but the reference back is still here. It is removed - * later in cl_page_free(). + /** + * clear vmpage uptodate bit, since ll_read_ahead_pages()-> + * ll_read_ahead_page() could pick up this stale vmpage and + * take it as uptodated. + */ + ClearPageUptodate(vmpage); + /** + * vvp_io_kernel_fault()->ll_readpage() set cp_fault_ref + * and need it to check cl_page to retry the page fault read. */ + if (cp->cp_fault_ref) { + cp->cp_defer_detach = 1; + /** + * get a vmpage reference, so that filemap_fault() + * won't free it from pagecache. + */ + get_page(vmpage); + } else { + /* Drop the reference count held in vvp_page_init */ + refc = atomic_dec_return(&cp->cp_ref); + LASSERTF(refc >= 1, "page = %p, refc = %d\n", cp, refc); + ClearPagePrivate(vmpage); + vmpage->private = 0; + + /* + * The reference from vmpage to cl_page is removed, + * but the reference back is still here. It is removed + * later in cl_page_free(). + */ + } } EXIT; diff --git a/lustre/tests/rw_seq_cst_vs_drop_caches.c b/lustre/tests/rw_seq_cst_vs_drop_caches.c index 5f988bb..3cb9aba 100644 --- a/lustre/tests/rw_seq_cst_vs_drop_caches.c +++ b/lustre/tests/rw_seq_cst_vs_drop_caches.c @@ -11,7 +11,7 @@ #include /* - * Usage: rw_seq_cst_vs_drop_caches /mnt/lustre/file0 /mnt/lustre2/file0 + * Usage: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0 * Race reads of the same file on two client mounts vs writes and drop * caches to detect sequential consistency violations. Run @@ -19,6 +19,15 @@ * which case the wait status ($?) will be 134. */ +int mmap_mode; /* -m flag */ + +void usage(void) +{ + fprintf(stdout, + "%s: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0\n" +" -m : use mmap to read/write file\n", __func__); +} + #define handle_error(msg) \ do { perror(msg); exit(EXIT_FAILURE); } while (0) @@ -28,23 +37,39 @@ static int fd[2] = { -1, -1 }; */ static uint64_t u, u_max = UINT64_MAX / 2; static uint64_t v[2]; +char *ptr; static void *access_thread_start(void *unused) { + char *ptr2 = NULL; ssize_t rc; int i; + if (mmap_mode) { + ptr2 = mmap(NULL, sizeof(v[1]), PROT_READ, + MAP_PRIVATE | MAP_POPULATE, fd[1], 0); + if (ptr2 == MAP_FAILED) + handle_error("mmap"); + } + do { for (i = 0; i < 2; i++) { - rc = pread(fd[i], &v[i], sizeof(v[i]), 0); - if (rc < 0 || rc != sizeof(v[i])) - handle_error("pread"); + if (mmap_mode) { + memcpy(&v[i], i == 0 ? ptr : ptr2, + sizeof(v[i])); + } else { + rc = pread(fd[i], &v[i], sizeof(v[i]), 0); + if (rc < 0 || rc != sizeof(v[i])) + handle_error("pread"); + } } } while (v[0] <= v[1]); fprintf(stderr, "error: u = %"PRIu64", v = %"PRIu64", %"PRIu64"\n", u, v[0], v[1]); + if (mmap_mode) + munmap(ptr2, sizeof(v[i])); abort(); } @@ -56,12 +81,26 @@ int main(int argc, char *argv[]) pthread_t access_thread; struct stat st[2]; ssize_t rc; - int i; + int i, ch; setvbuf(stderr, stderr_buf, _IOLBF, sizeof(stderr_buf)); - if (argc != 3) { - fprintf(stderr, "Usage: %s /mnt/lustre/file0 /mnt/lustre2/file0\n", argv[0]); + while ((ch = getopt(argc, argv, "m")) >= 0) { + switch (ch) { + case 'm': + mmap_mode = 1; + break; + default: + usage(); + exit(EXIT_FAILURE); + } + } + + argc -= optind; + argv += optind; + + if (argc != 2) { + usage(); exit(EXIT_FAILURE); } @@ -69,7 +108,7 @@ int main(int argc, char *argv[]) assert(!(drop_caches_fd < 0)); for (i = 0; i < 2; i++) { - fd[i] = open(argv[i + 1], O_RDWR|O_CREAT|O_TRUNC, 0666); + fd[i] = open(argv[i], O_RDWR|O_CREAT|O_TRUNC, 0666); if (fd[i] < 0) handle_error("open"); @@ -86,18 +125,33 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - rc = pwrite(fd[0], &u, sizeof(u), 0); - if (rc < 0 || rc != sizeof(u)) - handle_error("pwrite"); + if (mmap_mode) { + if (ftruncate(fd[0], sizeof(u)) < 0) + handle_error("ftruncate"); + + ptr = mmap(NULL, sizeof(u), PROT_READ|PROT_WRITE, MAP_SHARED, + fd[0], 0); + if (ptr == MAP_FAILED) + handle_error("mmap"); + memcpy(ptr, &u, sizeof(u)); + } else { + rc = pwrite(fd[0], &u, sizeof(u), 0); + if (rc < 0 || rc != sizeof(u)) + handle_error("pwrite"); + } rc = pthread_create(&access_thread, NULL, &access_thread_start, NULL); if (rc != 0) handle_error("pthread_create"); for (u = 1; u <= u_max; u++) { - rc = pwrite(fd[0], &u, sizeof(u), 0); - if (rc < 0 || rc != sizeof(u)) - handle_error("pwrite"); + if (mmap_mode) { + memcpy(ptr, &u, sizeof(u)); + } else { + rc = pwrite(fd[0], &u, sizeof(u), 0); + if (rc < 0 || rc != sizeof(u)) + handle_error("pwrite"); + } rc = write(drop_caches_fd, "3\n", 2); if (rc < 0 || rc != 2) @@ -112,5 +166,8 @@ int main(int argc, char *argv[]) if (rc != 0) handle_error("pthread_join"); + if (mmap_mode) + munmap(ptr, sizeof(u)); + return 0; } diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index ae75881..25158d0 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -18,8 +18,8 @@ init_test_env $@ init_logging ALWAYS_EXCEPT="$SANITYN_EXCEPT " -# bug number for skipped test: LU-7105 LU-14541 -ALWAYS_EXCEPT+=" 28 16f" +# bug number for skipped test: LU-7105 +ALWAYS_EXCEPT+=" 28 " # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! if [ $mds1_FSTYPE = "zfs" ]; then @@ -586,6 +586,35 @@ test_16f() { # LU-14541 } run_test 16f "rw sequential consistency vs drop_caches" +test_16g() { + local file1=$DIR1/$tfile + local file2=$DIR2/$tfile + local duration=20 + local status + + timeout --preserve-status --signal=USR1 $duration \ + rw_seq_cst_vs_drop_caches -m $file1 $file2 + status=$? + + case $((status & 0x7f)) in + 0) + echo OK # Computers must be fast now. + ;; + 6) # SIGABRT + error "sequential consistency violation detected" + ;; + 10) # SIGUSR1 + echo TIMEOUT # This is fine. + ;; + *) + error "strange status '$status'" + ;; + esac + + rm -f $file1 +} +run_test 16g "mmap rw sequential consistency vs drop_caches" + test_16h() { local tf=$DIR/$tdir/$tfile local tf2=$DIR2/$tdir/$tfile -- 1.8.3.1