From: Andrew Perepechko Date: Fri, 3 Mar 2023 22:11:50 +0000 (-0500) Subject: LU-16160 llite: SIGBUS is possible on a race with page reclaim X-Git-Tag: 2.15.3-RC1~42 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=b0a6d4d08e19d06661deabdb7278f07662d8b6e8;p=fs%2Flustre-release.git LU-16160 llite: SIGBUS is possible on a race with page reclaim We can restart fault handling if page truncation happens in parallel with the fault handler. Lustre-commit: I6e60783e3334f87e799dc8b0e6e63d0bb358a236 Lustre-change: https://review.whamcloud.com/49647 Also included sanityn test from: LU-16160 llite: clear stale page's uptodate bit 5b911e03261c3de6b0c2934c86dd191f01af4f2f https://review.whamcloud.com/48607 Change-Id: I6e60783e3334f87e799dc8b0e6e63d0bb358a236 Signed-off-by: Andrew Perepechko Signed-off-by: Patrick Farrell Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50202 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_compat.h b/lustre/include/lustre_compat.h index 1157ed4..4a9aa4b 100644 --- a/lustre/include/lustre_compat.h +++ b/lustre/include/lustre_compat.h @@ -367,9 +367,9 @@ static inline void __user *get_vmf_address(struct vm_fault *vmf) } #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY -# define ll_filemap_fault(vma, vmf) filemap_fault(vmf) +# define __ll_filemap_fault(vma, vmf) filemap_fault(vmf) #else -# define ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf) +# define __ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf) #endif #ifndef HAVE_CURRENT_TIME diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 5a7da7e..ab5311a 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -271,6 +272,7 @@ struct ll_inode_info { struct mutex lli_xattrs_enq_lock; struct list_head lli_xattrs; /* ll_xattr_entry->xe_list */ struct list_head lli_lccs; /* list of ll_cl_context */ + seqlock_t lli_page_inv_lock; }; #ifndef HAVE_USER_NAMESPACE_ARG @@ -1799,4 +1801,6 @@ int ll_manage_foreign(struct inode *inode, struct lustre_md *lmd); bool ll_foreign_is_openable(struct dentry *dentry, unsigned int flags); bool ll_foreign_is_removable(struct dentry *dentry, bool unset); +int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); + #endif /* LLITE_INTERNAL_H */ diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 6c6bee6..ab68075 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1241,6 +1241,7 @@ void ll_lli_init(struct ll_inode_info *lli) memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid)); /* ll_cl_context initialize */ INIT_LIST_HEAD(&lli->lli_lccs); + seqlock_init(&lli->lli_page_inv_lock); } #define MAX_STRING_SIZE 128 diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index d976c25..a68915e 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -250,6 +250,25 @@ static inline int to_fault_error(int result) return result; } +int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int ret; + unsigned int seq; + + /* this seqlock lets us notice if a page has been deleted on this inode + * during the fault process, allowing us to catch an erroneous SIGBUS + * See LU-16160 + */ + do { + seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock); + ret = __ll_filemap_fault(vma, vmf); + } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) && + (ret & VM_FAULT_SIGBUS)); + + return ret; +} + /** * Lustre implementation of a vm_operations_struct::fault() method, called by * VM to server page fault (both in kernel and user space). diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c index af4ff2e..5ee33e5 100644 --- a/lustre/llite/vvp_page.c +++ b/lustre/llite/vvp_page.c @@ -153,29 +153,6 @@ static void vvp_page_discard(const struct lu_env *env, generic_error_remove_page(vmpage->mapping, vmpage); } -static void vvp_page_delete(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct page *vmpage = cl2vm_page(slice); - struct cl_page *page = slice->cpl_page; - int refc; - - LASSERT(PageLocked(vmpage)); - LASSERT((struct cl_page *)vmpage->private == page); - - - /* Drop the reference count held in vvp_page_init */ - refc = atomic_dec_return(&page->cp_ref); - LASSERTF(refc >= 1, "page = %p, refc = %d\n", page, refc); - - ClearPagePrivate(vmpage); - vmpage->private = 0; - /* - * Reference from vmpage to cl_page is removed, but the reference back - * is still here. It is removed later in vvp_page_fini(). - */ -} - static void vvp_page_export(const struct lu_env *env, const struct cl_page_slice *slice, int uptodate) @@ -224,6 +201,41 @@ static int vvp_page_prep_write(const struct lu_env *env, return 0; } +static void vvp_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct cl_page *cp = slice->cpl_page; + + if (cp->cp_type == CPT_CACHEABLE) { + struct page *vmpage = cp->cp_vmpage; + struct inode *inode = vmpage->mapping->host; + + LASSERT(PageLocked(vmpage)); + LASSERT((struct cl_page *)vmpage->private == cp); + + /* Drop the reference count held in vvp_page_init */ + atomic_dec(&cp->cp_ref); + + ClearPagePrivate(vmpage); + vmpage->private = 0; + + /* clearpageuptodate prevents the page being read by the + * kernel after it has been deleted from Lustre, which avoids + * potential stale data reads. The seqlock allows us to see + * that a page was potentially deleted and catch the resulting + * SIGBUS - see ll_filemap_fault() (LU-16160) */ + write_seqlock(&ll_i2info(inode)->lli_page_inv_lock); + ClearPageUptodate(vmpage); + write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock); + + /* + * The reference from vmpage to cl_page is removed, + * but the reference back is still here. It is removed + * later in cl_page_free(). + */ + } +} + /** * Handles page transfer errors at VM level. * diff --git a/lustre/tests/rw_seq_cst_vs_drop_caches.c b/lustre/tests/rw_seq_cst_vs_drop_caches.c index 5f988bb..3cb9aba 100644 --- a/lustre/tests/rw_seq_cst_vs_drop_caches.c +++ b/lustre/tests/rw_seq_cst_vs_drop_caches.c @@ -11,7 +11,7 @@ #include /* - * Usage: rw_seq_cst_vs_drop_caches /mnt/lustre/file0 /mnt/lustre2/file0 + * Usage: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0 * Race reads of the same file on two client mounts vs writes and drop * caches to detect sequential consistency violations. Run @@ -19,6 +19,15 @@ * which case the wait status ($?) will be 134. */ +int mmap_mode; /* -m flag */ + +void usage(void) +{ + fprintf(stdout, + "%s: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0\n" +" -m : use mmap to read/write file\n", __func__); +} + #define handle_error(msg) \ do { perror(msg); exit(EXIT_FAILURE); } while (0) @@ -28,23 +37,39 @@ static int fd[2] = { -1, -1 }; */ static uint64_t u, u_max = UINT64_MAX / 2; static uint64_t v[2]; +char *ptr; static void *access_thread_start(void *unused) { + char *ptr2 = NULL; ssize_t rc; int i; + if (mmap_mode) { + ptr2 = mmap(NULL, sizeof(v[1]), PROT_READ, + MAP_PRIVATE | MAP_POPULATE, fd[1], 0); + if (ptr2 == MAP_FAILED) + handle_error("mmap"); + } + do { for (i = 0; i < 2; i++) { - rc = pread(fd[i], &v[i], sizeof(v[i]), 0); - if (rc < 0 || rc != sizeof(v[i])) - handle_error("pread"); + if (mmap_mode) { + memcpy(&v[i], i == 0 ? ptr : ptr2, + sizeof(v[i])); + } else { + rc = pread(fd[i], &v[i], sizeof(v[i]), 0); + if (rc < 0 || rc != sizeof(v[i])) + handle_error("pread"); + } } } while (v[0] <= v[1]); fprintf(stderr, "error: u = %"PRIu64", v = %"PRIu64", %"PRIu64"\n", u, v[0], v[1]); + if (mmap_mode) + munmap(ptr2, sizeof(v[i])); abort(); } @@ -56,12 +81,26 @@ int main(int argc, char *argv[]) pthread_t access_thread; struct stat st[2]; ssize_t rc; - int i; + int i, ch; setvbuf(stderr, stderr_buf, _IOLBF, sizeof(stderr_buf)); - if (argc != 3) { - fprintf(stderr, "Usage: %s /mnt/lustre/file0 /mnt/lustre2/file0\n", argv[0]); + while ((ch = getopt(argc, argv, "m")) >= 0) { + switch (ch) { + case 'm': + mmap_mode = 1; + break; + default: + usage(); + exit(EXIT_FAILURE); + } + } + + argc -= optind; + argv += optind; + + if (argc != 2) { + usage(); exit(EXIT_FAILURE); } @@ -69,7 +108,7 @@ int main(int argc, char *argv[]) assert(!(drop_caches_fd < 0)); for (i = 0; i < 2; i++) { - fd[i] = open(argv[i + 1], O_RDWR|O_CREAT|O_TRUNC, 0666); + fd[i] = open(argv[i], O_RDWR|O_CREAT|O_TRUNC, 0666); if (fd[i] < 0) handle_error("open"); @@ -86,18 +125,33 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - rc = pwrite(fd[0], &u, sizeof(u), 0); - if (rc < 0 || rc != sizeof(u)) - handle_error("pwrite"); + if (mmap_mode) { + if (ftruncate(fd[0], sizeof(u)) < 0) + handle_error("ftruncate"); + + ptr = mmap(NULL, sizeof(u), PROT_READ|PROT_WRITE, MAP_SHARED, + fd[0], 0); + if (ptr == MAP_FAILED) + handle_error("mmap"); + memcpy(ptr, &u, sizeof(u)); + } else { + rc = pwrite(fd[0], &u, sizeof(u), 0); + if (rc < 0 || rc != sizeof(u)) + handle_error("pwrite"); + } rc = pthread_create(&access_thread, NULL, &access_thread_start, NULL); if (rc != 0) handle_error("pthread_create"); for (u = 1; u <= u_max; u++) { - rc = pwrite(fd[0], &u, sizeof(u), 0); - if (rc < 0 || rc != sizeof(u)) - handle_error("pwrite"); + if (mmap_mode) { + memcpy(ptr, &u, sizeof(u)); + } else { + rc = pwrite(fd[0], &u, sizeof(u), 0); + if (rc < 0 || rc != sizeof(u)) + handle_error("pwrite"); + } rc = write(drop_caches_fd, "3\n", 2); if (rc < 0 || rc != 2) @@ -112,5 +166,8 @@ int main(int argc, char *argv[]) if (rc != 0) handle_error("pthread_join"); + if (mmap_mode) + munmap(ptr, sizeof(u)); + return 0; } diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index b70d809..11f597c 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -18,8 +18,8 @@ init_test_env $@ init_logging ALWAYS_EXCEPT="$SANITYN_EXCEPT " -# bug number for skipped test: LU-7105 LU-14541 -ALWAYS_EXCEPT+=" 28 16f" +# bug number for skipped test: LU-7105 +ALWAYS_EXCEPT+=" 28 " # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! # skip tests for PPC until they are fixed @@ -564,6 +564,35 @@ test_16f() { # LU-14541 } run_test 16f "rw sequential consistency vs drop_caches" +test_16g() { + local file1=$DIR1/$tfile + local file2=$DIR2/$tfile + local duration=20 + local status + + timeout --preserve-status --signal=USR1 $duration \ + rw_seq_cst_vs_drop_caches -m $file1 $file2 + status=$? + + case $((status & 0x7f)) in + 0) + echo OK # Computers must be fast now. + ;; + 6) # SIGABRT + error "sequential consistency violation detected" + ;; + 10) # SIGUSR1 + echo TIMEOUT # This is fine. + ;; + *) + error "strange status '$status'" + ;; + esac + + rm -f $file1 +} +run_test 16g "mmap rw sequential consistency vs drop_caches" + test_17() { # bug 3513, 3667 remote_ost_nodsh && skip "remote OST with nodsh" && return