Whamcloud - gitweb
LU-16160 llite: SIGBUS is possible on a race with page reclaim 02/50202/2
authorAndrew Perepechko <andrew.perepechko@hpe.com>
Fri, 3 Mar 2023 22:11:50 +0000 (17:11 -0500)
committerOleg Drokin <green@whamcloud.com>
Tue, 11 Apr 2023 00:06:24 +0000 (00:06 +0000)
We can restart fault handling if page truncation happens
in parallel with the fault handler.

Lustre-commit: I6e60783e3334f87e799dc8b0e6e63d0bb358a236
Lustre-change: https://review.whamcloud.com/49647

Also included sanityn test from:
LU-16160 llite: clear stale page's uptodate bit
5b911e03261c3de6b0c2934c86dd191f01af4f2f
https://review.whamcloud.com/48607

Change-Id: I6e60783e3334f87e799dc8b0e6e63d0bb358a236
Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Signed-off-by: Patrick Farrell <farr0186@gmail.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50202
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_compat.h
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/llite_mmap.c
lustre/llite/vvp_page.c
lustre/tests/rw_seq_cst_vs_drop_caches.c
lustre/tests/sanityn.sh

index 1157ed4..4a9aa4b 100644 (file)
@@ -367,9 +367,9 @@ static inline void __user *get_vmf_address(struct vm_fault *vmf)
 }
 
 #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-# define ll_filemap_fault(vma, vmf) filemap_fault(vmf)
+# define __ll_filemap_fault(vma, vmf) filemap_fault(vmf)
 #else
-# define ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf)
+# define __ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf)
 #endif
 
 #ifndef HAVE_CURRENT_TIME
index 5a7da7e..ab5311a 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/compat.h>
 #include <linux/aio.h>
 #include <linux/parser.h>
+#include <linux/seqlock.h>
 #include <lustre_compat.h>
 #include <lustre_crypto.h>
 #include <range_lock.h>
@@ -271,6 +272,7 @@ struct ll_inode_info {
        struct mutex                    lli_xattrs_enq_lock;
        struct list_head                lli_xattrs; /* ll_xattr_entry->xe_list */
        struct list_head                lli_lccs; /* list of ll_cl_context */
+       seqlock_t                       lli_page_inv_lock;
 };
 
 #ifndef HAVE_USER_NAMESPACE_ARG
@@ -1799,4 +1801,6 @@ int ll_manage_foreign(struct inode *inode, struct lustre_md *lmd);
 bool ll_foreign_is_openable(struct dentry *dentry, unsigned int flags);
 bool ll_foreign_is_removable(struct dentry *dentry, bool unset);
 
+int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+
 #endif /* LLITE_INTERNAL_H */
index 6c6bee6..ab68075 100644 (file)
@@ -1241,6 +1241,7 @@ void ll_lli_init(struct ll_inode_info *lli)
        memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
        /* ll_cl_context initialize */
        INIT_LIST_HEAD(&lli->lli_lccs);
+       seqlock_init(&lli->lli_page_inv_lock);
 }
 
 #define MAX_STRING_SIZE 128
index d976c25..a68915e 100644 (file)
@@ -250,6 +250,25 @@ static inline int to_fault_error(int result)
        return result;
 }
 
+int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct inode *inode = file_inode(vma->vm_file);
+       int ret;
+       unsigned int seq;
+
+       /* this seqlock lets us notice if a page has been deleted on this inode
+        * during the fault process, allowing us to catch an erroneous SIGBUS
+        * See LU-16160
+        */
+       do {
+               seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock);
+               ret = __ll_filemap_fault(vma, vmf);
+       } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) &&
+                (ret & VM_FAULT_SIGBUS));
+
+       return ret;
+}
+
 /**
  * Lustre implementation of a vm_operations_struct::fault() method, called by
  * VM to server page fault (both in kernel and user space).
index af4ff2e..5ee33e5 100644 (file)
@@ -153,29 +153,6 @@ static void vvp_page_discard(const struct lu_env *env,
        generic_error_remove_page(vmpage->mapping, vmpage);
 }
 
-static void vvp_page_delete(const struct lu_env *env,
-                           const struct cl_page_slice *slice)
-{
-       struct page      *vmpage = cl2vm_page(slice);
-       struct cl_page   *page   = slice->cpl_page;
-       int refc;
-
-       LASSERT(PageLocked(vmpage));
-       LASSERT((struct cl_page *)vmpage->private == page);
-
-
-       /* Drop the reference count held in vvp_page_init */
-       refc = atomic_dec_return(&page->cp_ref);
-       LASSERTF(refc >= 1, "page = %p, refc = %d\n", page, refc);
-
-       ClearPagePrivate(vmpage);
-       vmpage->private = 0;
-       /*
-        * Reference from vmpage to cl_page is removed, but the reference back
-        * is still here. It is removed later in vvp_page_fini().
-        */
-}
-
 static void vvp_page_export(const struct lu_env *env,
                            const struct cl_page_slice *slice,
                            int uptodate)
@@ -224,6 +201,41 @@ static int vvp_page_prep_write(const struct lu_env *env,
        return 0;
 }
 
+static void vvp_page_delete(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       struct cl_page *cp = slice->cpl_page;
+
+       if (cp->cp_type == CPT_CACHEABLE) {
+               struct page *vmpage = cp->cp_vmpage;
+               struct inode *inode = vmpage->mapping->host;
+
+               LASSERT(PageLocked(vmpage));
+               LASSERT((struct cl_page *)vmpage->private == cp);
+
+               /* Drop the reference count held in vvp_page_init */
+               atomic_dec(&cp->cp_ref);
+
+               ClearPagePrivate(vmpage);
+               vmpage->private = 0;
+
+               /* clearpageuptodate prevents the page being read by the
+                * kernel after it has been deleted from Lustre, which avoids
+                * potential stale data reads.  The seqlock allows us to see
+                * that a page was potentially deleted and catch the resulting
+                * SIGBUS - see ll_filemap_fault() (LU-16160) */
+               write_seqlock(&ll_i2info(inode)->lli_page_inv_lock);
+               ClearPageUptodate(vmpage);
+               write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock);
+
+               /*
+                * The reference from vmpage to cl_page is removed,
+                * but the reference back is still here. It is removed
+                * later in cl_page_free().
+                */
+       }
+}
+
 /**
  * Handles page transfer errors at VM level.
  *
index 5f988bb..3cb9aba 100644 (file)
@@ -11,7 +11,7 @@
 #include <pthread.h>
 
 /*
- * Usage: rw_seq_cst_vs_drop_caches /mnt/lustre/file0 /mnt/lustre2/file0
+ * Usage: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0
 
  * Race reads of the same file on two client mounts vs writes and drop
  * caches to detect sequential consistency violations. Run
  * which case the wait status ($?) will be 134.
 */
 
+int mmap_mode;         /* -m flag */
+
+void usage(void)
+{
+       fprintf(stdout,
+               "%s: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0\n"
+"      -m : use mmap to read/write file\n", __func__);
+}
+
 #define handle_error(msg)      \
        do { perror(msg); exit(EXIT_FAILURE); } while (0)
 
@@ -28,23 +37,39 @@ static int fd[2] = { -1, -1 };
  */
 static uint64_t u, u_max = UINT64_MAX / 2;
 static uint64_t v[2];
+char *ptr;
 
 static void *access_thread_start(void *unused)
 {
+       char *ptr2 = NULL;
        ssize_t rc;
        int i;
 
+       if (mmap_mode) {
+               ptr2 = mmap(NULL, sizeof(v[1]), PROT_READ,
+                           MAP_PRIVATE | MAP_POPULATE, fd[1], 0);
+               if (ptr2 == MAP_FAILED)
+                       handle_error("mmap");
+       }
+
        do {
                for (i = 0; i < 2; i++) {
-                       rc = pread(fd[i], &v[i], sizeof(v[i]), 0);
-                       if (rc < 0 || rc != sizeof(v[i]))
-                               handle_error("pread");
+                       if (mmap_mode) {
+                               memcpy(&v[i], i == 0 ? ptr : ptr2,
+                                      sizeof(v[i]));
+                       } else {
+                               rc = pread(fd[i], &v[i], sizeof(v[i]), 0);
+                               if (rc < 0 || rc != sizeof(v[i]))
+                                       handle_error("pread");
+                       }
                }
        } while (v[0] <= v[1]);
 
        fprintf(stderr, "error: u = %"PRIu64", v = %"PRIu64", %"PRIu64"\n",
                u, v[0], v[1]);
 
+       if (mmap_mode)
+               munmap(ptr2, sizeof(v[i]));
        abort();
 }
 
@@ -56,12 +81,26 @@ int main(int argc, char *argv[])
        pthread_t access_thread;
        struct stat st[2];
        ssize_t rc;
-       int i;
+       int i, ch;
 
        setvbuf(stderr, stderr_buf, _IOLBF, sizeof(stderr_buf));
 
-       if (argc != 3) {
-               fprintf(stderr, "Usage: %s /mnt/lustre/file0 /mnt/lustre2/file0\n", argv[0]);
+       while ((ch = getopt(argc, argv, "m")) >= 0) {
+               switch (ch) {
+               case 'm':
+                       mmap_mode = 1;
+                       break;
+               default:
+                       usage();
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc != 2) {
+               usage();
                exit(EXIT_FAILURE);
        }
 
@@ -69,7 +108,7 @@ int main(int argc, char *argv[])
        assert(!(drop_caches_fd < 0));
 
        for (i = 0; i < 2; i++) {
-               fd[i] = open(argv[i + 1], O_RDWR|O_CREAT|O_TRUNC, 0666);
+               fd[i] = open(argv[i], O_RDWR|O_CREAT|O_TRUNC, 0666);
                if (fd[i] < 0)
                        handle_error("open");
 
@@ -86,18 +125,33 @@ int main(int argc, char *argv[])
                exit(EXIT_FAILURE);
        }
 
-       rc = pwrite(fd[0], &u, sizeof(u), 0);
-       if (rc < 0 || rc != sizeof(u))
-               handle_error("pwrite");
+       if (mmap_mode) {
+               if (ftruncate(fd[0], sizeof(u)) < 0)
+                       handle_error("ftruncate");
+
+               ptr = mmap(NULL, sizeof(u), PROT_READ|PROT_WRITE, MAP_SHARED,
+                          fd[0], 0);
+               if (ptr == MAP_FAILED)
+                       handle_error("mmap");
+               memcpy(ptr, &u, sizeof(u));
+       } else {
+               rc = pwrite(fd[0], &u, sizeof(u), 0);
+               if (rc < 0 || rc != sizeof(u))
+                       handle_error("pwrite");
+       }
 
        rc = pthread_create(&access_thread, NULL, &access_thread_start, NULL);
        if (rc != 0)
                handle_error("pthread_create");
 
        for (u = 1; u <= u_max; u++) {
-               rc = pwrite(fd[0], &u, sizeof(u), 0);
-               if (rc < 0 || rc != sizeof(u))
-                       handle_error("pwrite");
+               if (mmap_mode) {
+                       memcpy(ptr, &u, sizeof(u));
+               } else {
+                       rc = pwrite(fd[0], &u, sizeof(u), 0);
+                       if (rc < 0 || rc != sizeof(u))
+                               handle_error("pwrite");
+               }
 
                rc = write(drop_caches_fd, "3\n", 2);
                if (rc < 0 || rc != 2)
@@ -112,5 +166,8 @@ int main(int argc, char *argv[])
        if (rc != 0)
                handle_error("pthread_join");
 
+       if (mmap_mode)
+               munmap(ptr, sizeof(u));
+
        return 0;
 }
index b70d809..11f597c 100755 (executable)
@@ -18,8 +18,8 @@ init_test_env $@
 init_logging
 
 ALWAYS_EXCEPT="$SANITYN_EXCEPT "
-# bug number for skipped test:  LU-7105 LU-14541
-ALWAYS_EXCEPT+="                28      16f"
+# bug number for skipped test:  LU-7105
+ALWAYS_EXCEPT+="                28 "
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 # skip tests for PPC until they are fixed
@@ -564,6 +564,35 @@ test_16f() { # LU-14541
 }
 run_test 16f "rw sequential consistency vs drop_caches"
 
+test_16g() {
+       local file1=$DIR1/$tfile
+       local file2=$DIR2/$tfile
+       local duration=20
+       local status
+
+       timeout --preserve-status --signal=USR1 $duration \
+               rw_seq_cst_vs_drop_caches -m $file1 $file2
+       status=$?
+
+       case $((status & 0x7f)) in
+               0)
+                       echo OK # Computers must be fast now.
+                       ;;
+               6) # SIGABRT
+                       error "sequential consistency violation detected"
+                       ;;
+               10) # SIGUSR1
+                       echo TIMEOUT # This is fine.
+                       ;;
+               *)
+                       error "strange status '$status'"
+                       ;;
+       esac
+
+       rm -f $file1
+}
+run_test 16g "mmap rw sequential consistency vs drop_caches"
+
 test_17() { # bug 3513, 3667
        remote_ost_nodsh && skip "remote OST with nodsh" && return