Whamcloud - gitweb
LU-16160 llite: SIGBUS is possible on a race with page reclaim
authorAndrew Perepechko <andrew.perepechko@hpe.com>
Sun, 15 Jan 2023 16:55:58 +0000 (11:55 -0500)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 1 Feb 2023 18:41:10 +0000 (18:41 +0000)
We can restart fault handling if page truncation happens
in parallel with the fault handler.

Lustre-change: https://review.whamcloud.com/49647
Lustre-commit: b4da788a819f82d35b685d6ee7f02809c05ca005

Include updates to rw_seq_cst_vs_drop_caches.c from 5b911e0326
to add the '-m' option to test mmap IO operations in sanityn/16g.

Change-Id: I6e60783e3334f87e799dc8b0e6e63d0bb358a236
Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Signed-off-by: Patrick Farrell <farr0186@gmail.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49831
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/lustre_compat.h
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/llite_mmap.c
lustre/llite/vvp_page.c
lustre/tests/rw_seq_cst_vs_drop_caches.c
lustre/tests/sanityn.sh

index 4691c9c..577a81d 100644 (file)
@@ -369,9 +369,9 @@ static inline void __user *get_vmf_address(struct vm_fault *vmf)
 }
 
 #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-# define ll_filemap_fault(vma, vmf) filemap_fault(vmf)
+# define __ll_filemap_fault(vma, vmf) filemap_fault(vmf)
 #else
-# define ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf)
+# define __ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf)
 #endif
 
 #ifndef HAVE_CURRENT_TIME
index bf59e21..7a07618 100644 (file)
@@ -44,6 +44,7 @@
 #include <lustre_intent.h>
 #include <linux/compat.h>
 #include <linux/aio.h>
+#include <linux/seqlock.h>
 #include <lustre_compat.h>
 #include <lustre_crypto.h>
 #include <range_lock.h>
@@ -287,6 +288,7 @@ struct ll_inode_info {
        struct mutex                    lli_xattrs_enq_lock;
        struct list_head                lli_xattrs; /* ll_xattr_entry->xe_list */
        struct list_head                lli_lccs; /* list of ll_cl_context */
+       seqlock_t                       lli_page_inv_lock;
 };
 
 #ifndef HAVE_USER_NAMESPACE_ARG
@@ -1876,4 +1878,6 @@ static inline char *xattr_for_enc(struct inode *inode)
 extern const struct llcrypt_operations lustre_cryptops;
 #endif
 
+int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+
 #endif /* LLITE_INTERNAL_H */
index a55bf9a..0f98dc2 100644 (file)
@@ -1134,6 +1134,7 @@ void ll_lli_init(struct ll_inode_info *lli)
        memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
        /* ll_cl_context initialize */
        INIT_LIST_HEAD(&lli->lli_lccs);
+       seqlock_init(&lli->lli_page_inv_lock);
 }
 
 #define MAX_STRING_SIZE 128
index 7242108..d195f38 100644 (file)
@@ -250,6 +250,25 @@ static inline int to_fault_error(int result)
        return result;
 }
 
+int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct inode *inode = file_inode(vma->vm_file);
+       int ret;
+       unsigned int seq;
+
+       /* this seqlock lets us notice if a page has been deleted on this inode
+        * during the fault process, allowing us to catch an erroneous SIGBUS
+        * See LU-16160
+        */
+       do {
+               seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock);
+               ret = __ll_filemap_fault(vma, vmf);
+       } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) &&
+                (ret & VM_FAULT_SIGBUS));
+
+       return ret;
+}
+
 /**
  * Lustre implementation of a vm_operations_struct::fault() method, called by
  * VM to server page fault (both in kernel and user space).
index 7028526..59388b3 100644 (file)
@@ -157,8 +157,9 @@ static void vvp_page_discard(const struct lu_env *env,
 static void vvp_page_delete(const struct lu_env *env,
                            const struct cl_page_slice *slice)
 {
-       struct page      *vmpage = cl2vm_page(slice);
-       struct cl_page   *page   = slice->cpl_page;
+       struct page *vmpage = cl2vm_page(slice);
+       struct cl_page *page = slice->cpl_page;
+       struct inode *inode = vmpage->mapping->host;
        int refc;
 
        LASSERT(PageLocked(vmpage));
@@ -171,9 +172,20 @@ static void vvp_page_delete(const struct lu_env *env,
 
        ClearPagePrivate(vmpage);
        vmpage->private = 0;
+
+       /* ClearPageUptodate() prevents the page being read by the
+        * kernel after it has been deleted from Lustre, which avoids
+        * potential stale data reads.  The seqlock allows us to see
+        * that a page was potentially deleted and catch the resulting
+        * SIGBUS - see ll_filemap_fault() (LU-16160) */
+       write_seqlock(&ll_i2info(inode)->lli_page_inv_lock);
+       ClearPageUptodate(vmpage);
+       write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock);
+
        /*
-        * Reference from vmpage to cl_page is removed, but the reference back
-        * is still here. It is removed later in vvp_page_fini().
+        * The reference from vmpage to cl_page is removed,
+        * but the reference back is still here. It is removed
+        * later in cl_page_free().
         */
 }
 
index 5f988bb..3cb9aba 100644 (file)
@@ -11,7 +11,7 @@
 #include <pthread.h>
 
 /*
- * Usage: rw_seq_cst_vs_drop_caches /mnt/lustre/file0 /mnt/lustre2/file0
+ * Usage: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0
 
  * Race reads of the same file on two client mounts vs writes and drop
  * caches to detect sequential consistency violations. Run
  * which case the wait status ($?) will be 134.
 */
 
+int mmap_mode;         /* -m flag */
+
+void usage(void)
+{
+       fprintf(stdout,
+               "%s: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0\n"
+"      -m : use mmap to read/write file\n", __func__);
+}
+
 #define handle_error(msg)      \
        do { perror(msg); exit(EXIT_FAILURE); } while (0)
 
@@ -28,23 +37,39 @@ static int fd[2] = { -1, -1 };
  */
 static uint64_t u, u_max = UINT64_MAX / 2;
 static uint64_t v[2];
+char *ptr;
 
 static void *access_thread_start(void *unused)
 {
+       char *ptr2 = NULL;
        ssize_t rc;
        int i;
 
+       if (mmap_mode) {
+               ptr2 = mmap(NULL, sizeof(v[1]), PROT_READ,
+                           MAP_PRIVATE | MAP_POPULATE, fd[1], 0);
+               if (ptr2 == MAP_FAILED)
+                       handle_error("mmap");
+       }
+
        do {
                for (i = 0; i < 2; i++) {
-                       rc = pread(fd[i], &v[i], sizeof(v[i]), 0);
-                       if (rc < 0 || rc != sizeof(v[i]))
-                               handle_error("pread");
+                       if (mmap_mode) {
+                               memcpy(&v[i], i == 0 ? ptr : ptr2,
+                                      sizeof(v[i]));
+                       } else {
+                               rc = pread(fd[i], &v[i], sizeof(v[i]), 0);
+                               if (rc < 0 || rc != sizeof(v[i]))
+                                       handle_error("pread");
+                       }
                }
        } while (v[0] <= v[1]);
 
        fprintf(stderr, "error: u = %"PRIu64", v = %"PRIu64", %"PRIu64"\n",
                u, v[0], v[1]);
 
+       if (mmap_mode)
+               munmap(ptr2, sizeof(v[i]));
        abort();
 }
 
@@ -56,12 +81,26 @@ int main(int argc, char *argv[])
        pthread_t access_thread;
        struct stat st[2];
        ssize_t rc;
-       int i;
+       int i, ch;
 
        setvbuf(stderr, stderr_buf, _IOLBF, sizeof(stderr_buf));
 
-       if (argc != 3) {
-               fprintf(stderr, "Usage: %s /mnt/lustre/file0 /mnt/lustre2/file0\n", argv[0]);
+       while ((ch = getopt(argc, argv, "m")) >= 0) {
+               switch (ch) {
+               case 'm':
+                       mmap_mode = 1;
+                       break;
+               default:
+                       usage();
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc != 2) {
+               usage();
                exit(EXIT_FAILURE);
        }
 
@@ -69,7 +108,7 @@ int main(int argc, char *argv[])
        assert(!(drop_caches_fd < 0));
 
        for (i = 0; i < 2; i++) {
-               fd[i] = open(argv[i + 1], O_RDWR|O_CREAT|O_TRUNC, 0666);
+               fd[i] = open(argv[i], O_RDWR|O_CREAT|O_TRUNC, 0666);
                if (fd[i] < 0)
                        handle_error("open");
 
@@ -86,18 +125,33 @@ int main(int argc, char *argv[])
                exit(EXIT_FAILURE);
        }
 
-       rc = pwrite(fd[0], &u, sizeof(u), 0);
-       if (rc < 0 || rc != sizeof(u))
-               handle_error("pwrite");
+       if (mmap_mode) {
+               if (ftruncate(fd[0], sizeof(u)) < 0)
+                       handle_error("ftruncate");
+
+               ptr = mmap(NULL, sizeof(u), PROT_READ|PROT_WRITE, MAP_SHARED,
+                          fd[0], 0);
+               if (ptr == MAP_FAILED)
+                       handle_error("mmap");
+               memcpy(ptr, &u, sizeof(u));
+       } else {
+               rc = pwrite(fd[0], &u, sizeof(u), 0);
+               if (rc < 0 || rc != sizeof(u))
+                       handle_error("pwrite");
+       }
 
        rc = pthread_create(&access_thread, NULL, &access_thread_start, NULL);
        if (rc != 0)
                handle_error("pthread_create");
 
        for (u = 1; u <= u_max; u++) {
-               rc = pwrite(fd[0], &u, sizeof(u), 0);
-               if (rc < 0 || rc != sizeof(u))
-                       handle_error("pwrite");
+               if (mmap_mode) {
+                       memcpy(ptr, &u, sizeof(u));
+               } else {
+                       rc = pwrite(fd[0], &u, sizeof(u), 0);
+                       if (rc < 0 || rc != sizeof(u))
+                               handle_error("pwrite");
+               }
 
                rc = write(drop_caches_fd, "3\n", 2);
                if (rc < 0 || rc != 2)
@@ -112,5 +166,8 @@ int main(int argc, char *argv[])
        if (rc != 0)
                handle_error("pthread_join");
 
+       if (mmap_mode)
+               munmap(ptr, sizeof(u));
+
        return 0;
 }
index 324482d..8a634b7 100755 (executable)
@@ -18,8 +18,8 @@ init_test_env $@
 init_logging
 
 ALWAYS_EXCEPT="$SANITYN_EXCEPT "
-# bug number for skipped test:  LU-7105 LU-14541
-ALWAYS_EXCEPT+="                28      16f"
+# bug number for skipped test:  LU-7105
+ALWAYS_EXCEPT+="                28     "
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 if [ $mds1_FSTYPE = "zfs" ]; then
@@ -579,6 +579,35 @@ test_16f() { # LU-14541
 }
 run_test 16f "rw sequential consistency vs drop_caches"
 
+test_16g() {
+       local file1=$DIR1/$tfile
+       local file2=$DIR2/$tfile
+       local duration=20
+       local status
+
+       timeout --preserve-status --signal=USR1 $duration \
+               rw_seq_cst_vs_drop_caches -m $file1 $file2
+       status=$?
+
+       case $((status & 0x7f)) in
+               0)
+                       echo OK # Computers must be fast now.
+                       ;;
+               6) # SIGABRT
+                       error "sequential consistency violation detected"
+                       ;;
+               10) # SIGUSR1
+                       echo TIMEOUT # This is fine.
+                       ;;
+               *)
+                       error "strange status '$status'"
+                       ;;
+       esac
+
+       rm -f $file1
+}
+run_test 16g "mmap rw sequential consistency vs drop_caches"
+
 test_16h() {
        local tf=$DIR/$tdir/$tfile
        local tf2=$DIR2/$tdir/$tfile