Whamcloud - gitweb
LU-16160 llite: clear stale page's uptodate bit 07/48607/18
authorBobi Jam <bobijam@whamcloud.com>
Tue, 20 Sep 2022 16:27:04 +0000 (00:27 +0800)
committerOleg Drokin <green@whamcloud.com>
Wed, 2 Nov 2022 07:09:29 +0000 (07:09 +0000)
With truncate_inode_page()->do_invalidatepage()->ll_invalidatepage()
call path before deleting vmpage from page cache, the page could be
possibly picked up by ll_read_ahead_page()->grab_cache_page_nowait().

If ll_invalidatepage()->cl_page_delete() does not clear the vmpage's
uptodate bit, the read ahead could pick it up and think it's already
uptodate wrongly.

In ll_fault()->vvp_io_fault_start()->vvp_io_kernel_fault(), the
filemap_fault() will call ll_readpage() to read vmpage and wait for
the unlock of the vmpage, and when ll_readpage() successfully read
the vmpage then unlock the vmpage, memory pressure or truncate can
get in and delete the cl_page, afterward filemap_fault() find that
the vmpage is not uptodate and VM_FAULT_SIGBUS got returned. To fix
this situation, this patch makes vvp_io_kernel_fault() restart
filemap_fault() to get uptodated vmpage again.

Test-Parameters: testlist=sanityn env=ONLY="16f",ONLY_REPEAT=50
Test-Parameters: testlist=sanityn env=ONLY="16g",ONLY_REPEAT=50
Test-Parameters: testlist=sanityn env=ONLY="16f 16g",ONLY_REPEAT=50
Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Change-Id: I369e1362ffb071ec0a4de3cd5bad27a87cff5e05
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48607
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/cl_object.h
lustre/llite/rw.c
lustre/llite/vvp_io.c
lustre/llite/vvp_page.c
lustre/obdclass/cl_page.c
lustre/tests/rw_seq_cst_vs_drop_caches.c
lustre/tests/sanityn.sh

index b7de8eb..672004a 100644 (file)
@@ -767,7 +767,15 @@ struct cl_page {
        enum cl_page_type       cp_type:CP_TYPE_BITS;
        unsigned                cp_defer_uptodate:1,
                                cp_ra_updated:1,
-                               cp_ra_used:1;
+                               cp_ra_used:1,
+                               /* fault page read grab extra referece */
+                               cp_fault_ref:1,
+                               /**
+                                * if fault page got delete before returned to
+                                * filemap_fault(), defer the vmpage detach/put
+                                * until filemap_fault() has been handled.
+                                */
+                               cp_defer_detach:1;
        /* which slab kmem index this memory allocated from */
        short int               cp_kmem_index;
 
@@ -2389,6 +2397,11 @@ static inline int cl_io_is_mkwrite(const struct cl_io *io)
        return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
 }
 
+static inline int cl_io_is_pagefault(const struct cl_io *io)
+{
+       return io->ci_type == CIT_FAULT && !io->u.ci_fault.ft_mkwrite;
+}
+
 /**
  * True, iff \a io is a truncate(2).
  */
index 66496e6..0f76a88 100644 (file)
@@ -1943,7 +1943,15 @@ int ll_readpage(struct file *file, struct page *vmpage)
                        unlock_page(vmpage);
                        result = 0;
                }
-               cl_page_put(env, page);
+               if (cl_io_is_pagefault(io) && result == 0) {
+                       /**
+                        * page fault, retain the cl_page reference until
+                        * vvp_io_kernel_fault() release it.
+                        */
+                       page->cp_fault_ref = 1;
+               } else {
+                       cl_page_put(env, page);
+               }
        } else {
                unlock_page(vmpage);
                result = PTR_ERR(page);
index eda72e8..699e9a6 100644 (file)
@@ -1386,14 +1386,42 @@ static void vvp_io_rw_end(const struct lu_env *env,
        trunc_sem_up_read(&lli->lli_trunc_sem);
 }
 
-static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+static void detach_and_deref_page(struct cl_page *clp, struct page *vmpage)
+{
+       if (!clp->cp_defer_detach)
+               return;
+
+       /**
+        * cl_page_delete0() took a vmpage reference, but not unlink the vmpage
+        * from its cl_page.
+        */
+       clp->cp_defer_detach = 0;
+       ClearPagePrivate(vmpage);
+       vmpage->private = 0;
+
+       put_page(vmpage);
+       atomic_dec(&clp->cp_ref);
+}
+
+static int vvp_io_kernel_fault(const struct lu_env *env,
+                              struct vvp_fault_io *cfio)
 {
        struct vm_fault *vmf = cfio->ft_vmf;
+       struct file *vmff = cfio->ft_vma->vm_file;
+       struct address_space *mapping = vmff->f_mapping;
+       struct inode *inode = mapping->host;
+       struct page *vmpage = NULL;
+       struct cl_page *clp = NULL;
+       int rc = 0;
+       ENTRY;
 
+       ll_inode_size_lock(inode);
+retry:
        cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf);
        cfio->ft_flags_valid = 1;
 
        if (vmf->page) {
+               /* success, vmpage is locked */
                LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
                               get_vmf_address(vmf));
                if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
@@ -1403,25 +1431,100 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 
                cfio->ft_vmpage = vmf->page;
 
-               return 0;
+               /**
+                * ll_filemap_fault()->ll_readpage() could get an extra cl_page
+                * reference. So we have to get the cl_page's to check its
+                * cp_fault_ref and drop the reference later.
+                */
+               clp = cl_vmpage_page(vmf->page, NULL);
+
+               GOTO(unlock, rc = 0);
+       }
+
+       /* filemap_fault() fails, vmpage is not locked */
+       if (clp == NULL) {
+               vmpage = find_get_page(mapping, vmf->pgoff);
+               if (vmpage) {
+                       lock_page(vmpage);
+                       clp = cl_vmpage_page(vmpage, NULL);
+                       unlock_page(vmpage);
+               }
        }
 
        if (cfio->ft_flags & VM_FAULT_SIGBUS) {
+               pgoff_t max_idx;
+
+               /**
+                * ll_filemap_fault()->ll_readpage() could fill vmpage
+                * correctly, and unlock the vmpage, while memory pressure or
+                * truncate could detach cl_page from vmpage, and kernel
+                * filemap_fault() will wait_on_page_locked(vmpage) and find
+                * out that the vmpage has been cleared its uptodate bit,
+                * so it returns VM_FAULT_SIGBUS.
+                *
+                * In this case, we'd retry the filemap_fault()->ll_readpage()
+                * to rebuild the cl_page and fill vmpage with uptodated data.
+                */
+               if (likely(vmpage)) {
+                       bool need_retry = false;
+
+                       if (clp) {
+                               if (clp->cp_defer_detach) {
+                                       detach_and_deref_page(clp, vmpage);
+                                       /**
+                                        * check i_size to make sure it's not
+                                        * over EOF, we don't want to call
+                                        * filemap_fault() repeatedly since it
+                                        * returns VM_FAULT_SIGBUS without even
+                                        * trying if vmf->pgoff is over EOF.
+                                        */
+                                       max_idx = DIV_ROUND_UP(
+                                               i_size_read(inode), PAGE_SIZE);
+                                       if (vmf->pgoff < max_idx)
+                                               need_retry = true;
+                               }
+                               if (clp->cp_fault_ref) {
+                                       clp->cp_fault_ref = 0;
+                                       /* ref not released in ll_readpage() */
+                                       cl_page_put(env, clp);
+                               }
+                               if (need_retry)
+                                       goto retry;
+                       }
+               }
+
                CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf));
-               return -EFAULT;
+               GOTO(unlock, rc = -EFAULT);
        }
 
        if (cfio->ft_flags & VM_FAULT_OOM) {
                CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf));
-               return -ENOMEM;
+               GOTO(unlock, rc = -ENOMEM);
        }
 
        if (cfio->ft_flags & VM_FAULT_RETRY)
-               return -EAGAIN;
+               GOTO(unlock, rc = -EAGAIN);
 
        CERROR("unknown error in page fault %d\n", cfio->ft_flags);
-
-       return -EINVAL;
+       rc = -EINVAL;
+unlock:
+       ll_inode_size_unlock(inode);
+       if (clp) {
+               if (clp->cp_defer_detach && vmpage)
+                       detach_and_deref_page(clp, vmpage);
+
+               /* additional cl_page ref has been taken in ll_readpage() */
+               if (clp->cp_fault_ref) {
+                       clp->cp_fault_ref = 0;
+                       /* ref not released in ll_readpage() */
+                       cl_page_put(env, clp);
+               }
+               /* ref taken in this function */
+               cl_page_put(env, clp);
+       }
+       if (vmpage)
+               put_page(vmpage);
+       return rc;
 }
 
 static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
@@ -1462,7 +1565,7 @@ static int vvp_io_fault_start(const struct lu_env *env,
                LASSERT(cfio->ft_vmpage != NULL);
                lock_page(cfio->ft_vmpage);
        } else {
-               result = vvp_io_kernel_fault(cfio);
+               result = vvp_io_kernel_fault(env, cfio);
                if (result != 0)
                        RETURN(result);
        }
index 31edb33..f3840c4 100644 (file)
@@ -108,6 +108,11 @@ static void vvp_page_completion_read(const struct lu_env *env,
                ll_ra_count_put(ll_i2sbi(inode), 1);
 
        if (ioret == 0)  {
+               /**
+                * cp_defer_uptodate is used for readahead page, and the
+                * vmpage Uptodate bit is deferred to set in ll_readpage/
+                * ll_io_read_page.
+                */
                if (!cp->cp_defer_uptodate)
                        SetPageUptodate(vmpage);
        } else if (cp->cp_defer_uptodate) {
index 29a5bc1..91a879f 100644 (file)
@@ -828,18 +828,36 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *cp)
                LASSERT(PageLocked(vmpage));
                LASSERT((struct cl_page *)vmpage->private == cp);
 
-               /* Drop the reference count held in vvp_page_init */
-               refc = atomic_dec_return(&cp->cp_ref);
-               LASSERTF(refc >= 1, "page = %p, refc = %d\n", cp, refc);
-
-               ClearPagePrivate(vmpage);
-               vmpage->private = 0;
-
-               /*
-                * The reference from vmpage to cl_page is removed,
-                * but the reference back is still here. It is removed
-                * later in cl_page_free().
+               /**
+                * clear vmpage uptodate bit, since ll_read_ahead_pages()->
+                * ll_read_ahead_page() could pick up this stale vmpage and
+                * take it as uptodated.
+                */
+               ClearPageUptodate(vmpage);
+               /**
+                * vvp_io_kernel_fault()->ll_readpage() set cp_fault_ref
+                * and need it to check cl_page to retry the page fault read.
                 */
+               if (cp->cp_fault_ref) {
+                       cp->cp_defer_detach = 1;
+                       /**
+                        * get a vmpage reference, so that filemap_fault()
+                        * won't free it from pagecache.
+                        */
+                       get_page(vmpage);
+               } else {
+                       /* Drop the reference count held in vvp_page_init */
+                       refc = atomic_dec_return(&cp->cp_ref);
+                       LASSERTF(refc >= 1, "page = %p, refc = %d\n", cp, refc);
+                       ClearPagePrivate(vmpage);
+                       vmpage->private = 0;
+
+                       /*
+                        * The reference from vmpage to cl_page is removed,
+                        * but the reference back is still here. It is removed
+                        * later in cl_page_free().
+                        */
+               }
        }
 
        EXIT;
index 5f988bb..3cb9aba 100644 (file)
@@ -11,7 +11,7 @@
 #include <pthread.h>
 
 /*
- * Usage: rw_seq_cst_vs_drop_caches /mnt/lustre/file0 /mnt/lustre2/file0
+ * Usage: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0
 
  * Race reads of the same file on two client mounts vs writes and drop
  * caches to detect sequential consistency violations. Run
  * which case the wait status ($?) will be 134.
 */
 
+int mmap_mode;         /* -m flag */
+
+void usage(void)
+{
+       fprintf(stdout,
+               "%s: rw_seq_cst_vs_drop_caches [-m] /mnt/lustre/file0 /mnt/lustre2/file0\n"
+"      -m : use mmap to read/write file\n", __func__);
+}
+
 #define handle_error(msg)      \
        do { perror(msg); exit(EXIT_FAILURE); } while (0)
 
@@ -28,23 +37,39 @@ static int fd[2] = { -1, -1 };
  */
 static uint64_t u, u_max = UINT64_MAX / 2;
 static uint64_t v[2];
+char *ptr;
 
 static void *access_thread_start(void *unused)
 {
+       char *ptr2 = NULL;
        ssize_t rc;
        int i;
 
+       if (mmap_mode) {
+               ptr2 = mmap(NULL, sizeof(v[1]), PROT_READ,
+                           MAP_PRIVATE | MAP_POPULATE, fd[1], 0);
+               if (ptr2 == MAP_FAILED)
+                       handle_error("mmap");
+       }
+
        do {
                for (i = 0; i < 2; i++) {
-                       rc = pread(fd[i], &v[i], sizeof(v[i]), 0);
-                       if (rc < 0 || rc != sizeof(v[i]))
-                               handle_error("pread");
+                       if (mmap_mode) {
+                               memcpy(&v[i], i == 0 ? ptr : ptr2,
+                                      sizeof(v[i]));
+                       } else {
+                               rc = pread(fd[i], &v[i], sizeof(v[i]), 0);
+                               if (rc < 0 || rc != sizeof(v[i]))
+                                       handle_error("pread");
+                       }
                }
        } while (v[0] <= v[1]);
 
        fprintf(stderr, "error: u = %"PRIu64", v = %"PRIu64", %"PRIu64"\n",
                u, v[0], v[1]);
 
+       if (mmap_mode)
+               munmap(ptr2, sizeof(v[i]));
        abort();
 }
 
@@ -56,12 +81,26 @@ int main(int argc, char *argv[])
        pthread_t access_thread;
        struct stat st[2];
        ssize_t rc;
-       int i;
+       int i, ch;
 
        setvbuf(stderr, stderr_buf, _IOLBF, sizeof(stderr_buf));
 
-       if (argc != 3) {
-               fprintf(stderr, "Usage: %s /mnt/lustre/file0 /mnt/lustre2/file0\n", argv[0]);
+       while ((ch = getopt(argc, argv, "m")) >= 0) {
+               switch (ch) {
+               case 'm':
+                       mmap_mode = 1;
+                       break;
+               default:
+                       usage();
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc != 2) {
+               usage();
                exit(EXIT_FAILURE);
        }
 
@@ -69,7 +108,7 @@ int main(int argc, char *argv[])
        assert(!(drop_caches_fd < 0));
 
        for (i = 0; i < 2; i++) {
-               fd[i] = open(argv[i + 1], O_RDWR|O_CREAT|O_TRUNC, 0666);
+               fd[i] = open(argv[i], O_RDWR|O_CREAT|O_TRUNC, 0666);
                if (fd[i] < 0)
                        handle_error("open");
 
@@ -86,18 +125,33 @@ int main(int argc, char *argv[])
                exit(EXIT_FAILURE);
        }
 
-       rc = pwrite(fd[0], &u, sizeof(u), 0);
-       if (rc < 0 || rc != sizeof(u))
-               handle_error("pwrite");
+       if (mmap_mode) {
+               if (ftruncate(fd[0], sizeof(u)) < 0)
+                       handle_error("ftruncate");
+
+               ptr = mmap(NULL, sizeof(u), PROT_READ|PROT_WRITE, MAP_SHARED,
+                          fd[0], 0);
+               if (ptr == MAP_FAILED)
+                       handle_error("mmap");
+               memcpy(ptr, &u, sizeof(u));
+       } else {
+               rc = pwrite(fd[0], &u, sizeof(u), 0);
+               if (rc < 0 || rc != sizeof(u))
+                       handle_error("pwrite");
+       }
 
        rc = pthread_create(&access_thread, NULL, &access_thread_start, NULL);
        if (rc != 0)
                handle_error("pthread_create");
 
        for (u = 1; u <= u_max; u++) {
-               rc = pwrite(fd[0], &u, sizeof(u), 0);
-               if (rc < 0 || rc != sizeof(u))
-                       handle_error("pwrite");
+               if (mmap_mode) {
+                       memcpy(ptr, &u, sizeof(u));
+               } else {
+                       rc = pwrite(fd[0], &u, sizeof(u), 0);
+                       if (rc < 0 || rc != sizeof(u))
+                               handle_error("pwrite");
+               }
 
                rc = write(drop_caches_fd, "3\n", 2);
                if (rc < 0 || rc != 2)
@@ -112,5 +166,8 @@ int main(int argc, char *argv[])
        if (rc != 0)
                handle_error("pthread_join");
 
+       if (mmap_mode)
+               munmap(ptr, sizeof(u));
+
        return 0;
 }
index ae75881..25158d0 100755 (executable)
@@ -18,8 +18,8 @@ init_test_env $@
 init_logging
 
 ALWAYS_EXCEPT="$SANITYN_EXCEPT "
-# bug number for skipped test:  LU-7105 LU-14541
-ALWAYS_EXCEPT+="                28      16f"
+# bug number for skipped test:  LU-7105
+ALWAYS_EXCEPT+="                28 "
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 if [ $mds1_FSTYPE = "zfs" ]; then
@@ -586,6 +586,35 @@ test_16f() { # LU-14541
 }
 run_test 16f "rw sequential consistency vs drop_caches"
 
+test_16g() {
+       local file1=$DIR1/$tfile
+       local file2=$DIR2/$tfile
+       local duration=20
+       local status
+
+       timeout --preserve-status --signal=USR1 $duration \
+               rw_seq_cst_vs_drop_caches -m $file1 $file2
+       status=$?
+
+       case $((status & 0x7f)) in
+               0)
+                       echo OK # Computers must be fast now.
+                       ;;
+               6) # SIGABRT
+                       error "sequential consistency violation detected"
+                       ;;
+               10) # SIGUSR1
+                       echo TIMEOUT # This is fine.
+                       ;;
+               *)
+                       error "strange status '$status'"
+                       ;;
+       esac
+
+       rm -f $file1
+}
+run_test 16g "mmap rw sequential consistency vs drop_caches"
+
 test_16h() {
        local tf=$DIR/$tdir/$tfile
        local tf2=$DIR2/$tdir/$tfile