Whamcloud - gitweb
LU-16649 llite: EIO is possible on a race with page reclaim 00/50600/3
authorPatrick Farrell <pfarrell@whamcloud.com>
Tue, 9 May 2023 18:48:03 +0000 (14:48 -0400)
committerOleg Drokin <green@whamcloud.com>
Sat, 20 May 2023 07:11:51 +0000 (07:11 +0000)
We must clear the 'uptodate' page flag when we delete a
page from Lustre, or stale reads can occur.  However,
generic_file_buffered_read requires any pages returned from
readpage() be uptodate.

So, we must retry reading if page truncation happens in
parallel with the read.

This implements the same fix as:
https://review.whamcloud.com/49647
b4da788a819f82d35b685d6ee7f02809c05ca005

did for the mmap path.

Lustre-change: https://review.whamcloud.com/50344
Lustre-commit: 1d98e5c32b41e19bb1247958e666bb66e69dbc4c

Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: Iae0d1eb343f25a0176135347e54c309056c2613a
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50600
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Qian Yingjin <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/llite/file.c
lustre/llite/rw.c
lustre/llite/rw26.c
lustre/llite/vvp_io.c
lustre/tests/Makefile.am
lustre/tests/fadvise_dontneed_helper.c [new file with mode: 0644]
lustre/tests/sanityn.sh

index b2aa840..cca40a6 100644 (file)
@@ -610,6 +610,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_PAGE_ALLOC                  0x1418
 #define OBD_FAIL_LLITE_OPEN_DELAY                  0x1419
 #define OBD_FAIL_LLITE_XATTR_PAUSE                 0x1420
+#define OBD_FAIL_LLITE_READPAGE_PAUSE2             0x1424
 
 #define OBD_FAIL_FID_INDIR     0x1501
 #define OBD_FAIL_FID_INLMA     0x1502
index 17dcab5..219931e 100644 (file)
@@ -1920,9 +1920,14 @@ ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
        result = generic_file_read_iter(iocb, iter);
 
        /* If the first page is not in cache, generic_file_aio_read() will be
-        * returned with -ENODATA.
-        * See corresponding code in ll_readpage(). */
-       if (result == -ENODATA)
+        * returned with -ENODATA.  Fall back to full read path.
+        * See corresponding code in ll_readpage().
+        *
+        * if we raced with page deletion, we might get EIO.  Rather than add
+        * locking to the fast path for this rare case, fall back to the full
+        * read path.  (See vvp_io_read_start() for rest of handling.
+        */
+       if (result == -ENODATA || result == -EIO)
                result = 0;
 
        if (result > 0) {
index d3d809c..4e30df5 100644 (file)
@@ -1992,5 +1992,13 @@ out:
        if (ra.cra_release != NULL)
                cl_read_ahead_release(env, &ra);
 
+       /* this delay gives time for the actual read of the page to finish and
+        * unlock the page in vvp_page_completion_read before we return to our
+        * caller and the caller tries to use the page, allowing us to test
+        * races with the page being unlocked after readpage() but before it's
+        * used by the caller
+        */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_READPAGE_PAUSE2, cfs_fail_val);
+
        RETURN(result);
 }
index 5e0a900..608d277 100644 (file)
@@ -172,21 +172,23 @@ static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
        struct address_space    *mapping;
        int result = 0;
 
+       ENTRY;
+
        LASSERT(PageLocked(vmpage));
        if (PageWriteback(vmpage) || PageDirty(vmpage))
-               return 0;
+               RETURN(0);
 
        mapping = vmpage->mapping;
        if (mapping == NULL)
-               return 1;
+               RETURN(1);
 
        obj = ll_i2info(mapping->host)->lli_clob;
        if (obj == NULL)
-               return 1;
+               RETURN(1);
 
        page = cl_vmpage_page(vmpage, obj);
        if (page == NULL)
-               return 1;
+               RETURN(1);
 
        env = cl_env_percpu_get();
        LASSERT(!IS_ERR(env));
@@ -213,7 +215,7 @@ static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
        cl_page_put(env, page);
 
        cl_env_percpu_put(env);
-       return result;
+       RETURN(result);
 }
 
 static ssize_t ll_get_user_pages(int rw, struct iov_iter *iter,
index 066f400..4dcec30 100644 (file)
@@ -812,8 +812,10 @@ static int vvp_io_read_start(const struct lu_env *env,
        size_t cnt = io->u.ci_rd.rd.crw_count;
        size_t tot = vio->vui_tot_count;
        struct ll_cl_context *lcc;
+       unsigned int seq;
        int exceed = 0;
        int result;
+       int total_bytes_read = 0;
        struct iov_iter iter;
        pgoff_t page_offset;
 
@@ -882,13 +884,29 @@ static int vvp_io_read_start(const struct lu_env *env,
        lcc->lcc_end_index = DIV_ROUND_UP(pos + iter.count, PAGE_SIZE);
        CDEBUG(D_VFSTRACE, "count:%ld iocb pos:%lld\n", iter.count, pos);
 
-       result = generic_file_read_iter(vio->vui_iocb, &iter);
+       /* this seqlock lets us notice if a page has been deleted on this inode
+        * during the fault process, allowing us to catch an erroneous short
+        * read or EIO
+        * See LU-16160
+        */
+       do {
+               seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock);
+               result = generic_file_read_iter(vio->vui_iocb, &iter);
+               if (result >= 0) {
+                       io->ci_nob += result;
+                       total_bytes_read += result;
+               }
+       /* if we got a short read or -EIO and we raced with page invalidation,
+        * retry
+        */
+       } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) &&
+                ((result >= 0 && iov_iter_count(&iter) > 0)
+                 || result == -EIO));
 
 out:
        if (result >= 0) {
-               if (result < cnt)
+               if (total_bytes_read < cnt)
                        io->ci_continue = 0;
-               io->ci_nob += result;
                result = 0;
        } else if (result == -EIOCBQUEUED) {
                io->ci_nob += vio->u.readwrite.vui_read;
index 0e6f42e..28dcbf1 100644 (file)
@@ -83,7 +83,7 @@ THETESTS += swap_lock_test lockahead_test mirror_io mmap_mknod_test
 THETESTS += create_foreign_file parse_foreign_file
 THETESTS += create_foreign_dir parse_foreign_dir
 THETESTS += check_fallocate splice-test lseek_test expand_truncate_test
-THETESTS += foreign_symlink_striping lov_getstripe_old
+THETESTS += foreign_symlink_striping lov_getstripe_old fadvise_dontneed_helper
 
 if LIBAIO
 THETESTS += aiocp
diff --git a/lustre/tests/fadvise_dontneed_helper.c b/lustre/tests/fadvise_dontneed_helper.c
new file mode 100644 (file)
index 0000000..fbcf41b
--- /dev/null
@@ -0,0 +1,33 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+       int fd;
+       int rc;
+
+       if (argc != 2) {
+               fprintf(stderr, "usage: %s file\n", argv[0]);
+               return 1;
+       }
+
+       fd = open(argv[1], O_RDWR, 0);
+       if (fd <= 0) {
+               fprintf(stderr,
+                       "open failed on %s, error: %s\n",
+                       argv[1], strerror(errno));
+               return errno;
+       }
+
+       rc = posix_fadvise(fd, 0, 1024 * 1024, POSIX_FADV_DONTNEED);
+       if (rc) {
+               fprintf(stderr,
+                       "fadvise FADV_DONTNEED failed on %s, error: %s\n",
+                       argv[1], strerror(errno));
+               return errno;
+       }
+       return 0;
+}
index 874496c..17a1231 100755 (executable)
@@ -5160,6 +5160,37 @@ test_94() {
 }
 run_test 94 "signal vs CP callback race"
 
+test_95b() {
+       local file=$DIR/$tfile
+       local file2=$DIR2/$tfile
+       local fast_read_save
+       local pid
+
+       fast_read_save=$($LCTL get_param -n llite.*.fast_read | head -n 1)
+       [ -z "$fast_read_save" ] && skip "no fast read support"
+
+       stack_trap "$LCTL set_param llite.*.fast_read=$fast_read_save" EXIT
+       $LCTL set_param llite.*.fast_read=0
+
+       $LFS setstripe -c $OSTCOUNT $file || error "failed to setstripe $file"
+       dd if=/dev/zero of=$file bs=$((PAGE_SIZE * 3)) count=1 ||
+               error "failed to write $file"
+
+       # This does the read from the second mount, so this flushes the pages
+       # the first mount and creates new ones on the second mount
+       # OBD_FAIL_LLITE_READPAGE_PAUSE2        0x1424
+       $LCTL set_param fail_loc=0x80001424 fail_val=5
+       $MULTIOP $file2 or${PAGE_SIZE}c &
+       pid=$!
+
+       sleep 2
+       fadvise_dontneed_helper $file2
+       $LCTL set_param fail_loc=0
+       sleep 4
+       wait $pid || error "failed to read file"
+}
+run_test 95b "Check readpage() on a page that is no longer uptodate"
+
 # Data-on-MDT tests
 test_100a() {
        skip "Reserved for glimpse-ahead" && return