From 1d98e5c32b41e19bb1247958e666bb66e69dbc4c Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Mon, 20 Mar 2023 17:21:32 -0400 Subject: [PATCH] LU-16649 llite: EIO is possible on a race with page reclaim We must clear the 'uptodate' page flag when we delete a page from Lustre, or stale reads can occur. However, generic_file_buffered_read requires any pages returned from readpage() be uptodate. So, we must retry reading if page truncation happens in parallel with the read. This implements the same fix as: https://review.whamcloud.com/49647 b4da788a819f82d35b685d6ee7f02809c05ca005 did for the mmap path. Signed-off-by: Patrick Farrell Change-Id: Iae0d1eb343f25a0176135347e54c309056c2613a Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50344 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andrew Perepechko Reviewed-by: Qian Yingjin Reviewed-by: Alexander Zarochentsev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/llite/file.c | 11 ++++++++--- lustre/llite/rw.c | 8 ++++++++ lustre/llite/rw26.c | 12 +++++++----- lustre/llite/vvp_io.c | 24 ++++++++++++++++++++--- lustre/tests/Makefile.am | 1 + lustre/tests/fadvise_dontneed_helper.c | 33 ++++++++++++++++++++++++++++++++ lustre/tests/sanityn.sh | 35 ++++++++++++++++++++++++++++++++-- 8 files changed, 112 insertions(+), 13 deletions(-) create mode 100644 lustre/tests/fadvise_dontneed_helper.c diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d190c1a..8b8c4f9 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -625,6 +625,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_PAGE_INVALIDATE_PAUSE 0x1421 #define OBD_FAIL_LLITE_READPAGE_PAUSE 0x1422 #define OBD_FAIL_LLITE_PANIC_ON_ESTALE 0x1423 +#define OBD_FAIL_LLITE_READPAGE_PAUSE2 0x1424 #define OBD_FAIL_FID_INDIR 0x1501 #define OBD_FAIL_FID_INLMA 0x1502 diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 1c80d3d..845907b 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1990,9 +1990,14 @@ ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter) result = generic_file_read_iter(iocb, iter); /* If the first page is not in cache, generic_file_aio_read() will be - * returned with -ENODATA. - * See corresponding code in ll_readpage(). */ - if (result == -ENODATA) + * returned with -ENODATA. Fall back to full read path. + * See corresponding code in ll_readpage(). + * + * if we raced with page deletion, we might get EIO. Rather than add + * locking to the fast path for this rare case, fall back to the full + * read path. (See vvp_io_read_start() for rest of handling. + */ + if (result == -ENODATA || result == -EIO) result = 0; if (result > 0) { diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index e135b4b..a562fd7 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -2042,6 +2042,14 @@ out: if (ra.cra_release != NULL) cl_read_ahead_release(env, &ra); + /* this delay gives time for the actual read of the page to finish and + * unlock the page in vvp_page_completion_read before we return to our + * caller and the caller tries to use the page, allowing us to test + * races with the page being unlocked after readpage() but before it's + * used by the caller + */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_READPAGE_PAUSE2, cfs_fail_val); + RETURN(result); } diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 1a39142..9c26704 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -174,21 +174,23 @@ static bool do_release_page(struct page *vmpage, gfp_t wait) struct lu_env *env; int result = 0; + ENTRY; + LASSERT(PageLocked(vmpage)); if (PageWriteback(vmpage) || PageDirty(vmpage)) - return 0; + RETURN(0); mapping = vmpage->mapping; if (mapping == NULL) - return 1; + RETURN(1); obj = ll_i2info(mapping->host)->lli_clob; if (obj == NULL) - return 1; + RETURN(1); page = cl_vmpage_page(vmpage, obj); if (page == NULL) - return 1; + RETURN(1); env = cl_env_percpu_get(); LASSERT(!IS_ERR(env)); @@ -215,7 +217,7 @@ static bool do_release_page(struct page *vmpage, gfp_t wait) cl_page_put(env, page); cl_env_percpu_put(env); - return result; + RETURN(result); } #ifdef HAVE_AOPS_RELEASE_FOLIO diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index a8c85da..9f2715b 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -826,8 +826,10 @@ static int vvp_io_read_start(const struct lu_env *env, size_t cnt = io->u.ci_rd.rd.crw_count; size_t tot = vio->vui_tot_count; struct ll_cl_context *lcc; + unsigned int seq; int exceed = 0; int result; + int total_bytes_read = 0; struct iov_iter iter; pgoff_t page_offset; @@ -896,13 +898,29 @@ static int vvp_io_read_start(const struct lu_env *env, lcc->lcc_end_index = DIV_ROUND_UP(pos + iter.count, PAGE_SIZE); CDEBUG(D_VFSTRACE, "count:%ld iocb pos:%lld\n", iter.count, pos); - result = generic_file_read_iter(vio->vui_iocb, &iter); + /* this seqlock lets us notice if a page has been deleted on this inode + * during the fault process, allowing us to catch an erroneous short + * read or EIO + * See LU-16160 + */ + do { + seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock); + result = generic_file_read_iter(vio->vui_iocb, &iter); + if (result >= 0) { + io->ci_nob += result; + total_bytes_read += result; + } + /* if we got a short read or -EIO and we raced with page invalidation, + * retry + */ + } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) && + ((result >= 0 && iov_iter_count(&iter) > 0) + || result == -EIO)); out: if (result >= 0) { - if (result < cnt) + if (total_bytes_read < cnt) io->ci_continue = 0; - io->ci_nob += result; result = 0; } else if (result == -EIOCBQUEUED) { io->ci_nob += vio->u.readwrite.vui_read; diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 330b403..5a88853 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -83,6 +83,7 @@ THETESTS += create_foreign_file parse_foreign_file THETESTS += create_foreign_dir parse_foreign_dir THETESTS += check_fallocate splice-test lseek_test expand_truncate_test THETESTS += foreign_symlink_striping lov_getstripe_old io_uring_probe +THETESTS += fadvise_dontneed_helper if LIBAIO THETESTS += aiocp diff --git a/lustre/tests/fadvise_dontneed_helper.c b/lustre/tests/fadvise_dontneed_helper.c new file mode 100644 index 0000000..fbcf41b --- /dev/null +++ b/lustre/tests/fadvise_dontneed_helper.c @@ -0,0 +1,33 @@ +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int fd; + int rc; + + if (argc != 2) { + fprintf(stderr, "usage: %s file\n", argv[0]); + return 1; + } + + fd = open(argv[1], O_RDWR, 0); + if (fd <= 0) { + fprintf(stderr, + "open failed on %s, error: %s\n", + argv[1], strerror(errno)); + return errno; + } + + rc = posix_fadvise(fd, 0, 1024 * 1024, POSIX_FADV_DONTNEED); + if (rc) { + fprintf(stderr, + "fadvise FADV_DONTNEED failed on %s, error: %s\n", + argv[1], strerror(errno)); + return errno; + } + return 0; +} diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 288020d..0fe043a 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -5358,7 +5358,7 @@ test_94() { } run_test 94 "signal vs CP callback race" -test_95() { +test_95a() { local file=$DIR/$tfile local file2=$DIR2/$tfile local fast_read_save @@ -5388,7 +5388,38 @@ test_95() { kill -USR1 $pid && wait $pid || error "wait for PID $pid failed" } -run_test 95 "Check readpage() on a page that was removed from page cache" +run_test 95a "Check readpage() on a page that was removed from page cache" + +test_95b() { + local file=$DIR/$tfile + local file2=$DIR2/$tfile + local fast_read_save + local pid + + fast_read_save=$($LCTL get_param -n llite.*.fast_read | head -n 1) + [ -z "$fast_read_save" ] && skip "no fast read support" + + stack_trap "$LCTL set_param llite.*.fast_read=$fast_read_save" EXIT + $LCTL set_param llite.*.fast_read=0 + + $LFS setstripe -c $OSTCOUNT $file || error "failed to setstripe $file" + dd if=/dev/zero of=$file bs=$((PAGE_SIZE * 3)) count=1 || + error "failed to write $file" + + # This does the read from the second mount, so this flushes the pages + # the first mount and creates new ones on the second mount + # OBD_FAIL_LLITE_READPAGE_PAUSE2 0x1424 + $LCTL set_param fail_loc=0x80001424 fail_val=5 + $MULTIOP $file2 or${PAGE_SIZE}c & + pid=$! + + sleep 2 + fadvise_dontneed_helper $file2 + $LCTL set_param fail_loc=0 + sleep 4 + wait $pid || error "failed to read file" +} +run_test 95b "Check readpage() on a page that is no longer uptodate" # Data-on-MDT tests test_100a() { -- 1.8.3.1