The newer kernel (such as Ubuntu 2204) introduces a new member:
invalidate_lock in the structure @address_space.
The filesystem must exclusively acquire invalidate_lock before
invalidating page cache in truncate / hole punch (and thus calling
into ->invalidatepage) to block races between page cache
invalidation and page cache filling functions (fault, read, ...)
However, current Lustre client does not hold this lock when remove
pages from page cache caused by the revocation of the extent DLM
lock protecting them.
If a client has two overlapped PR DLM extent locks, i.e:
- L1 = <PR, [1M, 4M - 1]
- L2 = <PR, [3M, 5M - 1]
A reader process holds L1 and reads data in range [3M, 4M - 1].
L2 is being revoken due to the conflict access.
Then the page read-in by the reader may be invalidated and deleted
from page cache by the revocation of L2 (in lock blocking AST).
The older kernel will check each page after read whether it was
invalidated and deleted from page cache. If so, it will retry the
page read.
In the newer kernel, it removes this check and retry.
Instead, it introduces a new rw_semaphore in the address_space -
invalidate_lock - that holding the shared lock to protect adding
of pages to page cache for page faults / reads / readahead, and
the exclusive lock to protect invalidating pages, removing them
from page cache for truncate / hole punch.
Thus, in this patch it holds exclusive invalidate_lock in newer
kernels when remove pages from page cache caused by the revocation
of a extent DLM lock protecting them. Otherwsie, it will result in
-EIO error or partial reads in the new added test case sanity/833.
Lustre-change: https://review.whamcloud.com/50371
Lustre-commit:
bba59b1287c9cd8c30a85fafb4fd5788452bd05c
Test-parameters: clientdistro=ubuntu2204 testlist=sanity env=ONLY=833,ONLY_REPEAT=10
Change-Id: If3a27002b89636b9fd4d7b5ea50afa9aeac5d121
Signed-off-by: Qian Yingjin <qian@ddn.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/50353
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
]) # LC_HAVE_GET_ACL_RCU_ARG
#
+# LC_HAVE_INVALIDATE_LOCK
+#
+# Kernel version v5.15-rc1 commit 730633f0b7f951726e87f912a6323641f674ae34
+# mm: Protect operations adding pages to page cache with invalidate_lock
+#
+AC_DEFUN([LC_HAVE_INVALIDATE_LOCK], [
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-Werror"
+LB_CHECK_COMPILE([if have address_space have invalidate_lock member],
+address_space_invalidate_lock, [
+ #include <linux/fs.h>
+ ],[
+ struct address_space *mapping = NULL;
+
+ filemap_invalidate_lock(mapping);
+ ],[
+ AC_DEFINE(HAVE_INVALIDATE_LOCK, 1,
+ [address_space invalidate_lock member exists])
+ ])
+EXTRA_KCFLAGS="$tmp_flags"
+]) # LC_HAVE_INVALIDATE_LOCK
+
+#
# LC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
#
# Linux v5.15-rc1-20-g15bf32398ad4
# 5.15
LC_HAVE_GET_ACL_RCU_ARG
+ LC_HAVE_INVALIDATE_LOCK
# 5.16
LC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
* write is probably size expanding and doesn't have to be chunk
* aligned
*/
- ci_size_extend_compression:1;
+ ci_size_extend_compression:1,
+ /*
+ * The filesystem must exclusively acquire invalidate_lock before
+ * invalidating page cache in truncate / hole punch / DLM extent
+ * lock blocking AST path (and thus calling into ->invalidatepage)
+ * to block races between page cache invalidation and page cache
+ * filling functions (fault, read, ...)
+ */
+ ci_invalidate_page_cache:1;
+
/**
* How many times the read has retried before this one.
* Set by the top level and consumed by the LOV.
CLOBINVRNT(env, obj, vvp_object_invariant(obj));
CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
- "need write layout %d, restore needed %d\n",
+ "need write layout %d, restore needed %d, invalidate_lock %d\n",
PFID(lu_object_fid(&obj->co_lu)),
io->ci_ignore_layout, io->ci_verify_layout,
vio->vui_layout_gen, io->ci_need_write_intent,
- io->ci_restore_needed);
+ io->ci_restore_needed, io->ci_invalidate_page_cache);
+
+#ifdef HAVE_INVALIDATE_LOCK
+ if (io->ci_invalidate_page_cache) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ io->ci_invalidate_page_cache = 0;
+ }
+#endif /* HAVE_INVALIDATE_LOCK */
if (io->ci_restore_needed) {
/* file was detected release, we need to restore it
PFID(lu_object_fid(&obj->co_lu)), result);
}
+#ifdef HAVE_INVALIDATE_LOCK
+ if (io->ci_invalidate_page_cache)
+ filemap_invalidate_lock(inode->i_mapping);
+#endif /* HAVE_INVALIDATE_LOCK */
+
io->ci_result = result < 0 ? result : 0;
RETURN(result);
}
io->ci_obj = cl_object_top(osc2cl(osc));
io->ci_ignore_layout = 1;
+ io->ci_invalidate_page_cache = 1;
io->u.ci_misc.lm_next_rpc_time = ktime_get_seconds() +
5 * obd_timeout / 16;
result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
}
run_test 832 "lfs rm_entry"
+test_833() {
+ local file=$DIR/$tfile
+
+ stack_trap "rm -f $file" EXIT
+ dd if=/dev/zero of=$file bs=1M count=50 || error "Write $file failed"
+
+ local wpid
+ local rpid
+ local rpid2
+
+ # Buffered I/O write
+ (
+ while [ ! -e $DIR/sanity.833.lck ]; do
+ dd if=/dev/zero of=$file bs=1M count=50 conv=notrunc ||
+ error "failed to write $file"
+ sleep 0.$((RANDOM % 4 + 1))
+ done
+ )&
+ wpid=$!
+
+ # Buffered I/O read
+ (
+ while [ ! -e $DIR/sanity.833.lck ]; do
+ dd if=$file of=/dev/null bs=1M count=50 ||
+ error "failed to read $file"
+ sleep 0.$((RANDOM % 4 + 1))
+ done
+ )&
+ rpid=$!
+
+ # Direct I/O read
+ (
+ while [ ! -e $DIR/sanity.833.lck ]; do
+ dd if=$file of=/dev/null bs=1M count=50 iflag=direct ||
+ error "failed to read $file in direct I/O mode"
+ sleep 0.$((RANDOM % 4 + 1))
+ done
+ )&
+ rpid2=$!
+
+ sleep 30
+ touch $DIR/sanity.833.lck
+ wait $wpid || error "$?: buffered write failed"
+ wait $rpid || error "$?: buffered read failed"
+ wait $rpid2 || error "$?: direct read failed"
+}
+run_test 833 "Mixed buffered/direct read and write should not return -EIO"
+
#
# tests that do cleanup/setup should be run at the end
#