Whamcloud - gitweb
LU-17364 llite: don't use stale page.
authorAlexey Lyashkov <alexey.lyashkov@hpe.com>
Fri, 12 Jan 2024 18:55:55 +0000 (13:55 -0500)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 18 Jan 2024 09:24:36 +0000 (09:24 +0000)
using stale page for write might confuse a read path,
which expect any IO page have PG_uptodate flag set,
and it caused an panic with removing from IO.

Lustre-Change: https://review.whamcloud.com/53550
Lustre-Commit: TBD (from f7b42523e669d3653ca7c442fe82afde618bbdd5)

Test-Parameters: testlist=sanityn env=SLOW=yes,ONLY=16k,ONLY_REPEAT=10
Test-Parameters: testlist=sanityn env=SLOW=yes,ONLY=16k,ONLY_REPEAT=10
Test-Parameters: testlist=sanityn env=SLOW=yes,ONLY=16k,ONLY_REPEAT=10
Test-Parameters: testlist=sanityn env=SLOW=yes,ONLY=16k,ONLY_REPEAT=10
Test-Parameters: testlist=sanityn env=SLOW=yes,ONLY=16k,ONLY_REPEAT=10
Test-Parameters: testlist=sanityn env=SLOW=yes,ONLY=16k,ONLY_REPEAT=10
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Change-Id: Ia01129ceaecf53d8d9f301c26cd2d65122f6a267
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53666
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/llite/vvp_io.c
lustre/obdclass/cl_page.c
lustre/tests/sanityn.sh

index ed543b0..94436cb 100644 (file)
@@ -1564,7 +1564,17 @@ static int vvp_io_fault_start(const struct lu_env *env,
        /* must return locked page */
        if (fio->ft_mkwrite) {
                LASSERT(cfio->ft_vmpage != NULL);
-               lock_page(cfio->ft_vmpage);
+               vmpage = cfio->ft_vmpage;
+               lock_page(vmpage);
+               /**
+                * page was turncated and lock was cancelled, return ENODATA
+                * so that VM_FAULT_NOPAGE will be returned to handle_mm_fault()
+                * XXX: cannot return VM_FAULT_RETRY to vfs since we cannot
+                * release mmap_lock and VM_FAULT_RETRY implies that the
+                * mmap_lock is released.
+                */
+               if (!PageUptodate(vmpage))
+                       GOTO(out, result = -ENODATA);
        } else {
                result = vvp_io_kernel_fault(cfio);
                if (result != 0)
index 85319f6..b36ec85 100644 (file)
@@ -1087,6 +1087,7 @@ int cl_page_make_ready(const struct lu_env *env, struct cl_page *cl_page,
                RETURN(-EINVAL);
 
        lock_page(vmpage);
+       PASSERT(env, cl_page, PageUptodate(vmpage));
        cl_page_slice_for_each(cl_page, slice, i) {
                if (slice->cpl_ops->io[crt].cpo_make_ready != NULL)
                        result = (*slice->cpl_ops->io[crt].cpo_make_ready)(env, slice);
index 8c74126..9091c52 100755 (executable)
@@ -698,6 +698,38 @@ test_16i() {
 }
 run_test 16i "read after truncate file"
 
+test_16k() {
+       local fsxN=${FSX_NP:-5}
+       local fsxNops=${FSX_NOPS:-10000}
+       local fsxNparams=${FSXPARAMS_16k:-""}
+       local dropsleep=${DROP_SLEEP:-3}
+       local dpipd
+       local -a fsxpids
+       local cmd
+
+       [ "$SLOW" = "no" ] && fsxNops=1000
+
+       while true; do
+               echo 3 > /proc/sys/vm/drop_caches
+               sleep $dropsleep
+       done &
+       dpipd=$!
+       stack_trap "kill -9 $dpipd"
+
+       for ((i = 1; i <= fsxN; i++)); do
+               cmd="$FSX $fsxNparams -N $fsxNops $DIR/fsxfile.${i} -l $((64 * 1024 * 1024))"
+               echo "+ $cmd"
+               eval $cmd &
+               fsxpids[$i]=$!
+       done
+       for ((i = 1; i <= fsxN; i++)); do
+               wait ${fsxpids[$i]} && echo "+ fsx $i done: rc=$?" ||
+                       error "- fsx $i FAILURE! rc=$?"
+               date
+       done
+}
+run_test 16k "Parallel FSX and drop caches should not panic"
+
 test_17() { # bug 3513, 3667
        remote_ost_nodsh && skip "remote OST with nodsh" && return