Whamcloud - gitweb
LU-16935 llite: avoid hopeless i/o repeats 05/51505/10
authorVladimir Saveliev <vladimir.saveliev@hpe.com>
Wed, 23 Aug 2023 14:19:38 +0000 (17:19 +0300)
committerOleg Drokin <green@whamcloud.com>
Sat, 23 Mar 2024 05:51:46 +0000 (05:51 +0000)
On SLES12SP5 kernels (4.12.14_122.147, 4.12.14-122.162) a race between
ll_filemap_fault and ll_imp_inval may lead to the livelock:

  - ll_filemap_fault loops endlessly as filemap_fault()->readpage()
    returns VM_FAULT_SIGBUS (it is unable to send read rpc as import
    is invalid) and as ll_page_inv_lock gets incremented within
    cl_page_discard()->..->vvp_page_delete() called after readpage
    failure.

  - ll_imp_inval stucks in
    obd_import_event(IMP_EVENT_INVALIDATE)->..->osc_object_invalidate
    (before recovery) waiting for completion of i/o ll_filemap_fault
    can not complete.

@ll_page_inv_lock is used to check the page being read by kernel
after it has been deleted from Lustre, which avoids potential
stale data reads. This seqlock allows us to see that a page was
potentially deleted, catch it in this case and repeat the I/O in
ll_filemap_fault() or vvp_io_read_start().

To avoid endless I/O repeat wrongly, in this patch we only increse
@ll_page_inv_lock for the page in PageUptodate state when delete
the page in vvp_page_delete(). The page that not in PageUptodate
state is usually deleted due to the error that does not require
retry.
By this way, ll_filemap_fault() and vvp_io_read_start() will not loop
endless for those errors that does not need to repeat I/O as the
seqlock @ll_page_inv_lock does not have any change.

Test to illustrate the issus is added.

sanity.sh tests are to test i/o error handling.

cl_io_loop(): avoid restart if ci_tried_all_mirrors flag is set.

HPE-bug-id: LUS-11686
Signed-off-by: Vladimir Saveliev <vladimir.saveliev@hpe.com>
Signed-off-by: Qian Yingjin <qian@ddn.com>
Change-Id: I3b62bc95db01bf11f6098011bf29e4064c7e201e
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51505
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/llite/vvp_io.c
lustre/llite/vvp_page.c
lustre/obdclass/cl_io.c
lustre/tests/recovery-small.sh
lustre/tests/sanity.sh

index 028f14b..4d88f21 100644 (file)
@@ -641,6 +641,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_LOV_INVALID_OSTIDX                0x1428
 #define OBD_FAIL_LLITE_DELAY_TRUNCATE              0x1430
 #define OBD_FAIL_LLITE_READ_PAUSE                  0x1431
+#define OBD_FAIL_LLITE_FAULT_PAUSE                 0x1432
 
 #define OBD_FAIL_FID_INDIR     0x1501
 #define OBD_FAIL_FID_INLMA     0x1502
index 62e9903..0fa77d0 100644 (file)
@@ -1510,6 +1510,8 @@ static int vvp_io_fault_start(const struct lu_env *env,
        if (result != 0)
                RETURN(result);
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_FAULT_PAUSE, cfs_fail_val);
+
        /* must return locked page */
        if (fio->ft_mkwrite) {
                LASSERT(cfio->ft_vmpage != NULL);
index 07d3906..a900bbb 100644 (file)
@@ -85,11 +85,13 @@ static void vvp_page_delete(const struct lu_env *env,
                 * kernel after it has been deleted from Lustre, which avoids
                 * potential stale data reads.  The seqlock allows us to see
                 * that a page was potentially deleted and catch the resulting
-                * SIGBUS - see ll_filemap_fault() (LU-16160) */
-               write_seqlock(&ll_i2info(inode)->lli_page_inv_lock);
-               ClearPageUptodate(vmpage);
-               write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock);
-
+                * SIGBUS - see ll_filemap_fault() (LU-16160)
+                */
+               if (PageUptodate(vmpage)) {
+                       write_seqlock(&ll_i2info(inode)->lli_page_inv_lock);
+                       ClearPageUptodate(vmpage);
+                       write_sequnlock(&ll_i2info(inode)->lli_page_inv_lock);
+               }
                /*
                 * The reference from vmpage to cl_page is removed,
                 * but the reference back is still here. It is removed
index ef674ae..b686e6e 100644 (file)
@@ -783,8 +783,12 @@ int cl_io_loop(const struct lu_env *env, struct cl_io *io)
                result = rc;
 
        if (result == -EAGAIN && io->ci_ndelay && !io->ci_iocb_nowait) {
-               io->ci_need_restart = 1;
-               result = 0;
+               if (!io->ci_tried_all_mirrors) {
+                       io->ci_need_restart = 1;
+                       result = 0;
+               } else {
+                       result = -EIO;
+               }
        }
 
        if (result == 0)
index 736ddd5..72ebcc1 100755 (executable)
@@ -3603,6 +3603,23 @@ test_156()
 }
 run_test 156 "tot_granted miscount after client eviction"
 
+test_157()
+{
+       $LFS setstripe -i 0 -c 1 $DIR/$tfile || error "setstripe failed"
+       dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+       cancel_lru_locks osc
+
+#define OBD_FAIL_LLITE_FAULT_PAUSE                 0x1432
+       $LCTL set_param fail_loc=0x80001432 fail_val=3
+       $MULTIOP $DIR/$tfile soO_RDWR:MRUc &
+       MULTIPID=$!
+       sleep 1
+       ost_evict_client
+       wait $MULTIPID
+       [[ $? == 135 ]] || error "multiop failed with not SIGBUS"
+}
+run_test 157 "eviction during mmaped i/o"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 exit_status
index 9a886fe..291e38f 100755 (executable)
@@ -28003,6 +28003,38 @@ test_398q()
 }
 run_test 398q "race dio with buffered i/o"
 
+test_398r() {
+       $LFS setstripe -i 0 -c 1 $DIR/$tfile || error "setstripe failed"
+       echo "hello, world" > $DIR/$tfile
+
+       cancel_lru_locks osc
+
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+       do_facet ost1 $LCTL set_param fail_loc=0x20f
+       cat $DIR/$tfile > /dev/null && error "cat should fail"
+       return 0
+}
+run_test 398r "i/o error on file read"
+
+test_398s() {
+       [[ $OSTCOUNT -ge 2 && "$ost1_HOST" = "$ost2_HOST" ]] ||
+               skip "remote OST"
+
+       $LFS mirror create -N -i 0 -c 1 -N -i 1 -c 1 $DIR/$tfile ||
+               error "mirror create failed"
+
+       echo "hello, world" > $DIR/$tfile
+       $LFS mirror resync $DIR/$tfile || error "mirror resync failed"
+
+       cancel_lru_locks osc
+
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+       do_facet ost1 $LCTL set_param fail_loc=0x20f
+       cat $DIR/$tfile > /dev/null && error "cat should fail"
+       return 0
+}
+run_test 398s "i/o error on mirror file read"
+
 test_fake_rw() {
        local read_write=$1
        if [ "$read_write" = "write" ]; then