Whamcloud - gitweb
LU-13805 llite: Fix return for non-queued aio 15/49915/17
authorPatrick Farrell <pfarrell@whamcloud.com>
Fri, 31 Mar 2023 20:34:33 +0000 (16:34 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 31 May 2023 19:02:14 +0000 (19:02 +0000)
If an AIO fails or is completed synchronously (even
partially), the VFS will handle calling the completion
callback to finish the AIO, and so Lustre needs to return
the number of bytes successfully completed to the VFS.

This fixes a bug where if an AIO was racing with buffered
I/O, the AIO would fall back to buffered I/O, causing it to
complete before returning to the VFS rather than being
queued.  In this case, Lustre would return 0 the VFS, and
the VFS would complete the AIO and report 0 bytes moved.

This fixes the logic for this.

Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I9306402201e2962bbff04a4264c37bd0f1eca7b7
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49915
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Zhenyu Xu <bobijam@hotmail.com>
Reviewed-by: Qian Yingjin <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/llite/file.c
lustre/llite/rw26.c
lustre/tests/sanity.sh

index 174da97..22e4eac 100644 (file)
@@ -1832,21 +1832,12 @@ restart:
                        range_locked = false;
        }
 
-       /*
-        * In order to move forward AIO, ci_nob was increased,
-        * but that doesn't mean io have been finished, it just
-        * means io have been submited, we will always return
-        * EIOCBQUEUED to the caller, So we could only return
-        * number of bytes in non-AIO case.
-        */
        if (io->ci_nob > 0) {
-               if (!is_aio) {
-                       if (rc2 == 0) {
-                               result += io->ci_nob;
-                               *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
-                       } else if (rc2) {
-                               result = 0;
-                       }
+               if (rc2 == 0) {
+                       result += io->ci_nob;
+                       *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
+               } else if (rc2) {
+                       result = 0;
                }
                count -= io->ci_nob;
 
@@ -1886,22 +1877,36 @@ out:
        }
 
        if (io->ci_dio_aio) {
+               /* set the number of bytes successfully moved in the aio */
+               if (result > 0)
+                       io->ci_dio_aio->cda_bytes = result;
                /*
                 * VFS will call aio_complete() if no -EIOCBQUEUED
                 * is returned for AIO, so we can not call aio_complete()
-                * in our end_io().
+                * in our end_io().  (cda_no_aio_complete is always set for
+                * normal DIO.)
                 *
-                * NB: This is safe because the atomic_dec_and_lock  in
-                * cl_sync_io_init has implicit memory barriers, so this will
-                * be seen by whichever thread completes the DIO/AIO, even if
-                * it's not this one
+                * NB: Setting cda_no_aio_complete like this is safe because
+                * the atomic_dec_and_lock in cl_sync_io_note has implicit
+                * memory barriers, so this will be seen by whichever thread
+                * completes the DIO/AIO, even if it's not this one.
                 */
-               if (rc != -EIOCBQUEUED)
+               if (is_aio && rc != -EIOCBQUEUED)
                        io->ci_dio_aio->cda_no_aio_complete = 1;
+               /* if an aio enqueued successfully (-EIOCBQUEUED), then Lustre
+                * will call aio_complete rather than the vfs, so we return 0
+                * to tell the VFS we're handling it
+                */
+               else if (is_aio) /* rc == -EIOCBQUEUED */
+                       result = 0;
                /**
-                * Drop one extra reference so that end_io() could be
-                * called for this IO context, we could call it after
-                * we make sure all AIO requests have been proceed.
+                * Drop the reference held by the llite layer on this top level
+                * IO context.
+                *
+                * For DIO, this frees it here, since IO is complete, and for
+                * AIO, we will call aio_complete() (and then free this top
+                * level context) once all the outstanding chunks of this AIO
+                * have completed.
                 */
                cl_sync_io_note(env, &io->ci_dio_aio->cda_sync,
                                rc == -EIOCBQUEUED ? 0 : rc);
index 3fd25e4..16acd8c 100644 (file)
@@ -590,7 +590,6 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
        }
 
 out:
-       ll_dio_aio->cda_bytes += tot_bytes;
 
        if (rw == WRITE)
                vio->u.readwrite.vui_written += tot_bytes;
index a54ff02..623fee7 100755 (executable)
@@ -26183,6 +26183,54 @@ test_398o() {
 }
 run_test 398o "right kms with DIO"
 
+test_398p()
+{
+       (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+       which aiocp || skip_env "no aiocp installed"
+
+       local stripe_size=$((1024 * 1024)) #1 MiB
+       # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+       local file_size=$((25 * stripe_size))
+
+       $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+       stack_trap "rm -f $DIR/$tfile*"
+       # Just a bit bigger than the largest size in the test set below
+       dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
+               error "buffered i/o to create file failed"
+
+       for bs in $PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
+               $((stripe_size * 4)); do
+
+               $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+
+               echo "bs: $bs, file_size $file_size"
+               aiocp -a $PAGE_SIZE -b $bs -s $file_size -f O_DIRECT \
+                       $DIR/$tfile.1 $DIR/$tfile.2 &
+               pid_dio1=$!
+               # Buffered I/O with similar but not the same block size
+               dd if=$DIR/$tfile.1 bs=$((bs * 2)) of=$DIR/$tfile.2 \
+                       conv=notrunc &
+               pid_bio2=$!
+               wait $pid_dio1
+               rc1=$?
+               wait $pid_bio2
+               rc2=$?
+               if (( rc1 != 0 )); then
+                       error "aio copy 1 w/bsize $bs failed: $rc1"
+               fi
+               if (( rc2 != 0 )); then
+                       error "buffered copy 2 w/bsize $bs failed: $rc2"
+               fi
+
+               $CHECKSTAT -t file -s $file_size $DIR/$tfile.2 ||
+                       error "size incorrect"
+               cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+                       error "files differ, bsize $bs"
+               rm -f $DIR/$tfile.2
+       done
+}
+run_test 398p "race aio with buffered i/o"
+
 test_fake_rw() {
        local read_write=$1
        if [ "$read_write" = "write" ]; then