From ff35d160dd3437dab2e08b52546e4dad76d01e01 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Thu, 11 May 2023 16:55:37 -0400 Subject: [PATCH] LU-13805 llite: wait for partially successful aio For various reasons (notably conflicting buffered IO), we may need to fall back from DIO to buffered IO. This also affects AIO, and if it happens, we will sometimes submit only part of an AIO with the AIO path, completing the rest with the buffered path. Userspace doesn't expect this, expecting us to either do all or none of the IO with AIO, so it doesn't wait for completion in this case. To meet this expectation, we must recognize this case and wait for AIO to complete before returning to userspace. Signed-off-by: Patrick Farrell Change-Id: Iac7abac3bd01f027c353120483932a62c6475277 --- lustre/include/obd_support.h | 1 + lustre/llite/file.c | 35 +++++++++++++++++++++++++++++------ lustre/llite/rw26.c | 4 ++++ lustre/obdclass/cl_io.c | 3 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index bb075f3..c8dbe99 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -629,6 +629,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_READPAGE_PAUSE 0x1422 #define OBD_FAIL_LLITE_PANIC_ON_ESTALE 0x1423 #define OBD_FAIL_LLITE_READPAGE_PAUSE2 0x1424 +#define OBD_FAIL_LLITE_DIO_FALLBACK 0x1425 #define OBD_FAIL_FID_INDIR 0x1501 #define OBD_FAIL_FID_INLMA 0x1502 diff --git a/lustre/llite/file.c b/lustre/llite/file.c index bcaa524..2abc902 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1890,6 +1890,9 @@ out: } if (io->ci_dio_aio) { + struct cl_sync_io *anchor = &io->ci_dio_aio->cda_sync; + bool creator_free; + /* set the number of bytes successfully moved in the aio */ if (result > 0) io->ci_dio_aio->cda_bytes = result; @@ -1899,13 +1902,27 @@ out: * in our end_io(). (cda_no_aio_complete is always set for * normal DIO.) * + * Returning !EIOCBQUEUED indicates we didn't do the IO + * successfully as pure AIO, but we may have done some of the + * IO as AIO. If we return to userspace with an rc other than + * EIOCBQUEUED, userspace assumes we completed all IO and + * continues without waiting, so we must wait for all IO to + * complete. + * * NB: Setting cda_no_aio_complete like this is safe because * the atomic_dec_and_lock in cl_sync_io_note has implicit * memory barriers, so this will be seen by whichever thread * completes the DIO/AIO, even if it's not this one. */ - if (is_aio && rc != -EIOCBQUEUED) + if (is_aio && rc != -EIOCBQUEUED) { io->ci_dio_aio->cda_no_aio_complete = 1; + LASSERT(ergo(!io->ci_dio_aio->cda_creator_free, is_aio)); + io->ci_dio_aio->cda_creator_free = 1; + + rc2 = cl_sync_io_wait_recycle(env, anchor, 0, 0); + if (rc2 < 0) + rc = rc2; + } /* if an aio enqueued successfully (-EIOCBQUEUED), then Lustre * will call aio_complete rather than the vfs, so we return 0 * to tell the VFS we're handling it @@ -1919,12 +1936,18 @@ out: * For DIO, this frees it here, since IO is complete, and for * AIO, we will call aio_complete() (and then free this top * level context) once all the outstanding chunks of this AIO - * have completed. + * have completed. NB: for AIO which is only partially done as + * AIO, we also complete it here. See call to + * cl_sync_io_wait_recycle just above. + */ + /* grab creator_free value before doing final cl_sync_io_note, + * since the ci_dio_aio may be freed after that */ - cl_sync_io_note(env, &io->ci_dio_aio->cda_sync, - rc == -EIOCBQUEUED ? 0 : rc); - if (!is_aio) { - LASSERT(io->ci_dio_aio->cda_creator_free); + creator_free = io->ci_dio_aio->cda_creator_free; + cl_sync_io_note(env, anchor, rc == -EIOCBQUEUED ? 0 : rc); + if (creator_free) { + /* aio is only freed here if it wasn't 100% successful*/ + LASSERT(!is_aio || rc != -EIOCBQUEUED); cl_dio_aio_free(env, io->ci_dio_aio); io->ci_dio_aio = NULL; } diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index dfca570..3e675a9 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -515,6 +515,10 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) (count >> PAGE_SHIFT) + !!(count & ~PAGE_MASK), MAX_DIO_SIZE >> PAGE_SHIFT, unaligned ? ", unaligned" : ""); + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_DIO_FALLBACK) && + cfs_fail_val == file_offset) + RETURN(0); + /* Check EOF by ourselves */ if (rw == READ && file_offset >= i_size_read(inode)) RETURN(0); diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index b32740b..5687715 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -1686,7 +1686,8 @@ EXPORT_SYMBOL(cl_sync_io_note); /* this function waits for completion of outstanding io and then re-initializes * the anchor used to track it. This is used to wait to complete DIO before - * returning to userspace, and is never called for true AIO + * returning to userspace; it is called for AIO in certain fallback cases where + * we must complete all AIO before returning to userspace */ int cl_sync_io_wait_recycle(const struct lu_env *env, struct cl_sync_io *anchor, long timeout, int ioret) -- 1.8.3.1