From 7194eb6431d2ef7245ef3b13394b60e220145187 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Thu, 27 Jul 2023 14:36:30 -0400 Subject: [PATCH] LU-13805 clio: bounce buffer for unaligned DIO Direct I/O must normally be page aligned both in terms of I/O size and memory alignment. This is because the I/O must be page aligned before being written to disk. This is also true for buffered I/O, but the I/O is aligned using the page cache. In recent versions of Lustre, direct I/O is significantly faster than buffered I/O, due to lower overhead for page management. Thus it is desirable to be able to do more I/O as direct I/O. This patch allows unaligned direct I/O by creating a buffer inside the kernel and aligning the I/O by copying in to this aligned buffer. Because the main cost of buffered I/O is locking in the page cache rather than memcopy(), this is still significantly faster than buffered I/O (though slower than normal direct I/O). This will eventually allow us to convert buffered I/O to direct I/O when doing so would increase performance. Here are some comparative benchmarks using IOR, all single process. UDIO is unaligned DIO. io size 1M 4M 16M 64M ---------------------------------------------------------- BIO Write | 1502 MiB/s | 1382 MiB/s | 1683 MiB/s | 1677 MiB/s BIO Read | 2169 MiB/s | 1902 MiB/s | 2131 MiB.s | 1955 MiB/s DIO Write | 1010 MiB/s | 2778 MiB/s | 5905 MiB/s | 7917 MiB/s DIO Read | 893 MiB/s | 2657 MiB/s | 4724 MiB/s | 7579 MiB/s UDIO Write | 848 MiB/s | 1666 MiB/s | 2117 MiB/s | 2243 MiB/s UDIO Read | 933 MIB/s | 2412 MiB/s | 3690 MiB/s | 5370 MiB/s Unaligned DIO offers benefits vs buffered write and buffered read, but is of course slower than DIO. Notice on this node the best case DIO performance is ~8 GiB/s. On a node with 12 GiB/s best case DIO, best case UDIO read is 8 GiB/s and best case UDIO write is 2.5 GiB/s. This is because UDIO read is fully parallel, UDIO write is not. Signed-off-by: Patrick Farrell Change-Id: I7eeebf9a608f006c8095b95f0677adb99f19d640 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/45616 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Qian Yingjin Reviewed-by: Oleg Drokin --- lustre/autoconf/lustre-core.m4 | 49 +++++++ lustre/include/cl_object.h | 25 +++- lustre/llite/rw26.c | 96 +++++++++--- lustre/obdclass/cl_io.c | 321 ++++++++++++++++++++++++++++++++++++++--- lustre/osd-zfs/osd_io.c | 1 - lustre/tests/sanity.sh | 167 +++++++++++++++++++-- lustre/tests/sanityn.sh | 11 +- 7 files changed, 606 insertions(+), 64 deletions(-) diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index d72b428..55dcc3a 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -3182,6 +3182,26 @@ fileattr_set, [ EXTRA_KCFLAGS="$tmp_flags" ]) # LC_HAVE_FILEATTR_GET +# LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC +# +# Kernel 5.13 commit f0b65f39ac505e8f1dcdaa165aa7b8c0bd6fd454 +# iov_iter: replace iov_iter_copy_from_user_atomic() with iterator-advancing variant +# +AC_DEFUN([LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [ + LB2_LINUX_TEST_SRC([copy_page_from_iter_atomic], [ + #include + ],[ + copy_page_from_iter_atomic(NULL, 0, 0, NULL); + ]) +]) +AC_DEFUN([LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [ + AC_MSG_CHECKING([if have copy_page_from_iter_atomic]) + LB2_LINUX_TEST_RESULT([copy_page_from_iter_atomic], [ + AC_DEFINE(HAVE_COPY_PAGE_FROM_ITER_ATOMIC, 1, + ['copy_page_from_iter_atomic' exists]) + ]) +]) # LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC + # # LC_HAVE_GET_ACL_RCU_ARG # @@ -3205,6 +3225,26 @@ AC_DEFUN([LC_HAVE_GET_ACL_RCU_ARG], [ ]) ]) # LC_HAVE_GET_ACL_RCU_ARG +# LC_HAVE_FAULT_IN_IOV_ITER_READABLE +# +# Kernel 5.15 commit a6294593e8a1290091d0b078d5d33da5e0cd3dfe +# iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable +# +AC_DEFUN([LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE], [ + LB2_LINUX_TEST_SRC([fault_in_iov_iter_readable], [ + #include + ],[ + fault_in_iov_iter_readable(NULL, 0); + ]) +]) +AC_DEFUN([LC_HAVE_FAULT_IN_IOV_ITER_READABLE], [ + AC_MSG_CHECKING([if have fault_in_iov_iter_readable]) + LB2_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [ + AC_DEFINE(HAVE_FAULT_IN_IOV_ITER_READABLE, 1, + ['fault_in_iov_iter_readable' exists]) + ]) +]) # LC_HAVE_FAULT_IN_IOV_ITER_READABLE + # # LC_HAVE_INVALIDATE_LOCK # @@ -4129,8 +4169,12 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [ # 5.12 LC_SRC_HAVE_USER_NAMESPACE_ARG + # 5.13 + LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC + # 5.15 LC_SRC_HAVE_GET_ACL_RCU_ARG + LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE # 5.16 LC_SRC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG @@ -4398,10 +4442,15 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [ # 5.13 LC_HAVE_FILEATTR_GET + LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC + + # 5.15 + LC_HAVE_GET_ACL_RCU_ARG # 5.15 LC_HAVE_GET_ACL_RCU_ARG LC_HAVE_INVALIDATE_LOCK + LC_HAVE_FAULT_IN_IOV_ITER_READABLE # 5.16 LC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 4946a15..6689a30 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1942,7 +1942,12 @@ struct cl_io { * to userspace, only the RPCs are submitted async, then waited for at * the llite layer before returning. */ - ci_parallel_dio:1; + ci_parallel_dio:1, + /** + * this DIO is at least partly unaligned, and so the unaligned DIO + * path is being used for this entire IO + */ + ci_unaligned_dio:1; /** * Bypass quota check */ @@ -2521,7 +2526,7 @@ struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj, bool is_aio); struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, struct iov_iter *iter, bool write, - bool sync); + bool unaligned, bool sync); void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio); void cl_sub_dio_free(struct cl_sub_dio *sdio); static inline void cl_sync_io_init(struct cl_sync_io *anchor, int nr) @@ -2553,14 +2558,14 @@ struct cl_sync_io { /** direct IO pages */ struct ll_dio_pages { /* - * page array to be written. we don't support - * partial pages except the last one. + * page array for RDMA - for aligned i/o, this is the user provided + * pages, but for unaligned i/o, this is the internal buffer */ - struct page **ldp_pages; + struct page **ldp_pages; /** # of pages in the array. */ - size_t ldp_count; + size_t ldp_count; /* the file offset of the first page. */ - loff_t ldp_file_offset; + loff_t ldp_file_offset; }; /* Top level struct used for AIO and DIO */ @@ -2585,7 +2590,8 @@ struct cl_sub_dio { struct ll_dio_pages csd_dio_pages; struct iov_iter csd_iter; unsigned csd_creator_free:1, - csd_write:1; + csd_write:1, + csd_unaligned:1; }; #if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \ defined(HAVE_DIRECTIO_2ARGS) @@ -2593,6 +2599,9 @@ struct cl_sub_dio { #endif void ll_release_user_pages(struct page **pages, int npages); +int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size); +void ll_free_dio_buffer(struct ll_dio_pages *pvec); +ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov); #ifndef HAVE_KTHREAD_USE_MM #define kthread_use_mm(mm) use_mm(mm) diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index ed009f8..8f9511d 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -492,26 +492,39 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) ssize_t tot_bytes = 0, result = 0; loff_t file_offset = iocb->ki_pos; bool sync_submit = false; + bool unaligned = false; struct vvp_io *vio; ssize_t rc2; + ENTRY; + /* Check EOF by ourselves */ if (rw == READ && file_offset >= i_size_read(inode)) - return 0; + RETURN(0); - /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ if (file_offset & ~PAGE_MASK) - RETURN(-EINVAL); + unaligned = true; - CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " - "offset=%lld=%llx, pages %zd (max %lu)\n", - PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, - file_offset, file_offset, count >> PAGE_SHIFT, - MAX_DIO_SIZE >> PAGE_SHIFT); + if (count & ~PAGE_MASK) + unaligned = true; /* Check that all user buffers are aligned as well */ if (ll_iov_iter_alignment(iter) & ~PAGE_MASK) - RETURN(-EINVAL); + unaligned = true; + + CDEBUG(D_VFSTRACE, + "VFS Op:inode="DFID"(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)%s\n", + PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, + file_offset, file_offset, + (count >> PAGE_SHIFT) + !!(count & ~PAGE_MASK), + MAX_DIO_SIZE >> PAGE_SHIFT, unaligned ? ", unaligned" : ""); + + /* the requirement to not return EIOCBQUEUED for pipes (see bottom of + * this function) plays havoc with the unaligned I/O lifecycle, so + * don't allow unaligned I/O on pipes + */ + if (unaligned && iov_iter_is_pipe(iter)) + RETURN(0); lcc = ll_cl_find(inode); if (lcc == NULL) @@ -523,6 +536,15 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) io = lcc->lcc_io; LASSERT(io != NULL); + /* if one part of an I/O is unaligned, just handle all of it that way - + * otherwise we create significant complexities with managing the iovec + * in different ways, etc, all for very marginal benefits + */ + if (unaligned) + io->ci_unaligned_dio = true; + if (io->ci_unaligned_dio) + unaligned = true; + ll_dio_aio = io->ci_dio_aio; LASSERT(ll_dio_aio); LASSERT(ll_dio_aio->cda_iocb == iocb); @@ -557,14 +579,28 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) * (either in this function or from a ptlrpcd daemon) */ sdio = cl_sub_dio_alloc(ll_dio_aio, iter, rw == WRITE, - sync_submit); + unaligned, sync_submit); if (!sdio) GOTO(out, result = -ENOMEM); pvec = &sdio->csd_dio_pages; pvec->ldp_file_offset = file_offset; - result = ll_get_user_pages(rw, iter, pvec, count); + if (!unaligned) { + result = ll_get_user_pages(rw, iter, pvec, count); + /* ll_get_user_pages returns bytes in the IO or error*/ + count = result; + } else { + /* same calculation used in ll_get_user_pages */ + count = min_t(size_t, count, iter->iov->iov_len); + result = ll_allocate_dio_buffer(pvec, count); + /* allocate_dio_buffer returns number of pages or + * error, so do not set count = result + */ + } + + /* now we have the actual count, so store it in the sdio */ + sdio->csd_bytes = count; if (unlikely(result <= 0)) { cl_sync_io_note(env, &sdio->csd_sync, result); @@ -574,10 +610,33 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) } GOTO(out, result); } - count = result; - result = ll_direct_rw_pages(env, io, count, - rw, inode, sdio); + if (unaligned && rw == WRITE) { + result = ll_dio_user_copy(sdio, iter); + if (unlikely(result <= 0)) { + cl_sync_io_note(env, &sdio->csd_sync, result); + if (sync_submit) { + cl_sub_dio_free(sdio); + LASSERT(sdio->csd_creator_free); + } + GOTO(out, result); + } + } + + result = ll_direct_rw_pages(env, io, count, rw, inode, sdio); + /* if the i/o was unsuccessful, we zero the number of bytes to + * copy back. Note that partial I/O completion isn't possible + * here - I/O either completes or fails. So there's no need to + * handle short I/O here by changing 'count' with the result + * from ll_direct_rw_pages. + * + * This must be done before we release the reference + * immediately below, because releasing the reference allows + * i/o completion (and copyback to userspace, if unaligned) to + * start. + */ + if (result != 0) + sdio->csd_bytes = 0; /* We've submitted pages and can now remove the extra * reference for that */ @@ -595,12 +654,15 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) GOTO(out, result); iov_iter_advance(iter, count); + tot_bytes += count; file_offset += count; + CDEBUG(D_VFSTRACE, + "result %zd tot_bytes %zd count %zd file_offset %lld\n", + result, tot_bytes, count, file_offset); } out: - if (rw == WRITE) vio->u.readwrite.vui_written += tot_bytes; else @@ -612,10 +674,10 @@ out: if (result == 0 && !iov_iter_is_pipe(iter)) result = -EIOCBQUEUED; - return result; + RETURN(result); } -#if defined(HAVE_DIO_ITER) +#ifdef HAVE_DIO_ITER static ssize_t ll_direct_IO( #ifndef HAVE_IOV_ITER_RW int rw, diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index 2984c43..0cf0943 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -1206,8 +1207,34 @@ static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor) cl_page_list_del(env, &sdio->csd_pages, page); } - ll_release_user_pages(sdio->csd_dio_pages.ldp_pages, - sdio->csd_dio_pages.ldp_count); + if (sdio->csd_unaligned) { + /* save the iovec pointer before it's modified by + * ll_dio_user_copy + */ + struct iovec *tmp = (struct iovec *) sdio->csd_iter.iov; + + CDEBUG(D_VFSTRACE, + "finishing unaligned dio %s aio->cda_bytes %ld\n", + sdio->csd_write ? "write" : "read", sdio->csd_bytes); + /* read copies *from* the kernel buffer *to* userspace + * here at the end, write copies *to* the kernel + * buffer from userspace at the start + */ + if (!sdio->csd_write && sdio->csd_bytes > 0) + ret = ll_dio_user_copy(sdio, NULL); + ll_free_dio_buffer(&sdio->csd_dio_pages); + /* handle the freeing here rather than in cl_sub_dio_free + * because we have the unmodified iovec pointer + */ + OBD_FREE_PTR(tmp); + sdio->csd_iter.iov = NULL; + } else { + /* unaligned DIO does not get user pages, so it doesn't have to + * release them, but aligned I/O must + */ + ll_release_user_pages(sdio->csd_dio_pages.ldp_pages, + sdio->csd_dio_pages.ldp_count); + } cl_sync_io_note(env, &sdio->csd_ll_aio->cda_sync, ret); EXIT; @@ -1246,7 +1273,7 @@ EXPORT_SYMBOL(cl_dio_aio_alloc); struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, struct iov_iter *iter, bool write, - bool sync) + bool unaligned, bool sync) { struct cl_sub_dio *sdio; @@ -1261,27 +1288,33 @@ struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, cl_page_list_init(&sdio->csd_pages); sdio->csd_ll_aio = ll_aio; - atomic_add(1, &ll_aio->cda_sync.csi_sync_nr); sdio->csd_creator_free = sync; sdio->csd_write = write; + sdio->csd_unaligned = unaligned; - /* we need to make a copy of the user iovec at this point in - * time so we: - * A) have the correct state of the iovec for this chunk of I/O - * B) have a chunk-local copy; some of the things we want to - * do to the iovec modify it, so to process each chunk from a - * separate thread requires a local copy of the iovec - */ - memcpy(&sdio->csd_iter, iter, - sizeof(struct iov_iter)); - OBD_ALLOC_PTR(sdio->csd_iter.iov); - if (sdio->csd_iter.iov == NULL) { - cl_sub_dio_free(sdio); - sdio = NULL; - goto out; + atomic_add(1, &ll_aio->cda_sync.csi_sync_nr); + + if (unaligned) { + /* we need to make a copy of the user iovec at this + * point in time, in order to: + * + * A) have the correct state of the iovec for this + * chunk of I/O, ie, the main iovec is altered as we do + * I/O and this chunk needs the current state + * B) have a chunk-local copy; doing the IO later + * modifies the iovec, so to process each chunk from a + * separate thread requires a local copy of the iovec + */ + memcpy(&sdio->csd_iter, iter, sizeof(struct iov_iter)); + OBD_ALLOC_PTR(sdio->csd_iter.iov); + if (sdio->csd_iter.iov == NULL) { + cl_sub_dio_free(sdio); + sdio = NULL; + goto out; + } + memcpy((void *) sdio->csd_iter.iov, iter->iov, + sizeof(struct iovec)); } - memcpy((void *) sdio->csd_iter.iov, iter->iov, - sizeof(struct iovec)); } out: return sdio; @@ -1302,16 +1335,96 @@ EXPORT_SYMBOL(cl_dio_aio_free); void cl_sub_dio_free(struct cl_sub_dio *sdio) { if (sdio) { - void *tmp = (void *) sdio->csd_iter.iov; + void *tmp = (void *)sdio->csd_iter.iov; - if (tmp) - OBD_FREE(tmp, sizeof(struct iovec)); + if (tmp) { + LASSERT(sdio->csd_unaligned); + OBD_FREE_PTR(tmp); + } OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem); } } EXPORT_SYMBOL(cl_sub_dio_free); /* + * For unaligned DIO. + * + * Allocate the internal buffer from/to which we will perform DIO. This takes + * the user I/O parameters and allocates an internal buffer large enough to + * hold it. The pages in this buffer are aligned with pages in the file (ie, + * they have a 1-to-1 mapping with file pages). + */ +int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size) +{ + struct page *new_page; + size_t pg_offset; + int result = 0; + ssize_t i; + + ENTRY; + + /* page level offset in the file where the I/O starts */ + pg_offset = pvec->ldp_file_offset & ~PAGE_MASK; + /* this adds 1 for the first page and removes the bytes in it from the + * io_size, making the rest of the calculation aligned + */ + if (pg_offset) { + pvec->ldp_count++; + io_size -= min_t(size_t, PAGE_SIZE - pg_offset, io_size); + } + + /* calculate pages for the rest of the buffer */ + pvec->ldp_count += (io_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + +#ifdef HAVE_DIO_ITER + pvec->ldp_pages = kvzalloc(pvec->ldp_count * sizeof(struct page *), + GFP_NOFS); +#else + OBD_ALLOC_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count); +#endif + if (pvec->ldp_pages == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < pvec->ldp_count; i++) { + new_page = alloc_page(GFP_NOFS); + if (!new_page) { + result = -ENOMEM; + pvec->ldp_count = i; + goto out; + } + pvec->ldp_pages[i] = new_page; + } + WARN_ON(i != pvec->ldp_count); + +out: + if (result) { + if (pvec->ldp_pages) + ll_free_dio_buffer(pvec); + } + + if (result == 0) + result = pvec->ldp_count; + + RETURN(result); +} +EXPORT_SYMBOL(ll_allocate_dio_buffer); + +void ll_free_dio_buffer(struct ll_dio_pages *pvec) +{ + int i; + + for (i = 0; i < pvec->ldp_count; i++) + __free_page(pvec->ldp_pages[i]); + +#ifdef HAVE_DIO_ITER + kfree(pvec->ldp_pages); +#else + OBD_FREE_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count); +#endif +} +EXPORT_SYMBOL(ll_free_dio_buffer); + +/* * ll_release_user_pages - tear down page struct array * @pages: array of page struct pointers underlying target buffer */ @@ -1338,6 +1451,168 @@ void ll_release_user_pages(struct page **pages, int npages) } EXPORT_SYMBOL(ll_release_user_pages); +#ifdef HAVE_FAULT_IN_IOV_ITER_READABLE +#define ll_iov_iter_fault_in_readable(iov, bytes) \ + fault_in_iov_iter_readable(iov, bytes) +#else +#define ll_iov_iter_fault_in_readable(iov, bytes) \ + iov_iter_fault_in_readable(iov, bytes) +#endif + +#ifndef HAVE_KTHREAD_USE_MM +#define kthread_use_mm(mm) use_mm(mm) +#define kthread_unuse_mm(mm) unuse_mm(mm) +#endif + +/* copy IO data to/from internal buffer and userspace iovec */ +ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov) +{ + struct iov_iter *iter = write_iov ? write_iov : &sdio->csd_iter; + struct ll_dio_pages *pvec = &sdio->csd_dio_pages; + struct mm_struct *mm = sdio->csd_ll_aio->cda_mm; + loff_t pos = pvec->ldp_file_offset; + size_t count = sdio->csd_bytes; + size_t original_count = count; + int short_copies = 0; + bool mm_used = false; + int status = 0; + int i = 0; + int rw; + + ENTRY; + + LASSERT(sdio->csd_unaligned); + + if (sdio->csd_write) + rw = WRITE; + else + rw = READ; + + /* if there's no mm, io is being done from a kernel thread, so there's + * no need to transition to its mm context anyway. + * + * Also, if mm == current->mm, that means this is being handled in the + * thread which created it, and not in a separate kthread - so it is + * unnecessary (and incorrect) to do a use_mm here + */ + if (mm && mm != current->mm) { + kthread_use_mm(mm); + mm_used = true; + } + + /* fault in the entire userspace iovec */ + if (rw == WRITE) { + if (unlikely(ll_iov_iter_fault_in_readable(iter, count))) + GOTO(out, status = -EFAULT); + } + + /* modeled on kernel generic_file_buffered_read/write() + * + * note we only have one 'chunk' of i/o here, so we do not copy the + * whole iovec here (except when the chunk is the whole iovec) so we + * use the count of bytes in the chunk, csd_bytes, instead of looking + * at the iovec + */ + while (true) { + struct page *page = pvec->ldp_pages[i]; + unsigned long offset; /* offset into kernel buffer page */ + size_t copied; /* bytes successfully copied */ + size_t bytes; /* bytes to copy for this page */ + + LASSERT(i < pvec->ldp_count); + + offset = pos & ~PAGE_MASK; + bytes = min_t(unsigned long, PAGE_SIZE - offset, + count); + + CDEBUG(D_VFSTRACE, + "count %zd, offset %lu, pos %lld, ldp_count %lu\n", + count, offset, pos, pvec->ldp_count); + + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + + /* write requires a few extra steps */ + if (rw == WRITE) { + /* like btrfs, we do not have a mapping since this isn't + * a page cache page, so we must do this flush + * unconditionally + * + * NB: This is a noop on x86 but active on other + * architectures + */ + flush_dcache_page(page); + +#ifndef HAVE_COPY_PAGE_FROM_ITER_ATOMIC + copied = iov_iter_copy_from_user_atomic(page, iter, + offset, bytes); + iov_iter_advance(iter, copied); +#else + copied = copy_page_from_iter_atomic(page, offset, bytes, + iter); +#endif + + } else /* READ */ { + copied = copy_page_to_iter(page, offset, bytes, iter); + } + + pos += copied; + count -= copied; + + if (unlikely(copied < bytes)) { + short_copies++; + + CDEBUG(D_VFSTRACE, + "short copy - copied only %zd of %lu, short %d times\n", + copied, bytes, short_copies); + /* copies will very rarely be interrupted, but we + * should retry in those cases, since the other option + * is giving an IO error and this can occur in normal + * operation such as with racing unaligned AIOs + * + * but of course we should not retry indefinitely + */ + if (short_copies > 2) { + CERROR("Unaligned DIO copy repeatedly short, count %zd, offset %lu, bytes %lu, copied %zd, pos %lld\n", + count, offset, bytes, copied, pos); + + status = -EFAULT; + break; + } + + continue; + } + + if (count == 0) + break; + + i++; + } + +out: + /* if we complete successfully, we should reach all of the pages */ + LASSERTF(ergo(status == 0, i == pvec->ldp_count - 1), + "status: %d, i: %d, pvec->ldp_count %zu, count %zu\n", + status, i, pvec->ldp_count, count); + + if (write_iov && status == 0) { + /* The copy function we use modifies the count in the iovec, + * but that's actually the job of the caller, so we return the + * iovec to the original count + */ + iov_iter_reexpand(iter, original_count); + } + + if (mm_used) + kthread_unuse_mm(mm); + + /* the total bytes copied, or status */ + RETURN(original_count - count ? original_count - count : status); +} +EXPORT_SYMBOL(ll_dio_user_copy); + /** * Indicate that transfer of a single page completed. */ diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 0c90712..0507212 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -424,7 +424,6 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, static inline struct page *kmem_to_page(void *addr) { - LASSERT(!((unsigned long)addr & ~PAGE_MASK)); if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); else diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 74b0a06..8a20214 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13562,6 +13562,8 @@ run_test 119c "Testing for direct read hitting hole" test_119e() { + (( $MDS1_VERSION >= $(version_code 2.15.58) )) || + skip "Need server version at least 2.15.58" (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" local stripe_size=$((1024 * 1024)) #1 MiB @@ -13576,14 +13578,27 @@ test_119e() dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 || error "buffered i/o to create file failed" + # trivial test of unaligned DIO + dd if=$DIR/$tfile.1 bs=4095 of=$DIR/$tfile.2 count=4 \ + iflag=direct oflag=direct || + error "trivial unaligned dio failed" + + # Clean up before next part of test + rm -f $DIR/$tfile.2 + if zfs_or_rotational; then # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size" + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" else - bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \ - $((stripe_size * 4))" + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ + $((stripe_size - 1)) $stripe_size \ + $((stripe_size + 1)) $((stripe_size * 3/2)) \ + $((stripe_size * 4)) $((stripe_size * 4 + 1))" fi for bs in $bsizes; do @@ -13624,10 +13639,15 @@ test_119f() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size" + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" else - bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \ - $((stripe_size * 4))" + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ + $((stripe_size - 1)) $stripe_size \ + $((stripe_size + 1)) $((stripe_size * 3/2)) \ + $((stripe_size * 4)) $((stripe_size * 4 + 1))" fi for bs in $bsizes; do @@ -13684,10 +13704,15 @@ test_119g() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size" + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" else - bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \ - $((stripe_size * 4))" + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ + $((stripe_size - 1)) $stripe_size \ + $((stripe_size + 1)) $((stripe_size * 3/2)) \ + $((stripe_size * 4)) $((stripe_size * 4 + 1))" fi for bs in $bsizes; do @@ -13719,6 +13744,123 @@ test_119g() } run_test 119g "dio vs buffered I/O race" +test_119h() +{ + (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" + + local stripe_size=$((1024 * 1024)) #1 MiB + # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os + local file_size=$((25 * stripe_size)) + local bsizes + + stack_trap "rm -f $DIR/$tfile.*" + + if zfs_or_rotational; then + # DIO on ZFS can take up to 2 seconds per IO + # rotational is better, but still slow. + # Limit testing on those media to larger sizes + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ + $((stripe_size - 1)) $stripe_size \ + $((stripe_size + 1)) $((stripe_size * 3/2)) \ + $((stripe_size * 4)) $((stripe_size * 4 + 1))" + fi + + for bs in $bsizes; do + $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1 + $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2 + echo "unaligned writes of blocksize: $bs" + # Write a file with unaligned DIO and regular DIO, and compare + # them + # with 'u', multiop randomly unaligns the io from the buffer + $MULTIOP $DIR/$tfile.1 \ + oO_CREAT:O_RDWR:O_DIRECT:wu${bs}wu${bs}wu${bs}wu${bs}wu${bs} || + error "multiop memory unaligned write failed, $bs" + $MULTIOP $DIR/$tfile.2 \ + oO_CREAT:O_RDWR:O_DIRECT:w${bs}w${bs}w${bs}w${bs}w${bs} || + error "multiop memory aligned write failed, $bs" + + cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 || + error "files differ, bsize $bs" + rm -f $DIR/$tfile.* + done + + $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1 + dd if=/dev/zero bs=$((stripe_size * 5)) of=$DIR/$tfile.1 count=5 || + error "dd to create source file for read failed" + + # Just a few quick tests to make sure unaligned DIO reads don't crash + for bs in $bsizes; do + + echo "unaligned reads of blocksize: $bs" + # with 'u', multiop randomly unaligns the io from the buffer + $MULTIOP $DIR/$tfile.1 \ + oO_CREAT:O_RDWR:O_DIRECT:ru${bs}ru${bs}ru${bs}ru${bs}ru${bs} || + error "multiop memory unaligned read failed, $bs" + + done + rm -f $DIR/$tfile* +} +run_test 119h "basic tests of memory unaligned dio" + +# aiocp with the '-a' option makes testing memory unaligned aio trivial +test_119i() +{ + (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" + which aiocp || skip_env "no aiocp installed" + + local stripe_size=$((1024 * 1024)) #1 MiB + # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os + local file_size=$((25 * stripe_size)) + local bsizes + + $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1 + stack_trap "rm -f $DIR/$tfile.*" + + # Just a bit bigger than the largest size in the test set below + dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 || + error "buffered i/o to create file failed" + + if zfs_or_rotational; then + # DIO on ZFS can take up to 2 seconds per IO + # rotational is better, but still slow. + # Limit testing on those media to larger sizes + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ + $((stripe_size - 1)) $stripe_size \ + $((stripe_size + 1)) $((stripe_size * 3/2)) \ + $((stripe_size * 4)) $((stripe_size * 4 + 1))" + fi + + # Do page aligned and NOT page aligned AIO + for align in 8 512 $((PAGE_SIZE)); do + # Deliberately includes a few aligned sizes + for bs in $bsizes; do + $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2 + + echo "bs: $bs, align: $align, file_size $file_size" + aiocp -a $align -b $bs -s $file_size -f O_DIRECT \ + $DIR/$tfile.1 $DIR/$tfile.2 || + error "unaligned aio failed, bs: $bs, align: $align" + + $CHECKSTAT -t file -s $file_size $DIR/$tfile.2 || + error "size incorrect" + cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 || + error "files differ" + rm -f $DIR/$tfile.2 + done + done +} +run_test 119i "test unaligned aio at varying sizes" + test_120a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" remote_mds_nodsh && skip "remote MDS with nodsh" @@ -26552,9 +26694,10 @@ test_398d() { # LU-13846 diff $DIR/$tfile $aio_file || error "file diff after aiocp" - # make sure we don't crash and fail properly - aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file && - error "aio not aligned with PAGE SIZE should fail" + # test memory unaligned aio + aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file || + error "unaligned aio failed" + diff $DIR/$tfile $aio_file || error "file diff after aiocp" rm -f $DIR/$tfile $aio_file } diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index f7230e4..bfc9dd6 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -728,10 +728,15 @@ test_16j() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size" + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" else - bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \ - $((stripe_size * 4))" + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ + $((stripe_size - 1)) $stripe_size \ + $((stripe_size + 1)) $((stripe_size * 3/2)) \ + $((stripe_size * 4)) $((stripe_size * 4 + 1))" fi # 1 process (BIO or DIO) on each client -- 1.8.3.1