Whamcloud - gitweb
LU-13805 clio: bounce buffer for unaligned DIO 16/45616/131
authorPatrick Farrell <pfarrell@whamcloud.com>
Thu, 27 Jul 2023 18:36:30 +0000 (14:36 -0400)
committerOleg Drokin <green@whamcloud.com>
Thu, 28 Sep 2023 07:58:59 +0000 (07:58 +0000)
Direct I/O must normally be page aligned both in terms of
I/O size and memory alignment.  This is because the I/O
must be page aligned before being written to disk.  This
is also true for buffered I/O, but the I/O is aligned
using the page cache.

In recent versions of Lustre, direct I/O is significantly
faster than buffered I/O, due to lower overhead for page
management.  Thus it is desirable to be able to do more
I/O as direct I/O.

This patch allows unaligned direct I/O by creating a buffer
inside the kernel and aligning the I/O by copying in to
this aligned buffer.  Because the main cost of buffered I/O
is locking in the page cache rather than memcopy(), this is
still significantly faster than buffered I/O (though slower
than normal direct I/O).

This will eventually allow us to convert buffered I/O to
direct I/O when doing so would increase performance.

Here are some comparative benchmarks using IOR, all single
process.

UDIO is unaligned DIO.

io size   1M           4M          16M          64M
----------------------------------------------------------
BIO Write  | 1502 MiB/s | 1382 MiB/s | 1683 MiB/s | 1677 MiB/s
BIO Read   | 2169 MiB/s | 1902 MiB/s | 2131 MiB.s | 1955 MiB/s
DIO Write  | 1010 MiB/s | 2778 MiB/s | 5905 MiB/s | 7917 MiB/s
DIO Read   | 893 MiB/s  | 2657 MiB/s | 4724 MiB/s | 7579 MiB/s
UDIO Write | 848 MiB/s  | 1666 MiB/s | 2117 MiB/s | 2243 MiB/s
UDIO Read  | 933 MIB/s  | 2412 MiB/s | 3690 MiB/s | 5370 MiB/s

Unaligned DIO offers benefits vs buffered write and
buffered read, but is of course slower than DIO.

Notice on this node the best case DIO performance is
~8 GiB/s.  On a node with 12 GiB/s best case DIO, best case
UDIO read is 8 GiB/s and best case UDIO write is 2.5 GiB/s.

This is because UDIO read is fully parallel, UDIO write is
not.

Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I7eeebf9a608f006c8095b95f0677adb99f19d640
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/45616
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Qian Yingjin <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/autoconf/lustre-core.m4
lustre/include/cl_object.h
lustre/llite/rw26.c
lustre/obdclass/cl_io.c
lustre/osd-zfs/osd_io.c
lustre/tests/sanity.sh
lustre/tests/sanityn.sh

index d72b428..55dcc3a 100644 (file)
@@ -3182,6 +3182,26 @@ fileattr_set, [
 EXTRA_KCFLAGS="$tmp_flags"
 ]) # LC_HAVE_FILEATTR_GET
 
+# LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+#
+# Kernel 5.13 commit f0b65f39ac505e8f1dcdaa165aa7b8c0bd6fd454
+# iov_iter: replace iov_iter_copy_from_user_atomic() with iterator-advancing variant
+#
+AC_DEFUN([LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [
+       LB2_LINUX_TEST_SRC([copy_page_from_iter_atomic], [
+               #include <linux/uio.h>
+       ],[
+               copy_page_from_iter_atomic(NULL, 0, 0, NULL);
+       ])
+])
+AC_DEFUN([LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [
+       AC_MSG_CHECKING([if have copy_page_from_iter_atomic])
+       LB2_LINUX_TEST_RESULT([copy_page_from_iter_atomic], [
+               AC_DEFINE(HAVE_COPY_PAGE_FROM_ITER_ATOMIC, 1,
+                       ['copy_page_from_iter_atomic' exists])
+       ])
+]) # LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
 #
 # LC_HAVE_GET_ACL_RCU_ARG
 #
@@ -3205,6 +3225,26 @@ AC_DEFUN([LC_HAVE_GET_ACL_RCU_ARG], [
        ])
 ]) # LC_HAVE_GET_ACL_RCU_ARG
 
+# LC_HAVE_FAULT_IN_IOV_ITER_READABLE
+#
+# Kernel 5.15 commit a6294593e8a1290091d0b078d5d33da5e0cd3dfe
+# iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
+#
+AC_DEFUN([LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE], [
+       LB2_LINUX_TEST_SRC([fault_in_iov_iter_readable], [
+               #include <linux/uio.h>
+       ],[
+               fault_in_iov_iter_readable(NULL, 0);
+       ])
+])
+AC_DEFUN([LC_HAVE_FAULT_IN_IOV_ITER_READABLE], [
+       AC_MSG_CHECKING([if have fault_in_iov_iter_readable])
+       LB2_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [
+               AC_DEFINE(HAVE_FAULT_IN_IOV_ITER_READABLE, 1,
+                       ['fault_in_iov_iter_readable' exists])
+       ])
+]) # LC_HAVE_FAULT_IN_IOV_ITER_READABLE
+
 #
 # LC_HAVE_INVALIDATE_LOCK
 #
@@ -4129,8 +4169,12 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
        # 5.12
        LC_SRC_HAVE_USER_NAMESPACE_ARG
 
+       # 5.13
+       LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
        # 5.15
        LC_SRC_HAVE_GET_ACL_RCU_ARG
+       LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE
 
        # 5.16
        LC_SRC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
@@ -4398,10 +4442,15 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
 
        # 5.13
        LC_HAVE_FILEATTR_GET
+       LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
+        # 5.15
+        LC_HAVE_GET_ACL_RCU_ARG
 
        # 5.15
        LC_HAVE_GET_ACL_RCU_ARG
        LC_HAVE_INVALIDATE_LOCK
+       LC_HAVE_FAULT_IN_IOV_ITER_READABLE
 
        # 5.16
        LC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
index 4946a15..6689a30 100644 (file)
@@ -1942,7 +1942,12 @@ struct cl_io {
         * to userspace, only the RPCs are submitted async, then waited for at
         * the llite layer before returning.
         */
-                            ci_parallel_dio:1;
+                            ci_parallel_dio:1,
+       /**
+        * this DIO is at least partly unaligned, and so the unaligned DIO
+        * path is being used for this entire IO
+        */
+                            ci_unaligned_dio:1;
        /**
         * Bypass quota check
         */
@@ -2521,7 +2526,7 @@ struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj,
                                    bool is_aio);
 struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
                                    struct iov_iter *iter, bool write,
-                                   bool sync);
+                                   bool unaligned, bool sync);
 void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio);
 void cl_sub_dio_free(struct cl_sub_dio *sdio);
 static inline void cl_sync_io_init(struct cl_sync_io *anchor, int nr)
@@ -2553,14 +2558,14 @@ struct cl_sync_io {
 /** direct IO pages */
 struct ll_dio_pages {
        /*
-        * page array to be written. we don't support
-        * partial pages except the last one.
+        * page array for RDMA - for aligned i/o, this is the user provided
+        * pages, but for unaligned i/o, this is the internal buffer
         */
-       struct page             **ldp_pages;
+       struct page             **ldp_pages;
        /** # of pages in the array. */
-       size_t                  ldp_count;
+       size_t                  ldp_count;
        /* the file offset of the first page. */
-       loff_t                  ldp_file_offset;
+       loff_t                  ldp_file_offset;
 };
 
 /* Top level struct used for AIO and DIO */
@@ -2585,7 +2590,8 @@ struct cl_sub_dio {
        struct ll_dio_pages     csd_dio_pages;
        struct iov_iter         csd_iter;
        unsigned                csd_creator_free:1,
-                               csd_write:1;
+                               csd_write:1,
+                               csd_unaligned:1;
 };
 #if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \
        defined(HAVE_DIRECTIO_2ARGS)
@@ -2593,6 +2599,9 @@ struct cl_sub_dio {
 #endif
 
 void ll_release_user_pages(struct page **pages, int npages);
+int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size);
+void ll_free_dio_buffer(struct ll_dio_pages *pvec);
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov);
 
 #ifndef HAVE_KTHREAD_USE_MM
 #define kthread_use_mm(mm) use_mm(mm)
index ed009f8..8f9511d 100644 (file)
@@ -492,26 +492,39 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
        ssize_t tot_bytes = 0, result = 0;
        loff_t file_offset = iocb->ki_pos;
        bool sync_submit = false;
+       bool unaligned = false;
        struct vvp_io *vio;
        ssize_t rc2;
 
+       ENTRY;
+
        /* Check EOF by ourselves */
        if (rw == READ && file_offset >= i_size_read(inode))
-               return 0;
+               RETURN(0);
 
-       /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
        if (file_offset & ~PAGE_MASK)
-               RETURN(-EINVAL);
+               unaligned = true;
 
-       CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
-              "offset=%lld=%llx, pages %zd (max %lu)\n",
-              PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
-              file_offset, file_offset, count >> PAGE_SHIFT,
-              MAX_DIO_SIZE >> PAGE_SHIFT);
+       if (count & ~PAGE_MASK)
+               unaligned = true;
 
        /* Check that all user buffers are aligned as well */
        if (ll_iov_iter_alignment(iter) & ~PAGE_MASK)
-               RETURN(-EINVAL);
+               unaligned = true;
+
+       CDEBUG(D_VFSTRACE,
+              "VFS Op:inode="DFID"(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)%s\n",
+              PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
+              file_offset, file_offset,
+              (count >> PAGE_SHIFT) + !!(count & ~PAGE_MASK),
+              MAX_DIO_SIZE >> PAGE_SHIFT, unaligned ? ", unaligned" : "");
+
+       /* the requirement to not return EIOCBQUEUED for pipes (see bottom of
+        * this function) plays havoc with the unaligned I/O lifecycle, so
+        * don't allow unaligned I/O on pipes
+        */
+       if (unaligned && iov_iter_is_pipe(iter))
+               RETURN(0);
 
        lcc = ll_cl_find(inode);
        if (lcc == NULL)
@@ -523,6 +536,15 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
        io = lcc->lcc_io;
        LASSERT(io != NULL);
 
+       /* if one part of an I/O is unaligned, just handle all of it that way -
+        * otherwise we create significant complexities with managing the iovec
+        * in different ways, etc, all for very marginal benefits
+        */
+       if (unaligned)
+               io->ci_unaligned_dio = true;
+       if (io->ci_unaligned_dio)
+               unaligned = true;
+
        ll_dio_aio = io->ci_dio_aio;
        LASSERT(ll_dio_aio);
        LASSERT(ll_dio_aio->cda_iocb == iocb);
@@ -557,14 +579,28 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
                 * (either in this function or from a ptlrpcd daemon)
                 */
                sdio = cl_sub_dio_alloc(ll_dio_aio, iter, rw == WRITE,
-                                       sync_submit);
+                                       unaligned, sync_submit);
                if (!sdio)
                        GOTO(out, result = -ENOMEM);
 
                pvec = &sdio->csd_dio_pages;
                pvec->ldp_file_offset = file_offset;
 
-               result = ll_get_user_pages(rw, iter, pvec, count);
+               if (!unaligned) {
+                       result = ll_get_user_pages(rw, iter, pvec, count);
+                       /* ll_get_user_pages returns bytes in the IO or error*/
+                       count = result;
+               } else {
+                       /* same calculation used in ll_get_user_pages */
+                       count = min_t(size_t, count, iter->iov->iov_len);
+                       result = ll_allocate_dio_buffer(pvec, count);
+                       /* allocate_dio_buffer returns number of pages or
+                        * error, so do not set count = result
+                        */
+               }
+
+               /* now we have the actual count, so store it in the sdio */
+               sdio->csd_bytes = count;
 
                if (unlikely(result <= 0)) {
                        cl_sync_io_note(env, &sdio->csd_sync, result);
@@ -574,10 +610,33 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
                        }
                        GOTO(out, result);
                }
-               count = result;
 
-               result = ll_direct_rw_pages(env, io, count,
-                                           rw, inode, sdio);
+               if (unaligned && rw == WRITE) {
+                       result = ll_dio_user_copy(sdio, iter);
+                       if (unlikely(result <= 0)) {
+                               cl_sync_io_note(env, &sdio->csd_sync, result);
+                               if (sync_submit) {
+                                       cl_sub_dio_free(sdio);
+                                       LASSERT(sdio->csd_creator_free);
+                               }
+                               GOTO(out, result);
+                       }
+               }
+
+               result = ll_direct_rw_pages(env, io, count, rw, inode, sdio);
+               /* if the i/o was unsuccessful, we zero the number of bytes to
+                * copy back.  Note that partial I/O completion isn't possible
+                * here - I/O either completes or fails.  So there's no need to
+                * handle short I/O here by changing 'count' with the result
+                * from ll_direct_rw_pages.
+                *
+                * This must be done before we release the reference
+                * immediately below, because releasing the reference allows
+                * i/o completion (and copyback to userspace, if unaligned) to
+                * start.
+                */
+               if (result != 0)
+                       sdio->csd_bytes = 0;
                /* We've submitted pages and can now remove the extra
                 * reference for that
                 */
@@ -595,12 +654,15 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
                        GOTO(out, result);
 
                iov_iter_advance(iter, count);
+
                tot_bytes += count;
                file_offset += count;
+               CDEBUG(D_VFSTRACE,
+                      "result %zd tot_bytes %zd count %zd file_offset %lld\n",
+                      result, tot_bytes, count, file_offset);
        }
 
 out:
-
        if (rw == WRITE)
                vio->u.readwrite.vui_written += tot_bytes;
        else
@@ -612,10 +674,10 @@ out:
        if (result == 0 && !iov_iter_is_pipe(iter))
                result = -EIOCBQUEUED;
 
-       return result;
+       RETURN(result);
 }
 
-#if defined(HAVE_DIO_ITER)
+#ifdef HAVE_DIO_ITER
 static ssize_t ll_direct_IO(
 #ifndef HAVE_IOV_ITER_RW
             int rw,
index 2984c43..0cf0943 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/list_sort.h>
+#include <linux/mmu_context.h>
 #include <obd_class.h>
 #include <obd_support.h>
 #include <lustre_fid.h>
@@ -1206,8 +1207,34 @@ static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor)
                cl_page_list_del(env, &sdio->csd_pages, page);
        }
 
-       ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
-                             sdio->csd_dio_pages.ldp_count);
+       if (sdio->csd_unaligned) {
+               /* save the iovec pointer before it's modified by
+                * ll_dio_user_copy
+                */
+               struct iovec *tmp = (struct iovec *) sdio->csd_iter.iov;
+
+               CDEBUG(D_VFSTRACE,
+                      "finishing unaligned dio %s aio->cda_bytes %ld\n",
+                      sdio->csd_write ? "write" : "read", sdio->csd_bytes);
+               /* read copies *from* the kernel buffer *to* userspace
+                * here at the end, write copies *to* the kernel
+                * buffer from userspace at the start
+                */
+               if (!sdio->csd_write && sdio->csd_bytes > 0)
+                       ret = ll_dio_user_copy(sdio, NULL);
+               ll_free_dio_buffer(&sdio->csd_dio_pages);
+               /* handle the freeing here rather than in cl_sub_dio_free
+                * because we have the unmodified iovec pointer
+                */
+               OBD_FREE_PTR(tmp);
+               sdio->csd_iter.iov = NULL;
+       } else {
+               /* unaligned DIO does not get user pages, so it doesn't have to
+                * release them, but aligned I/O must
+                */
+               ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
+                                     sdio->csd_dio_pages.ldp_count);
+       }
        cl_sync_io_note(env, &sdio->csd_ll_aio->cda_sync, ret);
 
        EXIT;
@@ -1246,7 +1273,7 @@ EXPORT_SYMBOL(cl_dio_aio_alloc);
 
 struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
                                    struct iov_iter *iter, bool write,
-                                   bool sync)
+                                   bool unaligned, bool sync)
 {
        struct cl_sub_dio *sdio;
 
@@ -1261,27 +1288,33 @@ struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
                cl_page_list_init(&sdio->csd_pages);
 
                sdio->csd_ll_aio = ll_aio;
-               atomic_add(1,  &ll_aio->cda_sync.csi_sync_nr);
                sdio->csd_creator_free = sync;
                sdio->csd_write = write;
+               sdio->csd_unaligned = unaligned;
 
-               /* we need to make a copy of the user iovec at this point in
-                * time so we:
-                * A) have the correct state of the iovec for this chunk of I/O
-                * B) have a chunk-local copy; some of the things we want to
-                * do to the iovec modify it, so to process each chunk from a
-                * separate thread requires a local copy of the iovec
-                */
-               memcpy(&sdio->csd_iter, iter,
-                      sizeof(struct iov_iter));
-               OBD_ALLOC_PTR(sdio->csd_iter.iov);
-               if (sdio->csd_iter.iov == NULL) {
-                       cl_sub_dio_free(sdio);
-                       sdio = NULL;
-                       goto out;
+               atomic_add(1,  &ll_aio->cda_sync.csi_sync_nr);
+
+               if (unaligned) {
+                       /* we need to make a copy of the user iovec at this
+                        * point in time, in order to:
+                        *
+                        * A) have the correct state of the iovec for this
+                        * chunk of I/O, ie, the main iovec is altered as we do
+                        * I/O and this chunk needs the current state
+                        * B) have a chunk-local copy; doing the IO later
+                        * modifies the iovec, so to process each chunk from a
+                        * separate thread requires a local copy of the iovec
+                        */
+                       memcpy(&sdio->csd_iter, iter, sizeof(struct iov_iter));
+                       OBD_ALLOC_PTR(sdio->csd_iter.iov);
+                       if (sdio->csd_iter.iov == NULL) {
+                               cl_sub_dio_free(sdio);
+                               sdio = NULL;
+                               goto out;
+                       }
+                       memcpy((void *) sdio->csd_iter.iov, iter->iov,
+                              sizeof(struct iovec));
                }
-               memcpy((void *) sdio->csd_iter.iov, iter->iov,
-                      sizeof(struct iovec));
        }
 out:
        return sdio;
@@ -1302,16 +1335,96 @@ EXPORT_SYMBOL(cl_dio_aio_free);
 void cl_sub_dio_free(struct cl_sub_dio *sdio)
 {
        if (sdio) {
-               void *tmp = (void *) sdio->csd_iter.iov;
+               void *tmp = (void *)sdio->csd_iter.iov;
 
-               if (tmp)
-                       OBD_FREE(tmp, sizeof(struct iovec));
+               if (tmp) {
+                       LASSERT(sdio->csd_unaligned);
+                       OBD_FREE_PTR(tmp);
+               }
                OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem);
        }
 }
 EXPORT_SYMBOL(cl_sub_dio_free);
 
 /*
+ * For unaligned DIO.
+ *
+ * Allocate the internal buffer from/to which we will perform DIO.  This takes
+ * the user I/O parameters and allocates an internal buffer large enough to
+ * hold it.  The pages in this buffer are aligned with pages in the file (ie,
+ * they have a 1-to-1 mapping with file pages).
+ */
+int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size)
+{
+       struct page *new_page;
+       size_t pg_offset;
+       int result = 0;
+       ssize_t i;
+
+       ENTRY;
+
+       /* page level offset in the file where the I/O starts */
+       pg_offset = pvec->ldp_file_offset & ~PAGE_MASK;
+       /* this adds 1 for the first page and removes the bytes in it from the
+        * io_size, making the rest of the calculation aligned
+        */
+       if (pg_offset) {
+               pvec->ldp_count++;
+               io_size -= min_t(size_t, PAGE_SIZE - pg_offset, io_size);
+       }
+
+       /* calculate pages for the rest of the buffer */
+       pvec->ldp_count += (io_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+#ifdef HAVE_DIO_ITER
+       pvec->ldp_pages = kvzalloc(pvec->ldp_count * sizeof(struct page *),
+                                   GFP_NOFS);
+#else
+       OBD_ALLOC_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count);
+#endif
+       if (pvec->ldp_pages == NULL)
+               RETURN(-ENOMEM);
+
+       for (i = 0; i < pvec->ldp_count; i++) {
+               new_page = alloc_page(GFP_NOFS);
+               if (!new_page) {
+                       result = -ENOMEM;
+                       pvec->ldp_count = i;
+                       goto out;
+               }
+               pvec->ldp_pages[i] = new_page;
+       }
+       WARN_ON(i != pvec->ldp_count);
+
+out:
+       if (result) {
+               if (pvec->ldp_pages)
+                       ll_free_dio_buffer(pvec);
+       }
+
+       if (result == 0)
+               result = pvec->ldp_count;
+
+       RETURN(result);
+}
+EXPORT_SYMBOL(ll_allocate_dio_buffer);
+
+void ll_free_dio_buffer(struct ll_dio_pages *pvec)
+{
+       int i;
+
+       for (i = 0; i < pvec->ldp_count; i++)
+               __free_page(pvec->ldp_pages[i]);
+
+#ifdef HAVE_DIO_ITER
+       kfree(pvec->ldp_pages);
+#else
+       OBD_FREE_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count);
+#endif
+}
+EXPORT_SYMBOL(ll_free_dio_buffer);
+
+/*
  * ll_release_user_pages - tear down page struct array
  * @pages: array of page struct pointers underlying target buffer
  */
@@ -1338,6 +1451,168 @@ void ll_release_user_pages(struct page **pages, int npages)
 }
 EXPORT_SYMBOL(ll_release_user_pages);
 
+#ifdef HAVE_FAULT_IN_IOV_ITER_READABLE
+#define ll_iov_iter_fault_in_readable(iov, bytes) \
+       fault_in_iov_iter_readable(iov, bytes)
+#else
+#define ll_iov_iter_fault_in_readable(iov, bytes) \
+       iov_iter_fault_in_readable(iov, bytes)
+#endif
+
+#ifndef HAVE_KTHREAD_USE_MM
+#define kthread_use_mm(mm) use_mm(mm)
+#define kthread_unuse_mm(mm) unuse_mm(mm)
+#endif
+
+/* copy IO data to/from internal buffer and userspace iovec */
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov)
+{
+       struct iov_iter *iter = write_iov ? write_iov : &sdio->csd_iter;
+       struct ll_dio_pages *pvec = &sdio->csd_dio_pages;
+       struct mm_struct *mm = sdio->csd_ll_aio->cda_mm;
+       loff_t pos = pvec->ldp_file_offset;
+       size_t count = sdio->csd_bytes;
+       size_t original_count = count;
+       int short_copies = 0;
+       bool mm_used = false;
+       int status = 0;
+       int i = 0;
+       int rw;
+
+       ENTRY;
+
+       LASSERT(sdio->csd_unaligned);
+
+       if (sdio->csd_write)
+               rw = WRITE;
+       else
+               rw = READ;
+
+       /* if there's no mm, io is being done from a kernel thread, so there's
+        * no need to transition to its mm context anyway.
+        *
+        * Also, if mm == current->mm, that means this is being handled in the
+        * thread which created it, and not in a separate kthread - so it is
+        * unnecessary (and incorrect) to do a use_mm here
+        */
+       if (mm && mm != current->mm) {
+               kthread_use_mm(mm);
+               mm_used = true;
+       }
+
+       /* fault in the entire userspace iovec */
+       if (rw == WRITE) {
+               if (unlikely(ll_iov_iter_fault_in_readable(iter, count)))
+                       GOTO(out, status = -EFAULT);
+       }
+
+       /* modeled on kernel generic_file_buffered_read/write()
+        *
+        * note we only have one 'chunk' of i/o here, so we do not copy the
+        * whole iovec here (except when the chunk is the whole iovec) so we
+        * use the count of bytes in the chunk, csd_bytes, instead of looking
+        * at the iovec
+        */
+       while (true) {
+               struct page *page = pvec->ldp_pages[i];
+               unsigned long offset; /* offset into kernel buffer page */
+               size_t copied; /* bytes successfully copied */
+               size_t bytes; /* bytes to copy for this page */
+
+               LASSERT(i < pvec->ldp_count);
+
+               offset = pos & ~PAGE_MASK;
+               bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                             count);
+
+               CDEBUG(D_VFSTRACE,
+                      "count %zd, offset %lu, pos %lld, ldp_count %lu\n",
+                      count, offset, pos, pvec->ldp_count);
+
+               if (fatal_signal_pending(current)) {
+                       status = -EINTR;
+                       break;
+               }
+
+               /* write requires a few extra steps */
+               if (rw == WRITE) {
+                       /* like btrfs, we do not have a mapping since this isn't
+                        * a page cache page, so we must do this flush
+                        * unconditionally
+                        *
+                        * NB: This is a noop on x86 but active on other
+                        * architectures
+                        */
+                       flush_dcache_page(page);
+
+#ifndef HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+                       copied = iov_iter_copy_from_user_atomic(page, iter,
+                                                               offset, bytes);
+                       iov_iter_advance(iter, copied);
+#else
+                       copied = copy_page_from_iter_atomic(page, offset, bytes,
+                                                           iter);
+#endif
+
+               } else /* READ */ {
+                       copied = copy_page_to_iter(page, offset, bytes, iter);
+               }
+
+               pos += copied;
+               count -= copied;
+
+               if (unlikely(copied < bytes)) {
+                       short_copies++;
+
+                       CDEBUG(D_VFSTRACE,
+                              "short copy - copied only %zd of %lu, short %d times\n",
+                              copied, bytes, short_copies);
+                       /* copies will very rarely be interrupted, but we
+                        * should retry in those cases, since the other option
+                        * is giving an IO error and this can occur in normal
+                        * operation such as with racing unaligned AIOs
+                        *
+                        * but of course we should not retry indefinitely
+                        */
+                       if (short_copies > 2) {
+                               CERROR("Unaligned DIO copy repeatedly short, count %zd, offset %lu, bytes %lu, copied %zd, pos %lld\n",
+                               count, offset, bytes, copied, pos);
+
+                               status = -EFAULT;
+                               break;
+                       }
+
+                       continue;
+               }
+
+               if (count == 0)
+                       break;
+
+               i++;
+       }
+
+out:
+       /* if we complete successfully, we should reach all of the pages */
+       LASSERTF(ergo(status == 0, i == pvec->ldp_count - 1),
+                "status: %d, i: %d, pvec->ldp_count %zu, count %zu\n",
+                 status, i, pvec->ldp_count, count);
+
+       if (write_iov && status == 0) {
+               /* The copy function we use modifies the count in the iovec,
+                * but that's actually the job of the caller, so we return the
+                * iovec to the original count
+                */
+               iov_iter_reexpand(iter, original_count);
+       }
+
+       if (mm_used)
+               kthread_unuse_mm(mm);
+
+       /* the total bytes copied, or status */
+       RETURN(original_count - count ? original_count - count : status);
+}
+EXPORT_SYMBOL(ll_dio_user_copy);
+
 /**
  * Indicate that transfer of a single page completed.
  */
index 0c90712..0507212 100644 (file)
@@ -424,7 +424,6 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
 
 static inline struct page *kmem_to_page(void *addr)
 {
-       LASSERT(!((unsigned long)addr & ~PAGE_MASK));
        if (is_vmalloc_addr(addr))
                return vmalloc_to_page(addr);
        else
index 74b0a06..8a20214 100755 (executable)
@@ -13562,6 +13562,8 @@ run_test 119c "Testing for direct read hitting hole"
 
 test_119e()
 {
+       (( $MDS1_VERSION >= $(version_code 2.15.58) )) ||
+               skip "Need server version at least 2.15.58"
        (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
 
        local stripe_size=$((1024 * 1024)) #1 MiB
@@ -13576,14 +13578,27 @@ test_119e()
        dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
                error "buffered i/o to create file failed"
 
+       # trivial test of unaligned DIO
+       dd if=$DIR/$tfile.1 bs=4095 of=$DIR/$tfile.2 count=4 \
+               iflag=direct oflag=direct ||
+               error "trivial unaligned dio failed"
+
+       # Clean up before next part of test
+       rm -f $DIR/$tfile.2
+
        if zfs_or_rotational; then
                # DIO on ZFS can take up to 2 seconds per IO
                # rotational is better, but still slow.
                # Limit testing on those media to larger sizes
-               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+                       $((stripe_size + 1024))"
        else
-               bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-                       $((stripe_size * 4))"
+               bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+                       $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+                       $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+                       $((stripe_size - 1)) $stripe_size \
+                       $((stripe_size + 1)) $((stripe_size * 3/2)) \
+                       $((stripe_size * 4)) $((stripe_size * 4 + 1))"
        fi
 
        for bs in $bsizes; do
@@ -13624,10 +13639,15 @@ test_119f()
                # DIO on ZFS can take up to 2 seconds per IO
                # rotational is better, but still slow.
                # Limit testing on those media to larger sizes
-               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+                       $((stripe_size + 1024))"
        else
-               bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-                       $((stripe_size * 4))"
+               bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+                       $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+                       $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+                       $((stripe_size - 1)) $stripe_size \
+                       $((stripe_size + 1)) $((stripe_size * 3/2)) \
+                       $((stripe_size * 4)) $((stripe_size * 4 + 1))"
        fi
 
        for bs in $bsizes; do
@@ -13684,10 +13704,15 @@ test_119g()
                # DIO on ZFS can take up to 2 seconds per IO
                # rotational is better, but still slow.
                # Limit testing on those media to larger sizes
-               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+                       $((stripe_size + 1024))"
        else
-               bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-                       $((stripe_size * 4))"
+               bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+                       $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+                       $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+                       $((stripe_size - 1)) $stripe_size \
+                       $((stripe_size + 1)) $((stripe_size * 3/2)) \
+                       $((stripe_size * 4)) $((stripe_size * 4 + 1))"
        fi
 
        for bs in $bsizes; do
@@ -13719,6 +13744,123 @@ test_119g()
 }
 run_test 119g "dio vs buffered I/O race"
 
+test_119h()
+{
+       (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+
+       local stripe_size=$((1024 * 1024)) #1 MiB
+       # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+       local file_size=$((25 * stripe_size))
+       local bsizes
+
+       stack_trap "rm -f $DIR/$tfile.*"
+
+       if zfs_or_rotational; then
+               # DIO on ZFS can take up to 2 seconds per IO
+               # rotational is better, but still slow.
+               # Limit testing on those media to larger sizes
+               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+                       $((stripe_size + 1024))"
+       else
+               bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+                       $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+                       $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+                       $((stripe_size - 1)) $stripe_size \
+                       $((stripe_size + 1)) $((stripe_size * 3/2)) \
+                       $((stripe_size * 4)) $((stripe_size * 4 + 1))"
+       fi
+
+       for bs in $bsizes; do
+               $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+               $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+               echo "unaligned writes of blocksize: $bs"
+               # Write a file with unaligned DIO and regular DIO, and compare
+               # them
+               # with 'u', multiop randomly unaligns the io from the buffer
+               $MULTIOP $DIR/$tfile.1 \
+               oO_CREAT:O_RDWR:O_DIRECT:wu${bs}wu${bs}wu${bs}wu${bs}wu${bs} ||
+                       error "multiop memory unaligned write failed, $bs"
+               $MULTIOP $DIR/$tfile.2 \
+               oO_CREAT:O_RDWR:O_DIRECT:w${bs}w${bs}w${bs}w${bs}w${bs} ||
+                       error "multiop memory aligned write failed, $bs"
+
+               cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+                       error "files differ, bsize $bs"
+               rm -f $DIR/$tfile.*
+       done
+
+       $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+       dd if=/dev/zero bs=$((stripe_size * 5)) of=$DIR/$tfile.1 count=5 ||
+               error "dd to create source file for read failed"
+
+       # Just a few quick tests to make sure unaligned DIO reads don't crash
+       for bs in $bsizes; do
+
+               echo "unaligned reads of blocksize: $bs"
+               # with 'u', multiop randomly unaligns the io from the buffer
+               $MULTIOP $DIR/$tfile.1 \
+               oO_CREAT:O_RDWR:O_DIRECT:ru${bs}ru${bs}ru${bs}ru${bs}ru${bs} ||
+                       error "multiop memory unaligned read failed, $bs"
+
+       done
+       rm -f $DIR/$tfile*
+}
+run_test 119h "basic tests of memory unaligned dio"
+
+# aiocp with the '-a' option makes testing memory unaligned aio trivial
+test_119i()
+{
+       (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+       which aiocp || skip_env "no aiocp installed"
+
+       local stripe_size=$((1024 * 1024)) #1 MiB
+       # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+       local file_size=$((25 * stripe_size))
+       local bsizes
+
+       $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+       stack_trap "rm -f $DIR/$tfile.*"
+
+       # Just a bit bigger than the largest size in the test set below
+       dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
+               error "buffered i/o to create file failed"
+
+       if zfs_or_rotational; then
+               # DIO on ZFS can take up to 2 seconds per IO
+               # rotational is better, but still slow.
+               # Limit testing on those media to larger sizes
+               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+                       $((stripe_size + 1024))"
+       else
+               bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+                       $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+                       $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+                       $((stripe_size - 1)) $stripe_size \
+                       $((stripe_size + 1)) $((stripe_size * 3/2)) \
+                       $((stripe_size * 4)) $((stripe_size * 4 + 1))"
+       fi
+
+       # Do page aligned and NOT page aligned AIO
+       for align in 8 512 $((PAGE_SIZE)); do
+       # Deliberately includes a few aligned sizes
+       for bs in $bsizes; do
+               $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+
+               echo "bs: $bs, align: $align, file_size $file_size"
+               aiocp -a $align -b $bs -s $file_size -f O_DIRECT \
+                       $DIR/$tfile.1 $DIR/$tfile.2 ||
+                       error "unaligned aio failed, bs: $bs, align: $align"
+
+               $CHECKSTAT -t file -s $file_size $DIR/$tfile.2 ||
+                       error "size incorrect"
+               cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+                       error "files differ"
+               rm -f $DIR/$tfile.2
+       done
+       done
+}
+run_test 119i "test unaligned aio at varying sizes"
+
 test_120a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
        remote_mds_nodsh && skip "remote MDS with nodsh"
@@ -26552,9 +26694,10 @@ test_398d() { #  LU-13846
 
        diff $DIR/$tfile $aio_file || error "file diff after aiocp"
 
-       # make sure we don't crash and fail properly
-       aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file &&
-               error "aio not aligned with PAGE SIZE should fail"
+       # test memory unaligned aio
+       aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file ||
+               error "unaligned aio failed"
+       diff $DIR/$tfile $aio_file || error "file diff after aiocp"
 
        rm -f $DIR/$tfile $aio_file
 }
index f7230e4..bfc9dd6 100755 (executable)
@@ -728,10 +728,15 @@ test_16j()
                # DIO on ZFS can take up to 2 seconds per IO
                # rotational is better, but still slow.
                # Limit testing on those media to larger sizes
-               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+               bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+                       $((stripe_size + 1024))"
        else
-               bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-                       $((stripe_size * 4))"
+               bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+                       $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+                       $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+                       $((stripe_size - 1)) $stripe_size \
+                       $((stripe_size + 1)) $((stripe_size * 3/2)) \
+                       $((stripe_size * 4)) $((stripe_size * 4 + 1))"
        fi
 
        # 1 process (BIO or DIO) on each client