From 7194eb6431d2ef7245ef3b13394b60e220145187 Mon Sep 17 00:00:00 2001
From: Patrick Farrell <pfarrell@whamcloud.com>
Date: Thu, 27 Jul 2023 14:36:30 -0400
Subject: [PATCH] LU-13805 clio: bounce buffer for unaligned DIO

Direct I/O must normally be page aligned both in terms of
I/O size and memory alignment.  This is because the I/O
must be page aligned before being written to disk.  This
is also true for buffered I/O, but the I/O is aligned
using the page cache.

In recent versions of Lustre, direct I/O is significantly
faster than buffered I/O, due to lower overhead for page
management.  Thus it is desirable to be able to do more
I/O as direct I/O.

This patch allows unaligned direct I/O by creating a buffer
inside the kernel and aligning the I/O by copying in to
this aligned buffer.  Because the main cost of buffered I/O
is locking in the page cache rather than memcopy(), this is
still significantly faster than buffered I/O (though slower
than normal direct I/O).

This will eventually allow us to convert buffered I/O to
direct I/O when doing so would increase performance.

Here are some comparative benchmarks using IOR, all single
process.

UDIO is unaligned DIO.

io size  	1M           4M          16M          64M
----------------------------------------------------------
BIO Write  | 1502 MiB/s | 1382 MiB/s | 1683 MiB/s | 1677 MiB/s
BIO Read   | 2169 MiB/s | 1902 MiB/s | 2131 MiB.s | 1955 MiB/s
DIO Write  | 1010 MiB/s | 2778 MiB/s | 5905 MiB/s | 7917 MiB/s
DIO Read   | 893 MiB/s  | 2657 MiB/s | 4724 MiB/s | 7579 MiB/s
UDIO Write | 848 MiB/s  | 1666 MiB/s | 2117 MiB/s | 2243 MiB/s
UDIO Read  | 933 MIB/s  | 2412 MiB/s | 3690 MiB/s | 5370 MiB/s

Unaligned DIO offers benefits vs buffered write and
buffered read, but is of course slower than DIO.

Notice on this node the best case DIO performance is
~8 GiB/s.  On a node with 12 GiB/s best case DIO, best case
UDIO read is 8 GiB/s and best case UDIO write is 2.5 GiB/s.

This is because UDIO read is fully parallel, UDIO write is
not.

Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I7eeebf9a608f006c8095b95f0677adb99f19d640
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/45616
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Qian Yingjin <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/autoconf/lustre-core.m4 |  49 +++++++
 lustre/include/cl_object.h     |  25 +++-
 lustre/llite/rw26.c            |  96 +++++++++---
 lustre/obdclass/cl_io.c        | 321 ++++++++++++++++++++++++++++++++++++++---
 lustre/osd-zfs/osd_io.c        |   1 -
 lustre/tests/sanity.sh         | 167 +++++++++++++++++++--
 lustre/tests/sanityn.sh        |  11 +-
 7 files changed, 606 insertions(+), 64 deletions(-)

diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4
index d72b428..55dcc3a 100644
--- a/lustre/autoconf/lustre-core.m4
+++ b/lustre/autoconf/lustre-core.m4
@@ -3182,6 +3182,26 @@ fileattr_set, [
 EXTRA_KCFLAGS="$tmp_flags"
 ]) # LC_HAVE_FILEATTR_GET
 
+# LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+#
+# Kernel 5.13 commit f0b65f39ac505e8f1dcdaa165aa7b8c0bd6fd454
+# iov_iter: replace iov_iter_copy_from_user_atomic() with iterator-advancing variant
+#
+AC_DEFUN([LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [
+	LB2_LINUX_TEST_SRC([copy_page_from_iter_atomic], [
+		#include <linux/uio.h>
+	],[
+		copy_page_from_iter_atomic(NULL, 0, 0, NULL);
+	])
+])
+AC_DEFUN([LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [
+	AC_MSG_CHECKING([if have copy_page_from_iter_atomic])
+	LB2_LINUX_TEST_RESULT([copy_page_from_iter_atomic], [
+		AC_DEFINE(HAVE_COPY_PAGE_FROM_ITER_ATOMIC, 1,
+			['copy_page_from_iter_atomic' exists])
+	])
+]) # LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
 #
 # LC_HAVE_GET_ACL_RCU_ARG
 #
@@ -3205,6 +3225,26 @@ AC_DEFUN([LC_HAVE_GET_ACL_RCU_ARG], [
 	])
 ]) # LC_HAVE_GET_ACL_RCU_ARG
 
+# LC_HAVE_FAULT_IN_IOV_ITER_READABLE
+#
+# Kernel 5.15 commit a6294593e8a1290091d0b078d5d33da5e0cd3dfe
+# iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
+#
+AC_DEFUN([LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE], [
+	LB2_LINUX_TEST_SRC([fault_in_iov_iter_readable], [
+		#include <linux/uio.h>
+	],[
+		fault_in_iov_iter_readable(NULL, 0);
+	])
+])
+AC_DEFUN([LC_HAVE_FAULT_IN_IOV_ITER_READABLE], [
+	AC_MSG_CHECKING([if have fault_in_iov_iter_readable])
+	LB2_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [
+		AC_DEFINE(HAVE_FAULT_IN_IOV_ITER_READABLE, 1,
+			['fault_in_iov_iter_readable' exists])
+	])
+]) # LC_HAVE_FAULT_IN_IOV_ITER_READABLE
+
 #
 # LC_HAVE_INVALIDATE_LOCK
 #
@@ -4129,8 +4169,12 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
 	# 5.12
 	LC_SRC_HAVE_USER_NAMESPACE_ARG
 
+	# 5.13
+	LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
 	# 5.15
 	LC_SRC_HAVE_GET_ACL_RCU_ARG
+	LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE
 
 	# 5.16
 	LC_SRC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
@@ -4398,10 +4442,15 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
 
 	# 5.13
 	LC_HAVE_FILEATTR_GET
+	LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
+        # 5.15
+        LC_HAVE_GET_ACL_RCU_ARG
 
 	# 5.15
 	LC_HAVE_GET_ACL_RCU_ARG
 	LC_HAVE_INVALIDATE_LOCK
+	LC_HAVE_FAULT_IN_IOV_ITER_READABLE
 
 	# 5.16
 	LC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h
index 4946a15..6689a30 100644
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -1942,7 +1942,12 @@ struct cl_io {
 	 * to userspace, only the RPCs are submitted async, then waited for at
 	 * the llite layer before returning.
 	 */
-			     ci_parallel_dio:1;
+			     ci_parallel_dio:1,
+	/**
+	 * this DIO is at least partly unaligned, and so the unaligned DIO
+	 * path is being used for this entire IO
+	 */
+			     ci_unaligned_dio:1;
 	/**
 	 * Bypass quota check
 	 */
@@ -2521,7 +2526,7 @@ struct cl_dio_aio *cl_dio_aio_alloc(struct kiocb *iocb, struct cl_object *obj,
 				    bool is_aio);
 struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
 				    struct iov_iter *iter, bool write,
-				    bool sync);
+				    bool unaligned, bool sync);
 void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio);
 void cl_sub_dio_free(struct cl_sub_dio *sdio);
 static inline void cl_sync_io_init(struct cl_sync_io *anchor, int nr)
@@ -2553,14 +2558,14 @@ struct cl_sync_io {
 /** direct IO pages */
 struct ll_dio_pages {
 	/*
-	 * page array to be written. we don't support
-	 * partial pages except the last one.
+	 * page array for RDMA - for aligned i/o, this is the user provided
+	 * pages, but for unaligned i/o, this is the internal buffer
 	 */
-	struct page             **ldp_pages;
+	struct page		**ldp_pages;
 	/** # of pages in the array. */
-	size_t                  ldp_count;
+	size_t			ldp_count;
 	/* the file offset of the first page. */
-	loff_t                  ldp_file_offset;
+	loff_t			ldp_file_offset;
 };
 
 /* Top level struct used for AIO and DIO */
@@ -2585,7 +2590,8 @@ struct cl_sub_dio {
 	struct ll_dio_pages	csd_dio_pages;
 	struct iov_iter		csd_iter;
 	unsigned		csd_creator_free:1,
-				csd_write:1;
+				csd_write:1,
+				csd_unaligned:1;
 };
 #if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \
 	defined(HAVE_DIRECTIO_2ARGS)
@@ -2593,6 +2599,9 @@ struct cl_sub_dio {
 #endif
 
 void ll_release_user_pages(struct page **pages, int npages);
+int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size);
+void ll_free_dio_buffer(struct ll_dio_pages *pvec);
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov);
 
 #ifndef HAVE_KTHREAD_USE_MM
 #define kthread_use_mm(mm) use_mm(mm)
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c
index ed009f8..8f9511d 100644
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -492,26 +492,39 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
 	ssize_t tot_bytes = 0, result = 0;
 	loff_t file_offset = iocb->ki_pos;
 	bool sync_submit = false;
+	bool unaligned = false;
 	struct vvp_io *vio;
 	ssize_t rc2;
 
+	ENTRY;
+
 	/* Check EOF by ourselves */
 	if (rw == READ && file_offset >= i_size_read(inode))
-		return 0;
+		RETURN(0);
 
-	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
 	if (file_offset & ~PAGE_MASK)
-		RETURN(-EINVAL);
+		unaligned = true;
 
-	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
-	       "offset=%lld=%llx, pages %zd (max %lu)\n",
-	       PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
-	       file_offset, file_offset, count >> PAGE_SHIFT,
-	       MAX_DIO_SIZE >> PAGE_SHIFT);
+	if (count & ~PAGE_MASK)
+		unaligned = true;
 
 	/* Check that all user buffers are aligned as well */
 	if (ll_iov_iter_alignment(iter) & ~PAGE_MASK)
-		RETURN(-EINVAL);
+		unaligned = true;
+
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op:inode="DFID"(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)%s\n",
+	       PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset,
+	       (count >> PAGE_SHIFT) + !!(count & ~PAGE_MASK),
+	       MAX_DIO_SIZE >> PAGE_SHIFT, unaligned ? ", unaligned" : "");
+
+	/* the requirement to not return EIOCBQUEUED for pipes (see bottom of
+	 * this function) plays havoc with the unaligned I/O lifecycle, so
+	 * don't allow unaligned I/O on pipes
+	 */
+	if (unaligned && iov_iter_is_pipe(iter))
+		RETURN(0);
 
 	lcc = ll_cl_find(inode);
 	if (lcc == NULL)
@@ -523,6 +536,15 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
 	io = lcc->lcc_io;
 	LASSERT(io != NULL);
 
+	/* if one part of an I/O is unaligned, just handle all of it that way -
+	 * otherwise we create significant complexities with managing the iovec
+	 * in different ways, etc, all for very marginal benefits
+	 */
+	if (unaligned)
+		io->ci_unaligned_dio = true;
+	if (io->ci_unaligned_dio)
+		unaligned = true;
+
 	ll_dio_aio = io->ci_dio_aio;
 	LASSERT(ll_dio_aio);
 	LASSERT(ll_dio_aio->cda_iocb == iocb);
@@ -557,14 +579,28 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
 		 * (either in this function or from a ptlrpcd daemon)
 		 */
 		sdio = cl_sub_dio_alloc(ll_dio_aio, iter, rw == WRITE,
-					sync_submit);
+					unaligned, sync_submit);
 		if (!sdio)
 			GOTO(out, result = -ENOMEM);
 
 		pvec = &sdio->csd_dio_pages;
 		pvec->ldp_file_offset = file_offset;
 
-		result = ll_get_user_pages(rw, iter, pvec, count);
+		if (!unaligned) {
+			result = ll_get_user_pages(rw, iter, pvec, count);
+			/* ll_get_user_pages returns bytes in the IO or error*/
+			count = result;
+		} else {
+			/* same calculation used in ll_get_user_pages */
+			count = min_t(size_t, count, iter->iov->iov_len);
+			result = ll_allocate_dio_buffer(pvec, count);
+			/* allocate_dio_buffer returns number of pages or
+			 * error, so do not set count = result
+			 */
+		}
+
+		/* now we have the actual count, so store it in the sdio */
+		sdio->csd_bytes = count;
 
 		if (unlikely(result <= 0)) {
 			cl_sync_io_note(env, &sdio->csd_sync, result);
@@ -574,10 +610,33 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
 			}
 			GOTO(out, result);
 		}
-		count = result;
 
-		result = ll_direct_rw_pages(env, io, count,
-					    rw, inode, sdio);
+		if (unaligned && rw == WRITE) {
+			result = ll_dio_user_copy(sdio, iter);
+			if (unlikely(result <= 0)) {
+				cl_sync_io_note(env, &sdio->csd_sync, result);
+				if (sync_submit) {
+					cl_sub_dio_free(sdio);
+					LASSERT(sdio->csd_creator_free);
+				}
+				GOTO(out, result);
+			}
+		}
+
+		result = ll_direct_rw_pages(env, io, count, rw, inode, sdio);
+		/* if the i/o was unsuccessful, we zero the number of bytes to
+		 * copy back.  Note that partial I/O completion isn't possible
+		 * here - I/O either completes or fails.  So there's no need to
+		 * handle short I/O here by changing 'count' with the result
+		 * from ll_direct_rw_pages.
+		 *
+		 * This must be done before we release the reference
+		 * immediately below, because releasing the reference allows
+		 * i/o completion (and copyback to userspace, if unaligned) to
+		 * start.
+		 */
+		if (result != 0)
+			sdio->csd_bytes = 0;
 		/* We've submitted pages and can now remove the extra
 		 * reference for that
 		 */
@@ -595,12 +654,15 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
 			GOTO(out, result);
 
 		iov_iter_advance(iter, count);
+
 		tot_bytes += count;
 		file_offset += count;
+		CDEBUG(D_VFSTRACE,
+		       "result %zd tot_bytes %zd count %zd file_offset %lld\n",
+		       result, tot_bytes, count, file_offset);
 	}
 
 out:
-
 	if (rw == WRITE)
 		vio->u.readwrite.vui_written += tot_bytes;
 	else
@@ -612,10 +674,10 @@ out:
 	if (result == 0 && !iov_iter_is_pipe(iter))
 		result = -EIOCBQUEUED;
 
-	return result;
+	RETURN(result);
 }
 
-#if defined(HAVE_DIO_ITER)
+#ifdef HAVE_DIO_ITER
 static ssize_t ll_direct_IO(
 #ifndef HAVE_IOV_ITER_RW
 	     int rw,
diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c
index 2984c43..0cf0943 100644
--- a/lustre/obdclass/cl_io.c
+++ b/lustre/obdclass/cl_io.c
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/list_sort.h>
+#include <linux/mmu_context.h>
 #include <obd_class.h>
 #include <obd_support.h>
 #include <lustre_fid.h>
@@ -1206,8 +1207,34 @@ static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor)
 		cl_page_list_del(env, &sdio->csd_pages, page);
 	}
 
-	ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
-			      sdio->csd_dio_pages.ldp_count);
+	if (sdio->csd_unaligned) {
+		/* save the iovec pointer before it's modified by
+		 * ll_dio_user_copy
+		 */
+		struct iovec *tmp = (struct iovec *) sdio->csd_iter.iov;
+
+		CDEBUG(D_VFSTRACE,
+		       "finishing unaligned dio %s aio->cda_bytes %ld\n",
+		       sdio->csd_write ? "write" : "read", sdio->csd_bytes);
+		/* read copies *from* the kernel buffer *to* userspace
+		 * here at the end, write copies *to* the kernel
+		 * buffer from userspace at the start
+		 */
+		if (!sdio->csd_write && sdio->csd_bytes > 0)
+			ret = ll_dio_user_copy(sdio, NULL);
+		ll_free_dio_buffer(&sdio->csd_dio_pages);
+		/* handle the freeing here rather than in cl_sub_dio_free
+		 * because we have the unmodified iovec pointer
+		 */
+		OBD_FREE_PTR(tmp);
+		sdio->csd_iter.iov = NULL;
+	} else {
+		/* unaligned DIO does not get user pages, so it doesn't have to
+		 * release them, but aligned I/O must
+		 */
+		ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
+				      sdio->csd_dio_pages.ldp_count);
+	}
 	cl_sync_io_note(env, &sdio->csd_ll_aio->cda_sync, ret);
 
 	EXIT;
@@ -1246,7 +1273,7 @@ EXPORT_SYMBOL(cl_dio_aio_alloc);
 
 struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
 				    struct iov_iter *iter, bool write,
-				    bool sync)
+				    bool unaligned, bool sync)
 {
 	struct cl_sub_dio *sdio;
 
@@ -1261,27 +1288,33 @@ struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
 		cl_page_list_init(&sdio->csd_pages);
 
 		sdio->csd_ll_aio = ll_aio;
-		atomic_add(1,  &ll_aio->cda_sync.csi_sync_nr);
 		sdio->csd_creator_free = sync;
 		sdio->csd_write = write;
+		sdio->csd_unaligned = unaligned;
 
-		/* we need to make a copy of the user iovec at this point in
-		 * time so we:
-		 * A) have the correct state of the iovec for this chunk of I/O
-		 * B) have a chunk-local copy; some of the things we want to
-		 * do to the iovec modify it, so to process each chunk from a
-		 * separate thread requires a local copy of the iovec
-		 */
-		memcpy(&sdio->csd_iter, iter,
-		       sizeof(struct iov_iter));
-		OBD_ALLOC_PTR(sdio->csd_iter.iov);
-		if (sdio->csd_iter.iov == NULL) {
-			cl_sub_dio_free(sdio);
-			sdio = NULL;
-			goto out;
+		atomic_add(1,  &ll_aio->cda_sync.csi_sync_nr);
+
+		if (unaligned) {
+			/* we need to make a copy of the user iovec at this
+			 * point in time, in order to:
+			 *
+			 * A) have the correct state of the iovec for this
+			 * chunk of I/O, ie, the main iovec is altered as we do
+			 * I/O and this chunk needs the current state
+			 * B) have a chunk-local copy; doing the IO later
+			 * modifies the iovec, so to process each chunk from a
+			 * separate thread requires a local copy of the iovec
+			 */
+			memcpy(&sdio->csd_iter, iter, sizeof(struct iov_iter));
+			OBD_ALLOC_PTR(sdio->csd_iter.iov);
+			if (sdio->csd_iter.iov == NULL) {
+				cl_sub_dio_free(sdio);
+				sdio = NULL;
+				goto out;
+			}
+			memcpy((void *) sdio->csd_iter.iov, iter->iov,
+			       sizeof(struct iovec));
 		}
-		memcpy((void *) sdio->csd_iter.iov, iter->iov,
-		       sizeof(struct iovec));
 	}
 out:
 	return sdio;
@@ -1302,16 +1335,96 @@ EXPORT_SYMBOL(cl_dio_aio_free);
 void cl_sub_dio_free(struct cl_sub_dio *sdio)
 {
 	if (sdio) {
-		void *tmp = (void *) sdio->csd_iter.iov;
+		void *tmp = (void *)sdio->csd_iter.iov;
 
-		if (tmp)
-			OBD_FREE(tmp, sizeof(struct iovec));
+		if (tmp) {
+			LASSERT(sdio->csd_unaligned);
+			OBD_FREE_PTR(tmp);
+		}
 		OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem);
 	}
 }
 EXPORT_SYMBOL(cl_sub_dio_free);
 
 /*
+ * For unaligned DIO.
+ *
+ * Allocate the internal buffer from/to which we will perform DIO.  This takes
+ * the user I/O parameters and allocates an internal buffer large enough to
+ * hold it.  The pages in this buffer are aligned with pages in the file (ie,
+ * they have a 1-to-1 mapping with file pages).
+ */
+int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size)
+{
+	struct page *new_page;
+	size_t pg_offset;
+	int result = 0;
+	ssize_t i;
+
+	ENTRY;
+
+	/* page level offset in the file where the I/O starts */
+	pg_offset = pvec->ldp_file_offset & ~PAGE_MASK;
+	/* this adds 1 for the first page and removes the bytes in it from the
+	 * io_size, making the rest of the calculation aligned
+	 */
+	if (pg_offset) {
+		pvec->ldp_count++;
+		io_size -= min_t(size_t, PAGE_SIZE - pg_offset, io_size);
+	}
+
+	/* calculate pages for the rest of the buffer */
+	pvec->ldp_count += (io_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+#ifdef HAVE_DIO_ITER
+	pvec->ldp_pages = kvzalloc(pvec->ldp_count * sizeof(struct page *),
+				    GFP_NOFS);
+#else
+	OBD_ALLOC_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count);
+#endif
+	if (pvec->ldp_pages == NULL)
+		RETURN(-ENOMEM);
+
+	for (i = 0; i < pvec->ldp_count; i++) {
+		new_page = alloc_page(GFP_NOFS);
+		if (!new_page) {
+			result = -ENOMEM;
+			pvec->ldp_count = i;
+			goto out;
+		}
+		pvec->ldp_pages[i] = new_page;
+	}
+	WARN_ON(i != pvec->ldp_count);
+
+out:
+	if (result) {
+		if (pvec->ldp_pages)
+			ll_free_dio_buffer(pvec);
+	}
+
+	if (result == 0)
+		result = pvec->ldp_count;
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(ll_allocate_dio_buffer);
+
+void ll_free_dio_buffer(struct ll_dio_pages *pvec)
+{
+	int i;
+
+	for (i = 0; i < pvec->ldp_count; i++)
+		__free_page(pvec->ldp_pages[i]);
+
+#ifdef HAVE_DIO_ITER
+	kfree(pvec->ldp_pages);
+#else
+	OBD_FREE_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count);
+#endif
+}
+EXPORT_SYMBOL(ll_free_dio_buffer);
+
+/*
  * ll_release_user_pages - tear down page struct array
  * @pages: array of page struct pointers underlying target buffer
  */
@@ -1338,6 +1451,168 @@ void ll_release_user_pages(struct page **pages, int npages)
 }
 EXPORT_SYMBOL(ll_release_user_pages);
 
+#ifdef HAVE_FAULT_IN_IOV_ITER_READABLE
+#define ll_iov_iter_fault_in_readable(iov, bytes) \
+	fault_in_iov_iter_readable(iov, bytes)
+#else
+#define ll_iov_iter_fault_in_readable(iov, bytes) \
+	iov_iter_fault_in_readable(iov, bytes)
+#endif
+
+#ifndef HAVE_KTHREAD_USE_MM
+#define kthread_use_mm(mm) use_mm(mm)
+#define kthread_unuse_mm(mm) unuse_mm(mm)
+#endif
+
+/* copy IO data to/from internal buffer and userspace iovec */
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov)
+{
+	struct iov_iter *iter = write_iov ? write_iov : &sdio->csd_iter;
+	struct ll_dio_pages *pvec = &sdio->csd_dio_pages;
+	struct mm_struct *mm = sdio->csd_ll_aio->cda_mm;
+	loff_t pos = pvec->ldp_file_offset;
+	size_t count = sdio->csd_bytes;
+	size_t original_count = count;
+	int short_copies = 0;
+	bool mm_used = false;
+	int status = 0;
+	int i = 0;
+	int rw;
+
+	ENTRY;
+
+	LASSERT(sdio->csd_unaligned);
+
+	if (sdio->csd_write)
+		rw = WRITE;
+	else
+		rw = READ;
+
+	/* if there's no mm, io is being done from a kernel thread, so there's
+	 * no need to transition to its mm context anyway.
+	 *
+	 * Also, if mm == current->mm, that means this is being handled in the
+	 * thread which created it, and not in a separate kthread - so it is
+	 * unnecessary (and incorrect) to do a use_mm here
+	 */
+	if (mm && mm != current->mm) {
+		kthread_use_mm(mm);
+		mm_used = true;
+	}
+
+	/* fault in the entire userspace iovec */
+	if (rw == WRITE) {
+		if (unlikely(ll_iov_iter_fault_in_readable(iter, count)))
+			GOTO(out, status = -EFAULT);
+	}
+
+	/* modeled on kernel generic_file_buffered_read/write()
+	 *
+	 * note we only have one 'chunk' of i/o here, so we do not copy the
+	 * whole iovec here (except when the chunk is the whole iovec) so we
+	 * use the count of bytes in the chunk, csd_bytes, instead of looking
+	 * at the iovec
+	 */
+	while (true) {
+		struct page *page = pvec->ldp_pages[i];
+		unsigned long offset; /* offset into kernel buffer page */
+		size_t copied; /* bytes successfully copied */
+		size_t bytes; /* bytes to copy for this page */
+
+		LASSERT(i < pvec->ldp_count);
+
+		offset = pos & ~PAGE_MASK;
+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
+			      count);
+
+		CDEBUG(D_VFSTRACE,
+		       "count %zd, offset %lu, pos %lld, ldp_count %lu\n",
+		       count, offset, pos, pvec->ldp_count);
+
+		if (fatal_signal_pending(current)) {
+			status = -EINTR;
+			break;
+		}
+
+		/* write requires a few extra steps */
+		if (rw == WRITE) {
+			/* like btrfs, we do not have a mapping since this isn't
+			 * a page cache page, so we must do this flush
+			 * unconditionally
+			 *
+			 * NB: This is a noop on x86 but active on other
+			 * architectures
+			 */
+			flush_dcache_page(page);
+
+#ifndef HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+			copied = iov_iter_copy_from_user_atomic(page, iter,
+								offset, bytes);
+			iov_iter_advance(iter, copied);
+#else
+			copied = copy_page_from_iter_atomic(page, offset, bytes,
+							    iter);
+#endif
+
+		} else /* READ */ {
+			copied = copy_page_to_iter(page, offset, bytes, iter);
+		}
+
+		pos += copied;
+		count -= copied;
+
+		if (unlikely(copied < bytes)) {
+			short_copies++;
+
+			CDEBUG(D_VFSTRACE,
+			       "short copy - copied only %zd of %lu, short %d times\n",
+			       copied, bytes, short_copies);
+			/* copies will very rarely be interrupted, but we
+			 * should retry in those cases, since the other option
+			 * is giving an IO error and this can occur in normal
+			 * operation such as with racing unaligned AIOs
+			 *
+			 * but of course we should not retry indefinitely
+			 */
+			if (short_copies > 2) {
+				CERROR("Unaligned DIO copy repeatedly short, count %zd, offset %lu, bytes %lu, copied %zd, pos %lld\n",
+				count, offset, bytes, copied, pos);
+
+				status = -EFAULT;
+				break;
+			}
+
+			continue;
+		}
+
+		if (count == 0)
+			break;
+
+		i++;
+	}
+
+out:
+	/* if we complete successfully, we should reach all of the pages */
+	LASSERTF(ergo(status == 0, i == pvec->ldp_count - 1),
+		 "status: %d, i: %d, pvec->ldp_count %zu, count %zu\n",
+		  status, i, pvec->ldp_count, count);
+
+	if (write_iov && status == 0) {
+		/* The copy function we use modifies the count in the iovec,
+		 * but that's actually the job of the caller, so we return the
+		 * iovec to the original count
+		 */
+		iov_iter_reexpand(iter, original_count);
+	}
+
+	if (mm_used)
+		kthread_unuse_mm(mm);
+
+	/* the total bytes copied, or status */
+	RETURN(original_count - count ? original_count - count : status);
+}
+EXPORT_SYMBOL(ll_dio_user_copy);
+
 /**
  * Indicate that transfer of a single page completed.
  */
diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c
index 0c90712..0507212 100644
--- a/lustre/osd-zfs/osd_io.c
+++ b/lustre/osd-zfs/osd_io.c
@@ -424,7 +424,6 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
 
 static inline struct page *kmem_to_page(void *addr)
 {
-	LASSERT(!((unsigned long)addr & ~PAGE_MASK));
 	if (is_vmalloc_addr(addr))
 		return vmalloc_to_page(addr);
 	else
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 74b0a06..8a20214 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -13562,6 +13562,8 @@ run_test 119c "Testing for direct read hitting hole"
 
 test_119e()
 {
+	(( $MDS1_VERSION >= $(version_code 2.15.58) )) ||
+		skip "Need server version at least 2.15.58"
 	(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
 
 	local stripe_size=$((1024 * 1024)) #1 MiB
@@ -13576,14 +13578,27 @@ test_119e()
 	dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
 		error "buffered i/o to create file failed"
 
+	# trivial test of unaligned DIO
+	dd if=$DIR/$tfile.1 bs=4095 of=$DIR/$tfile.2 count=4 \
+		iflag=direct oflag=direct ||
+		error "trivial unaligned dio failed"
+
+	# Clean up before next part of test
+	rm -f $DIR/$tfile.2
+
 	if zfs_or_rotational; then
 		# DIO on ZFS can take up to 2 seconds per IO
 		# rotational is better, but still slow.
 		# Limit testing on those media to larger sizes
-		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+			$((stripe_size + 1024))"
 	else
-		bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-			$((stripe_size * 4))"
+		bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+			$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+			$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+			$((stripe_size - 1)) $stripe_size \
+			$((stripe_size + 1)) $((stripe_size * 3/2)) \
+			$((stripe_size * 4)) $((stripe_size * 4 + 1))"
 	fi
 
 	for bs in $bsizes; do
@@ -13624,10 +13639,15 @@ test_119f()
 		# DIO on ZFS can take up to 2 seconds per IO
 		# rotational is better, but still slow.
 		# Limit testing on those media to larger sizes
-		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+			$((stripe_size + 1024))"
 	else
-		bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-			$((stripe_size * 4))"
+		bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+			$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+			$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+			$((stripe_size - 1)) $stripe_size \
+			$((stripe_size + 1)) $((stripe_size * 3/2)) \
+			$((stripe_size * 4)) $((stripe_size * 4 + 1))"
 	fi
 
 	for bs in $bsizes; do
@@ -13684,10 +13704,15 @@ test_119g()
 		# DIO on ZFS can take up to 2 seconds per IO
 		# rotational is better, but still slow.
 		# Limit testing on those media to larger sizes
-		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+			$((stripe_size + 1024))"
 	else
-		bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-			$((stripe_size * 4))"
+		bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+			$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+			$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+			$((stripe_size - 1)) $stripe_size \
+			$((stripe_size + 1)) $((stripe_size * 3/2)) \
+			$((stripe_size * 4)) $((stripe_size * 4 + 1))"
 	fi
 
 	for bs in $bsizes; do
@@ -13719,6 +13744,123 @@ test_119g()
 }
 run_test 119g "dio vs buffered I/O race"
 
+test_119h()
+{
+	(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+
+	local stripe_size=$((1024 * 1024)) #1 MiB
+	# Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+	local file_size=$((25 * stripe_size))
+	local bsizes
+
+	stack_trap "rm -f $DIR/$tfile.*"
+
+	if zfs_or_rotational; then
+		# DIO on ZFS can take up to 2 seconds per IO
+		# rotational is better, but still slow.
+		# Limit testing on those media to larger sizes
+		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+			$((stripe_size + 1024))"
+	else
+		bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+			$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+			$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+			$((stripe_size - 1)) $stripe_size \
+			$((stripe_size + 1)) $((stripe_size * 3/2)) \
+			$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+	fi
+
+	for bs in $bsizes; do
+		$LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+		$LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+		echo "unaligned writes of blocksize: $bs"
+		# Write a file with unaligned DIO and regular DIO, and compare
+		# them
+		# with 'u', multiop randomly unaligns the io from the buffer
+		$MULTIOP $DIR/$tfile.1 \
+		oO_CREAT:O_RDWR:O_DIRECT:wu${bs}wu${bs}wu${bs}wu${bs}wu${bs} ||
+			error "multiop memory unaligned write failed, $bs"
+		$MULTIOP $DIR/$tfile.2 \
+		oO_CREAT:O_RDWR:O_DIRECT:w${bs}w${bs}w${bs}w${bs}w${bs} ||
+			error "multiop memory aligned write failed, $bs"
+
+		cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+			error "files differ, bsize $bs"
+		rm -f $DIR/$tfile.*
+	done
+
+	$LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+	dd if=/dev/zero bs=$((stripe_size * 5)) of=$DIR/$tfile.1 count=5 ||
+		error "dd to create source file for read failed"
+
+	# Just a few quick tests to make sure unaligned DIO reads don't crash
+	for bs in $bsizes; do
+
+		echo "unaligned reads of blocksize: $bs"
+		# with 'u', multiop randomly unaligns the io from the buffer
+		$MULTIOP $DIR/$tfile.1 \
+		oO_CREAT:O_RDWR:O_DIRECT:ru${bs}ru${bs}ru${bs}ru${bs}ru${bs} ||
+			error "multiop memory unaligned read failed, $bs"
+
+	done
+	rm -f $DIR/$tfile*
+}
+run_test 119h "basic tests of memory unaligned dio"
+
+# aiocp with the '-a' option makes testing memory unaligned aio trivial
+test_119i()
+{
+	(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+	which aiocp || skip_env "no aiocp installed"
+
+	local stripe_size=$((1024 * 1024)) #1 MiB
+	# Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+	local file_size=$((25 * stripe_size))
+	local bsizes
+
+	$LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+	stack_trap "rm -f $DIR/$tfile.*"
+
+	# Just a bit bigger than the largest size in the test set below
+	dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
+		error "buffered i/o to create file failed"
+
+	if zfs_or_rotational; then
+		# DIO on ZFS can take up to 2 seconds per IO
+		# rotational is better, but still slow.
+		# Limit testing on those media to larger sizes
+		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+			$((stripe_size + 1024))"
+	else
+		bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+			$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+			$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+			$((stripe_size - 1)) $stripe_size \
+			$((stripe_size + 1)) $((stripe_size * 3/2)) \
+			$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+	fi
+
+	# Do page aligned and NOT page aligned AIO
+	for align in 8 512 $((PAGE_SIZE)); do
+	# Deliberately includes a few aligned sizes
+	for bs in $bsizes; do
+		$LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+
+		echo "bs: $bs, align: $align, file_size $file_size"
+		aiocp -a $align -b $bs -s $file_size -f O_DIRECT \
+			$DIR/$tfile.1 $DIR/$tfile.2 ||
+			error "unaligned aio failed, bs: $bs, align: $align"
+
+		$CHECKSTAT -t file -s $file_size $DIR/$tfile.2 ||
+			error "size incorrect"
+		cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+			error "files differ"
+		rm -f $DIR/$tfile.2
+	done
+	done
+}
+run_test 119i "test unaligned aio at varying sizes"
+
 test_120a() {
 	[ $PARALLEL == "yes" ] && skip "skip parallel run"
 	remote_mds_nodsh && skip "remote MDS with nodsh"
@@ -26552,9 +26694,10 @@ test_398d() { #  LU-13846
 
 	diff $DIR/$tfile $aio_file || error "file diff after aiocp"
 
-	# make sure we don't crash and fail properly
-	aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file &&
-		error "aio not aligned with PAGE SIZE should fail"
+	# test memory unaligned aio
+	aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file ||
+		error "unaligned aio failed"
+	diff $DIR/$tfile $aio_file || error "file diff after aiocp"
 
 	rm -f $DIR/$tfile $aio_file
 }
diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh
index f7230e4..bfc9dd6 100755
--- a/lustre/tests/sanityn.sh
+++ b/lustre/tests/sanityn.sh
@@ -728,10 +728,15 @@ test_16j()
 		# DIO on ZFS can take up to 2 seconds per IO
 		# rotational is better, but still slow.
 		# Limit testing on those media to larger sizes
-		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+		bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+			$((stripe_size + 1024))"
 	else
-		bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
-			$((stripe_size * 4))"
+		bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+			$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+			$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+			$((stripe_size - 1)) $stripe_size \
+			$((stripe_size + 1)) $((stripe_size * 3/2)) \
+			$((stripe_size * 4)) $((stripe_size * 4 + 1))"
 	fi
 
 	# 1 process (BIO or DIO) on each client
-- 
1.8.3.1