From 1fa633c2031c3e34072d5486aeedfbf222091ac7 Mon Sep 17 00:00:00 2001
From: Patrick Farrell <patrick.farrell@oracle.com>
Date: Wed, 24 Apr 2024 18:26:04 -0400
Subject: [PATCH] LU-17478 clio: parallelize unaligned DIO write copy

The data copying for unaligned/hybrid IO reads is already
parallel because it is done by the ptlrpc threads at the
end of the IO.  That for the writes is not - it is done
by the submitting thread during IO submission.

This has a huge performance impact, limiting writes to
around 3.0 GiB/s when reads are at 12 GiB/s.

With the iov iter issue fixed, we can do this copy as
part of IO submission.

With this and the patch to use page pools for buffer
allocation (https://review.whamcloud.com/53670), the
maximum performance of hybrid IO is basically the same as
DIO, at least for current speeds.

This means hybrid reads and writes at 20 GiB/s with
current master + this and the pool patch.

Note this requires a funny workaround: If a user thread
calls fsync while a DIO write is in progress, the user
thread can pick that write up at the RPC layer and become
responsible for writing it out, even though that write
isn't in cache.  (Because the write is waiting to be
picked up by a ptlrpcd thread.)

If that DIO write is unaligned, the user thread is unable
to do the memory copy.  It's not an option to have the
thread ignore a ready RPC, so instead we spawn a kthread
to handle this case.

This only occurs when DIO is racing with fsync, so
performance doesn't really matter, and this cost is OK.

Signed-off-by: Patrick Farrell <patrick.farrell@oracle.com>
Change-Id: Ic8209e1fda97cda83e5b87baba48d15dd4dcc15f
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53844
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
---
 lustre/include/cl_object.h  |  4 +++-
 lustre/include/lustre_osc.h |  2 ++
 lustre/llite/rw26.c         | 12 ----------
 lustre/obdclass/cl_io.c     | 58 ++++++++++++++++++++++++++++++++++++++++++++-
 lustre/obdclass/jobid.c     |  4 ++--
 lustre/osc/osc_cache.c      | 11 +++++++++
 lustre/osc/osc_request.c    | 22 +++++++++++++++++
 7 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h
index 45f0551..0c6c81a 100644
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -2552,9 +2552,11 @@ struct cl_sub_dio {
 	struct ll_dio_pages	csd_dio_pages;
 	struct iov_iter		csd_iter;
 	struct cl_iter_dup	csd_dup;
+	spinlock_t		csd_lock;
 	unsigned		csd_creator_free:1,
 				csd_write:1,
-				csd_unaligned:1;
+				csd_unaligned:1,
+				csd_write_copied:1;
 };
 
 static inline u64 cl_io_nob_aligned(u64 off, u32 nob, u32 pgsz)
diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h
index d1a45d8..4713551 100644
--- a/lustre/include/lustre_osc.h
+++ b/lustre/include/lustre_osc.h
@@ -287,6 +287,7 @@ struct osc_object {
 	struct list_head	oo_hp_exts;	/* list of hp extents */
 	struct list_head	oo_urgent_exts;	/* list of writeback extents */
 	struct list_head	oo_full_exts;
+	struct list_head	oo_dio_exts;
 
 	struct list_head	oo_reading_exts;
 
@@ -963,6 +964,7 @@ struct osc_extent {
 	unsigned int		oe_nr_pages;
 	/** list of pending oap pages. Pages in this list are NOT sorted. */
 	struct list_head	oe_pages;
+	struct cl_sub_dio	*oe_csd;
 	/** start and end index of this extent, include start and end
 	 * themselves. Page offset here is the page index of osc_pages.
 	 * oe_start is used as keyword for red-black tree.
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c
index bffe8be..5a5f6fc 100644
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -639,18 +639,6 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw)
 			GOTO(out, result);
 		}
 
-		if (unaligned && rw == WRITE) {
-			result = ll_dio_user_copy(sdio);
-			if (unlikely(result <= 0)) {
-				cl_sync_io_note(env, &sdio->csd_sync, result);
-				if (sync_submit) {
-					LASSERT(sdio->csd_creator_free);
-					cl_sub_dio_free(sdio);
-				}
-				GOTO(out, result);
-			}
-		}
-
 		result = ll_direct_rw_pages(env, io, count, rw, inode, sdio);
 		/* if the i/o was unsuccessful, we zero the number of bytes to
 		 * copy back.  Note that partial I/O completion isn't possible
diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c
index 7c9672b..3c28112 100644
--- a/lustre/obdclass/cl_io.c
+++ b/lustre/obdclass/cl_io.c
@@ -1291,6 +1291,7 @@ struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
 		sdio->csd_creator_free = sync;
 		sdio->csd_write = write;
 		sdio->csd_unaligned = unaligned;
+		spin_lock_init(&sdio->csd_lock);
 
 		atomic_add(1,  &ll_aio->cda_sync.csi_sync_nr);
 
@@ -1464,7 +1465,7 @@ EXPORT_SYMBOL(ll_release_user_pages);
 #endif
 
 /* copy IO data to/from internal buffer and userspace iovec */
-ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio)
+ssize_t __ll_dio_user_copy(struct cl_sub_dio *sdio)
 {
 	struct iov_iter *iter = &sdio->csd_iter;
 	struct ll_dio_pages *pvec = &sdio->csd_dio_pages;
@@ -1493,8 +1494,13 @@ ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio)
 	 * Also, if mm == current->mm, that means this is being handled in the
 	 * thread which created it, and not in a separate kthread - so it is
 	 * unnecessary (and incorrect) to do a use_mm here
+	 *
+	 * assert that if we have an mm and it's not ours, we're doing this
+	 * copying from a kernel thread - otherwise kthread_use_mm will happily
+	 * trash memory and crash later
 	 */
 	if (mm && mm != current->mm) {
+		LASSERT(current->flags & PF_KTHREAD);
 		kthread_use_mm(mm);
 		mm_used = true;
 	}
@@ -1602,6 +1608,56 @@ out:
 	/* the total bytes copied, or status */
 	RETURN(original_count - count ? original_count - count : status);
 }
+
+struct dio_user_copy_data {
+	struct cl_sub_dio *ducd_sdio;
+	struct completion ducd_completion;
+	ssize_t ducd_result;
+};
+
+int ll_dio_user_copy_helper(void *data)
+{
+	struct dio_user_copy_data *ducd = data;
+	struct cl_sub_dio *sdio = ducd->ducd_sdio;
+
+	ducd->ducd_result = __ll_dio_user_copy(sdio);
+	complete(&ducd->ducd_completion);
+
+	return 0;
+}
+
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio)
+{
+	struct dio_user_copy_data ducd;
+	struct task_struct *kthread;
+
+	/* normal case - copy is being done by ptlrpcd */
+	if (current->flags & PF_KTHREAD ||
+	/* for non-parallel DIO, the submitting thread does the copy */
+	    sdio->csd_ll_aio->cda_mm == current->mm)
+		return __ll_dio_user_copy(sdio);
+
+	/* this is a slightly unfortunate workaround; when doing an fsync, a
+	 * user thread may pick up a DIO extent which is about to be written
+	 * out.  we can't just ignore these, but we also can't handle them from
+	 * the user thread, since user threads can't do data copying from
+	 * another thread's memory.
+	 *
+	 * so we spawn a kthread to handle this case.
+	 * this will be rare and is not a 'hot path', so the performance
+	 * cost doesn't matter
+	 */
+	init_completion(&ducd.ducd_completion);
+	ducd.ducd_sdio = sdio;
+
+	kthread = kthread_run(ll_dio_user_copy_helper, &ducd,
+			      "ll_ucp_%u", current->pid);
+	if (IS_ERR_OR_NULL(kthread))
+		return PTR_ERR(kthread);
+	wait_for_completion(&ducd.ducd_completion);
+
+	return ducd.ducd_result;
+}
 EXPORT_SYMBOL(ll_dio_user_copy);
 
 /**
diff --git a/lustre/obdclass/jobid.c b/lustre/obdclass/jobid.c
index 78d8ea7..0d7e8a6 100644
--- a/lustre/obdclass/jobid.c
+++ b/lustre/obdclass/jobid.c
@@ -517,7 +517,7 @@ static bool jobid_name_is_valid(char *jobid)
 	const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
 						"ldlm", "ll_sa", "kworker",
 						"kswapd", "writeback", "irq",
-						"ksoftirq", NULL };
+						"ksoftirq", "ll_ucp", NULL };
 	int i;
 
 	if (jobid[0] == '\0')
@@ -669,7 +669,7 @@ out:
 static int jobid_print_current_comm(char *jobid, ssize_t joblen)
 {
 	const char *const names[] = {"kworker", "kswapd", "ll_sa", "ll_agl",
-				     "ldlm_bl", NULL};
+				     "ldlm_bl", "ll_ucp", NULL};
 	int i;
 
 	if (current->flags & PF_KTHREAD) {
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c
index b9e10d1..c4c13f4 100644
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -2585,6 +2585,17 @@ int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io,
 	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
 	ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
 	ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE);
+	if (ext->oe_dio) {
+		struct cl_sync_io *anchor;
+		struct cl_page *clpage;
+
+		oap = list_first_entry(list, struct osc_async_page,
+				       oap_pending_item);
+		clpage = oap2cl_page(oap);
+		LASSERT(clpage->cp_type == CPT_TRANSIENT);
+		anchor = clpage->cp_sync_io;
+		ext->oe_csd = anchor->csi_dio_aio;
+	}
 	oscl = oio->oi_write_osclock ? : oio->oi_read_osclock;
 	if (oscl && oscl->ols_dlmlock != NULL) {
 		ext->oe_dlmlock = LDLM_LOCK_GET(oscl->ols_dlmlock);
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index aafb9c6..c016380 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -2762,6 +2762,8 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 
 	/* add pages into rpc_list to build BRW rpc */
 	list_for_each_entry(ext, ext_list, oe_link) {
+		struct cl_sub_dio *sdio = ext->oe_csd;
+
 		LASSERT(ext->oe_state == OES_RPC);
 		mem_tight |= ext->oe_memalloc;
 		grant += ext->oe_grants;
@@ -2769,6 +2771,26 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		layout_version = max(layout_version, ext->oe_layout_version);
 		if (obj == NULL)
 			obj = ext->oe_obj;
+
+		/* for unaligned writes, we do the data copying here */
+		if (sdio && sdio->csd_unaligned && sdio->csd_write &&
+		    !sdio->csd_write_copied) {
+			/* note a single sdio can correspond to multiple RPCs,
+			 * so we use this lock to ensure the data copy is only
+			 * done once (an sdio can also correspond to multiple
+			 * extents, which is also handled by this)
+			 */
+			spin_lock(&sdio->csd_lock);
+			if (!sdio->csd_write_copied) {
+				rc = ll_dio_user_copy(sdio);
+				if (rc <= 0) {
+					spin_unlock(&sdio->csd_lock);
+					GOTO(out, rc);
+				}
+				sdio->csd_write_copied = true;
+			}
+			spin_unlock(&sdio->csd_lock);
+		}
 	}
 
 	soft_sync = osc_over_unstable_soft_limit(cli);
-- 
1.8.3.1