From 2cf3b1e972b1d5e37478f1739b3dfde381b6bfc4 Mon Sep 17 00:00:00 2001
From: Patrick Farrell <pfarrell@whamcloud.com>
Date: Thu, 2 Nov 2023 17:23:01 -0400
Subject: [PATCH] EX-7601 tgt: reorder tgt_brw_write decls

Reorder the declarations in tgt_brw_write.

This patch also serves as the series head for implementing
read-modify-write support for compressed chunks.

The process for read-modify-write is similar to that used
for unaligned reads.

At a high level, read-modify-write means we must read up,
decompress, then recompress and write back the data.  This
only applies when we're actually doing read-modify-write.

To know when to do this, we rely partly on the client.  If
the client is able to compress a chunk, either because it is
a complete chunk, or because the start is chunk aligned and
the write is past EOF, we know there is no read-modify-write
required.  Either there is no existing data (write past EOF)
or the data will be fully replaced.

So, when we see a write which is not fully chunk aligned and
not already compressed, we will do a read-modify-write.

For this, we round the IO lnbs and associated locking to
cover complete chunks, then we do a read of the unaligned
chunks.

ie, if we have a write which goes from 63 KiB to 257 KiB
with a chunk size of 64 KiB, we will read 0-64 KiB and
256-320 KiB, and decompress those chunks in to the buffer.
64 KiB to 256 KiB is *NOT* read, because those are complete
chunks.

We then set up a transfer mapping - identical to the process
for unaligned reads - so the client data is written in to
the correct lnbs.

Now we have a set of chunk aligned lnbs which contain data
updated with the client write.  In the initial version, we
write these to disk uncompressed.  This is sufficient for
correct operation, but it does mean read-modify-write will
decompress those chunks.

There is code for recompression, but it is not working 100%
yet, and there are some complexities around managing holes
and EOF which still need to be resolved.

TBD if this will make our initial release - I am hopeful but
not sure yet.

Test-Parameters: trivial
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: Ia24583d4221f498928e99afa8c289b70e4d25f5b
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52959
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Artem Blagodarenko <ablagodarenko@ddn.com>
---
 lustre/obdclass/lustre_compr.c | 107 ++++++++++++++++++++++++-----------------
 lustre/target/tgt_handler.c    |  27 ++++++-----
 2 files changed, 80 insertions(+), 54 deletions(-)

diff --git a/lustre/obdclass/lustre_compr.c b/lustre/obdclass/lustre_compr.c
index bab0322..0fe598f 100644
--- a/lustre/obdclass/lustre_compr.c
+++ b/lustre/obdclass/lustre_compr.c
@@ -26,58 +26,79 @@
  */
 
 /*
- * When using compression, the client attempts to send chunk
- * aligned reads, but sometimes it can't, and the client will
- * send a read to the server which is not chunk aligned.
+ * Whenever possible, the client handles compression and decompression, but
+ * there are two cases where the server must assist.
  *
- * In this case, the server must read the full chunk,
- * decompress it, and provide the requested data to the client.
+ * 1. Unaligned reads.  The client attempts to always send chunk aligned reads,
+ * but sometimes it must send an unaligned read to the server.
+ * 2. Writes into existing compressed chunks.  If a write does not cover an
+ * entire compressed chunk, we must do a chunk level read-modify-write to
+ * complete the write.  We do this on the server.
  *
- * The server receives a set of remote niobufs describing IO
- * from the client.  Each remote niobuf (rnb) describes a range
- * of data the client wants to do IO to.
+ * Both of these are types of unaligned IO, in that the IO doesn't match up
+ * 100% to compression chunks.
  *
- * These are translated to a set of local niobufs on the
- * server, which we then use to do the read.  For compression,
- * the server has to read complete chunks on unalinged reads.
+ * In both cases, the server must read the necessary chunks from disk,
+ * decompress them, then do the transfer (either to the client for reads or
+ * from the client on writes).  In the case of writes, the server must then
+ * write the complete chunk to disk.  (The server will eventually recompress
+ * the data, but this isn't finished yet.)
  *
- * So we walk these remote niobufs and identify unaligned read
- * requests (in ofd_preprw_read), then round them to chunk
- * size. The server then reads the chunk rounded read request
- * from storage.
+ * The server receives a set of remote niobufs describing the IO from the
+ * client.  Each remote niobuf (rnb) describes a range of data the client
+ * wants to do IO to.
  *
- * The local niobufs now contain a set of complete compressed
- * chunks, ie, the raw data from disk.  We need to decompress
- * the chunks where the client is doing an unaligned read, but
- * leave the other chunks compressed (because the client will
- * uncompress them).
+ * These are translated to a set of local niobufs on the server, which are used
+ * to do the server side IO.  With compression, we must always read or write
+ * complete chunks.
  *
- * So, in obd_decompress_read, we use the remote niobuf to
- * identify unaligned reads from the client.  We then walk the
- * local niobufs, identify the chunks which match the unaligned
- * reads from the client, and decompress them 'in place'.
- * The decompression uses temporary buffers, but the
- * decompressed data is placed back in the local niobuf.
- * (If the data is uncompressed on disk, we of course do not
- * decompress it.  This happens for incompressible data.)
+ * So we walk these remote niobufs and identify unaligned IO requests (in
+ * ofd_preprw_read/write), then round them to chunk size. The server then
+ * reads the necessary data from storage - for reads, this is the entire range;
+ * for writes, this is just the chunks which have unaligned IO.
  *
- * Now the local niobuf contains some raw chunks and some
- * chunks which have been decompressed.  This is *more* data
- * than the client asked for.  Normally, the server local
- * niobuf contains exactly what the client asked for, so the
- * server checksums and sends the entire local niobuf.  But
- * because we read complete chunks, the local niobuf contains
- * more data than the client requested.
+ * The local niobufs now contain a set of complete chunks with the raw data
+ * from disk.  We need to decompress the chunks for unaligned IO, but leave the
+ * other chunks unmodified.  (For write, those chunks were not read from disk,
+ * for reads, they will be decompressed by the client.)
  *
- * This means we need to identify the subset of the local
- * niobuf which the client actually wants to read and present
- * that to the client.
+ * So, in obd_compression, we use the remote niobufs to identify unaligned
+ * accesses from the client.  We then walk the local niobufs, identify the
+ * chunks which match the unaligned IO from the client, and decompress them
+ * 'in place'.
  *
- * In order to do that, we walk the local niobuf and use the
- * remote niobufs (the description of the pages the client
- * needs) and create a special tx niobuf which points to only
- * the pages the client wants (io_lnb_to_tx_lnb).  Then we use
- * this tx niobuf for checksum and transfer to the client.
+ * The decompression uses temporary buffers, but the decompressed data is
+ * placed back in the local niobuf.  (If the data is uncompressed on disk, we
+ * of course do not decompress it.  This happens for incompressible data.)
+ *
+ * Now the local niobuf is ready for transfer - either to be sent to the client
+ * or to be updated by data from the client.  For reads, the aligned portion of
+ * the IO contains raw data from disk for the client to decompress.  For
+ * writes, the aligned portion is empty (the client will place data there).
+ * For both reads and writes, the portions of the niobuf which correspond to
+ * unaligned IO contain decompressed data.
+ *
+ * However, the local niobuf does not match the range requested by the client -
+ * Because of chunk rounding, it's larger than the client asked for.  Normally
+ * the local niobuf contains exactly what was asked for, so we checksum and
+ * transfer the whole thing.  In this case, we can't.
+ *
+ * This means we need to identify the subset of the local niobuf where the
+ * client tranfer (read from or write to) will occur and present that to the
+ * client.
+ *
+ * In order to do that, we walk the local niobuf and use the remote niobufs
+ * (the description of the pages the client needs) and create a special tx
+ * niobuf which points to only the pages the client wants (io_lnb_to_tx_lnb).
+ * Then we use this tx niobuf for checksum and transfer to/from the client.
+ *
+ * For reads, we're done.  For writes, we then write all of the data out to
+ * disk, including complete chunks for the unaligned areas.
+ *
+ * In the initial version, we write this to disk uncompressed.  This is
+ * sufficient for correctness, but not ideal since it decompresses those areas
+ * of the file.  The code for re-compression is not working 100% yet.  This
+ * will be updated when that code is in and working.
  */
 
 #define DEBUG_SUBSYSTEM S_SEC
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c
index 76d5e2b..fdf34e0 100644
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -2802,29 +2802,34 @@ static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 int tgt_brw_write(struct tgt_session_info *tsi)
 {
 	struct ptlrpc_request	*req = tgt_ses_req(tsi);
-	struct ptlrpc_bulk_desc	*desc = NULL;
+	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
 	struct obd_export	*exp = req->rq_export;
+	struct ptlrpc_bulk_desc	*desc = NULL;
+	struct lustre_handle	 lockh = {0};
 	struct niobuf_remote	*remote_nb;
 	struct niobuf_local	*local_nb;
+	struct ost_body		*repbody;
+	struct ost_body		*body;
 	struct obd_ioobj	*ioo;
-	struct ost_body		*body, *repbody;
-	struct lustre_handle	 lockh = {0};
-	__u32			*rcs;
-	int			 objcount, niocount, npages;
-	int			 rc = 0;
-	int			 i, j;
-	enum cksum_types cksum_type = OBD_CKSUM_CRC32;
-	bool			 no_reply = false, mmap;
-	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
-	bool wait_sync = false;
 	const char *obd_name = exp->exp_obd->obd_name;
+	enum cksum_types cksum_type = OBD_CKSUM_CRC32;
 	/* '1' for consistency with code that checks !mpflag to restore */
 	unsigned int mpflags = 1;
 	struct ost_layout_compr *olc;
 	enum ll_compr_type type;
+	bool wait_sync = false;
+	bool no_reply = false;
 	int chunk_size = 0;
 	ktime_t kstart;
+	int objcount;
+	int niocount;
 	int nob = 0;
+	int npages;
+	int rc = 0;
+	__u32 *rcs;
+	bool mmap;
+	int i;
+	int j;
 
 	ENTRY;
 
-- 
1.8.3.1