Whamcloud - gitweb
EX-8993 ofd: do not write 'hole' pages on compression
authorPatrick Farrell <pfarrell@whamcloud.com>
Tue, 13 Feb 2024 16:45:17 +0000 (11:45 -0500)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 24 Feb 2024 03:44:50 +0000 (03:44 +0000)
When doing unaligned read-modify-write to a compressed file,
we must round the IO lnb used for write in order to read up
the compressed data for modification.

In some cases, this creates a situation where there are
pages in the write lnb which have no data in them.  It is
important not to write out these pages, because if we do,
this wastes space and can cause incorrect file size.

In most cases, the file size is covered by the client
sending the file size, but if the client does not compress
a particular write, it does not send the size and the server
does not use it.  We could resolve this by having the client
always send size info and have the server always use it, but
it's better to make server writes 'hole' aware, since this
improves space usage.  (And this will be required for the
server to do recompression on read-modify-write, otherwise
no space is gained.)

Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I66169e205fe4691ed03b2c9b3005ffc4ecd3213d
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53595
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Artem Blagodarenko <ablagodarenko@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/obd.h
lustre/ofd/ofd_compress.c
lustre/ofd/ofd_io.c
lustre/osd-ldiskfs/osd_io.c
lustre/target/tgt_handler.c
lustre/tests/sanity-compr.sh

index 8031bf8..9b80c01 100644 (file)
@@ -477,6 +477,10 @@ struct niobuf_local {
        __u16           lnb_guard_disk:1;
        /* separate unlock for read path to allow shared access */
        __u16           lnb_locked:1;
+       /* this lnb represents a hole in the file and should not be written out
+        * or transferred to the client
+        */
+       __u16           lnb_hole:1;
 };
 
 void chunk_round(__u64 *buf_start, __u64 *buf_end, int chunk_size);
index d5200a9..260207d 100644 (file)
@@ -37,7 +37,7 @@ static int decompress_chunk_in_lnb(const char *obd_name, struct lu_fid *fid,
                                   struct niobuf_local *lnbs, int lnb_start,
                                   void **bounce_src, void **bounce_dst,
                                   enum ll_compr_type type, int lvl,
-                                  int chunk_size)
+                                  int chunk_size, bool write)
 {
        struct ll_compr_hdr *llch = NULL;
        struct crypto_comp *cc = NULL;
@@ -148,11 +148,15 @@ out:
         */
        for(i = lnb_start; i < lnb_start + pages_in_chunk; i++) {
                /* if there's no data in this page, we must clear it */
-               if (lnbs[i].lnb_rc == 0) {
+               if (!write && lnbs[i].lnb_rc == 0) {
                        CDEBUG(D_SEC, "%s: no data, clearing: page  %d, "DFID" at %llu\n",
                               obd_name, i, PFID(fid), lnbs[i].lnb_file_offset);
                        memset(kmap(lnbs[i].lnb_page), 0, PAGE_SIZE);
                        kunmap(lnbs[i].lnb_page);
+               } else if (write && lnbs[i].lnb_rc == 0) {
+                       lnbs[i].lnb_hole = true;
+                       CDEBUG(D_SEC, "no data in page %d at %llu, marking hole\n",
+                              i, lnbs[i].lnb_file_offset);
                }
        }
        if (cc)
@@ -216,7 +220,8 @@ int decompress_rnb(const char *obd_name, struct lu_fid *fid,
                        GOTO(out, rc = -EINVAL);
 
                rc = decompress_chunk_in_lnb(obd_name, fid, lnbs, i, bounce_src,
-                                            bounce_dst, type, lvl, chunk_size);
+                                            bounce_dst, type, lvl, chunk_size,
+                                            write);
                if (rc)
                        GOTO(out, rc);
                i += pages_per_chunk;
@@ -255,7 +260,8 @@ int decompress_rnb(const char *obd_name, struct lu_fid *fid,
                        GOTO(out, rc = -EINVAL);
 
                rc = decompress_chunk_in_lnb(obd_name, fid, lnbs, i, bounce_src,
-                                            bounce_dst, type, lvl, chunk_size);
+                                            bounce_dst, type, lvl, chunk_size,
+                                            write);
                if (rc)
                        GOTO(out, rc);
                i += pages_per_chunk;
index 0e426a4..33321be 100644 (file)
@@ -723,8 +723,9 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp,
                                     */
                                    (buf_start == orig_start && orig_end >= la->la_size)) {
                                        CDEBUG(D_SEC,
-                                              "rnb %d from %llu to %llu (chunk rounded: %llu to %llu) is past eof\n",
-                                              i, orig_start, orig_end, buf_start, buf_end);
+                                              "rnb %d from %llu to %llu (chunk rounded: %llu to %llu) is past eof (%llu)\n",
+                                              i, orig_start, orig_end,
+                                              buf_start, buf_end, la->la_size);
                                        buf_start = orig_start;
                                        buf_end = orig_end;
                                        eof_rnb = i;
@@ -1166,6 +1167,7 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
                                CDEBUG(D_SEC, "read_lnb %d is write_lnb %d (offset %llu), lnb_rc %d\n",
                                       i, j, write_lnb[j].lnb_file_offset, read_lnb[i].lnb_rc);
                                write_lnb[j] = read_lnb[i];
+                               CDEBUG(D_SEC, "write lnb %d at %llu hole %d\n", j, write_lnb[j].lnb_file_offset, write_lnb[j].lnb_hole);
                                break;
                        }
                }
index 837ab29..166f641 100644 (file)
@@ -690,6 +690,7 @@ static int osd_init_lnbs(loff_t offset, ssize_t len, int *nrpages,
                lnb->lnb_guard_rpc = 0;
                lnb->lnb_guard_disk = 0;
                lnb->lnb_locked = 0;
+               lnb->lnb_hole = 0;
 
                LASSERTF(plen <= len, "plen %u, len %lld\n", plen,
                         (long long) len);
@@ -1656,9 +1657,15 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                        lnb[i].lnb_rc = 0;
                }
 
-               if (lnb[i].lnb_rc) { /* ENOSPC, network RPC error, etc. */
-                       CDEBUG(D_INODE, "Skipping [%d] == %d\n", i,
-                              lnb[i].lnb_rc);
+               /* ENOSPC, network RPC error, hole, etc. */
+               if (lnb[i].lnb_rc || lnb[i].lnb_hole) {
+                       CDEBUG(D_INODE, "Skipping [%d] at %llu == %d%s\n", i,
+                              lnb[i].lnb_file_offset, lnb[i].lnb_rc,
+                              /* lnb_hole indicates this is a hole due to
+                               * compression rounding, so we do not write this
+                               * page to disk
+                               */
+                              lnb[i].lnb_hole ? "because no data in page\n" : "");
                        LASSERT(lnb[i].lnb_page);
                        generic_error_remove_page(inode->i_mapping,
                                                  lnb[i].lnb_page);
index 4b70cce..69dc7b0 100644 (file)
@@ -2372,6 +2372,17 @@ int io_lnb_to_tx_lnb(struct niobuf_local *io_lnb, struct niobuf_local *tx_lnb,
                CDEBUG(D_SEC, "nio %d npages_remote %d, lnb_index %d\n", i,
                       npages_remote, lnb_index);
                for (j = 0; j < npages_remote; j++) {
+                       /* we are transferring to this page, so if it had no
+                        * data before (if it was a hole), it will now have data
+                        * (this only occurs on writes)
+                        */
+                       if (io_lnb[lnb_index + j].lnb_hole == true) {
+                               CDEBUG(D_SEC,
+                                      "write to hole, so clearing hole in %d at %llu\n",
+                                      lnb_index + j,
+                                      io_lnb[lnb_index + j].lnb_file_offset);
+                               io_lnb[lnb_index + j].lnb_hole = false;
+                       }
                        /* point the tx_ln member at the io_lnb member */
                        tx_lnb[tx_lnb_pg] = io_lnb[lnb_index + j];
 
index 36d9823..d6ced48 100644 (file)
@@ -701,6 +701,7 @@ test_1005() {
                skip "Need OST >= 2.14.0-ddn127-8-g46708e4636 for partial compr"
 
        local tf=$DIR/$tfile
+       local tf_copy=$TMP/$tfile.copy
        # We read from $tfile to this
        # Larger than arm page size
        local chunksize=128
@@ -744,22 +745,23 @@ test_1005() {
        # chunk, just 4K at the beginning (this won't be compressed, but it's
        # still a valid test)
        dd if=$source bs=4K of=$tf count=1 || error "(0) dd failed"
-       dd if=$source bs=4K of=$tf.2 count=1 || error "(1) dd failed"
+       dd if=$source bs=4K of=$tf_copy count=1 || error "(1) dd failed"
 
-       flush_and_compare $tf $tf.2 "(2)"
+       flush_and_compare $tf $tf_copy "(2)"
 
        # Now we write adjacent in this first chunk, confirming updating it works
+       # Write 4K twice starting at an offset of 4K
        dd if=$source bs=4K of=$tf conv=notrunc count=2 seek=1 skip=1 || error "(3) dd failed"
-       dd if=$source bs=4K of=$tf.2 conv=notrunc count=2 seek=1 skip=1 || error "(4) dd failed"
+       dd if=$source bs=4K of=$tf_copy conv=notrunc count=2 seek=1 skip=1 || error "(4) dd failed"
 
-       flush_and_compare $tf $tf.2 "(5)"
+       flush_and_compare $tf $tf_copy "(5)"
 
        # Now we jump a bit further on and do a write
+       # Write 8K at an offset of 32K
        dd if=$source bs=8K of=$tf conv=notrunc count=1 seek=4 skip=4 || error "(6) dd failed"
-       dd if=$source bs=8K of=$tf.2 conv=notrunc count=1 seek=4 skip=4 || error "(7) dd failed"
-       sync
+       dd if=$source bs=8K of=$tf_copy conv=notrunc count=1 seek=4 skip=4 || error "(7) dd failed"
 
-       flush_and_compare $tf $tf.2 "(8)"
+       flush_and_compare $tf $tf_copy "(8)"
 
        # None of the above was compressed, because the first write was too
        # small and the later writes weren't chunk aligned.  Confirm the server
@@ -768,89 +770,98 @@ test_1005() {
 
        # OK, now we truncate the file back to zero and start with a compressed
        # chunk at the beginning.
-       # 16K - this will be compressed
+       # Write 16K at 0 - this will be compressed
        dd if=$source bs=16K of=$tf count=1 || error "(9) dd failed"
-       dd if=$source bs=16K of=$tf.2 count=1 || error "(10) dd failed"
+       dd if=$source bs=16K of=$tf_copy count=1 || error "(10) dd failed"
 
-       flush_and_compare $tf $tf.2 "(11)"
+       flush_and_compare $tf $tf_copy "(11)"
 
        # Now we do the same size of write slightly further down
-       # 16K, in to an existing chunk - leads to read-modify-write
+       # 16K in to existing chunk - read-modify-write
+       # Seek to 32K and write 16K
        dd if=$source bs=16K of=$tf count=1 seek=2 skip=2 conv=notrunc || error "(12) dd failed"
-       dd if=$source bs=16K of=$tf.2 count=1 seek=2 skip=2 conv=notrunc || error "(13) dd failed"
+       dd if=$source bs=16K of=$tf_copy count=1 seek=2 skip=2 conv=notrunc || error "(13) dd failed"
 
-       flush_and_compare $tf $tf.2 "(14)"
+       flush_and_compare $tf $tf_copy "(14)"
 
        # OK, now we're going to create a complete compressed chunk further
        # along in the file
+       # Seek to 128K and write 128K
        dd if=$source bs=128K count=1 skip=1 seek=1 conv=notrunc of=$tf ||
                error "(15) dd failed"
-       dd if=$source bs=128K count=1 skip=1 seek=1 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=128K count=1 skip=1 seek=1 conv=notrunc of=$tf_copy ||
                error "(16) dd failed"
 
-       flush_and_compare $tf $tf.2 "(17)"
+       flush_and_compare $tf $tf_copy "(17)"
 
        # Now we're going to write *into* the middle of this compressed chunk
+       # Seek to 160K and write 32K
        dd if=$source bs=32K count=1 skip=5 seek=5 conv=notrunc of=$tf ||
                error "(18) dd failed"
-       dd if=$source bs=32K count=1 skip=5 seek=5 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=32K count=1 skip=5 seek=5 conv=notrunc of=$tf_copy ||
                error "(19) dd failed"
 
-       flush_and_compare $tf $tf.2 "(20)"
+       flush_and_compare $tf $tf_copy "(20)"
 
        # OK, now we're going to add another incomplete chunk after those
        # two - starting in the third chunk, ie, offset of 256K
        # This will be compressed
+       # Seek to 256K and write 64K
        dd if=$source bs=64K count=1 skip=4 seek=4 conv=notrunc of=$tf ||
                error "(21) dd failed"
-       dd if=$source bs=64K count=1 skip=4 seek=4 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=64K count=1 skip=4 seek=4 conv=notrunc of=$tf_copy ||
                error "(22) dd failed"
 
-       flush_and_compare $tf $tf.2 "(23)"
+       flush_and_compare $tf $tf_copy "(23)"
 
        # Now we're going to write the second 'half' of this chunk
+       # Seek to 320K and write 64K
        dd if=$source bs=64K count=1 skip=5 seek=5 conv=notrunc of=$tf ||
                error "(24) dd failed"
-       dd if=$source bs=64K count=1 skip=5 seek=5 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=64K count=1 skip=5 seek=5 conv=notrunc of=$tf_copy ||
                error "(25) dd failed"
 
-       flush_and_compare $tf $tf.2 "(26)"
+       flush_and_compare $tf $tf_copy "(26)"
 
        # And now we're going to add a chunk that doesn't start at offset 0,
        # so it won't be compressed.  This is the fourth chunk, so starts at
        # 384K.  So we'll start at 448K
+       # Seek to 448K and write 64K
        dd if=$source bs=64K count=1 skip=7 seek=7 conv=notrunc of=$tf ||
                error "(27) dd failed"
-       dd if=$source bs=64K count=1 skip=7 seek=7 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=64K count=1 skip=7 seek=7 conv=notrunc of=$tf_copy ||
                error "(28) dd failed"
 
-       flush_and_compare $tf $tf.2 "(29)"
+       flush_and_compare $tf $tf_copy "(29)"
 
        # Now we'll write a few KiB in to the middle of the first part of this
        # chunk - starting at 400K
+       # Seek to 400K and write 8K
        dd if=$source bs=8K count=2 skip=50 seek=50 conv=notrunc of=$tf ||
                error "(30) dd failed"
-       dd if=$source bs=8K count=2 skip=50 seek=50 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=8K count=2 skip=50 seek=50 conv=notrunc of=$tf_copy ||
                error "(31) dd failed"
 
-       flush_and_compare $tf $tf.2 "(32)"
+       flush_and_compare $tf $tf_copy "(32)"
 
        # Now let's skip an entire chunk and do a full chunk at 640K
+       # Seek 640K and write 128K
        dd if=$source bs=128K count=1 skip=5 seek=5 conv=notrunc of=$tf ||
                error "(33) dd failed"
-       dd if=$source bs=128K count=1 skip=5 seek=5 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=128K count=1 skip=5 seek=5 conv=notrunc of=$tf_copy ||
                error "(34) dd failed"
 
-       flush_and_compare $tf $tf.2 "(35)"
+       flush_and_compare $tf $tf_copy "(35)"
 
        # Then one more partial, but compressible, chunk after that at
        # 768K
+       # Seek to 768K and write 64K
        dd if=$source bs=64K count=1 skip=12 seek=12 conv=notrunc of=$tf ||
                error "(36) dd failed"
-       dd if=$source bs=64K count=1 skip=12 seek=12 conv=notrunc of=$tf.2 ||
+       dd if=$source bs=64K count=1 skip=12 seek=12 conv=notrunc of=$tf_copy ||
                error "(37) dd failed"
 
-       flush_and_compare $tf $tf.2 "(38)"
+       flush_and_compare $tf $tf_copy "(38)"
 
        # OK, now we have this complex file, let's test reading it at a
        # number of block sizes to give us unusual offsets