Whamcloud - gitweb
LU-10026 osd-ldiskfs: use preallocation for dense writes
authorAlex Zhuravlev <bzzz@whamcloud.com>
Tue, 27 Jun 2023 06:59:31 +0000 (09:59 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 28 Sep 2023 08:42:05 +0000 (08:42 +0000)
use inode's preallocation chunks as per-inode group preallocation:
just grab the very first available blocks from the window.

Lustre-change: https://review.whamcloud.com//50171
Lustre-commit: TBD (from 986340bcdfa572a1f6bab34014e0474c89f47691))

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I9d36701f569f4c6305bc46f3373bfc054fcd61a9
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51468
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Zhenyu Xu <bobijam@hotmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series
lustre/include/obd_support.h
lustre/osc/osc_request.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/osd-ldiskfs/osd_lproc.c
lustre/tests/sanity-compr.sh

diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch
new file mode 100644 (file)
index 0000000..e9bd5e6
--- /dev/null
@@ -0,0 +1,78 @@
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
+@@ -151,6 +151,7 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_CR0_OPTIMIZED         0x8000
+ /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+ #define EXT4_MB_CR1_OPTIMIZED         0x00010000
++#define EXT4_MB_VERY_DENSE            0x00080000
+
+ struct ext4_allocation_request {
+       /* target inode for block we're allocating */
+@@ -627,6 +628,7 @@ enum {
+       /* Caller will submit data before dropping transaction handle. This
+        * allows jbd2 to avoid submitting data before commit. */
+ #define EXT4_GET_BLOCKS_IO_SUBMIT             0x0400
++#define EXT4_GET_BLOCKS_VERY_DENSE            0x0800
+ /*
+  * The bit position of these flags must not overlap with any of the
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/extents.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/extents.c
+@@ -4484,6 +4467,8 @@ int ext4_ext_map_blocks(handle_t *han
+               ar.flags = 0;
+       if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+               ar.flags |= EXT4_MB_HINT_NOPREALLOC;
++      if (flags & EXT4_GET_BLOCKS_VERY_DENSE)
++              ar.flags |= EXT4_MB_VERY_DENSE;
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+       if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c
+@@ -4267,6 +4291,21 @@ ext4_mb_use_inode_pa(struct ext4
+       ext4_fsblk_t end;
+       int len;
+
++      if (ac->ac_flags & EXT4_MB_VERY_DENSE) {
++              unsigned int len = ac->ac_o_ex.fe_len;
++              ext4_get_group_no_and_offset(ac->ac_sb,
++                                      pa->pa_pstart,
++                                      &ac->ac_b_ex.fe_group,
++                                      &ac->ac_b_ex.fe_start);
++              ac->ac_b_ex.fe_len = len;
++              pa->pa_pstart += len;
++              pa->pa_free -= len;
++              pa->pa_len -= len;
++              ac->ac_status = AC_STATUS_FOUND;
++              ac->ac_pa = pa;
++              return;
++      }
++
+       /* found preallocated blocks, use them */
+       start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+       end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+@@ -4367,6 +4380,24 @@ ext4_mb_use_preallocated(struct ext4
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return false;
++      if (ac->ac_flags & EXT4_MB_VERY_DENSE) {
++              unsigned int len = ac->ac_o_ex.fe_len;
++              rcu_read_lock();
++              list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
++                      spin_lock(&pa->pa_lock);
++                      if (pa->pa_deleted == 0 && len <= pa->pa_free) {
++                              atomic_inc(&pa->pa_count);
++                              ext4_mb_use_inode_pa(ac, pa);
++                              spin_unlock(&pa->pa_lock);
++                              break;
++                      }
++                      spin_unlock(&pa->pa_lock);
++              }
++              rcu_read_unlock();
++              if (ac->ac_status == AC_STATUS_FOUND)
++                      return true;
++      }
++
+       /* first, try per-file preallocation */
+       rcu_read_lock();
+       list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
index 33a2226..fffffc5 100644 (file)
@@ -38,3 +38,4 @@ rhel8.4/ext4-optimize-find_delayed_extent.patch
 rhel8/ext4-limit-per-inode-preallocation-list.patch
 rhel8/ext4-mballoc-improve.patch
 rhel8/ext4-mballoc-for-hybrid.patch
+rhel8/ext4-mballoc-dense.patch
index 76bca28..9cd87d8 100644 (file)
@@ -437,6 +437,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSC_DELAY_CANCEL        0x416
 #define OBD_FAIL_OSC_SLOW_PAGE_EVICT    0x417
 #define OBD_FAIL_OSC_WRONG_COMP_ALG     0x418
+#define OBD_FAIL_OSC_MARK_COMPRESSED    0x419
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
index 12af7cf..1f20e23 100644 (file)
@@ -1906,6 +1906,8 @@ no_bulk:
                        niobuf->rnb_len    = pg->count;
                        niobuf->rnb_flags  = pg->flag;
                 }
+               if (CFS_FAIL_CHECK(OBD_FAIL_OSC_MARK_COMPRESSED))
+                       niobuf->rnb_flags |= OBD_BRW_COMPRESSED;
                 pg_prev = pg;
         }
 
index 3d62d12..caa0193 100644 (file)
@@ -8337,6 +8337,7 @@ static int osd_mount(const struct lu_env *env,
                o->od_nonrotational = 1;
                osd_tune_nonrot(o, true);
        }
+       o->od_extents_dense = 1;
 
        GOTO(out, rc = 0);
 
index 72c1d72..b7cf181 100644 (file)
@@ -293,7 +293,8 @@ struct osd_device {
                                  od_writethrough_cache_enable_set:1,
                                  od_nonrotational:1,
                                  od_nonrotational_set:1,
-                                 od_enable_projid_xattr:1;
+                                 od_enable_projid_xattr:1,
+                                 od_extents_dense:1;
 
        __s64                     od_auto_scrub_interval;
        __u32                     od_dirent_journal;
index d6e2a1f..9ba4b10 100644 (file)
@@ -1092,12 +1092,15 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode,
        sector_t *blocks = iobuf->dr_blocks;
        struct niobuf_local *lnb1, *lnb2;
        loff_t size1, size2;
+       bool compressed;
 
        max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT;
 
        CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
                inode->i_ino, pages, (*page)->index);
 
+       compressed = iobuf->dr_lnbs[0]->lnb_flags & OBD_BRW_COMPRESSED;
+
        if (create) {
                create = LDISKFS_GET_BLOCKS_CREATE;
                handle = ldiskfs_journal_current_handle();
@@ -1169,6 +1172,10 @@ cont_map:
                        else
                                oh->oh_declared_ext--;
                }
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+               if (osd->od_extents_dense && compressed)
+                       create |= LDISKFS_GET_BLOCKS_VERY_DENSE;
+#endif
                rc = ldiskfs_map_blocks(handle, inode, &map, create);
                if (rc >= 0) {
                        int c = 0;
index bfd4cb2..9e89129 100644 (file)
@@ -901,6 +901,35 @@ ssize_t index_backup_store(struct kobject *kobj, struct attribute *attr,
 }
 LUSTRE_RW_ATTR(index_backup);
 
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+static ssize_t extents_dense_show(struct kobject *kobj, struct attribute *attr,
+                                 char *buf)
+{
+       struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+       struct osd_device *osd = osd_dt_dev(dt);
+
+       return sprintf(buf, "%d\n", osd->od_extents_dense);
+}
+
+static ssize_t extents_dense_store(struct kobject *kobj, struct attribute *attr,
+                                  const char *buffer, size_t count)
+{
+       struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+       struct osd_device *osd = osd_dt_dev(dt);
+       bool extents_dense;
+       int rc;
+
+       rc = kstrtobool(buffer, &extents_dense);
+       if (rc != 0)
+               return rc;
+
+       osd->od_extents_dense = extents_dense;
+
+       return count;
+}
+LUSTRE_RW_ATTR(extents_dense);
+#endif
+
 struct ldebugfs_vars ldebugfs_osd_obd_vars[] = {
        { .name =       "oi_scrub",
          .fops =       &ldiskfs_osd_oi_scrub_fops      },
@@ -928,6 +957,9 @@ static struct attribute *ldiskfs_attrs[] = {
        &lustre_attr_full_scrub_ratio.attr,
        &lustre_attr_full_scrub_threshold_rate.attr,
        &lustre_attr_extent_bytes_allocation.attr,
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+       &lustre_attr_extents_dense.attr,
+#endif
        NULL,
 };
 
index 2c654b5..8ee26c7 100644 (file)
@@ -59,6 +59,84 @@ test_sanityn()
 }
 run_test sanityn "Run sanityn with PFL layout"
 
+test_1000() {
+       local blocks=128
+       local dense=$(do_facet ost1 lctl get_param -n \
+                             osd*.*OST0000*.extents_dense)
+       [[ -n $dense ]] || skip "no dense writes supported"
+
+       local osts=$(comma_list $(osts_nodes))
+       do_nodes $osts $LCTL set_param osd*.*.extents_dense=0 ||
+               error "cannot enable dense extent allocation"
+       stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense"
+
+       local tf=$DIR/$tfile
+       stack_trap "rm -f $tf"
+       log "create file with dense=0"
+
+       $LFS setstripe -c 1 -i 0 $tf
+       for ((i=0; i<$blocks; i++)); do
+               dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+                       oflag=direct >&/dev/null conv=notrunc ||
+                               error "can't dd (sparse)"
+       done
+       filefrag -sv $tf
+       local nonr=0
+       while read EX LS LE PS PE LEN DEV FLAGS; do
+               [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+               [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nonr+=1)) && continue
+               (( ${PS%%.*} == PREV + 1 )) || ((nonr+=1))
+               PREV=${PE%:}
+       done < <(filefrag -v $tf)
+       (( nonr > 0 )) || error "no extents?"
+       rm -f $tf
+       wait_delete_completed
+
+       do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 ||
+               error "cannot enable dense extent allocation"
+       #define OBD_FAIL_OSC_MARK_COMPRESSED    0x419
+       $LCTL set_param fail_loc=0x419
+       log "create file with dense=1"
+
+       $LFS setstripe -c 1 -i 0 $tf
+       for ((i=0; i<$blocks; i++)); do
+               dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+                       oflag=direct conv=notrunc >&/dev/null ||
+                               error "can't dd (dense)"
+       done
+       filefrag -sv $tf
+       local nr=0
+       while read EX LS LE PS PE LEN DEV FLAGS; do
+               [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+               [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nr+=1)) && continue
+               (( ${PS%%.*} == PREV + 1 )) || ((nr+=1))
+               PREV=${PE%:}
+       done < <(filefrag -v $tf)
+       (( nr > 0 )) || error "no extents?"
+
+       echo "dense ($nr) should have fewer extents ($nonr)"
+       (( (nonr / nr) > 3 )) ||
+               error "dense ($nr) should have less extents ($nonr)"
+       $LCTL set_param fail_loc=0
+
+       local tmpfile=$(mktemp)
+       stack_trap "rm -f $tmpfile"
+       echo "generate temp file $tmpfile"
+       dd if=/dev/urandom of=$tmpfile bs=32k count=$((blocks*2)) iflag=fullblock ||
+               error "can't generate temporary file"
+       dd if=$tmpfile of=$tf bs=32k conv=notrunc
+       cancel_lru_locks osc
+
+       stop ost1 || error "(2) Fail to stop ost1"
+       run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" ||
+               error "(3) Fail to run e2fsck error"
+       start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
+               error "(4) Fail to start ost1"
+
+       cmp $tmpfile $tf || error "data mismatch"
+}
+run_test 1000 "compressed vs uncompressed allocation"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 declare -a logs=($ONLY)