Whamcloud - gitweb
LU-10026 osd-ldiskfs: use preallocation for dense writes 71/50171/48
authorAlex Zhuravlev <bzzz@whamcloud.com>
Wed, 1 Mar 2023 18:28:25 +0000 (21:28 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 4 Mar 2024 20:01:44 +0000 (20:01 +0000)
use inode's preallocation chunks as per-inode group preallocation:
just grab the very first available blocks from the window.

Test-Parameters: env=ONLY=1000,ONLY_REPEAT=11 testlist=sanity-compr
Test-Parameters: env=ONLY=fsx,ONLY_REPEAT=11 testlist=sanity-compr
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I9d36701f569f4c6305bc46f3373bfc054fcd61a9
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50171
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Artem Blagodarenko <ablagodarenko@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series
lustre/include/obd_support.h
lustre/osc/osc_request.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/osd-ldiskfs/osd_lproc.c
lustre/tests/sanity-benchmark.sh
lustre/tests/sanity-compr.sh

diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch
new file mode 100644 (file)
index 0000000..a3d17ff
--- /dev/null
@@ -0,0 +1,127 @@
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.h
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.h
+@@ -131,6 +131,8 @@ enum SHIFT_DIRECTION {
+       ext4_lblk_t             pa_lstart;      /* log. block */
+       ext4_grpblk_t           pa_len;         /* len of preallocated chunk */
+       ext4_grpblk_t           pa_free;        /* how many blocks are free */
++      ext4_grpblk_t           pa_group;
++      unsigned short          pa_regular;
+       unsigned short          pa_type;        /* pa type. inode or group */
+       unsigned short          pa_error;
+       spinlock_t              *pa_obj_lock;
+@@ -167,7 +167,7 @@ struct ext4_allocation_request {
+       __u16 ac_found;
+       __u16 ac_tail;
+       __u16 ac_buddy;
+-      __u16 ac_flags;         /* allocation hints */
++      __u32 ac_flags;         /* allocation hints */
+       __u8 ac_status;
+       __u8 ac_criteria;
+       __u8 ac_2order;         /* if request is to allocate 2^N blocks and
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
+@@ -151,6 +151,7 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_USE_RESERVED          0x2000
+ /* Do strict check for free blocks while retrying block allocation */
+ #define EXT4_MB_STRICT_CHECK          0x4000
++#define EXT4_MB_VERY_DENSE            0x80000
+
+ struct ext4_allocation_request {
+       /* target inode for block we're allocating */
+@@ -627,6 +628,7 @@ enum {
+       /* Caller will submit data before dropping transaction handle. This
+        * allows jbd2 to avoid submitting data before commit. */
+ #define EXT4_GET_BLOCKS_IO_SUBMIT             0x0400
++#define EXT4_GET_BLOCKS_VERY_DENSE            0x08000
+ /*
+  * The bit position of these flags must not overlap with any of the
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/extents.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/extents.c
+@@ -4484,6 +4467,8 @@ int ext4_ext_map_blocks(handle_t *han
+               ar.flags = 0;
+       if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+               ar.flags |= EXT4_MB_HINT_NOPREALLOC;
++      if (flags & EXT4_GET_BLOCKS_VERY_DENSE)
++              ar.flags |= EXT4_MB_VERY_DENSE;
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+       if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c
+@@ -4267,6 +4291,25 @@ ext4_mb_use_inode_pa(struct ext4
+       ext4_fsblk_t end;
+       int len;
+
++      if (ac->ac_flags & EXT4_MB_VERY_DENSE && !pa->pa_regular) {
++              unsigned int len = ac->ac_o_ex.fe_len;
++              if (len > pa->pa_free)
++                      len = pa->pa_free;
++              ext4_get_group_no_and_offset(ac->ac_sb,
++                                      pa->pa_pstart,
++                                      &ac->ac_b_ex.fe_group,
++                                      &ac->ac_b_ex.fe_start);
++              ac->ac_b_ex.fe_len = len;
++              pa->pa_lstart += len;
++              pa->pa_pstart += len;
++              pa->pa_free -= len;
++              pa->pa_len -= len;
++              ac->ac_status = AC_STATUS_FOUND;
++              ac->ac_pa = pa;
++              return;
++      }
++
++      pa->pa_regular = 1;
+       /* found preallocated blocks, use them */
+       start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+       end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+@@ -4367,6 +4380,23 @@ ext4_mb_use_preallocated(struct ext4
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return false;
++      if (ac->ac_flags & EXT4_MB_VERY_DENSE) {
++              rcu_read_lock();
++              list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
++                      spin_lock(&pa->pa_lock);
++                      if (!pa->pa_deleted && pa->pa_free && !pa->pa_regular) {
++                              atomic_inc(&pa->pa_count);
++                              ext4_mb_use_inode_pa(ac, pa);
++                              spin_unlock(&pa->pa_lock);
++                              break;
++                      }
++                      spin_unlock(&pa->pa_lock);
++              }
++              rcu_read_unlock();
++              if (ac->ac_status == AC_STATUS_FOUND)
++                      return true;
++      }
++
+       /* first, try per-file preallocation */
+       rcu_read_lock();
+       list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+@@ -4833,7 +4833,7 @@ ext4_mb_put_pa(struct ext4
+       if (pa->pa_type == MB_GROUP_PA)
+               grp_blk--;
+
+-      grp = ext4_get_group_number(sb, grp_blk);
++      grp = pa->pa_group;
+
+       /*
+        * possible race:
+@@ -4894,6 +4894,8 @@ ext4_mb_new_inode_pa(struct ext4
+       pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+       pa->pa_len = ac->ac_b_ex.fe_len;
+       pa->pa_free = pa->pa_len;
++      pa->pa_group = ac->ac_b_ex.fe_group;
++      pa->pa_regular = 0;
+       spin_lock_init(&pa->pa_lock);
+       INIT_LIST_HEAD(&pa->pa_inode_list);
+       INIT_LIST_HEAD(&pa->pa_group_list);
+@@ -5004,6 +5005,7 @@ ext4_mb_new_group_pa(struct ext4
+       pa->pa_lstart = pa->pa_pstart;
+       pa->pa_len = ac->ac_b_ex.fe_len;
+       pa->pa_free = pa->pa_len;
++      pa->pa_group = ac->ac_b_ex.fe_group;
+       spin_lock_init(&pa->pa_lock);
+       INIT_LIST_HEAD(&pa->pa_inode_list);
+       INIT_LIST_HEAD(&pa->pa_group_list);
index cb5726a..48aabda 100644 (file)
@@ -39,3 +39,4 @@ rhel8/ext4-old_ea_inodes_handling_fix.patch
 rhel8.4/ext4-optimize-find_delayed_extent.patch
 rhel8/ext4-encdata.patch
 rhel8/ext4-race-in-ext4-destroy-inode.patch
+rhel8/ext4-mballoc-dense.patch
index d7579eb..f770359 100644 (file)
@@ -443,6 +443,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 #define OBD_FAIL_OSC_DELAY_CANCEL        0x416
 #define OBD_FAIL_OSC_SLOW_PAGE_EVICT    0x417
+#define OBD_FAIL_OSC_MARK_COMPRESSED    0x419
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
@@ -766,6 +767,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_BARRIER_FAILURE               0x2203
 
 #define OBD_FAIL_OSD_FAIL_AT_TRUNCATE          0x2301
+#define OBD_FAIL_OSD_MARK_COMPRESSED           0x2302
 
 /* continuation of MDS related constants */
 #define OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP 0x2401
index 15382ec..3afa502 100644 (file)
@@ -1864,6 +1864,8 @@ no_bulk:
                        niobuf->rnb_flags  = pg->bp_flag;
                }
                pg_prev = pg;
+               if (CFS_FAIL_CHECK(OBD_FAIL_OSC_MARK_COMPRESSED))
+                       niobuf->rnb_flags |= OBD_BRW_COMPRESSED;
        }
 
        LASSERTF((void *)(niobuf - niocount) ==
index 330286e..7ef6fe5 100644 (file)
@@ -288,7 +288,8 @@ struct osd_device {
                                  od_read_cache:1,
                                  od_writethrough_cache:1,
                                  od_nonrotational:1,
-                                 od_enable_projid_xattr:1;
+                                 od_enable_projid_xattr:1,
+                                 od_extents_dense:1;
 
 
        __u32                     od_dirent_journal;
index 9133823..7b253bd 100644 (file)
@@ -915,7 +915,8 @@ static void osd_decay_extent_bytes(struct osd_device *osd,
 static int osd_ldiskfs_map_inode_pages(struct inode *inode,
                                       struct osd_iobuf *iobuf,
                                       struct osd_device *osd,
-                                      int create, __u64 user_size,
+                                      const int create,
+                                      __u64 user_size,
                                       int check_credits,
                                       struct thandle *thandle)
 {
@@ -932,14 +933,19 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode,
        sector_t *blocks = iobuf->dr_blocks;
        struct niobuf_local *lnb1, *lnb2;
        loff_t size1, size2;
+       bool compressed = false;
+       int flags = 0;
 
        max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT;
 
        CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
                inode->i_ino, pages, (*lnbs)->lnb_page->index);
 
+       if (osd->od_extents_dense)
+               compressed = iobuf->dr_lnbs[0]->lnb_flags & OBD_BRW_COMPRESSED;
+
        if (create) {
-               create = LDISKFS_GET_BLOCKS_CREATE;
+               flags = LDISKFS_GET_BLOCKS_CREATE;
                handle = ldiskfs_journal_current_handle();
                LASSERT(handle != NULL);
                rc = osd_attach_jinode(inode);
@@ -1031,9 +1037,17 @@ cont_map:
                        else
                                oh->oh_declared_ext--;
                }
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+               if (osd->od_extents_dense) {
+                       if (CFS_FAIL_CHECK(OBD_FAIL_OSD_MARK_COMPRESSED))
+                               flags |= LDISKFS_GET_BLOCKS_VERY_DENSE;
+                       if (compressed)
+                               flags |= LDISKFS_GET_BLOCKS_VERY_DENSE;
+               }
+#endif
 
                time = ktime_get();
-               rc = ldiskfs_map_blocks(handle, inode, &map, create);
+               rc = ldiskfs_map_blocks(handle, inode, &map, flags);
                time = ktime_sub(ktime_get(), time);
 
                if (rc >= 0) {
index 21f3b8f..9e6cbf9 100644 (file)
@@ -805,6 +805,35 @@ ssize_t index_backup_store(struct kobject *kobj, struct attribute *attr,
 }
 LUSTRE_RW_ATTR(index_backup);
 
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+static ssize_t extents_dense_show(struct kobject *kobj, struct attribute *attr,
+                       char *buf)
+{
+       struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+       struct osd_device *osd = osd_dt_dev(dt);
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", osd->od_extents_dense);
+}
+
+static ssize_t extents_dense_store(struct kobject *kobj, struct attribute *attr,
+                        const char *buffer, size_t count)
+{
+       struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+       struct osd_device *osd = osd_dt_dev(dt);
+       bool extents_dense;
+       int rc;
+
+       rc = kstrtobool(buffer, &extents_dense);
+       if (rc != 0)
+               return rc;
+
+       osd->od_extents_dense = extents_dense;
+
+       return count;
+}
+LUSTRE_RW_ATTR(extents_dense);
+#endif
+
 struct ldebugfs_vars ldebugfs_osd_obd_vars[] = {
        { .name =       "oi_scrub",
          .fops =       &ldiskfs_osd_oi_scrub_fops      },
@@ -832,6 +861,9 @@ static struct attribute *ldiskfs_attrs[] = {
        &lustre_attr_full_scrub_ratio.attr,
        &lustre_attr_full_scrub_threshold_rate.attr,
        &lustre_attr_extent_bytes_allocation.attr,
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+       &lustre_attr_extents_dense.attr,
+#endif
        NULL,
 };
 
index 891c327..c0b3aaf 100644 (file)
@@ -178,6 +178,7 @@ test_iozone() {
 run_test iozone "iozone"
 
 test_fsx() {
+        local fsx_layout="${fsx_STRIPEPARAMS:--c -1}"
        local testfile=$DIR/f0.fsxfile
        FSX_SIZE=$SIZE
        FSX_COUNT=1000
@@ -189,7 +190,8 @@ test_fsx() {
        $DEBUG_OFF
        FSX_SEED=${FSX_SEED:-$RANDOM}
        rm -f $testfile
-       $LFS setstripe -c -1 $testfile
+       $LFS setstripe $fsx_layout $testfile ||
+               error "'setstripe $fsx_layout $testfile' failed"
        CMD="$FSX -c 50 -p 1000 -S $FSX_SEED -P $TMP -l $FSX_SIZE \
             -N $((FSX_COUNT * 100)) $FSXOPT $testfile"
        echo "Using: $CMD"
index 4164df4..a211636 100644 (file)
@@ -54,6 +54,106 @@ test_sanityn()
 }
 run_test sanityn "Run sanityn with PFL layout"
 
+test_1000() {
+       local blocks=128
+       local dense=$(do_facet ost1 lctl get_param -n \
+                             osd*.*OST0000*.extents_dense)
+       [[ -n $dense ]] || skip "no dense writes supported"
+
+       local osts=$(comma_list $(osts_nodes))
+       do_nodes $osts $LCTL set_param osd*.*.extents_dense=0 ||
+               error "cannot enable dense extent allocation"
+       stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense"
+
+       local tf=$DIR/$tfile
+       stack_trap "rm -f $tf"
+       log "create file with dense=0"
+
+       $LFS setstripe -c 1 -i 0 $tf
+       for ((i=0; i<$blocks; i++)); do
+               dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+                       oflag=direct >&/dev/null conv=notrunc ||
+                               error "can't dd (sparse)"
+       done
+       filefrag -sv $tf
+       local nonr=0
+       while read EX LS LE PS PE LEN DEV FLAGS; do
+               [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+               [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nonr+=1)) && continue
+               (( ${PS%%.*} == PREV + 1 )) || ((nonr+=1))
+               PREV=${PE%:}
+       done < <(filefrag -v $tf)
+       (( nonr > 0 )) || error "no extents?"
+       rm -f $tf
+       wait_delete_completed
+
+       do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 ||
+               error "cannot enable dense extent allocation"
+       #define OBD_FAIL_OSC_MARK_COMPRESSED    0x419
+       $LCTL set_param fail_loc=0x419
+       log "create file with dense=1"
+
+       $LFS setstripe -c 1 -i 0 $tf
+       for ((i=0; i<$blocks; i++)); do
+               dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+                       oflag=direct conv=notrunc >&/dev/null ||
+                               error "can't dd (dense)"
+       done
+       filefrag -sv $tf
+       local nr=0
+       while read EX LS LE PS PE LEN DEV FLAGS; do
+               [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+               [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nr+=1)) && continue
+               (( ${PS%%.*} == PREV + 1 )) || ((nr+=1))
+               PREV=${PE%:}
+       done < <(filefrag -v $tf)
+       (( nr > 0 )) || error "no extents?"
+
+       echo "dense ($nr) should have fewer extents ($nonr)"
+       (( (nonr / nr) > 3 )) ||
+               error "dense ($nr) should have less extents ($nonr)"
+       $LCTL set_param fail_loc=0
+
+       local tmpfile=$(mktemp)
+       stack_trap "rm -f $tmpfile"
+       echo "generate temp file $tmpfile"
+       dd if=/dev/urandom of=$tmpfile bs=32k count=$((blocks*2)) iflag=fullblock ||
+               error "can't generate temporary file"
+       dd if=$tmpfile of=$tf bs=32k conv=notrunc
+       cancel_lru_locks osc
+
+       stop ost1 || error "(2) Fail to stop ost1"
+       run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" ||
+               error "(3) Fail to run e2fsck error"
+       start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
+               error "(4) Fail to start ost1"
+
+       cmp $tmpfile $tf || error "data mismatch"
+}
+run_test 1000 "compressed vs uncompressed allocation"
+
+test_fsx() {
+       [[ "$ost1_FSTYPE" == "ldiskfs" ]] || skip "need ldiskfs backend"
+       local osts=$(comma_list $(osts_nodes))
+
+       local dense=$(do_facet ost1 lctl get_param -n \
+                             osd*.*OST0000*.extents_dense)
+       [[ -n $dense ]] || skip "no dense writes supported"
+       do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 ||
+               error "cannot enable dense extent allocation"
+       stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense"
+
+#define OBD_FAIL_OSD_MARK_COMPRESSED           0x2302
+       do_nodes $osts $LCTL set_param fail_loc=0x2302 ||
+               error "cannot force dense writes"
+       stack_trap "do_nodes $osts $LCTL set_param fail_loc=0"
+
+       fsx_STRIPEPARAMS="-E eof -c -1" ONLY=fsx FSX_COUNT=2500 SLOW=yes bash sanity-benchmark.sh
+
+       $DEBUG_ON
+}
+run_test fsx "verify dense writes with fsx on ldiskfs"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 declare -a logs=($ONLY)