From: Alex Zhuravlev Date: Tue, 27 Jun 2023 06:59:31 +0000 (+0300) Subject: LU-10026 osd-ldiskfs: use preallocation for dense writes X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=f36eda6a1e;p=fs%2Flustre-release.git LU-10026 osd-ldiskfs: use preallocation for dense writes use inode's preallocation chunks as per-inode group preallocation: just grab the very first available blocks from the window. Lustre-change: https://review.whamcloud.com//50171 Lustre-commit: TBD (from 986340bcdfa572a1f6bab34014e0474c89f47691)) Signed-off-by: Alex Zhuravlev Change-Id: I9d36701f569f4c6305bc46f3373bfc054fcd61a9 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51468 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Zhenyu Xu Reviewed-by: Andreas Dilger --- diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch new file mode 100644 index 0000000..e9bd5e6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch @@ -0,0 +1,78 @@ +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h +@@ -151,6 +151,7 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_CR0_OPTIMIZED 0x8000 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ + #define EXT4_MB_CR1_OPTIMIZED 0x00010000 ++#define EXT4_MB_VERY_DENSE 0x00080000 + + struct ext4_allocation_request { + /* target inode for block we're allocating */ +@@ -627,6 +628,7 @@ enum { + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ + #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 ++#define EXT4_GET_BLOCKS_VERY_DENSE 0x0800 + + /* + * The bit position of these flags must not overlap with any of the +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/extents.c ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/extents.c +@@ -4484,6 +4467,8 @@ int ext4_ext_map_blocks(handle_t *han + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; ++ if (flags & EXT4_GET_BLOCKS_VERY_DENSE) ++ ar.flags |= EXT4_MB_VERY_DENSE; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c +@@ -4267,6 +4291,21 @@ ext4_mb_use_inode_pa(struct ext4 + ext4_fsblk_t end; + int len; + ++ if (ac->ac_flags & EXT4_MB_VERY_DENSE) { ++ unsigned int len = ac->ac_o_ex.fe_len; ++ ext4_get_group_no_and_offset(ac->ac_sb, ++ pa->pa_pstart, ++ &ac->ac_b_ex.fe_group, ++ &ac->ac_b_ex.fe_start); ++ ac->ac_b_ex.fe_len = len; ++ pa->pa_pstart += len; ++ pa->pa_free -= len; ++ pa->pa_len -= len; ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_pa = pa; ++ return; ++ } ++ + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), +@@ -4367,6 +4380,24 @@ ext4_mb_use_preallocated(struct ext4 + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return false; + ++ if (ac->ac_flags & EXT4_MB_VERY_DENSE) { ++ unsigned int len = ac->ac_o_ex.fe_len; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ++ spin_lock(&pa->pa_lock); ++ if (pa->pa_deleted == 0 && len <= pa->pa_free) { ++ atomic_inc(&pa->pa_count); ++ ext4_mb_use_inode_pa(ac, pa); ++ spin_unlock(&pa->pa_lock); ++ break; ++ } ++ spin_unlock(&pa->pa_lock); ++ } ++ rcu_read_unlock(); ++ if (ac->ac_status == AC_STATUS_FOUND) ++ return true; ++ } ++ + /* first, try per-file preallocation */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series index 33a2226..fffffc5 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series @@ -38,3 +38,4 @@ rhel8.4/ext4-optimize-find_delayed_extent.patch rhel8/ext4-limit-per-inode-preallocation-list.patch rhel8/ext4-mballoc-improve.patch rhel8/ext4-mballoc-for-hybrid.patch +rhel8/ext4-mballoc-dense.patch diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 76bca28..9cd87d8 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -437,6 +437,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSC_DELAY_CANCEL 0x416 #define OBD_FAIL_OSC_SLOW_PAGE_EVICT 0x417 #define OBD_FAIL_OSC_WRONG_COMP_ALG 0x418 +#define OBD_FAIL_OSC_MARK_COMPRESSED 0x419 #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 12af7cf..1f20e23 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1906,6 +1906,8 @@ no_bulk: niobuf->rnb_len = pg->count; niobuf->rnb_flags = pg->flag; } + if (CFS_FAIL_CHECK(OBD_FAIL_OSC_MARK_COMPRESSED)) + niobuf->rnb_flags |= OBD_BRW_COMPRESSED; pg_prev = pg; } diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 3d62d12..caa0193 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -8337,6 +8337,7 @@ static int osd_mount(const struct lu_env *env, o->od_nonrotational = 1; osd_tune_nonrot(o, true); } + o->od_extents_dense = 1; GOTO(out, rc = 0); diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 72c1d72..b7cf181 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -293,7 +293,8 @@ struct osd_device { od_writethrough_cache_enable_set:1, od_nonrotational:1, od_nonrotational_set:1, - od_enable_projid_xattr:1; + od_enable_projid_xattr:1, + od_extents_dense:1; __s64 od_auto_scrub_interval; __u32 od_dirent_journal; diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index d6e2a1f..9ba4b10 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1092,12 +1092,15 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, sector_t *blocks = iobuf->dr_blocks; struct niobuf_local *lnb1, *lnb2; loff_t size1, size2; + bool compressed; max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT; CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", inode->i_ino, pages, (*page)->index); + compressed = iobuf->dr_lnbs[0]->lnb_flags & OBD_BRW_COMPRESSED; + if (create) { create = LDISKFS_GET_BLOCKS_CREATE; handle = ldiskfs_journal_current_handle(); @@ -1169,6 +1172,10 @@ cont_map: else oh->oh_declared_ext--; } +#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE + if (osd->od_extents_dense && compressed) + create |= LDISKFS_GET_BLOCKS_VERY_DENSE; +#endif rc = ldiskfs_map_blocks(handle, inode, &map, create); if (rc >= 0) { int c = 0; diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index bfd4cb2..9e89129 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -901,6 +901,35 @@ ssize_t index_backup_store(struct kobject *kobj, struct attribute *attr, } LUSTRE_RW_ATTR(index_backup); +#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE +static ssize_t extents_dense_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); + struct osd_device *osd = osd_dt_dev(dt); + + return sprintf(buf, "%d\n", osd->od_extents_dense); +} + +static ssize_t extents_dense_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); + struct osd_device *osd = osd_dt_dev(dt); + bool extents_dense; + int rc; + + rc = kstrtobool(buffer, &extents_dense); + if (rc != 0) + return rc; + + osd->od_extents_dense = extents_dense; + + return count; +} +LUSTRE_RW_ATTR(extents_dense); +#endif + struct ldebugfs_vars ldebugfs_osd_obd_vars[] = { { .name = "oi_scrub", .fops = &ldiskfs_osd_oi_scrub_fops }, @@ -928,6 +957,9 @@ static struct attribute *ldiskfs_attrs[] = { &lustre_attr_full_scrub_ratio.attr, &lustre_attr_full_scrub_threshold_rate.attr, &lustre_attr_extent_bytes_allocation.attr, +#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE + &lustre_attr_extents_dense.attr, +#endif NULL, }; diff --git a/lustre/tests/sanity-compr.sh b/lustre/tests/sanity-compr.sh index 2c654b5..8ee26c7 100644 --- a/lustre/tests/sanity-compr.sh +++ b/lustre/tests/sanity-compr.sh @@ -59,6 +59,84 @@ test_sanityn() } run_test sanityn "Run sanityn with PFL layout" +test_1000() { + local blocks=128 + local dense=$(do_facet ost1 lctl get_param -n \ + osd*.*OST0000*.extents_dense) + [[ -n $dense ]] || skip "no dense writes supported" + + local osts=$(comma_list $(osts_nodes)) + do_nodes $osts $LCTL set_param osd*.*.extents_dense=0 || + error "cannot enable dense extent allocation" + stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense" + + local tf=$DIR/$tfile + stack_trap "rm -f $tf" + log "create file with dense=0" + + $LFS setstripe -c 1 -i 0 $tf + for ((i=0; i<$blocks; i++)); do + dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \ + oflag=direct >&/dev/null conv=notrunc || + error "can't dd (sparse)" + done + filefrag -sv $tf + local nonr=0 + while read EX LS LE PS PE LEN DEV FLAGS; do + [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue + [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nonr+=1)) && continue + (( ${PS%%.*} == PREV + 1 )) || ((nonr+=1)) + PREV=${PE%:} + done < <(filefrag -v $tf) + (( nonr > 0 )) || error "no extents?" + rm -f $tf + wait_delete_completed + + do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 || + error "cannot enable dense extent allocation" + #define OBD_FAIL_OSC_MARK_COMPRESSED 0x419 + $LCTL set_param fail_loc=0x419 + log "create file with dense=1" + + $LFS setstripe -c 1 -i 0 $tf + for ((i=0; i<$blocks; i++)); do + dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \ + oflag=direct conv=notrunc >&/dev/null || + error "can't dd (dense)" + done + filefrag -sv $tf + local nr=0 + while read EX LS LE PS PE LEN DEV FLAGS; do + [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue + [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nr+=1)) && continue + (( ${PS%%.*} == PREV + 1 )) || ((nr+=1)) + PREV=${PE%:} + done < <(filefrag -v $tf) + (( nr > 0 )) || error "no extents?" + + echo "dense ($nr) should have fewer extents ($nonr)" + (( (nonr / nr) > 3 )) || + error "dense ($nr) should have less extents ($nonr)" + $LCTL set_param fail_loc=0 + + local tmpfile=$(mktemp) + stack_trap "rm -f $tmpfile" + echo "generate temp file $tmpfile" + dd if=/dev/urandom of=$tmpfile bs=32k count=$((blocks*2)) iflag=fullblock || + error "can't generate temporary file" + dd if=$tmpfile of=$tf bs=32k conv=notrunc + cancel_lru_locks osc + + stop ost1 || error "(2) Fail to stop ost1" + run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" || + error "(3) Fail to run e2fsck error" + start ost1 $(ostdevname 1) $OST_MOUNT_OPTS || + error "(4) Fail to start ost1" + + cmp $tmpfile $tf || error "data mismatch" +} +run_test 1000 "compressed vs uncompressed allocation" + complete_test $SECONDS check_and_cleanup_lustre declare -a logs=($ONLY)