From 686dee707f8728aa8ba27bcd4cee69f8fbf7b278 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 1 Mar 2023 21:28:25 +0300 Subject: [PATCH] LU-10026 osd-ldiskfs: use preallocation for dense writes use inode's preallocation chunks as per-inode group preallocation: just grab the very first available blocks from the window. Test-Parameters: env=ONLY=1000,ONLY_REPEAT=11 testlist=sanity-compr Test-Parameters: env=ONLY=fsx,ONLY_REPEAT=11 testlist=sanity-compr Signed-off-by: Alex Zhuravlev Change-Id: I9d36701f569f4c6305bc46f3373bfc054fcd61a9 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50171 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Artem Blagodarenko Reviewed-by: Oleg Drokin --- .../patches/rhel8/ext4-mballoc-dense.patch | 127 +++++++++++++++++++++ .../series/ldiskfs-4.18-rhel8.8.series | 1 + lustre/include/obd_support.h | 2 + lustre/osc/osc_request.c | 2 + lustre/osd-ldiskfs/osd_internal.h | 3 +- lustre/osd-ldiskfs/osd_io.c | 20 +++- lustre/osd-ldiskfs/osd_lproc.c | 32 ++++++ lustre/tests/sanity-benchmark.sh | 4 +- lustre/tests/sanity-compr.sh | 100 ++++++++++++++++ 9 files changed, 286 insertions(+), 5 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch new file mode 100644 index 0000000..a3d17ff --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-dense.patch @@ -0,0 +1,127 @@ +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.h ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.h +@@ -131,6 +131,8 @@ enum SHIFT_DIRECTION { + ext4_lblk_t pa_lstart; /* log. block */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ ++ ext4_grpblk_t pa_group; ++ unsigned short pa_regular; + unsigned short pa_type; /* pa type. inode or group */ + unsigned short pa_error; + spinlock_t *pa_obj_lock; +@@ -167,7 +167,7 @@ struct ext4_allocation_request { + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; +- __u16 ac_flags; /* allocation hints */ ++ __u32 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h +@@ -151,6 +151,7 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_USE_RESERVED 0x2000 + /* Do strict check for free blocks while retrying block allocation */ + #define EXT4_MB_STRICT_CHECK 0x4000 ++#define EXT4_MB_VERY_DENSE 0x80000 + + struct ext4_allocation_request { + /* target inode for block we're allocating */ +@@ -627,6 +628,7 @@ enum { + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ + #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 ++#define EXT4_GET_BLOCKS_VERY_DENSE 0x08000 + + /* + * The bit position of these flags must not overlap with any of the +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/extents.c ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/extents.c +@@ -4484,6 +4467,8 @@ int ext4_ext_map_blocks(handle_t *han + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; ++ if (flags & EXT4_GET_BLOCKS_VERY_DENSE) ++ ar.flags |= EXT4_MB_VERY_DENSE; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) +--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c ++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c +@@ -4267,6 +4291,25 @@ ext4_mb_use_inode_pa(struct ext4 + ext4_fsblk_t end; + int len; + ++ if (ac->ac_flags & EXT4_MB_VERY_DENSE && !pa->pa_regular) { ++ unsigned int len = ac->ac_o_ex.fe_len; ++ if (len > pa->pa_free) ++ len = pa->pa_free; ++ ext4_get_group_no_and_offset(ac->ac_sb, ++ pa->pa_pstart, ++ &ac->ac_b_ex.fe_group, ++ &ac->ac_b_ex.fe_start); ++ ac->ac_b_ex.fe_len = len; ++ pa->pa_lstart += len; ++ pa->pa_pstart += len; ++ pa->pa_free -= len; ++ pa->pa_len -= len; ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_pa = pa; ++ return; ++ } ++ ++ pa->pa_regular = 1; + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), +@@ -4367,6 +4380,23 @@ ext4_mb_use_preallocated(struct ext4 + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return false; + ++ if (ac->ac_flags & EXT4_MB_VERY_DENSE) { ++ rcu_read_lock(); ++ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ++ spin_lock(&pa->pa_lock); ++ if (!pa->pa_deleted && pa->pa_free && !pa->pa_regular) { ++ atomic_inc(&pa->pa_count); ++ ext4_mb_use_inode_pa(ac, pa); ++ spin_unlock(&pa->pa_lock); ++ break; ++ } ++ spin_unlock(&pa->pa_lock); ++ } ++ rcu_read_unlock(); ++ if (ac->ac_status == AC_STATUS_FOUND) ++ return true; ++ } ++ + /* first, try per-file preallocation */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { +@@ -4833,7 +4833,7 @@ ext4_mb_put_pa(struct ext4 + if (pa->pa_type == MB_GROUP_PA) + grp_blk--; + +- grp = ext4_get_group_number(sb, grp_blk); ++ grp = pa->pa_group; + + /* + * possible race: +@@ -4894,6 +4894,8 @@ ext4_mb_new_inode_pa(struct ext4 + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; ++ pa->pa_group = ac->ac_b_ex.fe_group; ++ pa->pa_regular = 0; + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); +@@ -5004,6 +5005,7 @@ ext4_mb_new_group_pa(struct ext4 + pa->pa_lstart = pa->pa_pstart; + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; ++ pa->pa_group = ac->ac_b_ex.fe_group; + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series index cb5726a..48aabda 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series @@ -39,3 +39,4 @@ rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch rhel8/ext4-encdata.patch rhel8/ext4-race-in-ext4-destroy-inode.patch +rhel8/ext4-mballoc-dense.patch diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d7579eb..f770359 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -443,6 +443,7 @@ extern bool obd_enable_health_write; #define OBD_FAIL_OSC_NO_SIZE_DATA 0x415 #define OBD_FAIL_OSC_DELAY_CANCEL 0x416 #define OBD_FAIL_OSC_SLOW_PAGE_EVICT 0x417 +#define OBD_FAIL_OSC_MARK_COMPRESSED 0x419 #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 @@ -766,6 +767,7 @@ extern bool obd_enable_health_write; #define OBD_FAIL_BARRIER_FAILURE 0x2203 #define OBD_FAIL_OSD_FAIL_AT_TRUNCATE 0x2301 +#define OBD_FAIL_OSD_MARK_COMPRESSED 0x2302 /* continuation of MDS related constants */ #define OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP 0x2401 diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 15382ec..3afa502 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1864,6 +1864,8 @@ no_bulk: niobuf->rnb_flags = pg->bp_flag; } pg_prev = pg; + if (CFS_FAIL_CHECK(OBD_FAIL_OSC_MARK_COMPRESSED)) + niobuf->rnb_flags |= OBD_BRW_COMPRESSED; } LASSERTF((void *)(niobuf - niocount) == diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 330286e..7ef6fe5 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -288,7 +288,8 @@ struct osd_device { od_read_cache:1, od_writethrough_cache:1, od_nonrotational:1, - od_enable_projid_xattr:1; + od_enable_projid_xattr:1, + od_extents_dense:1; __u32 od_dirent_journal; diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 9133823..7b253bd 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -915,7 +915,8 @@ static void osd_decay_extent_bytes(struct osd_device *osd, static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct osd_iobuf *iobuf, struct osd_device *osd, - int create, __u64 user_size, + const int create, + __u64 user_size, int check_credits, struct thandle *thandle) { @@ -932,14 +933,19 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, sector_t *blocks = iobuf->dr_blocks; struct niobuf_local *lnb1, *lnb2; loff_t size1, size2; + bool compressed = false; + int flags = 0; max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT; CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", inode->i_ino, pages, (*lnbs)->lnb_page->index); + if (osd->od_extents_dense) + compressed = iobuf->dr_lnbs[0]->lnb_flags & OBD_BRW_COMPRESSED; + if (create) { - create = LDISKFS_GET_BLOCKS_CREATE; + flags = LDISKFS_GET_BLOCKS_CREATE; handle = ldiskfs_journal_current_handle(); LASSERT(handle != NULL); rc = osd_attach_jinode(inode); @@ -1031,9 +1037,17 @@ cont_map: else oh->oh_declared_ext--; } +#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE + if (osd->od_extents_dense) { + if (CFS_FAIL_CHECK(OBD_FAIL_OSD_MARK_COMPRESSED)) + flags |= LDISKFS_GET_BLOCKS_VERY_DENSE; + if (compressed) + flags |= LDISKFS_GET_BLOCKS_VERY_DENSE; + } +#endif time = ktime_get(); - rc = ldiskfs_map_blocks(handle, inode, &map, create); + rc = ldiskfs_map_blocks(handle, inode, &map, flags); time = ktime_sub(ktime_get(), time); if (rc >= 0) { diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index 21f3b8f..9e6cbf9 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -805,6 +805,35 @@ ssize_t index_backup_store(struct kobject *kobj, struct attribute *attr, } LUSTRE_RW_ATTR(index_backup); +#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE +static ssize_t extents_dense_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); + struct osd_device *osd = osd_dt_dev(dt); + + return snprintf(buf, PAGE_SIZE, "%d\n", osd->od_extents_dense); +} + +static ssize_t extents_dense_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); + struct osd_device *osd = osd_dt_dev(dt); + bool extents_dense; + int rc; + + rc = kstrtobool(buffer, &extents_dense); + if (rc != 0) + return rc; + + osd->od_extents_dense = extents_dense; + + return count; +} +LUSTRE_RW_ATTR(extents_dense); +#endif + struct ldebugfs_vars ldebugfs_osd_obd_vars[] = { { .name = "oi_scrub", .fops = &ldiskfs_osd_oi_scrub_fops }, @@ -832,6 +861,9 @@ static struct attribute *ldiskfs_attrs[] = { &lustre_attr_full_scrub_ratio.attr, &lustre_attr_full_scrub_threshold_rate.attr, &lustre_attr_extent_bytes_allocation.attr, +#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE + &lustre_attr_extents_dense.attr, +#endif NULL, }; diff --git a/lustre/tests/sanity-benchmark.sh b/lustre/tests/sanity-benchmark.sh index 891c327..c0b3aaf 100644 --- a/lustre/tests/sanity-benchmark.sh +++ b/lustre/tests/sanity-benchmark.sh @@ -178,6 +178,7 @@ test_iozone() { run_test iozone "iozone" test_fsx() { + local fsx_layout="${fsx_STRIPEPARAMS:--c -1}" local testfile=$DIR/f0.fsxfile FSX_SIZE=$SIZE FSX_COUNT=1000 @@ -189,7 +190,8 @@ test_fsx() { $DEBUG_OFF FSX_SEED=${FSX_SEED:-$RANDOM} rm -f $testfile - $LFS setstripe -c -1 $testfile + $LFS setstripe $fsx_layout $testfile || + error "'setstripe $fsx_layout $testfile' failed" CMD="$FSX -c 50 -p 1000 -S $FSX_SEED -P $TMP -l $FSX_SIZE \ -N $((FSX_COUNT * 100)) $FSXOPT $testfile" echo "Using: $CMD" diff --git a/lustre/tests/sanity-compr.sh b/lustre/tests/sanity-compr.sh index 4164df4..a211636 100644 --- a/lustre/tests/sanity-compr.sh +++ b/lustre/tests/sanity-compr.sh @@ -54,6 +54,106 @@ test_sanityn() } run_test sanityn "Run sanityn with PFL layout" +test_1000() { + local blocks=128 + local dense=$(do_facet ost1 lctl get_param -n \ + osd*.*OST0000*.extents_dense) + [[ -n $dense ]] || skip "no dense writes supported" + + local osts=$(comma_list $(osts_nodes)) + do_nodes $osts $LCTL set_param osd*.*.extents_dense=0 || + error "cannot enable dense extent allocation" + stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense" + + local tf=$DIR/$tfile + stack_trap "rm -f $tf" + log "create file with dense=0" + + $LFS setstripe -c 1 -i 0 $tf + for ((i=0; i<$blocks; i++)); do + dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \ + oflag=direct >&/dev/null conv=notrunc || + error "can't dd (sparse)" + done + filefrag -sv $tf + local nonr=0 + while read EX LS LE PS PE LEN DEV FLAGS; do + [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue + [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nonr+=1)) && continue + (( ${PS%%.*} == PREV + 1 )) || ((nonr+=1)) + PREV=${PE%:} + done < <(filefrag -v $tf) + (( nonr > 0 )) || error "no extents?" + rm -f $tf + wait_delete_completed + + do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 || + error "cannot enable dense extent allocation" + #define OBD_FAIL_OSC_MARK_COMPRESSED 0x419 + $LCTL set_param fail_loc=0x419 + log "create file with dense=1" + + $LFS setstripe -c 1 -i 0 $tf + for ((i=0; i<$blocks; i++)); do + dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \ + oflag=direct conv=notrunc >&/dev/null || + error "can't dd (dense)" + done + filefrag -sv $tf + local nr=0 + while read EX LS LE PS PE LEN DEV FLAGS; do + [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue + [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nr+=1)) && continue + (( ${PS%%.*} == PREV + 1 )) || ((nr+=1)) + PREV=${PE%:} + done < <(filefrag -v $tf) + (( nr > 0 )) || error "no extents?" + + echo "dense ($nr) should have fewer extents ($nonr)" + (( (nonr / nr) > 3 )) || + error "dense ($nr) should have less extents ($nonr)" + $LCTL set_param fail_loc=0 + + local tmpfile=$(mktemp) + stack_trap "rm -f $tmpfile" + echo "generate temp file $tmpfile" + dd if=/dev/urandom of=$tmpfile bs=32k count=$((blocks*2)) iflag=fullblock || + error "can't generate temporary file" + dd if=$tmpfile of=$tf bs=32k conv=notrunc + cancel_lru_locks osc + + stop ost1 || error "(2) Fail to stop ost1" + run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" || + error "(3) Fail to run e2fsck error" + start ost1 $(ostdevname 1) $OST_MOUNT_OPTS || + error "(4) Fail to start ost1" + + cmp $tmpfile $tf || error "data mismatch" +} +run_test 1000 "compressed vs uncompressed allocation" + +test_fsx() { + [[ "$ost1_FSTYPE" == "ldiskfs" ]] || skip "need ldiskfs backend" + local osts=$(comma_list $(osts_nodes)) + + local dense=$(do_facet ost1 lctl get_param -n \ + osd*.*OST0000*.extents_dense) + [[ -n $dense ]] || skip "no dense writes supported" + do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 || + error "cannot enable dense extent allocation" + stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense" + +#define OBD_FAIL_OSD_MARK_COMPRESSED 0x2302 + do_nodes $osts $LCTL set_param fail_loc=0x2302 || + error "cannot force dense writes" + stack_trap "do_nodes $osts $LCTL set_param fail_loc=0" + + fsx_STRIPEPARAMS="-E eof -c -1" ONLY=fsx FSX_COUNT=2500 SLOW=yes bash sanity-benchmark.sh + + $DEBUG_ON +} +run_test fsx "verify dense writes with fsx on ldiskfs" + complete_test $SECONDS check_and_cleanup_lustre declare -a logs=($ONLY) -- 1.8.3.1