--- /dev/null
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.h
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.h
+@@ -131,6 +131,8 @@ enum SHIFT_DIRECTION {
+ ext4_lblk_t pa_lstart; /* log. block */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
++ ext4_grpblk_t pa_group;
++ unsigned short pa_regular;
+ unsigned short pa_type; /* pa type. inode or group */
+ unsigned short pa_error;
+ spinlock_t *pa_obj_lock;
+@@ -167,7 +167,7 @@ struct ext4_allocation_request {
+ __u16 ac_found;
+ __u16 ac_tail;
+ __u16 ac_buddy;
+- __u16 ac_flags; /* allocation hints */
++ __u32 ac_flags; /* allocation hints */
+ __u8 ac_status;
+ __u8 ac_criteria;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
+@@ -151,6 +151,7 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_USE_RESERVED 0x2000
+ /* Do strict check for free blocks while retrying block allocation */
+ #define EXT4_MB_STRICT_CHECK 0x4000
++#define EXT4_MB_VERY_DENSE 0x80000
+
+ struct ext4_allocation_request {
+ /* target inode for block we're allocating */
+@@ -627,6 +628,7 @@ enum {
+ /* Caller will submit data before dropping transaction handle. This
+ * allows jbd2 to avoid submitting data before commit. */
+ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
++#define EXT4_GET_BLOCKS_VERY_DENSE 0x08000
+
+ /*
+ * The bit position of these flags must not overlap with any of the
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/extents.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/extents.c
+@@ -4484,6 +4467,8 @@ int ext4_ext_map_blocks(handle_t *han
+ ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
++ if (flags & EXT4_GET_BLOCKS_VERY_DENSE)
++ ar.flags |= EXT4_MB_VERY_DENSE;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+ if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c
+@@ -4267,6 +4291,25 @@ ext4_mb_use_inode_pa(struct ext4
+ ext4_fsblk_t end;
+ int len;
+
++ if (ac->ac_flags & EXT4_MB_VERY_DENSE && !pa->pa_regular) {
++ unsigned int len = ac->ac_o_ex.fe_len;
++ if (len > pa->pa_free)
++ len = pa->pa_free;
++ ext4_get_group_no_and_offset(ac->ac_sb,
++ pa->pa_pstart,
++ &ac->ac_b_ex.fe_group,
++ &ac->ac_b_ex.fe_start);
++ ac->ac_b_ex.fe_len = len;
++ pa->pa_lstart += len;
++ pa->pa_pstart += len;
++ pa->pa_free -= len;
++ pa->pa_len -= len;
++ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_pa = pa;
++ return;
++ }
++
++ pa->pa_regular = 1;
+ /* found preallocated blocks, use them */
+ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+@@ -4367,6 +4380,23 @@ ext4_mb_use_preallocated(struct ext4
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return false;
+
++ if (ac->ac_flags & EXT4_MB_VERY_DENSE) {
++ rcu_read_lock();
++ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
++ spin_lock(&pa->pa_lock);
++ if (!pa->pa_deleted && pa->pa_free && !pa->pa_regular) {
++ atomic_inc(&pa->pa_count);
++ ext4_mb_use_inode_pa(ac, pa);
++ spin_unlock(&pa->pa_lock);
++ break;
++ }
++ spin_unlock(&pa->pa_lock);
++ }
++ rcu_read_unlock();
++ if (ac->ac_status == AC_STATUS_FOUND)
++ return true;
++ }
++
+ /* first, try per-file preallocation */
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+@@ -4833,7 +4833,7 @@ ext4_mb_put_pa(struct ext4
+ if (pa->pa_type == MB_GROUP_PA)
+ grp_blk--;
+
+- grp = ext4_get_group_number(sb, grp_blk);
++ grp = pa->pa_group;
+
+ /*
+ * possible race:
+@@ -4894,6 +4894,8 @@ ext4_mb_new_inode_pa(struct ext4
+ pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ pa->pa_len = ac->ac_b_ex.fe_len;
+ pa->pa_free = pa->pa_len;
++ pa->pa_group = ac->ac_b_ex.fe_group;
++ pa->pa_regular = 0;
+ spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
+@@ -5004,6 +5005,7 @@ ext4_mb_new_group_pa(struct ext4
+ pa->pa_lstart = pa->pa_pstart;
+ pa->pa_len = ac->ac_b_ex.fe_len;
+ pa->pa_free = pa->pa_len;
++ pa->pa_group = ac->ac_b_ex.fe_group;
+ spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
rhel8.4/ext4-optimize-find_delayed_extent.patch
rhel8/ext4-encdata.patch
rhel8/ext4-race-in-ext4-destroy-inode.patch
+rhel8/ext4-mballoc-dense.patch
#define OBD_FAIL_OSC_NO_SIZE_DATA 0x415
#define OBD_FAIL_OSC_DELAY_CANCEL 0x416
#define OBD_FAIL_OSC_SLOW_PAGE_EVICT 0x417
+#define OBD_FAIL_OSC_MARK_COMPRESSED 0x419
#define OBD_FAIL_PTLRPC 0x500
#define OBD_FAIL_PTLRPC_ACK 0x501
#define OBD_FAIL_BARRIER_FAILURE 0x2203
#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE 0x2301
+#define OBD_FAIL_OSD_MARK_COMPRESSED 0x2302
/* continuation of MDS related constants */
#define OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP 0x2401
niobuf->rnb_flags = pg->bp_flag;
}
pg_prev = pg;
+ if (CFS_FAIL_CHECK(OBD_FAIL_OSC_MARK_COMPRESSED))
+ niobuf->rnb_flags |= OBD_BRW_COMPRESSED;
}
LASSERTF((void *)(niobuf - niocount) ==
od_read_cache:1,
od_writethrough_cache:1,
od_nonrotational:1,
- od_enable_projid_xattr:1;
+ od_enable_projid_xattr:1,
+ od_extents_dense:1;
__u32 od_dirent_journal;
static int osd_ldiskfs_map_inode_pages(struct inode *inode,
struct osd_iobuf *iobuf,
struct osd_device *osd,
- int create, __u64 user_size,
+ const int create,
+ __u64 user_size,
int check_credits,
struct thandle *thandle)
{
sector_t *blocks = iobuf->dr_blocks;
struct niobuf_local *lnb1, *lnb2;
loff_t size1, size2;
+ bool compressed = false;
+ int flags = 0;
max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT;
CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
inode->i_ino, pages, (*lnbs)->lnb_page->index);
+ if (osd->od_extents_dense)
+ compressed = iobuf->dr_lnbs[0]->lnb_flags & OBD_BRW_COMPRESSED;
+
if (create) {
- create = LDISKFS_GET_BLOCKS_CREATE;
+ flags = LDISKFS_GET_BLOCKS_CREATE;
handle = ldiskfs_journal_current_handle();
LASSERT(handle != NULL);
rc = osd_attach_jinode(inode);
else
oh->oh_declared_ext--;
}
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+ if (osd->od_extents_dense) {
+ if (CFS_FAIL_CHECK(OBD_FAIL_OSD_MARK_COMPRESSED))
+ flags |= LDISKFS_GET_BLOCKS_VERY_DENSE;
+ if (compressed)
+ flags |= LDISKFS_GET_BLOCKS_VERY_DENSE;
+ }
+#endif
time = ktime_get();
- rc = ldiskfs_map_blocks(handle, inode, &map, create);
+ rc = ldiskfs_map_blocks(handle, inode, &map, flags);
time = ktime_sub(ktime_get(), time);
if (rc >= 0) {
}
LUSTRE_RW_ATTR(index_backup);
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+static ssize_t extents_dense_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct osd_device *osd = osd_dt_dev(dt);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", osd->od_extents_dense);
+}
+
+static ssize_t extents_dense_store(struct kobject *kobj, struct attribute *attr,
+ const char *buffer, size_t count)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct osd_device *osd = osd_dt_dev(dt);
+ bool extents_dense;
+ int rc;
+
+ rc = kstrtobool(buffer, &extents_dense);
+ if (rc != 0)
+ return rc;
+
+ osd->od_extents_dense = extents_dense;
+
+ return count;
+}
+LUSTRE_RW_ATTR(extents_dense);
+#endif
+
struct ldebugfs_vars ldebugfs_osd_obd_vars[] = {
{ .name = "oi_scrub",
.fops = &ldiskfs_osd_oi_scrub_fops },
&lustre_attr_full_scrub_ratio.attr,
&lustre_attr_full_scrub_threshold_rate.attr,
&lustre_attr_extent_bytes_allocation.attr,
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+ &lustre_attr_extents_dense.attr,
+#endif
NULL,
};
run_test iozone "iozone"
test_fsx() {
+ local fsx_layout="${fsx_STRIPEPARAMS:--c -1}"
local testfile=$DIR/f0.fsxfile
FSX_SIZE=$SIZE
FSX_COUNT=1000
$DEBUG_OFF
FSX_SEED=${FSX_SEED:-$RANDOM}
rm -f $testfile
- $LFS setstripe -c -1 $testfile
+ $LFS setstripe $fsx_layout $testfile ||
+ error "'setstripe $fsx_layout $testfile' failed"
CMD="$FSX -c 50 -p 1000 -S $FSX_SEED -P $TMP -l $FSX_SIZE \
-N $((FSX_COUNT * 100)) $FSXOPT $testfile"
echo "Using: $CMD"
}
run_test sanityn "Run sanityn with PFL layout"
+test_1000() {
+ local blocks=128
+ local dense=$(do_facet ost1 lctl get_param -n \
+ osd*.*OST0000*.extents_dense)
+ [[ -n $dense ]] || skip "no dense writes supported"
+
+ local osts=$(comma_list $(osts_nodes))
+ do_nodes $osts $LCTL set_param osd*.*.extents_dense=0 ||
+ error "cannot enable dense extent allocation"
+ stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense"
+
+ local tf=$DIR/$tfile
+ stack_trap "rm -f $tf"
+ log "create file with dense=0"
+
+ $LFS setstripe -c 1 -i 0 $tf
+ for ((i=0; i<$blocks; i++)); do
+ dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+ oflag=direct >&/dev/null conv=notrunc ||
+ error "can't dd (sparse)"
+ done
+ filefrag -sv $tf
+ local nonr=0
+ while read EX LS LE PS PE LEN DEV FLAGS; do
+ [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+ [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nonr+=1)) && continue
+ (( ${PS%%.*} == PREV + 1 )) || ((nonr+=1))
+ PREV=${PE%:}
+ done < <(filefrag -v $tf)
+ (( nonr > 0 )) || error "no extents?"
+ rm -f $tf
+ wait_delete_completed
+
+ do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 ||
+ error "cannot enable dense extent allocation"
+ #define OBD_FAIL_OSC_MARK_COMPRESSED 0x419
+ $LCTL set_param fail_loc=0x419
+ log "create file with dense=1"
+
+ $LFS setstripe -c 1 -i 0 $tf
+ for ((i=0; i<$blocks; i++)); do
+ dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+ oflag=direct conv=notrunc >&/dev/null ||
+ error "can't dd (dense)"
+ done
+ filefrag -sv $tf
+ local nr=0
+ while read EX LS LE PS PE LEN DEV FLAGS; do
+ [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+ [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nr+=1)) && continue
+ (( ${PS%%.*} == PREV + 1 )) || ((nr+=1))
+ PREV=${PE%:}
+ done < <(filefrag -v $tf)
+ (( nr > 0 )) || error "no extents?"
+
+ echo "dense ($nr) should have fewer extents ($nonr)"
+ (( (nonr / nr) > 3 )) ||
+ error "dense ($nr) should have less extents ($nonr)"
+ $LCTL set_param fail_loc=0
+
+ local tmpfile=$(mktemp)
+ stack_trap "rm -f $tmpfile"
+ echo "generate temp file $tmpfile"
+ dd if=/dev/urandom of=$tmpfile bs=32k count=$((blocks*2)) iflag=fullblock ||
+ error "can't generate temporary file"
+ dd if=$tmpfile of=$tf bs=32k conv=notrunc
+ cancel_lru_locks osc
+
+ stop ost1 || error "(2) Fail to stop ost1"
+ run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" ||
+ error "(3) Fail to run e2fsck error"
+ start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
+ error "(4) Fail to start ost1"
+
+ cmp $tmpfile $tf || error "data mismatch"
+}
+run_test 1000 "compressed vs uncompressed allocation"
+
+test_fsx() {
+ [[ "$ost1_FSTYPE" == "ldiskfs" ]] || skip "need ldiskfs backend"
+ local osts=$(comma_list $(osts_nodes))
+
+ local dense=$(do_facet ost1 lctl get_param -n \
+ osd*.*OST0000*.extents_dense)
+ [[ -n $dense ]] || skip "no dense writes supported"
+ do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 ||
+ error "cannot enable dense extent allocation"
+ stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense"
+
+#define OBD_FAIL_OSD_MARK_COMPRESSED 0x2302
+ do_nodes $osts $LCTL set_param fail_loc=0x2302 ||
+ error "cannot force dense writes"
+ stack_trap "do_nodes $osts $LCTL set_param fail_loc=0"
+
+ fsx_STRIPEPARAMS="-E eof -c -1" ONLY=fsx FSX_COUNT=2500 SLOW=yes bash sanity-benchmark.sh
+
+ $DEBUG_ON
+}
+run_test fsx "verify dense writes with fsx on ldiskfs"
+
complete_test $SECONDS
check_and_cleanup_lustre
declare -a logs=($ONLY)