--- /dev/null
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
+@@ -151,6 +151,7 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_CR0_OPTIMIZED 0x8000
+ /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+ #define EXT4_MB_CR1_OPTIMIZED 0x00010000
++#define EXT4_MB_VERY_DENSE 0x00080000
+
+ struct ext4_allocation_request {
+ /* target inode for block we're allocating */
+@@ -627,6 +628,7 @@ enum {
+ /* Caller will submit data before dropping transaction handle. This
+ * allows jbd2 to avoid submitting data before commit. */
+ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
++#define EXT4_GET_BLOCKS_VERY_DENSE 0x0800
+
+ /*
+ * The bit position of these flags must not overlap with any of the
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/extents.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/extents.c
+@@ -4484,6 +4467,8 @@ int ext4_ext_map_blocks(handle_t *han
+ ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
++ if (flags & EXT4_GET_BLOCKS_VERY_DENSE)
++ ar.flags |= EXT4_MB_VERY_DENSE;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+ if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+--- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c
++++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c
+@@ -4267,6 +4291,21 @@ ext4_mb_use_inode_pa(struct ext4
+ ext4_fsblk_t end;
+ int len;
+
++ if (ac->ac_flags & EXT4_MB_VERY_DENSE) {
++ unsigned int len = ac->ac_o_ex.fe_len;
++ ext4_get_group_no_and_offset(ac->ac_sb,
++ pa->pa_pstart,
++ &ac->ac_b_ex.fe_group,
++ &ac->ac_b_ex.fe_start);
++ ac->ac_b_ex.fe_len = len;
++ pa->pa_pstart += len;
++ pa->pa_free -= len;
++ pa->pa_len -= len;
++ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_pa = pa;
++ return;
++ }
++
+ /* found preallocated blocks, use them */
+ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+@@ -4367,6 +4380,24 @@ ext4_mb_use_preallocated(struct ext4
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return false;
+
++ if (ac->ac_flags & EXT4_MB_VERY_DENSE) {
++ unsigned int len = ac->ac_o_ex.fe_len;
++ rcu_read_lock();
++ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
++ spin_lock(&pa->pa_lock);
++ if (pa->pa_deleted == 0 && len <= pa->pa_free) {
++ atomic_inc(&pa->pa_count);
++ ext4_mb_use_inode_pa(ac, pa);
++ spin_unlock(&pa->pa_lock);
++ break;
++ }
++ spin_unlock(&pa->pa_lock);
++ }
++ rcu_read_unlock();
++ if (ac->ac_status == AC_STATUS_FOUND)
++ return true;
++ }
++
+ /* first, try per-file preallocation */
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
rhel8/ext4-limit-per-inode-preallocation-list.patch
rhel8/ext4-mballoc-improve.patch
rhel8/ext4-mballoc-for-hybrid.patch
+rhel8/ext4-mballoc-dense.patch
#define OBD_FAIL_OSC_DELAY_CANCEL 0x416
#define OBD_FAIL_OSC_SLOW_PAGE_EVICT 0x417
#define OBD_FAIL_OSC_WRONG_COMP_ALG 0x418
+#define OBD_FAIL_OSC_MARK_COMPRESSED 0x419
#define OBD_FAIL_PTLRPC 0x500
#define OBD_FAIL_PTLRPC_ACK 0x501
niobuf->rnb_len = pg->count;
niobuf->rnb_flags = pg->flag;
}
+ if (CFS_FAIL_CHECK(OBD_FAIL_OSC_MARK_COMPRESSED))
+ niobuf->rnb_flags |= OBD_BRW_COMPRESSED;
pg_prev = pg;
}
o->od_nonrotational = 1;
osd_tune_nonrot(o, true);
}
+ o->od_extents_dense = 1;
GOTO(out, rc = 0);
od_writethrough_cache_enable_set:1,
od_nonrotational:1,
od_nonrotational_set:1,
- od_enable_projid_xattr:1;
+ od_enable_projid_xattr:1,
+ od_extents_dense:1;
__s64 od_auto_scrub_interval;
__u32 od_dirent_journal;
sector_t *blocks = iobuf->dr_blocks;
struct niobuf_local *lnb1, *lnb2;
loff_t size1, size2;
+ bool compressed;
max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT;
CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
inode->i_ino, pages, (*page)->index);
+ compressed = iobuf->dr_lnbs[0]->lnb_flags & OBD_BRW_COMPRESSED;
+
if (create) {
create = LDISKFS_GET_BLOCKS_CREATE;
handle = ldiskfs_journal_current_handle();
else
oh->oh_declared_ext--;
}
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+ if (osd->od_extents_dense && compressed)
+ create |= LDISKFS_GET_BLOCKS_VERY_DENSE;
+#endif
rc = ldiskfs_map_blocks(handle, inode, &map, create);
if (rc >= 0) {
int c = 0;
}
LUSTRE_RW_ATTR(index_backup);
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+static ssize_t extents_dense_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct osd_device *osd = osd_dt_dev(dt);
+
+ return sprintf(buf, "%d\n", osd->od_extents_dense);
+}
+
+static ssize_t extents_dense_store(struct kobject *kobj, struct attribute *attr,
+ const char *buffer, size_t count)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct osd_device *osd = osd_dt_dev(dt);
+ bool extents_dense;
+ int rc;
+
+ rc = kstrtobool(buffer, &extents_dense);
+ if (rc != 0)
+ return rc;
+
+ osd->od_extents_dense = extents_dense;
+
+ return count;
+}
+LUSTRE_RW_ATTR(extents_dense);
+#endif
+
struct ldebugfs_vars ldebugfs_osd_obd_vars[] = {
{ .name = "oi_scrub",
.fops = &ldiskfs_osd_oi_scrub_fops },
&lustre_attr_full_scrub_ratio.attr,
&lustre_attr_full_scrub_threshold_rate.attr,
&lustre_attr_extent_bytes_allocation.attr,
+#ifdef LDISKFS_GET_BLOCKS_VERY_DENSE
+ &lustre_attr_extents_dense.attr,
+#endif
NULL,
};
}
run_test sanityn "Run sanityn with PFL layout"
+test_1000() {
+ local blocks=128
+ local dense=$(do_facet ost1 lctl get_param -n \
+ osd*.*OST0000*.extents_dense)
+ [[ -n $dense ]] || skip "no dense writes supported"
+
+ local osts=$(comma_list $(osts_nodes))
+ do_nodes $osts $LCTL set_param osd*.*.extents_dense=0 ||
+ error "cannot enable dense extent allocation"
+ stack_trap "do_nodes $osts $LCTL set_param osd*.*.extents_dense=$dense"
+
+ local tf=$DIR/$tfile
+ stack_trap "rm -f $tf"
+ log "create file with dense=0"
+
+ $LFS setstripe -c 1 -i 0 $tf
+ for ((i=0; i<$blocks; i++)); do
+ dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+ oflag=direct >&/dev/null conv=notrunc ||
+ error "can't dd (sparse)"
+ done
+ filefrag -sv $tf
+ local nonr=0
+ while read EX LS LE PS PE LEN DEV FLAGS; do
+ [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+ [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nonr+=1)) && continue
+ (( ${PS%%.*} == PREV + 1 )) || ((nonr+=1))
+ PREV=${PE%:}
+ done < <(filefrag -v $tf)
+ (( nonr > 0 )) || error "no extents?"
+ rm -f $tf
+ wait_delete_completed
+
+ do_nodes $osts $LCTL set_param osd*.*.extents_dense=1 ||
+ error "cannot enable dense extent allocation"
+ #define OBD_FAIL_OSC_MARK_COMPRESSED 0x419
+ $LCTL set_param fail_loc=0x419
+ log "create file with dense=1"
+
+ $LFS setstripe -c 1 -i 0 $tf
+ for ((i=0; i<$blocks; i++)); do
+ dd if=/dev/zero of=$tf bs=32k seek=$((i*2)) count=1 \
+ oflag=direct conv=notrunc >&/dev/null ||
+ error "can't dd (dense)"
+ done
+ filefrag -sv $tf
+ local nr=0
+ while read EX LS LE PS PE LEN DEV FLAGS; do
+ [[ "$EX" == "ext:" || "$EX" =~ "File" ]] && continue
+ [[ "$EX" == "0:" ]] && PREV=${PE%:} && ((nr+=1)) && continue
+ (( ${PS%%.*} == PREV + 1 )) || ((nr+=1))
+ PREV=${PE%:}
+ done < <(filefrag -v $tf)
+ (( nr > 0 )) || error "no extents?"
+
+ echo "dense ($nr) should have fewer extents ($nonr)"
+ (( (nonr / nr) > 3 )) ||
+ error "dense ($nr) should have less extents ($nonr)"
+ $LCTL set_param fail_loc=0
+
+ local tmpfile=$(mktemp)
+ stack_trap "rm -f $tmpfile"
+ echo "generate temp file $tmpfile"
+ dd if=/dev/urandom of=$tmpfile bs=32k count=$((blocks*2)) iflag=fullblock ||
+ error "can't generate temporary file"
+ dd if=$tmpfile of=$tf bs=32k conv=notrunc
+ cancel_lru_locks osc
+
+ stop ost1 || error "(2) Fail to stop ost1"
+ run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" ||
+ error "(3) Fail to run e2fsck error"
+ start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
+ error "(4) Fail to start ost1"
+
+ cmp $tmpfile $tf || error "data mismatch"
+}
+run_test 1000 "compressed vs uncompressed allocation"
+
complete_test $SECONDS
check_and_cleanup_lustre
declare -a logs=($ONLY)