From 452f102a581f2a8ef8396bf0ba5584d61512a267 Mon Sep 17 00:00:00 2001
From: Bobi Jam <bobijam@whamcloud.com>
Date: Mon, 10 Jul 2023 19:40:34 +0800
Subject: [PATCH] LU-16750 ldiskfs: optimize metadata allocation for hybrid
 LUNs

With LVM it is possible to create an LV with SSD storage at the
beginning of the LV and HDD storage at the end of the LV, and use that
to separate ext4 metadata allocations (that need small random IOs)
from data allocations (that are better suited for large sequential
IOs) depending on the type of underlying storage.  Between 0.5-1.0% of
the filesystem capacity would need to be high-IOPS storage in order to
hold all of the internal metadata.

This would improve performance for inode and other metadata access,
such as ls, find, e2fsck, and in general improve file access latency,
modification, truncate, unlink, transaction commit, etc.

This patch split largest free order group lists and average fragment
size lists into other two lists for IOPS/fast storage groups, and
cr 0 / cr 1 group scanning for metadata block allocation in following
order:

if (allocate metadata blocks)
      if (cr ==0) try to find group in largest free order IOPS group
                  list
      if (cr ==1 or failed to find group in largest free order IOPS
          group)
                 try to find group in fragment size IOPS group list
      if (above two find failed)
                 fall through normal group lists as before
if (allocate data blocks)
      try to find group in normal group lists as before
      if (failed to find group in normal group)
                 try to find group in IOPS groups

Non-metadata block allocation does not allocate from the IOPS groups
if non-IOPS groups are not used up.

Add for mke2fs an option to mark which blocks are in the IOPS region
of storage at format time:

  -E iops=0-1024G,4096-8192G

so the ext4 mballoc code can then use the EXT4_BG_IOPS flag in the
group descriptors to decide which groups to allocate dynamic
filesystem metadata.

--
v2->v3: add sysfs mb_enable_iops to disable/enable this feature
v1->v2: for metadata block allocation, search in IOPS list then normal
        list; for data block allocation, search in normal list then
        IOPS list.

        Try to create new inode from IOPS group.

Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Change-Id: Ice2d25b8db19f67e70690f9ccebc419f253b12bd
---
 .../patches/rhel8/ext4-mballoc-for-hybrid.patch    | 636 +++++++++++++++++++++
 .../series/ldiskfs-4.18-rhel8.8.series             |   1 +
 2 files changed, 637 insertions(+)
 create mode 100644 ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-for-hybrid.patch

diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-for-hybrid.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-for-hybrid.patch
new file mode 100644
index 0000000..c2dc996
--- /dev/null
+++ b/ldiskfs/kernel_patches/patches/rhel8/ext4-mballoc-for-hybrid.patch
@@ -0,0 +1,636 @@
+With LVM it is possible to create an LV with SSD storage at the
+beginning of the LV and HDD storage at the end of the LV, and use that
+to separate ext4 metadata allocations (that need small random IOs)
+from data allocations (that are better suited for large sequential
+IOs) depending on the type of underlying storage.  Between 0.5-1.0% of
+the filesystem capacity would need to be high-IOPS storage in order to
+hold all of the internal metadata.
+
+This would improve performance for inode and other metadata access,
+such as ls, find, e2fsck, and in general improve file access latency,
+modification, truncate, unlink, transaction commit, etc.
+
+This patch split largest free order group lists and average fragment
+size lists into other two lists for IOPS/fast storage groups, and
+cr 0 / cr 1 group scanning for metadata block allocation in following
+order:
+
+if (allocate metadata blocks)
+      if (cr ==0) try to find group in largest free order IOPS group
+                  list
+      if (cr ==1 or failed to find group in largest free order IOPS
+          group)
+                 try to find group in fragment size IOPS group list
+      if (above two find failed)
+                 fall through normal group lists as before
+if (allocate data blocks)
+      try to find group in normal group lists as before
+      if (failed to find group in normal group)
+                 try to find group in IOPS groups
+
+Non-metadata block allocation does not allocate from the IOPS groups
+if non-IOPS groups are not used up.
+
+Add for mke2fs an option to mark which blocks are in the IOPS region
+of storage at format time:
+
+-E iops=0-1024G,4096-8192G
+
+so the ext4 mballoc code can then use the EXT4_BG_IOPS flag in the
+group descriptors to decide which groups to allocate dynamic
+filesystem metadata.
+
+v2->v3: add sysfs mb_enable_iops to disable/enable this feature
+v1->v2: for metadata block allocation, search in IOPS list then normal
+        list; for data block allocation, search in normal list then
+        IOPS list.
+
+        Try to create new inode from IOPS group.
+
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
+@@ -361,6 +361,7 @@ struct flex_groups {
+ #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
+ #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
+ #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
++#define EXT4_BG_IOPS		0x0010 /* In IOPS/fast storage */
+ 
+ /*
+  * Macro-instructions used to manage group descriptors
+@@ -1128,6 +1129,8 @@ struct ext4_inode_info {
+ #define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+ #define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+ 
++#define EXT2_FLAGS_HAS_IOPS		0x0080	/* has IOPS storage */
++
+ /*
+  * Mount flags set via mount options or defaults
+  */
+@@ -1487,8 +1490,12 @@ struct ext4_sb_info {
+ 						   after commit completed */
+ 	struct list_head *s_mb_avg_fragment_size;
+ 	rwlock_t *s_mb_avg_fragment_size_locks;
++	struct list_head *s_avg_fragment_size_list_iops;  /* avg_frament_size for IOPS groups */
++	rwlock_t *s_avg_fragment_size_locks_iops;
+ 	struct list_head *s_mb_largest_free_orders;
+ 	rwlock_t *s_mb_largest_free_orders_locks;
++	struct list_head *s_largest_free_orders_list_iops; /* largest_free_orders for IOPS grps */
++	rwlock_t *s_largest_free_orders_locks_iops;
+ 
+ 	/* tunables */
+ 	unsigned long s_stripe;
+@@ -1512,6 +1519,7 @@ struct ext4_sb_info {
+ 	unsigned long s_mb_last_start;
+ 	unsigned int s_mb_prefetch;
+ 	unsigned int s_mb_prefetch_limit;
++	unsigned int s_mb_enable_iops;
+ 
+ 	/* stats for buddy allocator */
+ 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+@@ -2733,6 +2741,7 @@ extern int ext4_group_add_blocks(handle_
+ 				ext4_fsblk_t block, unsigned long count);
+ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+ extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
++extern void ext4_mb_disable_iops(struct ext4_sb_info *sbi);
+ 
+ /* inode.c */
+ #define HAVE_LDISKFS_INFO_JINODE
+@@ -3234,6 +3243,7 @@ struct ext4_group_info {
+ #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
+ 	(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+ #define EXT4_GROUP_INFO_BBITMAP_READ_BIT	4
++#define EXT4_GROUP_INFO_IOPS_BIT		5
+ 
+ #define EXT4_MB_GRP_NEED_INIT(grp)	\
+ 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+@@ -3252,6 +3262,10 @@ struct ext4_group_info {
+ 	(test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
+ #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
+ 	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
++#define EXT4_MB_GRP_TEST_IOPS(grp)	\
++	(test_bit(EXT4_GROUP_INFO_IOPS_BIT, &((grp)->bb_state)))
++#define EXT4_MB_GRP_SET_IOPS(grp)	\
++	(set_bit(EXT4_GROUP_INFO_IOPS_BIT, &((grp)->bb_state)))
+ 
+ #define EXT4_MAX_CONTENTION		8
+ #define EXT4_CONTENTION_THRESHOLD	2
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c
++++ linux-stage/fs/ext4/mballoc.c
+@@ -828,6 +828,8 @@ static void
+ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
+ {
+ 	struct ext4_sb_info *sbi = EXT4_SB(sb);
++	rwlock_t *afs_locks;
++	struct list_head *afs_list;
+ 	int new_order;
+ 
+ 	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+@@ -838,20 +840,23 @@ mb_update_avg_fragment_size(struct super
+ 	if (new_order == grp->bb_avg_fragment_size_order)
+ 		return;
+ 
++	if (sbi->s_mb_enable_iops && EXT4_MB_GRP_TEST_IOPS(grp)) {
++		afs_locks = sbi->s_avg_fragment_size_locks_iops;
++		afs_list = sbi->s_avg_fragment_size_list_iops;
++	} else {
++		afs_locks = sbi->s_mb_avg_fragment_size_locks;
++		afs_list = sbi->s_mb_avg_fragment_size;
++	}
++
+ 	if (grp->bb_avg_fragment_size_order != -1) {
+-		write_lock(&sbi->s_mb_avg_fragment_size_locks[
+-					grp->bb_avg_fragment_size_order]);
++		write_lock(&afs_locks[grp->bb_avg_fragment_size_order]);
+ 		list_del(&grp->bb_avg_fragment_size_node);
+-		write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+-					grp->bb_avg_fragment_size_order]);
++		write_unlock(&afs_locks[grp->bb_avg_fragment_size_order]);
+ 	}
+ 	grp->bb_avg_fragment_size_order = new_order;
+-	write_lock(&sbi->s_mb_avg_fragment_size_locks[
+-					grp->bb_avg_fragment_size_order]);
+-	list_add_tail(&grp->bb_avg_fragment_size_node,
+-		&sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
+-	write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+-					grp->bb_avg_fragment_size_order]);
++	write_lock(&afs_locks[new_order]);
++	list_add_tail(&grp->bb_avg_fragment_size_node, &afs_list[new_order]);
++	write_unlock(&afs_locks[new_order]);
+ }
+ 
+ /*
+@@ -986,6 +991,95 @@ inc_and_return:
+ 	return group + 1 >= ngroups ? 0 : group + 1;
+ }
+ 
++static bool ext4_mb_choose_next_iops_group_cr0(
++			struct ext4_allocation_context *ac, ext4_group_t *group)
++{
++	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++	struct ext4_group_info *iter, *grp;
++	int i;
++
++	if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
++		atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
++
++	grp = NULL;
++	for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++		if (list_empty(&sbi->s_largest_free_orders_list_iops[i]))
++			continue;
++		read_lock(&sbi->s_largest_free_orders_locks_iops[i]);
++		if (list_empty(&sbi->s_largest_free_orders_list_iops[i])) {
++			read_unlock(&sbi->s_largest_free_orders_locks_iops[i]);
++			continue;
++		}
++		grp = NULL;
++		list_for_each_entry(iter,
++				    &sbi->s_largest_free_orders_list_iops[i],
++				    bb_largest_free_order_node) {
++			if (sbi->s_mb_stats)
++				atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
++			if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
++				grp = iter;
++				break;
++			}
++		}
++		read_unlock(&sbi->s_largest_free_orders_locks_iops[i]);
++		if (grp)
++			break;
++	}
++
++	if (grp) {
++		*group = grp->bb_group;
++		ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
++		return true;
++	}
++
++	return false;
++}
++
++static bool ext4_mb_choose_next_iops_group_cr1(
++			struct ext4_allocation_context *ac, ext4_group_t *group)
++{
++	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++	struct ext4_group_info *grp = NULL, *iter;
++	int i;
++
++	if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
++		if (sbi->s_mb_stats)
++			atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
++	}
++
++	for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
++	     i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++		if (list_empty(&sbi->s_avg_fragment_size_list_iops[i]))
++			continue;
++		read_lock(&sbi->s_avg_fragment_size_locks_iops[i]);
++		if (list_empty(&sbi->s_avg_fragment_size_list_iops[i])) {
++			read_unlock(&sbi->s_avg_fragment_size_locks_iops[i]);
++			continue;
++		}
++		list_for_each_entry(iter,
++				    &sbi->s_avg_fragment_size_list_iops[i],
++				    bb_avg_fragment_size_node) {
++			if (sbi->s_mb_stats)
++				atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
++			if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
++				grp = iter;
++				break;
++			}
++		}
++		read_unlock(&sbi->s_avg_fragment_size_locks_iops[i]);
++		if (grp)
++			break;
++	}
++
++	if (grp) {
++		*group = grp->bb_group;
++		ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
++		return true;
++	}
++
++	return false;
++}
++
+ /*
+  * ext4_mb_choose_next_group: choose next group for allocation.
+  *
+@@ -1002,6 +1096,12 @@ inc_and_return:
+ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
+ 		int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ {
++	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
++	bool alloc_metadata = ac->ac_flags & EXT4_MB_HINT_METADATA;
++	bool has_iops = sbi->s_mb_enable_iops &&
++			ac->ac_sb->s_flags & EXT2_FLAGS_HAS_IOPS;
++	bool ret = false;
++
+ 	*new_cr = ac->ac_criteria;
+ 
+ 	if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
+@@ -1009,12 +1109,36 @@ static void ext4_mb_choose_next_group(st
+ 		return;
+ 	}
+ 
++	if (has_iops && alloc_metadata) {
++		if (*new_cr == 0)
++			ret = ext4_mb_choose_next_iops_group_cr0(ac, group);
++		if (!ret && *new_cr < 2)
++			ret = ext4_mb_choose_next_iops_group_cr1(ac, group);
++		if (ret)
++			return;
++		/*
++		 * Cannot get metadata group from IOPS storage, fall through
++		 * to slow storage.
++		 */
++		cond_resched();
++	}
++
+ 	if (*new_cr == 0) {
+ 		ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
+ 	} else if (*new_cr == 1) {
+ 		ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
+ 	} else {
+ 		/*
++		 * Cannot get data group from slow storage, try IOPS storage
++		 */
++		if (has_iops && !alloc_metadata && *new_cr == 2) {
++			if (ac->ac_2order)
++				ret = ext4_mb_choose_next_iops_group_cr0(ac,
++									 group);
++			if (!ret)
++				ext4_mb_choose_next_iops_group_cr1(ac, group);
++		}
++		/*
+ 		 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
+ 		 * bb_free. But until that happens, we should never come here.
+ 		 */
+@@ -1030,6 +1154,8 @@ static void
+ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+ {
+ 	struct ext4_sb_info *sbi = EXT4_SB(sb);
++	rwlock_t *lfo_locks;
++	struct list_head *lfo_list;
+ 	int i;
+ 
+ 	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
+@@ -1042,21 +1168,24 @@ mb_set_largest_free_order(struct super_b
+ 		return;
+ 	}
+ 
++	if (sbi->s_mb_enable_iops && EXT4_MB_GRP_TEST_IOPS(grp)) {
++		lfo_locks = sbi->s_largest_free_orders_locks_iops;
++		lfo_list = sbi->s_largest_free_orders_list_iops;
++	} else {
++		lfo_locks = sbi->s_mb_largest_free_orders_locks;
++		lfo_list = sbi->s_mb_largest_free_orders;
++	}
++
+ 	if (grp->bb_largest_free_order >= 0) {
+-		write_lock(&sbi->s_mb_largest_free_orders_locks[
+-					      grp->bb_largest_free_order]);
++		write_lock(&lfo_locks[grp->bb_largest_free_order]);
+ 		list_del_init(&grp->bb_largest_free_order_node);
+-		write_unlock(&sbi->s_mb_largest_free_orders_locks[
+-					      grp->bb_largest_free_order]);
++		write_unlock(&lfo_locks[grp->bb_largest_free_order]);
+ 	}
+ 	grp->bb_largest_free_order = i;
+ 	if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
+-		write_lock(&sbi->s_mb_largest_free_orders_locks[
+-					      grp->bb_largest_free_order]);
+-		list_add_tail(&grp->bb_largest_free_order_node,
+-		      &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
+-		write_unlock(&sbi->s_mb_largest_free_orders_locks[
+-					      grp->bb_largest_free_order]);
++		write_lock(&lfo_locks[i]);
++		list_add_tail(&grp->bb_largest_free_order_node, &lfo_list[i]);
++		write_unlock(&lfo_locks[i]);
+ 	}
+ }
+ 
+@@ -2209,6 +2338,16 @@ int ext4_mb_find_by_goal(struct ext4_all
+ 		return 0;
+ 	}
+ 
++	if (sbi->s_mb_enable_iops && ac->ac_sb->s_flags & EXT2_FLAGS_HAS_IOPS) {
++		if ((ac->ac_flags & EXT4_MB_HINT_METADATA &&
++		     !EXT4_MB_GRP_TEST_IOPS(e4b->bd_info)) ||
++		    (!(ac->ac_flags & EXT4_MB_HINT_METADATA) &&
++		     EXT4_MB_GRP_TEST_IOPS(e4b->bd_info))) {
++			ext4_mb_unload_buddy(e4b);
++			return 0;
++		}
++	}
++
+ 	ext4_lock_group(ac->ac_sb, group);
+ 	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
+ 			     ac->ac_g_ex.fe_len, &ex);
+@@ -3251,6 +3390,8 @@ int ext4_mb_add_groupinfo(struct super_b
+ 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ 	init_rwsem(&meta_group_info[i]->alloc_sem);
+ 	meta_group_info[i]->bb_free_root = RB_ROOT;
++	if (desc->bg_flags & EXT4_BG_IOPS)
++		EXT4_MB_GRP_SET_IOPS(meta_group_info[i]);
+ 	INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
+ 	INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
+ 	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
+@@ -3464,6 +3605,24 @@ int ext4_mb_init(struct super_block *sb)
+ 		INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
+ 		rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
+ 	}
++	sbi->s_avg_fragment_size_list_iops =
++		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++			      GFP_KERNEL);
++	if (!sbi->s_avg_fragment_size_list_iops) {
++		ret = -ENOMEM;
++		goto out;
++	}
++	sbi->s_avg_fragment_size_locks_iops =
++		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++			      GFP_KERNEL);
++	if (!sbi->s_avg_fragment_size_locks_iops) {
++		ret = -ENOMEM;
++		goto out;
++	}
++	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++		INIT_LIST_HEAD(&sbi->s_avg_fragment_size_list_iops[i]);
++		rwlock_init(&sbi->s_avg_fragment_size_locks_iops[i]);
++	}
+ 	sbi->s_mb_largest_free_orders =
+ 		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ 			GFP_KERNEL);
+@@ -3482,6 +3641,24 @@ int ext4_mb_init(struct super_block *sb)
+ 		INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
+ 		rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
+ 	}
++	sbi->s_largest_free_orders_list_iops =
++		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++			      GFP_KERNEL);
++	if (!sbi->s_largest_free_orders_list_iops) {
++		ret = -ENOMEM;
++		goto out;
++	}
++	sbi->s_largest_free_orders_locks_iops =
++		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++			      GFP_KERNEL);
++	if (!sbi->s_largest_free_orders_locks_iops) {
++		ret = -ENOMEM;
++		goto out;
++	}
++	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++		INIT_LIST_HEAD(&sbi->s_largest_free_orders_list_iops[i]);
++		rwlock_init(&sbi->s_largest_free_orders_locks_iops[i]);
++	}
+ 
+ 	spin_lock_init(&sbi->s_md_lock);
+ 	sbi->s_mb_free_pending = 0;
+@@ -3566,6 +3743,8 @@ int ext4_mb_init(struct super_block *sb)
+ 	if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ 		sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+ 
++	sbi->s_mb_enable_iops = 1;
++
+ 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+ 	if (sbi->s_locality_groups == NULL) {
+ 		ret = -ENOMEM;
+@@ -3597,8 +3776,12 @@ out_free_locality_groups:
+ out:
+ 	kfree(sbi->s_mb_avg_fragment_size);
+ 	kfree(sbi->s_mb_avg_fragment_size_locks);
++	kfree(sbi->s_avg_fragment_size_list_iops);
++	kfree(sbi->s_avg_fragment_size_locks_iops);
+ 	kfree(sbi->s_mb_largest_free_orders);
+ 	kfree(sbi->s_mb_largest_free_orders_locks);
++	kfree(sbi->s_largest_free_orders_list_iops);
++	kfree(sbi->s_largest_free_orders_locks_iops);
+ 	kfree(sbi->s_mb_prealloc_table);
+ 	kfree(sbi->s_mb_offsets);
+ 	sbi->s_mb_offsets = NULL;
+@@ -3658,8 +3841,12 @@ int ext4_mb_release(struct super_block *
+ 	}
+ 	kfree(sbi->s_mb_avg_fragment_size);
+ 	kfree(sbi->s_mb_avg_fragment_size_locks);
++	kfree(sbi->s_avg_fragment_size_list_iops);
++	kfree(sbi->s_avg_fragment_size_locks_iops);
+ 	kfree(sbi->s_mb_largest_free_orders);
+ 	kfree(sbi->s_mb_largest_free_orders_locks);
++	kfree(sbi->s_largest_free_orders_list_iops);
++	kfree(sbi->s_largest_free_orders_locks_iops);
+ 	kfree(sbi->s_mb_offsets);
+ 	kfree(sbi->s_mb_maxs);
+ 	iput(sbi->s_buddy_cache);
+@@ -6485,3 +6672,46 @@ out_unload:
+ 
+ 	return error;
+ }
++
++void ext4_mb_disable_iops(struct ext4_sb_info *sbi)
++{
++	struct super_block *sb = sbi->s_sb;
++	struct ext4_group_info *iter;
++	int i;
++
++	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++		if (list_empty(&sbi->s_largest_free_orders_list_iops[i]))
++			continue;
++		write_lock(&sbi->s_largest_free_orders_locks_iops[i]);
++		if (list_empty(&sbi->s_largest_free_orders_list_iops[i])) {
++			write_unlock(&sbi->s_largest_free_orders_locks_iops[i]);
++			continue;
++		}
++		write_lock(&sbi->s_mb_largest_free_orders_locks[i]);
++		list_for_each_entry(iter,
++				    &sbi->s_largest_free_orders_list_iops[i],
++				    bb_largest_free_order_node)
++			list_move(&iter->bb_largest_free_order_node,
++				  &sbi->s_mb_largest_free_orders[i]);
++
++		write_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
++		write_unlock(&sbi->s_largest_free_orders_locks_iops[i]);
++	}
++	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++		if (list_empty(&sbi->s_avg_fragment_size_list_iops[i]))
++			continue;
++		write_lock(&sbi->s_avg_fragment_size_locks_iops[i]);
++		if (list_empty(&sbi->s_avg_fragment_size_list_iops[i])) {
++			write_unlock(&sbi->s_avg_fragment_size_locks_iops[i]);
++			continue;
++		}
++		write_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
++		list_for_each_entry(iter,
++				    &sbi->s_avg_fragment_size_list_iops[i],
++				    bb_avg_fragment_size_node)
++			list_move(&iter->bb_avg_fragment_size_node,
++				  &sbi->s_mb_avg_fragment_size[i]);
++		write_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++		write_unlock(&sbi->s_avg_fragment_size_locks_iops[i]);
++	}
++}
+Index: linux-stage/fs/ext4/balloc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/balloc.c
++++ linux-stage/fs/ext4/balloc.c
+@@ -676,7 +676,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle
+ 	ar.inode = inode;
+ 	ar.goal = goal;
+ 	ar.len = count ? *count : 1;
+-	ar.flags = flags;
++	ar.flags = flags | EXT4_MB_HINT_METADATA;
+ 
+ 	ret = ext4_mb_new_blocks(handle, &ar, errp);
+ 	if (count)
+Index: linux-stage/fs/ext4/ialloc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/ialloc.c
++++ linux-stage/fs/ext4/ialloc.c
+@@ -778,6 +778,7 @@ struct inode *__ext4_new_inode(handle_t
+ 	ext4_group_t flex_group;
+ 	struct ext4_group_info *grp;
+ 	int encrypt = 0;
++	bool try_iops = false;
+ 
+ 	/* Cannot create files in a deleted directory */
+ 	if (!dir || !dir->i_nlink)
+@@ -900,7 +901,13 @@ got_group:
+ 	 * Normally we will only go through one pass of this loop,
+ 	 * unless we get unlucky and it turns out the group we selected
+ 	 * had its last inode grabbed by someone else.
++	 *
++	 * We'd try to get IOPS group for inode unless it has been used up.
+ 	 */
++	if (sbi->s_mb_enable_iops && sb->s_flags & EXT2_FLAGS_HAS_IOPS)
++		try_iops = true;
++
++repeat:
+ 	for (i = 0; i < ngroups; i++, ino = 0) {
+ 		err = -EIO;
+ 
+@@ -919,6 +926,9 @@ got_group:
+ 		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ 			goto next_group;
+ 
++		if (try_iops && !EXT4_MB_GRP_TEST_IOPS(grp))
++			goto next_group;
++
+ 		brelse(inode_bitmap_bh);
+ 		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ 		/* Skip groups with suspicious inode tables */
+@@ -983,6 +993,11 @@ next_group:
+ 		if (++group == ngroups)
+ 			group = 0;
+ 	}
++	if (try_iops) {
++		try_iops = false;
++		goto repeat;
++	}
++
+ 	err = -ENOSPC;
+ 	goto out;
+ 
+Index: linux-stage/fs/ext4/extents.c
+===================================================================
+--- linux-stage.orig/fs/ext4/extents.c
++++ linux-stage/fs/ext4/extents.c
+@@ -4635,7 +4635,7 @@ int ext4_ext_map_blocks(handle_t *handle
+ 		ar.flags = EXT4_MB_HINT_DATA;
+ 	else
+ 		/* disable in-core preallocation for non-regular files */
+-		ar.flags = 0;
++		ar.flags = EXT4_MB_HINT_METADATA;
+ 	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ 		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+ 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+Index: linux-stage/fs/ext4/sysfs.c
+===================================================================
+--- linux-stage.orig/fs/ext4/sysfs.c
++++ linux-stage/fs/ext4/sysfs.c
+@@ -237,6 +237,7 @@ EXT4_ATTR(last_error_time, 0444, last_er
+ EXT4_ATTR(journal_task, 0444, journal_task);
+ EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+ EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
++EXT4_RW_ATTR_SBI_UI(mb_enable_iops, s_mb_enable_iops);
+ 
+ static unsigned int old_bump_val = 128;
+ EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
+@@ -277,6 +278,7 @@ static struct attribute *ext4_attrs[] =
+ 	ATTR_LIST(journal_task),
+ 	ATTR_LIST(mb_prefetch),
+ 	ATTR_LIST(mb_prefetch_limit),
++	ATTR_LIST(mb_enable_iops),
+ 	NULL,
+ };
+ 
+@@ -288,6 +290,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize);
+ EXT4_ATTR_FEATURE(encryption);
+ #endif
+ EXT4_ATTR_FEATURE(metadata_csum_seed);
++EXT4_ATTR_FEATURE(iops);
+ 
+ static struct attribute *ext4_feat_attrs[] = {
+ 	ATTR_LIST(lazy_itable_init),
+@@ -297,6 +300,7 @@ static struct attribute *ext4_feat_attrs
+ 	ATTR_LIST(encryption),
+ #endif
+ 	ATTR_LIST(metadata_csum_seed),
++	ATTR_LIST(iops),
+ 	NULL,
+ };
+ 
+@@ -408,6 +412,12 @@ static ssize_t ext4_attr_store(struct ko
+ 			*((__le32 *) ptr) = cpu_to_le32(t);
+ 		else
+ 			*((unsigned int *) ptr) = t;
++
++		if (a->attr_ptr == ptr_ext4_sb_info_offset &&
++		    a->u.offset == offsetof(struct ext4_sb_info,
++			    		    s_mb_enable_iops) &&
++		    t == 0)
++			ext4_mb_disable_iops(sbi);
+ 		return len;
+ 	case attr_inode_readahead:
+ 		return inode_readahead_blks_store(sbi, buf, len);
diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series
index c4f4ad6..d6daa1b 100644
--- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series
+++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series
@@ -38,3 +38,4 @@ rhel8/ext4-old_ea_inodes_handling_fix.patch
 rhel8.4/ext4-optimize-find_delayed_extent.patch
 rhel8/ext4-encdata.patch
 rhel8/ext4-mballoc-improve.patch
+rhel8/ext4-mballoc-for-hybrid.patch
-- 
1.8.3.1