From 1a0f7f0b9c13ef0aa86e125f350b6733bff8db3c Mon Sep 17 00:00:00 2001
-From: Shaun Tancheff <stancheff@cray.com>
+From: Liang Zhen <liang.zhen@intel.com>
Date: Wed, 15 Jan 2020 07:35:13 -0600
-Subject: [PATCH] Single directory performance is a critical for HPC workloads.
+Subject: [PATCH] LU-50 ldiskfs: parallel directory operations for ext4
+
In a typical use case an application creates a separate output file for each
node and task in a job. As nodes and tasks increase, hundreds of thousands of
files may be created in a single directory within a short window of time.
This patch contains:
- pdirops support for ldiskfs
- integrate with osd-ldiskfs
+Signed-off-by: Liang Zhen <liang.zhen@intel.com>
+Change-Id: I269c0e3112e68f3acd79e860dab052a68c7d7aaa
---
fs/ext4/Makefile | 1 +
fs/ext4/ext4.h | 78 ++++
create mode 100644 fs/ext4/htree_lock.c
create mode 100644 include/linux/htree_lock.h
-diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
-index b17ddc2..45a68cb 100644
---- a/fs/ext4/Makefile
-+++ b/fs/ext4/Makefile
-@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
-
- ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
- extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
-+ htree_lock.o \
- indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
- mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
-diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
-index 78893a6..72c355d 100644
---- a/fs/ext4/ext4.h
-+++ b/fs/ext4/ext4.h
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
@@ -29,6 +29,7 @@
#include <linux/timer.h>
#include <linux/version.h>
/*
* i_block_group is the number of the block group which contains
* this file's inode. Constant across the lifetime of the inode,
-@@ -2181,6 +2185,72 @@ struct dx_hash_info
+@@ -2207,6 +2211,72 @@ struct dx_hash_info
*/
#define HASH_NB_ALWAYS 1
struct ext4_filename {
const struct qstr *usr_fname;
struct fscrypt_str disk_name;
-@@ -2548,8 +2618,16 @@ void ext4_insert_dentry(struct inode *inode,
+@@ -2574,12 +2644,21 @@ void ext4_insert_dentry(struct inode *in
struct ext4_filename *fname, void *data);
static inline void ext4_update_dx_flag(struct inode *inode)
{
+ * the htree-locking.
+ * If we really want to support this operation in the future,
+ * we need to exclusively lock the directory at here which will
-+ * increase complexity of code */
++ * increase complexity of code
++ */
+#if 0
- if (!ext4_has_feature_dir_index(inode->i_sb))
+ if (!ext4_has_feature_dir_index(inode->i_sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+ /* ext4_iget() should have caught this... */
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
+#endif
}
static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
-index 91525f7..9c57749 100644
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t *handle,
+Index: linux-stage/fs/ext4/Makefile
+===================================================================
+--- linux-stage.orig/fs/ext4/Makefile
++++ linux-stage/fs/ext4/Makefile
+@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
+ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
++ htree_lock.o \
+ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
+ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
+ super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+Index: linux-stage/fs/ext4/namei.c
+===================================================================
+--- linux-stage.orig/fs/ext4/namei.c
++++ linux-stage/fs/ext4/namei.c
+@@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t
ext4_lblk_t *block)
{
struct buffer_head *bh;
int err;
if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
-@@ -62,15 +63,22 @@ struct buffer_head *ext4_append(handle_t *handle,
+@@ -62,15 +63,22 @@ struct buffer_head *ext4_append(handle_t
EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
return ERR_PTR(-ENOSPC);
if (err) {
brelse(bh);
ext4_std_error(inode->i_sb, err);
-@@ -264,7 +272,8 @@ static unsigned dx_node_limit(struct inode *dir);
+@@ -264,7 +272,8 @@ static unsigned dx_node_limit(struct ino
static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct inode *dir,
struct dx_hash_info *hinfo,
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
unsigned blocksize, struct dx_hash_info *hinfo,
-@@ -278,12 +287,13 @@ static void dx_insert_block(struct dx_frame *frame,
+@@ -278,12 +287,13 @@ static void dx_insert_block(struct dx_fr
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
/* checksumming functions */
void ext4_initialize_dirent_tail(struct buffer_head *bh,
-@@ -748,6 +758,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+@@ -748,6 +758,227 @@ struct stats dx_show_entries(struct dx_h
}
#endif /* DX_DEBUG */
/*
* Probe for a directory leaf block to search.
*
-@@ -759,10 +990,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+@@ -759,10 +990,11 @@ struct stats dx_show_entries(struct dx_h
*/
static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_root_info *info;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
-@@ -824,8 +1056,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+@@ -824,8 +1056,15 @@ dx_probe(struct ext4_filename *fname, st
dxtrace(printk("Look up %x", hash));
while (1) {
ext4_warning_inode(dir,
"dx entry: count %u beyond limit %u",
count, dx_get_limit(entries));
-@@ -864,8 +1103,70 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+@@ -864,8 +1103,70 @@ dx_probe(struct ext4_filename *fname, st
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
frame++;
frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
if (IS_ERR(frame->bh)) {
-@@ -934,7 +1235,7 @@ static void dx_release(struct dx_frame *frames)
+@@ -934,7 +1235,7 @@ static void dx_release(struct dx_frame *
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
{
struct dx_frame *p;
struct buffer_head *bh;
-@@ -949,12 +1250,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+@@ -949,12 +1250,22 @@ static int ext4_htree_next_block(struct
* this loop, num_frames indicates the number of interior
* nodes need to be read.
*/
+ ext4_htree_de_unlock(lck);
while (1) {
-- if (++(p->at) < p->entries + dx_get_count(p->entries))
-- break;
+ if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
+ /* num_frames > 0 :
+ * DX block
+ * ext4_htree_dx_locked:
+ * frame->at is reliable pointer returned by dx_probe,
+ * otherwise dx_probe already knew no collision */
-+ if (++(p->at) < p->entries + dx_get_count(p->entries))
-+ break;
+ if (++(p->at) < p->entries + dx_get_count(p->entries))
+ break;
+ }
if (p == frames)
return 0;
p--;
}
-@@ -977,6 +1288,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+@@ -977,6 +1288,13 @@ static int ext4_htree_next_block(struct
* block so no check is necessary
*/
while (num_frames--) {
bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
if (IS_ERR(bh))
return PTR_ERR(bh);
-@@ -985,6 +1303,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+@@ -985,6 +1303,7 @@ static int ext4_htree_next_block(struct
p->bh = bh;
p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
}
return 1;
}
-@@ -1132,10 +1451,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+@@ -1132,10 +1451,10 @@ int ext4_htree_fill_tree(struct file *di
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -1175,7 +1494,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+@@ -1175,7 +1494,7 @@ int ext4_htree_fill_tree(struct file *di
count += ret;
hashval = ~0;
ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
*next_hash = hashval;
if (ret < 0) {
err = ret;
-@@ -1451,7 +1770,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+@@ -1451,7 +1770,7 @@ static int is_dx_internal_node(struct in
static struct buffer_head *__ext4_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir,
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -1493,7 +1812,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
+@@ -1493,7 +1812,7 @@ static struct buffer_head *__ext4_find_e
goto restart;
}
if (is_dx(dir)) {
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
-@@ -1503,6 +1822,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
+@@ -1503,6 +1822,7 @@ static struct buffer_head *__ext4_find_e
goto cleanup_and_exit;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
-@@ -1590,10 +1910,10 @@ cleanup_and_exit:
+@@ -1591,10 +1911,10 @@ cleanup_and_exit:
return ret;
}
{
int err;
struct ext4_filename fname;
-@@ -1605,12 +1925,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir,
+@@ -1606,12 +1926,14 @@ static struct buffer_head *ext4_find_ent
if (err)
return ERR_PTR(err);
static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct dentry *dentry,
struct ext4_dir_entry_2 **res_dir)
-@@ -1625,7 +1947,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+@@ -1626,7 +1948,7 @@ static struct buffer_head *ext4_lookup_e
if (err)
return ERR_PTR(err);
ext4_fname_free_filename(&fname);
return bh;
-@@ -1633,7 +1955,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+@@ -1634,7 +1956,8 @@ static struct buffer_head *ext4_lookup_e
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
{
struct super_block * sb = dir->i_sb;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-@@ -1644,7 +1967,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+@@ -1645,7 +1968,7 @@ static struct buffer_head * ext4_dx_find
#ifdef CONFIG_FS_ENCRYPTION
*res_dir = NULL;
#endif
if (IS_ERR(frame))
return (struct buffer_head *) frame;
do {
-@@ -1666,7 +1989,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+@@ -1667,7 +1990,7 @@ static struct buffer_head * ext4_dx_find
/* Check to see if we should continue to search */
retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
if (retval < 0) {
ext4_warning_inode(dir,
"error %d reading directory index block",
-@@ -1846,8 +2169,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+@@ -1847,8 +2170,9 @@ static struct ext4_dir_entry_2* dx_pack_
* Returns pointer to de in block into which the new entry will be inserted.
*/
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
{
unsigned blocksize = dir->i_sb->s_blocksize;
unsigned count, continued;
-@@ -1908,8 +2232,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+@@ -1919,8 +2243,14 @@ static struct ext4_dir_entry_2 *do_split
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
-@@ -1927,12 +2257,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+@@ -1938,12 +2268,21 @@ static struct ext4_dir_entry_2 *do_split
dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
blocksize, 1));
- if (hinfo->hash >= hash2) {
- swap(*bh, bh2);
- de = de2;
+- }
+ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
+ frame->at); /* notify block is being split */
+ if (hinfo->hash < hash2) {
-+ dx_insert_block(frame, hash2 + continued, newblock);
+ dx_insert_block(frame, hash2 + continued, newblock);
+
+ } else {
+ /* switch block number */
+ dx_get_block(frame->at));
+ dx_set_block(frame->at, newblock);
+ (frame->at)++;
- }
-- dx_insert_block(frame, hash2 + continued, newblock);
++ }
+ ext4_htree_spin_unlock(lck);
+ ext4_htree_dx_unlock(lck);
+
err = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (err)
goto journal_error;
-@@ -2202,7 +2541,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+@@ -2213,7 +2552,7 @@ static int make_indexed_dir(handle_t *ha
if (retval)
goto out_frames;
if (IS_ERR(de)) {
retval = PTR_ERR(de);
goto out_frames;
-@@ -2312,8 +2651,8 @@ out:
+@@ -2323,8 +2662,8 @@ out:
* may not sleep between calling this and putting something into
* the entry, as someone else might have used it while you slept.
*/
{
struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head *bh = NULL;
-@@ -2361,9 +2700,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+@@ -2375,9 +2714,10 @@ static int ext4_add_entry(handle_t *hand
if (dentry->d_name.len == 2 &&
memcmp(dentry->d_name.name, "..", 2) == 0)
return ext4_update_dotdot(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
+ ext4_htree_safe_relock(lck);
- ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
- dx_fallback++;
- ext4_mark_inode_dirty(handle, dir);
-@@ -2417,12 +2757,14 @@ out:
+ /* Can we just ignore htree data? */
+ if (ext4_has_metadata_csum(sb)) {
+ EXT4_ERROR_INODE(dir,
+@@ -2438,12 +2778,14 @@ out:
ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
return retval;
}
{
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
-@@ -2434,7 +2776,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+@@ -2455,7 +2797,7 @@ static int ext4_dx_add_entry(handle_t *h
again:
restart = 0;
if (IS_ERR(frame))
return PTR_ERR(frame);
entries = frame->entries;
-@@ -2469,6 +2811,12 @@ again:
+@@ -2490,6 +2832,12 @@ again:
struct dx_node *node2;
struct buffer_head *bh2;
while (frame > frames) {
if (dx_get_count((frame - 1)->entries) <
dx_get_limit((frame - 1)->entries)) {
-@@ -2571,8 +2919,32 @@ again:
+@@ -2591,8 +2939,32 @@ again:
restart = 1;
goto journal_error;
}
if (IS_ERR(de)) {
err = PTR_ERR(de);
goto cleanup;
-@@ -2583,6 +2955,8 @@ again:
+@@ -2603,6 +2975,8 @@ again:
journal_error:
ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
dx_release(frames);
/* @restart is true means htree-path has been changed, we need to
-diff --git a/fs/ext4/super.c b/fs/ext4/super.c
-index 0fcc33b..3cc0306 100644
---- a/fs/ext4/super.c
-+++ b/fs/ext4/super.c
-@@ -1076,6 +1076,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -1086,6 +1086,7 @@ static struct inode *ext4_alloc_inode(st
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
--- /dev/null
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
+@@ -1494,6 +1494,9 @@ struct ext4_sb_info {
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
++ ext4_fsblk_t s_mb_c1_blocks;
++ ext4_fsblk_t s_mb_c2_blocks;
++ ext4_fsblk_t s_mb_c3_blocks;
+ unsigned long *s_mb_prealloc_table;
+ unsigned int s_mb_group_prealloc;
+ unsigned int s_max_dir_size_kb;
+@@ -1510,6 +1513,9 @@ struct ext4_sb_info {
+ atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_breaks; /* too long searches */
+ atomic_t s_bal_2orders; /* 2^order hits */
++ /* cX loop didn't find blocks */
++ atomic64_t s_bal_cX_failed[3];
++ atomic64_t s_bal_cX_skipped[3];
+ spinlock_t s_bal_lock;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
+@@ -2723,6 +2729,9 @@ ext4_read_inode_bitmap(struct super_bloc
+ /* mballoc.c */
+ extern const struct file_operations ext4_seq_prealloc_table_fops;
+ extern const struct seq_operations ext4_mb_seq_groups_ops;
++extern const struct file_operations ext4_mb_seq_alloc_fops;
++extern int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
++ ext4_fsblk_t *blocks);
+ extern const struct file_operations ext4_seq_mb_last_group_fops;
+ extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
+ extern long ext4_mb_stats;
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c
++++ linux-stage/fs/ext4/mballoc.c
+@@ -2114,6 +2114,20 @@ static int ext4_mb_good_group(struct ext
+ return 0;
+ }
+
++static u64 available_blocks_count(struct ext4_sb_info *sbi)
++{
++ ext4_fsblk_t resv_blocks;
++ u64 bfree;
++ struct ext4_super_block *es = sbi->s_es;
++
++ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
++ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
++ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
++
++ bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
++ return bfree - (ext4_r_blocks_count(es) + resv_blocks);
++}
++
+ static noinline_for_stack int
+ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+ {
+@@ -2123,6 +2137,7 @@ ext4_mb_regular_allocator(struct ext4_al
+ struct ext4_sb_info *sbi;
+ struct super_block *sb;
+ struct ext4_buddy e4b;
++ ext4_fsblk_t avail_blocks;
+
+ sb = ac->ac_sb;
+ sbi = EXT4_SB(sb);
+@@ -2175,6 +2190,21 @@ ext4_mb_regular_allocator(struct ext4_al
+
+ /* Let's just scan groups to find more-less suitable blocks */
+ cr = ac->ac_2order ? 0 : 1;
++
++ /* Choose what loop to pass based on disk fullness */
++ avail_blocks = available_blocks_count(sbi) ;
++
++ if (avail_blocks < sbi->s_mb_c3_blocks) {
++ cr = 3;
++ atomic64_inc(&sbi->s_bal_cX_skipped[2]);
++ } else if(avail_blocks < sbi->s_mb_c2_blocks) {
++ cr = 2;
++ atomic64_inc(&sbi->s_bal_cX_skipped[1]);
++ } else if(avail_blocks < sbi->s_mb_c1_blocks) {
++ cr = 1;
++ atomic64_inc(&sbi->s_bal_cX_skipped[0]);
++ }
++
+ /*
+ * cr == 0 try to get exact allocation,
+ * cr == 3 try to get anything
+@@ -2240,6 +2270,9 @@ repeat:
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
++ /* Processed all groups and haven't found blocks */
++ if (i == ngroups)
++ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+ }
+
+ if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+@@ -2520,6 +2553,93 @@ const struct file_operations ext4_seq_mb
+ .write = ext4_mb_last_group_write,
+ };
+
++static int mb_seq_alloc_show(struct seq_file *seq, void *v)
++{
++ struct super_block *sb = seq->private;
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
++
++ seq_printf(seq, "mballoc:\n");
++ seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
++
++ seq_printf(seq, "\textents_scanned: %u\n",
++ atomic_read(&sbi->s_bal_ex_scanned));
++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
++
++ seq_printf(seq, "\tuseless_c1_loops: %llu\n",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]));
++ seq_printf(seq, "\tuseless_c2_loops: %llu\n",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]));
++ seq_printf(seq, "\tuseless_c3_loops: %llu\n",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
++ seq_printf(seq, "\tskipped_c1_loops: %llu\n",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]));
++ seq_printf(seq, "\tskipped_c2_loops: %llu\n",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]));
++ seq_printf(seq, "\tskipped_c3_loops: %llu\n",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
++ seq_printf(seq, "\tbuddies_generated: %lu\n",
++ sbi->s_mb_buddies_generated);
++ seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
++ seq_printf(seq, "\tpreallocated: %u\n",
++ atomic_read(&sbi->s_mb_preallocated));
++ seq_printf(seq, "\tdiscarded: %u\n",
++ atomic_read(&sbi->s_mb_discarded));
++ return 0;
++}
++
++static ssize_t mb_seq_alloc_write(struct file *file,
++ const char __user *buf,
++ size_t cnt, loff_t *pos)
++{
++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
++
++ atomic_set(&sbi->s_bal_allocated, 0),
++ atomic_set(&sbi->s_bal_reqs, 0),
++ atomic_set(&sbi->s_bal_success, 0);
++
++ atomic_set(&sbi->s_bal_ex_scanned, 0),
++ atomic_set(&sbi->s_bal_goals, 0),
++ atomic_set(&sbi->s_bal_2orders, 0),
++ atomic_set(&sbi->s_bal_breaks, 0),
++ atomic_set(&sbi->s_mb_lost_chunks, 0);
++
++ atomic64_set(&sbi->s_bal_cX_failed[0], 0),
++ atomic64_set(&sbi->s_bal_cX_failed[1], 0),
++ atomic64_set(&sbi->s_bal_cX_failed[2], 0);
++
++ atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
++ atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
++ atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
++
++
++ sbi->s_mb_buddies_generated = 0;
++ sbi->s_mb_generation_time = 0;
++
++ atomic_set(&sbi->s_mb_preallocated, 0),
++ atomic_set(&sbi->s_mb_discarded, 0);
++
++ return cnt;
++}
++
++static int mb_seq_alloc_open(struct inode *inode, struct file *file)
++{
++ return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
++}
++
++const struct file_operations ext4_mb_seq_alloc_fops = {
++ .owner = THIS_MODULE,
++ .open = mb_seq_alloc_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release,
++ .write = mb_seq_alloc_write,
++};
++
+ int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(m->private);
+@@ -2759,6 +2879,8 @@ static int ext4_groupinfo_create_slab(si
+ return 0;
+ }
+
++#define THRESHOLD_BLOCKS(sbi, percent) \
++ (ext4_blocks_count((sbi)->s_es) / 100 * (percent))
+ int ext4_mb_init(struct super_block *sb)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2812,6 +2934,15 @@ int ext4_mb_init(struct super_block *sb)
+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+ sbi->s_mb_stats = MB_DEFAULT_STATS;
+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
++ if (!sbi->s_mb_c1_blocks)
++ sbi->s_mb_c1_blocks =
++ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
++ if (!sbi->s_mb_c2_blocks)
++ sbi->s_mb_c2_blocks =
++ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
++ if (!sbi->s_mb_c3_blocks)
++ sbi->s_mb_c3_blocks =
++ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+@@ -2951,6 +3082,16 @@ int ext4_mb_release(struct super_block *
+ atomic_read(&sbi->s_bal_reqs),
+ atomic_read(&sbi->s_bal_success));
+ ext4_msg(sb, KERN_INFO,
++ "mballoc: (%llu, %llu, %llu) useless c(0,1,2) loops",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]),
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]),
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
++ ext4_msg(sb, KERN_INFO,
++ "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]),
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]),
++ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
++ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u extents scanned, %u goal hits, "
+ "%u 2^N hits, %u breaks, %u lost",
+ atomic_read(&sbi->s_bal_ex_scanned),
+Index: linux-stage/fs/ext4/mballoc.h
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.h
++++ linux-stage/fs/ext4/mballoc.h
+@@ -72,6 +72,9 @@ do { \
+ * for which requests use 2^N search using buddies
+ */
+ #define MB_DEFAULT_ORDER2_REQS 8
++#define MB_DEFAULT_C1_THRESHOLD 25
++#define MB_DEFAULT_C2_THRESHOLD 15
++#define MB_DEFAULT_C3_THRESHOLD 5
+
+ /*
+ * default group prealloc size 512 blocks
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -1468,6 +1468,7 @@ enum {
+ Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_dioread_nolock, Opt_dioread_lock,
++ Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ };
+@@ -1554,6 +1555,9 @@ static const match_table_t tokens = {
+ {Opt_init_itable, "init_itable"},
+ {Opt_noinit_itable, "noinit_itable"},
+ {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
++ {Opt_mb_c1_threshold, "mb_c1_threshold=%s"},
++ {Opt_mb_c2_threshold, "mb_c2_threshold=%s"},
++ {Opt_mb_c3_threshold, "mb_c3_threshold=%s"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_nombcache, "nombcache"},
+ {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
+@@ -1766,6 +1770,9 @@ static const struct mount_opts {
+ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+ {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_max_dir_size_kb, 0, MOPT_GTE0},
++ {Opt_mb_c1_threshold, 0, MOPT_STRING},
++ {Opt_mb_c2_threshold, 0, MOPT_STRING},
++ {Opt_mb_c3_threshold, 0, MOPT_STRING},
+ {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_err, 0, 0}
+@@ -1929,6 +1936,12 @@ static int handle_mount_opt(struct super
+ sbi->s_max_dir_size_kb = arg;
+ /* reset s_warning_dir_size and make it re-calculated */
+ sbi->s_warning_dir_size = 0;
++ } else if (token == Opt_mb_c1_threshold) {
++ save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c1_blocks);
++ } else if (token == Opt_mb_c2_threshold) {
++ save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c2_blocks);
++ } else if (token == Opt_mb_c3_threshold) {
++ save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c3_blocks);
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (token == Opt_resuid) {
+Index: linux-stage/fs/ext4/sysfs.c
+===================================================================
+--- linux-stage.orig/fs/ext4/sysfs.c
++++ linux-stage/fs/ext4/sysfs.c
+@@ -20,6 +20,9 @@
+ typedef enum {
+ attr_noop,
+ attr_delayed_allocation_blocks,
++ attr_mb_c1_threshold,
++ attr_mb_c2_threshold,
++ attr_mb_c3_threshold,
+ attr_session_write_kbytes,
+ attr_lifetime_write_kbytes,
+ attr_reserved_clusters,
+@@ -135,6 +138,32 @@ static ssize_t journal_task_show(struct
+ task_pid_vnr(sbi->s_journal->j_task));
+ }
+
++int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
++ ext4_fsblk_t *blocks)
++{
++ unsigned long long val;
++
++ int ret;
++
++ ret = kstrtoull(skip_spaces(buf), 0, &val);
++ if (ret || val > 100)
++ return -EINVAL;
++
++ *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
++ return 0;
++}
++
++#define THRESHOLD_PERCENT(sbi, blocks) \
++ (((blocks) - 1) * 100 / ext4_blocks_count((sbi)->s_es) + 1)
++static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
++ const char *buf, size_t count,
++ ext4_fsblk_t *blocks)
++{
++ int ret = save_threshold_percent(sbi, buf, blocks);
++
++ return ret ?: count;
++}
++
+ #define EXT4_ATTR(_name,_mode,_id) \
+ static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+@@ -178,6 +207,9 @@ EXT4_ATTR_FUNC(session_write_kbytes, 044
+ EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+ EXT4_ATTR_FUNC(reserved_clusters, 0644);
+ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
++EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
++EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
++EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
+
+ EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
+ ext4_sb_info, s_inode_readahead_blks);
+@@ -214,6 +246,9 @@ static struct attribute *ext4_attrs[] =
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
+ ATTR_LIST(sra_exceeded_retry_limit),
++ ATTR_LIST(mb_c1_threshold),
++ ATTR_LIST(mb_c2_threshold),
++ ATTR_LIST(mb_c3_threshold),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
+ ATTR_LIST(max_dir_size),
+@@ -311,6 +346,15 @@ static ssize_t ext4_attr_show(struct kob
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
++ case attr_mb_c1_threshold:
++ return scnprintf(buf, PAGE_SIZE, "%llu\n",
++ THRESHOLD_PERCENT(sbi, sbi->s_mb_c1_blocks));
++ case attr_mb_c2_threshold:
++ return scnprintf(buf, PAGE_SIZE, "%llu\n",
++ THRESHOLD_PERCENT(sbi, sbi->s_mb_c2_blocks));
++ case attr_mb_c3_threshold:
++ return scnprintf(buf, PAGE_SIZE, "%llu\n",
++ THRESHOLD_PERCENT(sbi, sbi->s_mb_c3_blocks));
+ case attr_session_write_kbytes:
+ return session_write_kbytes_show(sbi, buf);
+ case attr_lifetime_write_kbytes:
+@@ -384,6 +428,12 @@ static ssize_t ext4_attr_store(struct ko
+ return inode_readahead_blks_store(sbi, buf, len);
+ case attr_trigger_test_error:
+ return trigger_test_error(sbi, buf, len);
++ case attr_mb_c1_threshold:
++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
++ case attr_mb_c2_threshold:
++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
++ case attr_mb_c3_threshold:
++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
+ }
+ return 0;
+ }
+@@ -446,6 +496,8 @@ int ext4_register_sysfs(struct super_blo
+ &ext4_seq_mb_last_group_fops, sb);
+ proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
+ ext4_mb_seq_last_start_seq_show, sb);
++ proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
++ sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
+ }
+ return 0;
+ }