+++ /dev/null
-Index: linux-4.18.0-32.el8/fs/ext4/extents.c
-===================================================================
---- linux-4.18.0-32.el8/fs.orig/ext4/extents.c 2020-11-27 10:01:19.149710442 +0300
-+++ linux-4.18.0-32.el8/fs/ext4/extents.c 2020-11-27 10:01:25.462844639 +0300
-@@ -3431,8 +3431,8 @@ static int ext4_ext_convert_to_initia
- ex = path[depth].p_ext;
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
-- zero_ex1.ee_len = 0;
-- zero_ex2.ee_len = 0;
-+ memset(&zero_ex1, 0, sizeof(zero_ex1));
-+ memset(&zero_ex2, 0, sizeof(zero_ex2));
-
- trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
-
+++ /dev/null
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
-@@ -1591,6 +1591,8 @@ static inline void ext4_clear_state_flag
- */
- #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
-
-+#define JOURNAL_START_HAS_3ARGS 1
-+
- /*
- * Codes for operating systems
- */
-@@ -1805,7 +1807,21 @@ static inline bool ext4_has_unknown_ext#
-
- EXTN_FEATURE_FUNCS(2)
- EXTN_FEATURE_FUNCS(3)
--EXTN_FEATURE_FUNCS(4)
-+static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb)
-+{
-+ return ((EXT4_SB(sb)->s_es->s_feature_compat &
-+ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0);
-+}
-+static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb)
-+{
-+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat &
-+ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0);
-+}
-+static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb)
-+{
-+ return ((EXT4_SB(sb)->s_es->s_feature_incompat &
-+ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0);
-+}
-
- static inline bool ext4_has_compat_features(struct super_block *sb)
- {
-@@ -3111,6 +3127,13 @@ struct ext4_extent;
-
- extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
- extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-+extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb,
-+ ext4_group_t block_group);
-+extern void ext4_inc_count(handle_t *handle, struct inode *inode);
-+extern void ext4_dec_count(handle_t *handle, struct inode *inode);
-+extern struct buffer_head *ext4_append(handle_t *handle,
-+ struct inode *inode,
-+ ext4_lblk_t *block);
- extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
- extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map, int flags);
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/ialloc.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ialloc.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ialloc.c
-@@ -114,7 +114,7 @@ verified:
- *
- * Return buffer_head of bitmap on success or NULL.
- */
--static struct buffer_head *
-+struct buffer_head *
- ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
- {
- struct ext4_group_desc *desc;
-@@ -211,6 +211,7 @@ out:
- put_bh(bh);
- return ERR_PTR(err);
- }
-+EXPORT_SYMBOL(ext4_read_inode_bitmap);
-
- /*
- * NOTE! When we get the inode, we're the only people
-Index: linux-4.18.0-147.0.3.el8/fs/ext4/inode.c
-===================================================================
---- linux-4.18.0-147.0.3.el8.orig/fs/ext4/inode.c
-+++ linux-4.18.0-147.0.3.el8/fs/ext4/inode.c
-@@ -6284,3 +6284,19 @@ vm_fault_t ext4_filemap_fault(struct vm_
-
- return ret;
- }
-+EXPORT_SYMBOL(ext4_map_blocks);
-+EXPORT_SYMBOL(ext4_truncate);
-+EXPORT_SYMBOL(__ext4_iget);
-+EXPORT_SYMBOL(ext4_bread);
-+EXPORT_SYMBOL(ext4_itable_unused_count);
-+EXPORT_SYMBOL(ext4_force_commit);
-+EXPORT_SYMBOL(ext4_mark_inode_dirty);
-+EXPORT_SYMBOL(ext4_get_group_desc);
-+EXPORT_SYMBOL(__ext4_journal_get_write_access);
-+EXPORT_SYMBOL(__ext4_journal_start_sb);
-+EXPORT_SYMBOL(__ext4_journal_stop);
-+EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
-+EXPORT_SYMBOL(__ext4_std_error);
-+EXPORT_SYMBOL(ext4fs_dirhash);
-+EXPORT_SYMBOL(ext4_get_inode_loc);
-+EXPORT_SYMBOL(ext4_chunk_trans_blocks);
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/namei.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/namei.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/namei.c
-@@ -49,7 +49,7 @@
- #define NAMEI_RA_BLOCKS 4
- #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-
--static struct buffer_head *ext4_append(handle_t *handle,
-+struct buffer_head *ext4_append(handle_t *handle,
- struct inode *inode,
- ext4_lblk_t *block)
- {
-@@ -160,6 +160,7 @@ static struct buffer_head *__ext4_read_d
- }
- return bh;
- }
-+EXPORT_SYMBOL(ext4_append);
-
- #ifndef assert
- #define assert(test) J_ASSERT(test)
-@@ -2415,23 +2416,25 @@ EXPORT_SYMBOL(ext4_delete_entry);
- * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
- * on regular files) and to avoid creating huge/slow non-HTREE directories.
- */
--static void ext4_inc_count(handle_t *handle, struct inode *inode)
-+void ext4_inc_count(handle_t *handle, struct inode *inode)
- {
- inc_nlink(inode);
- if (is_dx(inode) &&
- (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2))
- set_nlink(inode, 1);
- }
-+EXPORT_SYMBOL(ext4_inc_count);
-
- /*
- * If a directory had nlink == 1, then we should let it be 1. This indicates
- * directory has >EXT4_LINK_MAX subdirs.
- */
--static void ext4_dec_count(handle_t *handle, struct inode *inode)
-+void ext4_dec_count(handle_t *handle, struct inode *inode)
- {
- if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
- drop_nlink(inode);
- }
-+EXPORT_SYMBOL(ext4_dec_count);
-
-
- static int ext4_add_nondir(handle_t *handle,
-Index: linux-4.18.0-147.0.3.el8/fs/ext4/super.c
-===================================================================
---- linux-4.18.0-147.0.3.el8.orig/fs/ext4/super.c
-+++ linux-4.18.0-147.0.3.el8/fs/ext4/super.c
-@@ -364,12 +364,12 @@ static void __save_error_info(struct sup
- return;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_update_tstamp(es, s_last_error_time);
-- strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
-+ strlcpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
- es->s_last_error_line = cpu_to_le32(line);
- if (!es->s_first_error_time) {
- es->s_first_error_time = es->s_last_error_time;
- es->s_first_error_time_hi = es->s_last_error_time_hi;
-- strncpy(es->s_first_error_func, func,
-+ strlcpy(es->s_first_error_func, func,
- sizeof(es->s_first_error_func));
- es->s_first_error_line = cpu_to_le32(line);
- es->s_first_error_ino = es->s_last_error_ino;
-@@ -6029,16 +6029,12 @@ static int __init ext4_init_fs(void)
- err = init_inodecache();
- if (err)
- goto out1;
-- register_as_ext3();
-- register_as_ext2();
- err = register_filesystem(&ext4_fs_type);
- if (err)
- goto out;
-
- return 0;
- out:
-- unregister_as_ext2();
-- unregister_as_ext3();
- destroy_inodecache();
- out1:
- ext4_exit_mballoc();
-@@ -6059,8 +6055,6 @@ out6:
- static void __exit ext4_exit_fs(void)
- {
- ext4_destroy_lazyinit_thread();
-- unregister_as_ext2();
-- unregister_as_ext3();
- unregister_filesystem(&ext4_fs_type);
- destroy_inodecache();
- ext4_exit_mballoc();
+++ /dev/null
-Single directory performance is a critical for HPC workloads. In a
-typical use case an application creates a separate output file for
-each node and task in a job. As nodes and tasks increase, hundreds
-of thousands of files may be created in a single directory within
-a short window of time.
-Today, both filename lookup and file system modifying operations
-(such as create and unlink) are protected with a single lock for
-an entire ldiskfs directory. PDO project will remove this
-bottleneck by introducing a parallel locking mechanism for entire
-ldiskfs directories. This work will enable multiple application
-threads to simultaneously lookup, create and unlink in parallel.
-
-This patch contains:
- - pdirops support for ldiskfs
- - integrate with osd-ldiskfs
-
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/Makefile
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/Makefile
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/Makefile
-@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
-
- ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
- extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
-+ htree_lock.o \
- indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
- mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
-Index: linux-4.18.0-240.1.1.el8/fs/ext4/ext4.h
-===================================================================
---- linux-4.18.0-240.1.1.el8.orig/fs/ext4/ext4.h
-+++ linux-4.18.0-240.1.1.el8/fs/ext4/ext4.h
-@@ -29,6 +29,7 @@
- #include <linux/timer.h>
- #include <linux/version.h>
- #include <linux/wait.h>
-+#include <linux/htree_lock.h>
- #include <linux/sched/signal.h>
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-@@ -946,6 +947,9 @@ struct ext4_inode_info {
- __u32 i_dtime;
- ext4_fsblk_t i_file_acl;
-
-+ /* following fields for parallel directory operations -bzzz */
-+ struct semaphore i_append_sem;
-+
- /*
- * i_block_group is the number of the block group which contains
- * this file's inode. Constant across the lifetime of the inode,
-@@ -2185,6 +2189,72 @@ struct dx_hash_info
- */
- #define HASH_NB_ALWAYS 1
-
-+/* assume name-hash is protected by upper layer */
-+#define EXT4_HTREE_LOCK_HASH 0
-+
-+enum ext4_pdo_lk_types {
-+#if EXT4_HTREE_LOCK_HASH
-+ EXT4_LK_HASH,
-+#endif
-+ EXT4_LK_DX, /* index block */
-+ EXT4_LK_DE, /* directory entry block */
-+ EXT4_LK_SPIN, /* spinlock */
-+ EXT4_LK_MAX,
-+};
-+
-+/* read-only bit */
-+#define EXT4_LB_RO(b) (1 << (b))
-+/* read + write, high bits for writer */
-+#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
-+
-+enum ext4_pdo_lock_bits {
-+ /* DX lock bits */
-+ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX),
-+ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX),
-+ /* DE lock bits */
-+ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE),
-+ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE),
-+ /* DX spinlock bits */
-+ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN),
-+ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN),
-+ /* accurate searching */
-+ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1),
-+};
-+
-+enum ext4_pdo_lock_opc {
-+ /* external */
-+ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
-+ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
-+ EXT4_LB_EXACT),
-+ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
-+ EXT4_LB_EXACT),
-+ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
-+
-+ /* internal */
-+ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
-+ EXT4_LB_EXACT),
-+ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
-+ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
-+};
-+
-+extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
-+#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead)
-+
-+extern struct htree_lock *ext4_htree_lock_alloc(void);
-+#define ext4_htree_lock_free(lck) htree_lock_free(lck)
-+
-+extern void ext4_htree_lock(struct htree_lock *lck,
-+ struct htree_lock_head *lhead,
-+ struct inode *dir, unsigned flags);
-+#define ext4_htree_unlock(lck) htree_unlock(lck)
-+
-+extern struct buffer_head *__ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 **res_dir,
-+ int *inlined, struct htree_lock *lck);
-+extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode, struct htree_lock *lck);
-+
- struct ext4_filename {
- const struct qstr *usr_fname;
- struct fscrypt_str disk_name;
-@@ -2487,11 +2557,19 @@ void ext4_insert_dentry(struct inode *in
- struct ext4_filename *fname, void *data);
- static inline void ext4_update_dx_flag(struct inode *inode)
- {
-+ /* Disable it for ldiskfs, because going from a DX directory to
-+ * a non-DX directory while it is in use will completely break
-+ * the htree-locking.
-+ * If we really want to support this operation in the future,
-+ * we need to exclusively lock the directory at here which will
-+ * increase complexity of code */
-+#if 0
- if (!ext4_has_feature_dir_index(inode->i_sb)) {
- /* ext4_iget() should have caught this... */
- WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
- ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
- }
-+#endif
- }
- static const unsigned char ext4_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-Index: linux-4.18.0-240.1.1.el8/fs/ext4/namei.c
-===================================================================
---- linux-4.18.0-240.1.1.el8.orig/fs/ext4/namei.c
-+++ linux-4.18.0-240.1.1.el8/fs/ext4/namei.c
-@@ -54,6 +54,7 @@ struct buffer_head *ext4_append(handle_t
- ext4_lblk_t *block)
- {
- struct buffer_head *bh;
-+ struct ext4_inode_info *ei = EXT4_I(inode);
- int err;
-
- if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
-@@ -61,15 +62,22 @@ struct buffer_head *ext4_append(handle_t
- EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
- return ERR_PTR(-ENOSPC);
-
-+ /* with parallel dir operations all appends
-+ * have to be serialized -bzzz */
-+ down(&ei->i_append_sem);
-+
- *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-
- bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
-- if (IS_ERR(bh))
-+ if (IS_ERR(bh)) {
-+ up(&ei->i_append_sem);
- return bh;
-+ }
- inode->i_size += inode->i_sb->s_blocksize;
- EXT4_I(inode)->i_disksize = inode->i_size;
- BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
-+ up(&ei->i_append_sem);
- if (err) {
- brelse(bh);
- ext4_std_error(inode->i_sb, err);
-@@ -252,7 +260,8 @@ static unsigned dx_node_limit(struct ino
- static struct dx_frame *dx_probe(struct ext4_filename *fname,
- struct inode *dir,
- struct dx_hash_info *hinfo,
-- struct dx_frame *frame);
-+ struct dx_frame *frame,
-+ struct htree_lock *lck);
- static void dx_release(struct dx_frame *frames);
- static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
-@@ -266,12 +275,13 @@ static void dx_insert_block(struct dx_fr
- static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
-- __u32 *start_hash);
-+ __u32 *start_hash, struct htree_lock *lck);
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
- struct ext4_filename *fname,
-- struct ext4_dir_entry_2 **res_dir);
-+ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck);
- static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-- struct inode *dir, struct inode *inode);
-+ struct inode *dir, struct inode *inode,
-+ struct htree_lock *lck);
-
- /* checksumming functions */
- void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-@@ -735,6 +745,227 @@ struct stats dx_show_entries(struct dx_h
- }
- #endif /* DX_DEBUG */
-
-+/* private data for htree_lock */
-+struct ext4_dir_lock_data {
-+ unsigned ld_flags; /* bits-map for lock types */
-+ unsigned ld_count; /* # entries of the last DX block */
-+ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */
-+ struct dx_entry *ld_at; /* position of leaf dx_entry */
-+};
-+
-+#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
-+#define ext4_find_entry(dir, name, dirent, inline) \
-+ __ext4_find_entry(dir, name, dirent, inline, NULL)
-+#define ext4_add_entry(handle, dentry, inode) \
-+ __ext4_add_entry(handle, dentry, inode, NULL)
-+
-+/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
-+#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32)
-+
-+static void ext4_htree_event_cb(void *target, void *event)
-+{
-+ u64 *block = (u64 *)target;
-+
-+ if (*block == dx_get_block((struct dx_entry *)event))
-+ *block = EXT4_HTREE_NODE_CHANGED;
-+}
-+
-+struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
-+{
-+ struct htree_lock_head *lhead;
-+
-+ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
-+ if (lhead != NULL) {
-+ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
-+ ext4_htree_event_cb);
-+ }
-+ return lhead;
-+}
-+EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
-+
-+struct htree_lock *ext4_htree_lock_alloc(void)
-+{
-+ return htree_lock_alloc(EXT4_LK_MAX,
-+ sizeof(struct ext4_dir_lock_data));
-+}
-+EXPORT_SYMBOL(ext4_htree_lock_alloc);
-+
-+static htree_lock_mode_t ext4_htree_mode(unsigned flags)
-+{
-+ switch (flags) {
-+ default: /* 0 or unknown flags require EX lock */
-+ return HTREE_LOCK_EX;
-+ case EXT4_HLOCK_READDIR:
-+ return HTREE_LOCK_PR;
-+ case EXT4_HLOCK_LOOKUP:
-+ return HTREE_LOCK_CR;
-+ case EXT4_HLOCK_DEL:
-+ case EXT4_HLOCK_ADD:
-+ return HTREE_LOCK_CW;
-+ }
-+}
-+
-+/* return PR for read-only operations, otherwise return EX */
-+static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
-+{
-+ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
-+
-+ /* 0 requires EX lock */
-+ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
-+}
-+
-+static int ext4_htree_safe_locked(struct htree_lock *lck)
-+{
-+ int writer;
-+
-+ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
-+ return 1;
-+
-+ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
-+ EXT4_LB_DE;
-+ if (writer) /* all readers & writers are excluded? */
-+ return lck->lk_mode == HTREE_LOCK_EX;
-+
-+ /* all writers are excluded? */
-+ return lck->lk_mode == HTREE_LOCK_PR ||
-+ lck->lk_mode == HTREE_LOCK_PW ||
-+ lck->lk_mode == HTREE_LOCK_EX;
-+}
-+
-+/* relock htree_lock with EX mode if it's change operation, otherwise
-+ * relock it with PR mode. It's noop if PDO is disabled. */
-+static void ext4_htree_safe_relock(struct htree_lock *lck)
-+{
-+ if (!ext4_htree_safe_locked(lck)) {
-+ unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
-+
-+ htree_change_lock(lck, ext4_htree_safe_mode(flags));
-+ }
-+}
-+
-+void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
-+ struct inode *dir, unsigned flags)
-+{
-+ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
-+ ext4_htree_safe_mode(flags);
-+
-+ ext4_htree_lock_data(lck)->ld_flags = flags;
-+ htree_lock(lck, lhead, mode);
-+ if (!is_dx(dir))
-+ ext4_htree_safe_relock(lck); /* make sure it's safe locked */
-+}
-+EXPORT_SYMBOL(ext4_htree_lock);
-+
-+static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
-+ unsigned lmask, int wait, void *ev)
-+{
-+ u32 key = (at == NULL) ? 0 : dx_get_block(at);
-+ u32 mode;
-+
-+ /* NOOP if htree is well protected or caller doesn't require the lock */
-+ if (ext4_htree_safe_locked(lck) ||
-+ !(ext4_htree_lock_data(lck)->ld_flags & lmask))
-+ return 1;
-+
-+ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
-+ HTREE_LOCK_PW : HTREE_LOCK_PR;
-+ while (1) {
-+ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
-+ return 1;
-+ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
-+ return 0;
-+ cpu_relax(); /* spin until granted */
-+ }
-+}
-+
-+static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
-+{
-+ return ext4_htree_safe_locked(lck) ||
-+ htree_node_is_granted(lck, ffz(~lmask));
-+}
-+
-+static void ext4_htree_node_unlock(struct htree_lock *lck,
-+ unsigned lmask, void *buf)
-+{
-+ /* NB: it's safe to call mutiple times or even it's not locked */
-+ if (!ext4_htree_safe_locked(lck) &&
-+ htree_node_is_granted(lck, ffz(~lmask)))
-+ htree_node_unlock(lck, ffz(~lmask), buf);
-+}
-+
-+#define ext4_htree_dx_lock(lck, key) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
-+#define ext4_htree_dx_lock_try(lck, key) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
-+#define ext4_htree_dx_unlock(lck) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
-+#define ext4_htree_dx_locked(lck) \
-+ ext4_htree_node_locked(lck, EXT4_LB_DX)
-+
-+static void ext4_htree_dx_need_lock(struct htree_lock *lck)
-+{
-+ struct ext4_dir_lock_data *ld;
-+
-+ if (ext4_htree_safe_locked(lck))
-+ return;
-+
-+ ld = ext4_htree_lock_data(lck);
-+ switch (ld->ld_flags) {
-+ default:
-+ return;
-+ case EXT4_HLOCK_LOOKUP:
-+ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
-+ return;
-+ case EXT4_HLOCK_DEL:
-+ ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
-+ return;
-+ case EXT4_HLOCK_ADD:
-+ ld->ld_flags = EXT4_HLOCK_SPLIT;
-+ return;
-+ }
-+}
-+
-+#define ext4_htree_de_lock(lck, key) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
-+#define ext4_htree_de_unlock(lck) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
-+
-+#define ext4_htree_spin_lock(lck, key, event) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
-+#define ext4_htree_spin_unlock(lck) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
-+#define ext4_htree_spin_unlock_listen(lck, p) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
-+
-+static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
-+{
-+ if (!ext4_htree_safe_locked(lck) &&
-+ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
-+ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
-+}
-+
-+enum {
-+ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */
-+ DX_HASH_COL_YES, /* there is collision and it does matter */
-+ DX_HASH_COL_NO, /* there is no collision */
-+};
-+
-+static int dx_probe_hash_collision(struct htree_lock *lck,
-+ struct dx_entry *entries,
-+ struct dx_entry *at, u32 hash)
-+{
-+ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
-+ return DX_HASH_COL_IGNORE; /* don't care about collision */
-+
-+ } else if (at == entries + dx_get_count(entries) - 1) {
-+ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
-+
-+ } else { /* hash collision? */
-+ return ((dx_get_hash(at + 1) & ~1) == hash) ?
-+ DX_HASH_COL_YES : DX_HASH_COL_NO;
-+ }
-+}
-+
- /*
- * Probe for a directory leaf block to search.
- *
-@@ -746,10 +977,11 @@ struct stats dx_show_entries(struct dx_h
- */
- static struct dx_frame *
- dx_probe(struct ext4_filename *fname, struct inode *dir,
-- struct dx_hash_info *hinfo, struct dx_frame *frame_in)
-+ struct dx_hash_info *hinfo, struct dx_frame *frame_in,
-+ struct htree_lock *lck)
- {
- unsigned count, indirect;
-- struct dx_entry *at, *entries, *p, *q, *m;
-+ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
- struct dx_root_info *info;
- struct dx_frame *frame = frame_in;
- struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
-@@ -811,8 +1043,15 @@ dx_probe(struct ext4_filename *fname, st
-
- dxtrace(printk("Look up %x", hash));
- while (1) {
-+ if (indirect == 0) { /* the last index level */
-+ /* NB: ext4_htree_dx_lock() could be noop if
-+ * DX-lock flag is not set for current operation */
-+ ext4_htree_dx_lock(lck, dx);
-+ ext4_htree_spin_lock(lck, dx, NULL);
-+ }
- count = dx_get_count(entries);
-- if (!count || count > dx_get_limit(entries)) {
-+ if (count == 0 || count > dx_get_limit(entries)) {
-+ ext4_htree_spin_unlock(lck); /* release spin */
- ext4_warning_inode(dir,
- "dx entry: count %u beyond limit %u",
- count, dx_get_limit(entries));
-@@ -851,8 +1090,70 @@ dx_probe(struct ext4_filename *fname, st
- dx_get_block(at)));
- frame->entries = entries;
- frame->at = at;
-- if (!indirect--)
-+
-+ if (indirect == 0) { /* the last index level */
-+ struct ext4_dir_lock_data *ld;
-+ u64 myblock;
-+
-+ /* By default we only lock DE-block, however, we will
-+ * also lock the last level DX-block if:
-+ * a) there is hash collision
-+ * we will set DX-lock flag (a few lines below)
-+ * and redo to lock DX-block
-+ * see detail in dx_probe_hash_collision()
-+ * b) it's a retry from splitting
-+ * we need to lock the last level DX-block so nobody
-+ * else can split any leaf blocks under the same
-+ * DX-block, see detail in ext4_dx_add_entry()
-+ */
-+ if (ext4_htree_dx_locked(lck)) {
-+ /* DX-block is locked, just lock DE-block
-+ * and return */
-+ ext4_htree_spin_unlock(lck);
-+ if (!ext4_htree_safe_locked(lck))
-+ ext4_htree_de_lock(lck, frame->at);
-+ return frame;
-+ }
-+ /* it's pdirop and no DX lock */
-+ if (dx_probe_hash_collision(lck, entries, at, hash) ==
-+ DX_HASH_COL_YES) {
-+ /* found hash collision, set DX-lock flag
-+ * and retry to abtain DX-lock */
-+ ext4_htree_spin_unlock(lck);
-+ ext4_htree_dx_need_lock(lck);
-+ continue;
-+ }
-+ ld = ext4_htree_lock_data(lck);
-+ /* because I don't lock DX, so @at can't be trusted
-+ * after I release spinlock so I have to save it */
-+ ld->ld_at = at;
-+ ld->ld_at_entry = *at;
-+ ld->ld_count = dx_get_count(entries);
-+
-+ frame->at = &ld->ld_at_entry;
-+ myblock = dx_get_block(at);
-+
-+ /* NB: ordering locking */
-+ ext4_htree_spin_unlock_listen(lck, &myblock);
-+ /* other thread can split this DE-block because:
-+ * a) I don't have lock for the DE-block yet
-+ * b) I released spinlock on DX-block
-+ * if it happened I can detect it by listening
-+ * splitting event on this DE-block */
-+ ext4_htree_de_lock(lck, frame->at);
-+ ext4_htree_spin_stop_listen(lck);
-+
-+ if (myblock == EXT4_HTREE_NODE_CHANGED) {
-+ /* someone split this DE-block before
-+ * I locked it, I need to retry and lock
-+ * valid DE-block */
-+ ext4_htree_de_unlock(lck);
-+ continue;
-+ }
- return frame;
-+ }
-+ dx = at;
-+ indirect--;
- frame++;
- frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
- if (IS_ERR(frame->bh)) {
-@@ -921,7 +1222,7 @@ static void dx_release(struct dx_frame *
- static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
-- __u32 *start_hash)
-+ __u32 *start_hash, struct htree_lock *lck)
- {
- struct dx_frame *p;
- struct buffer_head *bh;
-@@ -936,12 +1237,22 @@ static int ext4_htree_next_block(struct
- * this loop, num_frames indicates the number of interior
- * nodes need to be read.
- */
-+ ext4_htree_de_unlock(lck);
- while (1) {
-- if (++(p->at) < p->entries + dx_get_count(p->entries))
-- break;
-+ if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
-+ /* num_frames > 0 :
-+ * DX block
-+ * ext4_htree_dx_locked:
-+ * frame->at is reliable pointer returned by dx_probe,
-+ * otherwise dx_probe already knew no collision */
-+ if (++(p->at) < p->entries + dx_get_count(p->entries))
-+ break;
-+ }
- if (p == frames)
- return 0;
- num_frames++;
-+ if (num_frames == 1)
-+ ext4_htree_dx_unlock(lck);
- p--;
- }
-
-@@ -964,6 +1275,13 @@ static int ext4_htree_next_block(struct
- * block so no check is necessary
- */
- while (num_frames--) {
-+ if (num_frames == 0) {
-+ /* it's not always necessary, we just don't want to
-+ * detect hash collision again */
-+ ext4_htree_dx_need_lock(lck);
-+ ext4_htree_dx_lock(lck, p->at);
-+ }
-+
- bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
-@@ -972,6 +1290,7 @@ static int ext4_htree_next_block(struct
- p->bh = bh;
- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
- }
-+ ext4_htree_de_lock(lck, p->at);
- return 1;
- }
-
-@@ -1119,10 +1438,10 @@ int ext4_htree_fill_tree(struct file *di
- }
- hinfo.hash = start_hash;
- hinfo.minor_hash = 0;
-- frame = dx_probe(NULL, dir, &hinfo, frames);
-+ /* assume it's PR locked */
-+ frame = dx_probe(NULL, dir, &hinfo, frames, NULL);
- if (IS_ERR(frame))
- return PTR_ERR(frame);
--
- /* Add '.' and '..' from the htree header */
- if (!start_hash && !start_minor_hash) {
- de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -1162,7 +1481,7 @@ int ext4_htree_fill_tree(struct file *di
- count += ret;
- hashval = ~0;
- ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
-- frame, frames, &hashval);
-+ frame, frames, &hashval, NULL);
- *next_hash = hashval;
- if (ret < 0) {
- err = ret;
-@@ -1354,10 +1673,10 @@ static int is_dx_internal_node(struct in
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
-+struct buffer_head *__ext4_find_entry(struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
-- int *inlined)
-+ int *inlined, struct htree_lock *lck)
- {
- struct super_block *sb;
- struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -1406,7 +1725,7 @@ static struct buffer_head * ext4_find_en
- goto restart;
- }
- if (is_dx(dir)) {
-- ret = ext4_dx_find_entry(dir, &fname, res_dir);
-+ ret = ext4_dx_find_entry(dir, &fname, res_dir, lck);
- /*
- * On success, or if the error was file not found,
- * return. Otherwise, fall back to doing a search the
-@@ -1416,6 +1735,7 @@ static struct buffer_head * ext4_find_en
- goto cleanup_and_exit;
- dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
- "falling back\n"));
-+ ext4_htree_safe_relock(lck);
- ret = NULL;
- }
- nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
-@@ -1507,10 +1827,12 @@ cleanup_and_exit:
- ext4_fname_free_filename(&fname);
- return ret;
- }
-+EXPORT_SYMBOL(__ext4_find_entry);
-
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
- struct ext4_filename *fname,
-- struct ext4_dir_entry_2 **res_dir)
-+ struct ext4_dir_entry_2 **res_dir,
-+ struct htree_lock *lck)
- {
- struct super_block * sb = dir->i_sb;
- struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-@@ -1521,7 +1843,7 @@ static struct buffer_head * ext4_dx_find
- #ifdef CONFIG_EXT4_FS_ENCRYPTION
- *res_dir = NULL;
- #endif
-- frame = dx_probe(fname, dir, NULL, frames);
-+ frame = dx_probe(fname, dir, NULL, frames, lck);
- if (IS_ERR(frame))
- return (struct buffer_head *) frame;
- do {
-@@ -1543,7 +1865,7 @@ static struct buffer_head * ext4_dx_find
-
- /* Check to see if we should continue to search */
- retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
-- frames, NULL);
-+ frames, NULL, lck);
- if (retval < 0) {
- ext4_warning_inode(dir,
- "error %d reading directory index block",
-@@ -1718,8 +2040,9 @@ static struct ext4_dir_entry_2* dx_pack_
- * Returns pointer to de in block into which the new entry will be inserted.
- */
- static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
-- struct buffer_head **bh,struct dx_frame *frame,
-- struct dx_hash_info *hinfo)
-+ struct buffer_head **bh, struct dx_frame *frames,
-+ struct dx_frame *frame, struct dx_hash_info *hinfo,
-+ struct htree_lock *lck)
- {
- unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
-@@ -1781,8 +2104,14 @@ static struct ext4_dir_entry_2 *do_split
- hash2, split, count-split));
-
- /* Fancy dance to stay within two buffers */
-- de2 = dx_move_dirents(data1, data2, map + split, count - split,
-- blocksize);
-+ if (hinfo->hash < hash2) {
-+ de2 = dx_move_dirents(data1, data2, map + split,
-+ count - split, blocksize);
-+ } else {
-+ /* make sure we will add entry to the same block which
-+ * we have already locked */
-+ de2 = dx_move_dirents(data1, data2, map, split, blocksize);
-+ }
- de = dx_pack_dirents(data1, blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
- (char *) de,
-@@ -1803,12 +2132,21 @@ static struct ext4_dir_entry_2 *do_split
- dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
- blocksize, 1));
-
-- /* Which block gets the new entry? */
-- if (hinfo->hash >= hash2) {
-- swap(*bh, bh2);
-- de = de2;
-+ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
-+ frame->at); /* notify block is being split */
-+ if (hinfo->hash < hash2) {
-+ dx_insert_block(frame, hash2 + continued, newblock);
-+
-+ } else {
-+ /* switch block number */
-+ dx_insert_block(frame, hash2 + continued,
-+ dx_get_block(frame->at));
-+ dx_set_block(frame->at, newblock);
-+ (frame->at)++;
- }
-- dx_insert_block(frame, hash2 + continued, newblock);
-+ ext4_htree_spin_unlock(lck);
-+ ext4_htree_dx_unlock(lck);
-+
- err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
- if (err)
- goto journal_error;
-@@ -2082,7 +2420,7 @@ static int make_indexed_dir(handle_t *ha
- if (retval)
- goto out_frames;
-
-- de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
-+ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL);
- if (IS_ERR(de)) {
- retval = PTR_ERR(de);
- goto out_frames;
-@@ -2192,8 +2530,8 @@ out:
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode, struct htree_lock *lck)
- {
- struct inode *dir = d_inode(dentry->d_parent);
- struct buffer_head *bh = NULL;
-@@ -2234,9 +2572,10 @@ static int ext4_add_entry(handle_t *hand
- if (dentry->d_name.len == 2 &&
- memcmp(dentry->d_name.name, "..", 2) == 0)
- return ext4_update_dotdot(handle, dentry, inode);
-- retval = ext4_dx_add_entry(handle, &fname, dir, inode);
-+ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck);
- if (!retval || (retval != ERR_BAD_DX_DIR))
- goto out;
-+ ext4_htree_safe_relock(lck);
- /* Can we just ignore htree data? */
- if (ext4_has_metadata_csum(sb)) {
- EXT4_ERROR_INODE(dir,
-@@ -2293,12 +2632,14 @@ out:
- ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
- return retval;
- }
-+EXPORT_SYMBOL(__ext4_add_entry);
-
- /*
- * Returns 0 for success, or a negative error value
- */
- static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-- struct inode *dir, struct inode *inode)
-+ struct inode *dir, struct inode *inode,
-+ struct htree_lock *lck)
- {
- struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct dx_entry *entries, *at;
-@@ -2310,7 +2651,7 @@ static int ext4_dx_add_entry(handle_t *h
-
- again:
- restart = 0;
-- frame = dx_probe(fname, dir, NULL, frames);
-+ frame = dx_probe(fname, dir, NULL, frames, lck);
- if (IS_ERR(frame))
- return PTR_ERR(frame);
- entries = frame->entries;
-@@ -2345,6 +2686,11 @@ again:
- struct dx_node *node2;
- struct buffer_head *bh2;
-
-+ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
-+ ext4_htree_safe_relock(lck);
-+ restart = 1;
-+ goto cleanup;
-+ }
- while (frame > frames) {
- if (dx_get_count((frame - 1)->entries) <
- dx_get_limit((frame - 1)->entries)) {
-@@ -2447,8 +2793,32 @@ again:
- restart = 1;
- goto journal_error;
- }
-+ } else if (!ext4_htree_dx_locked(lck)) {
-+ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
-+
-+ /* not well protected, require DX lock */
-+ ext4_htree_dx_need_lock(lck);
-+ at = frame > frames ? (frame - 1)->at : NULL;
-+
-+ /* NB: no risk of deadlock because it's just a try.
-+ *
-+ * NB: we check ld_count for twice, the first time before
-+ * having DX lock, the second time after holding DX lock.
-+ *
-+ * NB: We never free blocks for directory so far, which
-+ * means value returned by dx_get_count() should equal to
-+ * ld->ld_count if nobody split any DE-block under @at,
-+ * and ld->ld_at still points to valid dx_entry. */
-+ if ((ld->ld_count != dx_get_count(entries)) ||
-+ !ext4_htree_dx_lock_try(lck, at) ||
-+ (ld->ld_count != dx_get_count(entries))) {
-+ restart = 1;
-+ goto cleanup;
-+ }
-+ /* OK, I've got DX lock and nothing changed */
-+ frame->at = ld->ld_at;
- }
-- de = do_split(handle, dir, &bh, frame, &fname->hinfo);
-+ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck);
- if (IS_ERR(de)) {
- err = PTR_ERR(de);
- goto cleanup;
-@@ -2459,6 +2829,8 @@ again:
- journal_error:
- ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
- cleanup:
-+ ext4_htree_dx_unlock(lck);
-+ ext4_htree_de_unlock(lck);
- brelse(bh);
- dx_release(frames);
- /* @restart is true means htree-path has been changed, we need to
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/super.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/super.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/super.c
-@@ -1009,6 +1009,7 @@ static struct inode *ext4_alloc_inode(st
-
- inode_set_iversion(&ei->vfs_inode, 1);
- spin_lock_init(&ei->i_raw_lock);
-+ sema_init(&ei->i_append_sem, 1);
- INIT_LIST_HEAD(&ei->i_prealloc_list);
- spin_lock_init(&ei->i_prealloc_lock);
- ext4_es_init_tree(&ei->i_es_tree);
+++ /dev/null
-diff -wur a/fs/ext4/ext4.h b/fs/ext4/ext4.h
---- a/fs/ext4/ext4.h
-+++ b/fs/ext4/ext4.h
-@@ -1154,6 +1154,9 @@ struct ext4_inode_info {
- #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
- #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
-
-+/* we know this is a Lustre mount thanks to the DIRDATA flag */
-+#define IS_LUSTRE_MOUNT(sb) test_opt((sb), DIRDATA)
-+
- /*
- * Mount flags set either automatically (could not be set by mount option)
- * based on per file system feature or property or in special cases such as
-diff -wur a/fs/ext4/inode.c b/fs/ext4/inode.c
---- a/fs/ext4/inode.c
-+++ b/fs/ext4/inode.c
-@@ -4766,8 +4766,9 @@ void ext4_set_inode_flags(struct inod
- new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
- new_fl |= S_DAX;
-- if (flags & EXT4_ENCRYPT_FL)
-+ if (flags & EXT4_ENCRYPT_FL &&
-+ unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)))
- new_fl |= S_ENCRYPTED;
- inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
- S_ENCRYPTED);
-@@ -5753,8 +5757,9 @@ int ext4_getattr(const struct path *p
- stat->attributes |= STATX_ATTR_APPEND;
- if (flags & EXT4_COMPR_FL)
- stat->attributes |= STATX_ATTR_COMPRESSED;
-- if (flags & EXT4_ENCRYPT_FL)
-+ if (flags & EXT4_ENCRYPT_FL &&
-+ unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)))
- stat->attributes |= STATX_ATTR_ENCRYPTED;
- if (flags & EXT4_IMMUTABLE_FL)
- stat->attributes |= STATX_ATTR_IMMUTABLE;
- if (flags & EXT4_NODUMP_FL)
-diff -wur a/fs/ext4/xattr.c b/fs/ext4/xattr.c
---- a/fs/ext4/xattr.c
-+++ b/fs/ext4/xattr.c
-@@ -654,6 +654,7 @@
- up_read(&EXT4_I(inode)->xattr_sem);
- return error;
- }
-+EXPORT_SYMBOL(ext4_xattr_get);
-
- static int
- ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
-@@ -2413,13 +2415,20 @@
- ext4_handle_sync(handle);
- }
-
-+ if (!error && name_index == EXT4_XATTR_INDEX_ENCRYPTION &&
-+ strcmp(name, "c") == 0) {
-+ EXT4_I(inode)->i_flags |= EXT4_ENCRYPT_FL;
-+ mark_inode_dirty(inode);
-+ }
-+
- cleanup:
- brelse(is.iloc.bh);
- brelse(bs.bh);
- ext4_write_unlock_xattr(inode, &no_expand);
- return error;
- }
-+EXPORT_SYMBOL(ext4_xattr_set_handle);
-
- int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
- bool is_create, int *credits)
-
+++ /dev/null
-diff -wur /dev/null b/fs/ext4/critical_encode.h
---- /dev/null
-+++ b/fs/ext4/critical_encode.h
-@@ -0,0 +1,166 @@
-+/*
-+ * critical_encode.h
-+ *
-+ * Copyright (c) 2022 Whamcloud
-+ */
-+
-+#ifndef _CRITICAL_ENCODE_H
-+#define _CRITICAL_ENCODE_H
-+
-+#include <linux/ctype.h>
-+
-+/* Encoding/decoding routines inspired from yEnc principles.
-+ * We just take care of a few critical characters:
-+ * NULL, LF, CR, /, DEL and =.
-+ * If such a char is found, it is replaced with '=' followed by
-+ * the char value + 64.
-+ * All other chars are left untouched.
-+ * Efficiency of this encoding depends on the occurences of the
-+ * critical chars, but statistically on binary data it can be much higher
-+ * than base64 for instance.
-+ */
-+static inline int critical_encode(const u8 *src, int len, char *dst)
-+{
-+ u8 *p = (u8 *)src, *q = dst;
-+
-+ while (p - src < len) {
-+ /* escape NULL, LF, CR, /, DEL and = */
-+ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD ||
-+ *p == '/' || *p == 0x7F || *p == '=')) {
-+ *(q++) = '=';
-+ *(q++) = *(p++) + 64;
-+ } else {
-+ *(q++) = *(p++);
-+ }
-+ }
-+
-+ return (char *)q - dst;
-+}
-+
-+/* returns the number of chars encoding would produce */
-+static inline int critical_chars(const u8 *src, int len)
-+{
-+ u8 *p = (u8 *)src;
-+ int newlen = len;
-+
-+ while (p - src < len) {
-+ /* NULL, LF, CR, /, DEL and = cost an additional '=' */
-+ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD ||
-+ *p == '/' || *p == 0x7F || *p == '='))
-+ newlen++;
-+ p++;
-+ }
-+
-+ return newlen;
-+}
-+
-+/* decoding routine - returns the number of chars in output */
-+static inline int critical_decode(const u8 *src, int len, char *dst)
-+{
-+ u8 *p = (u8 *)src, *q = dst;
-+
-+ while (p - src < len) {
-+ if (unlikely(*p == '=')) {
-+ *(q++) = *(++p) - 64;
-+ p++;
-+ } else {
-+ *(q++) = *(p++);
-+ }
-+ }
-+
-+ return (char *)q - dst;
-+}
-+
-+#define fscrypt_get_encryption_info(inode) \
-+ (unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)) ? 0 : -EOPNOTSUPP)
-+
-+static inline int ext4_has_permitted_context(struct inode *parent,
-+ struct inode *child)
-+{
-+ if (unlikely(!IS_LUSTRE_MOUNT(parent->i_sb)))
-+ return 1;
-+ return fscrypt_has_permitted_context(parent, child);
-+}
-+
-+static inline int ext4_prepare_lookup(struct inode *dir,
-+ struct dentry *dentry,
-+ unsigned int flags)
-+{
-+ if (unlikely(!IS_LUSTRE_MOUNT(dir->i_sb)))
-+ return 0;
-+ return fscrypt_prepare_lookup(dir, dentry, flags);
-+}
-+
-+static inline int ext4_fname_alloc_buffer(const struct inode *inode,
-+ u32 max_encrypted_len,
-+ struct fscrypt_str *crypto_str)
-+{
-+ crypto_str->name = kmalloc(max_encrypted_len + 1, GFP_NOFS);
-+ if (!crypto_str->name)
-+ return -ENOMEM;
-+ crypto_str->len = max_encrypted_len;
-+ return 0;
-+}
-+
-+static inline void ext4_fname_free_buffer(struct fscrypt_str *crypto_str)
-+{
-+ if (!crypto_str)
-+ return;
-+ kfree(crypto_str->name);
-+ crypto_str->name = NULL;
-+}
-+
-+static inline int ext4_fname_disk_to_usr(struct inode *inode,
-+ u32 hash, u32 minor_hash,
-+ const struct fscrypt_str *iname,
-+ struct fscrypt_str *oname)
-+{
-+ int presented_len;
-+
-+ presented_len = critical_encode(iname->name, iname->len, oname->name);
-+ if (presented_len > NAME_MAX) {
-+ /* truncate at NAME_MAX,
-+ * or NAME_MAX-1 if name ends with '=' to avoid decoding issue
-+ */
-+ presented_len = NAME_MAX;
-+ if (oname->name[presented_len - 1] == '=')
-+ presented_len--;
-+ oname->len = presented_len;
-+ }
-+ oname->name[presented_len] = '\0';
-+
-+ return 0;
-+}
-+
-+static inline int ext4_setup_filename(struct inode *dir,
-+ const struct qstr *iname,
-+ int lookup,
-+ struct ext4_filename *fname)
-+{
-+ fname->usr_fname = iname;
-+
-+ if (lookup && IS_ENCRYPTED(dir) &&
-+ unlikely(!IS_LUSTRE_MOUNT(dir->i_sb) &&
-+ strnchr(iname->name, iname->len, '='))) {
-+ /* Only proceed to critical decode if
-+ * iname contains escape char '='.
-+ */
-+ int len = iname->len;
-+ char *buf;
-+
-+ buf = kmalloc(len, GFP_NOFS);
-+ if (!buf)
-+ return -ENOMEM;
-+
-+ len = critical_decode(iname->name, len, buf);
-+ fname->disk_name.name = (unsigned char *)buf;
-+ fname->disk_name.len = len;
-+ return 0;
-+ }
-+
-+ fname->disk_name.name = (unsigned char *) iname->name;
-+ fname->disk_name.len = iname->len;
-+ return 0;
-+}
-+
-+#endif /* _CRITICAL_ENCODE_H */
-diff -wur a/fs/ext4/dir.c b/fs/ext4/dir.c
---- a/fs/ext4/dir.c
-+++ b/fs/ext4/dir.c
-@@ -28,6 +28,7 @@
- #include <linux/iversion.h>
- #include "ext4.h"
- #include "xattr.h"
-+#include "critical_encode.h"
-
- static int ext4_dx_readdir(struct file *, struct dir_context *);
-
-@@ -144,7 +145,8 @@ static int ext4_readdir(struct file *
- return err;
- }
-
-- if (IS_ENCRYPTED(inode)) {
-+ /* disable decryption of filename, present only escaped name */
-+ if (0 && IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
- if (err < 0)
- return err;
-@@ -258,22 +259,33 @@ static int ext4_readdir(struct file *
- get_dtype(sb, de->file_type)))
- goto done;
- } else {
-- int save_len = fstr.len;
- struct fscrypt_str de_name =
- FSTR_INIT(de->name,
- de->name_len);
-+ int presented_len;
-
- /* Directory is encrypted */
-- err = fscrypt_fname_disk_to_usr(inode,
-+ presented_len = critical_chars(de->name,
-+ de->name_len);
-+ err = ext4_fname_alloc_buffer(inode,
-+ presented_len,
-+ &fstr);
-+ if (err)
-+ goto errout;
-+
-+ err = ext4_fname_disk_to_usr(inode,
- 0, 0, &de_name, &fstr);
- de_name = fstr;
-- fstr.len = save_len;
-- if (err)
-+ if (err) {
-+ ext4_fname_free_buffer(&fstr);
- goto errout;
-- if (!dir_emit(ctx,
-+ }
-+ err = dir_emit(ctx,
- de_name.name, de_name.len,
- le32_to_cpu(de->inode),
-- get_dtype(sb, de->file_type)))
-+ get_dtype(sb, de->file_type));
-+ ext4_fname_free_buffer(&fstr);
-+ if (!err)
- goto done;
- }
- }
-diff -wur a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
---- a/fs/ext4/ialloc.c
-+++ b/fs/ext4/ialloc.c
-@@ -30,6 +30,7 @@
- #include "ext4_jbd2.h"
- #include "xattr.h"
- #include "acl.h"
-+#include "critical_encode.h"
-
- #include <trace/events/ext4.h>
-
-diff -wur a/fs/ext4/namei.c b/fs/ext4/namei.c
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -40,6 +40,7 @@
-
- #include "xattr.h"
- #include "acl.h"
-+#include "critical_encode.h"
-
- #include <trace/events/ext4.h>
- /*
-@@ -1368,22 +1369,31 @@ static int htree_dirblock_to_tree(struct
- hinfo->hash, hinfo->minor_hash, de,
- &tmp_str);
- } else {
-- int save_len = fname_crypto_str.len;
- struct fscrypt_str de_name = FSTR_INIT(de->name,
- de->name_len);
-+ int presented_len;
-
- /* Directory is encrypted */
-- err = fscrypt_fname_disk_to_usr(dir, hinfo->hash,
-+ presented_len = critical_chars(de->name, de->name_len);
-+ err = ext4_fname_alloc_buffer(dir, presented_len,
-+ &fname_crypto_str);
-+ if (err) {
-+ count = err;
-+ goto errout;
-+ }
-+
-+ err = ext4_fname_disk_to_usr(dir, hinfo->hash,
- hinfo->minor_hash, &de_name,
- &fname_crypto_str);
- if (err) {
-+ ext4_fname_free_buffer(&fname_crypto_str);
- count = err;
- goto errout;
- }
- err = ext4_htree_store_dirent(dir_file,
- hinfo->hash, hinfo->minor_hash, de,
- &fname_crypto_str);
-- fname_crypto_str.len = save_len;
-+ ext4_fname_free_buffer(&fname_crypto_str);
- }
- if (err != 0) {
- count = err;
-@@ -1614,7 +1614,7 @@ static void dx_insert_block(struct dx_fr
- * Return: %true if the directory entry matches, otherwise %false.
- */
- static inline bool ext4_match(const struct ext4_filename *fname,
-- const struct ext4_dir_entry_2 *de)
-+ const struct ext4_dir_entry_2 *de, int denamelen)
- {
- struct fscrypt_name f;
-
-@@ -1626,7 +1626,7 @@ static inline bool ext4_match(const s
- #ifdef CONFIG_EXT4_FS_ENCRYPTION
- f.crypto_buf = fname->crypto_buf;
- #endif
-- return fscrypt_match_name(&f, de->name, de->name_len);
-+ return fscrypt_match_name(&f, de->name, denamelen);
- }
-
- /*
-@@ -1637,16 +1637,30 @@ int ext4_search_dir(struct buffer_hea
- unsigned int offset, struct ext4_dir_entry_2 **res_dir)
- {
- struct ext4_dir_entry_2 * de;
-+ bool probablytrunc;
- char * dlimit;
-- int de_len;
-+ int de_len, denamelen;
-
- de = (struct ext4_dir_entry_2 *)search_buf;
- dlimit = search_buf + buf_size;
-+ /* fname is probably truncated if it is the decoded representation of
-+ * an encrypted filename not aligned on a 32-byte boundary
-+ */
-+ probablytrunc = !IS_LUSTRE_MOUNT(dir->i_sb) && IS_ENCRYPTED(dir) &&
-+ fname->disk_name.len & 31;
- while ((char *) de < dlimit) {
- /* this code is executed quadratically often */
- /* do minimal checking `by hand' */
-+ denamelen = de->name_len;
-+ if (unlikely(probablytrunc) &&
-+ de->name_len > fname->disk_name.len)
-+ /* Adjust name len to look for a partial match.
-+ * Since it is binary encrypted names, there
-+ * should not be any collision between names.
-+ */
-+ denamelen = fname->disk_name.len;
- if ((char *) de + de->name_len <= dlimit &&
-- ext4_match(fname, de)) {
-+ ext4_match(fname, de, denamelen)) {
- /* found a match - just to be sure, do
- * a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
-@@ -1707,7 +1717,7 @@ struct buffer_head *__ext4_find_entry
- if (namelen > EXT4_NAME_LEN)
- return NULL;
-
-- retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
-+ retval = ext4_setup_filename(dir, d_name, 1, &fname);
- if (retval == -ENOENT)
- return NULL;
- if (retval)
-@@ -1834,7 +1844,8 @@ cleanup_and_exit:
- /* Clean up the read-ahead blocks */
- for (; ra_ptr < ra_max; ra_ptr++)
- brelse(bh_use[ra_ptr]);
-- ext4_fname_free_filename(&fname);
-+ if (fname.disk_name.name != d_name->name)
-+ kfree(fname.disk_name.name);
- return ret;
- }
- EXPORT_SYMBOL(__ext4_find_entry);
-@@ -1900,7 +1911,7 @@ static struct dentry *ext4_lookup(str
- struct buffer_head *bh;
- int err;
-
-- err = fscrypt_prepare_lookup(dir, dentry, flags);
-+ err = ext4_prepare_lookup(dir, dentry, flags);
- if (err)
- return ERR_PTR(err);
-
-@@ -1957,7 +1957,7 @@ static struct dentry *ext4_lookup(struct
- }
- if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
- (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
-- !fscrypt_has_permitted_context(dir, inode)) {
-+ !ext4_has_permitted_context(dir, inode)) {
- ext4_warning(inode->i_sb,
- "Inconsistent encryption contexts: %lu/%lu",
- dir->i_ino, inode->i_ino);
-@@ -2206,7 +2221,7 @@ int ext4_find_dest_de(struct inode *d
- if (ext4_check_dir_entry(dir, NULL, de, bh,
- buf, buf_size, offset))
- return -EFSCORRUPTED;
-- if (ext4_match(fname, de))
-+ if (ext4_match(fname, de, de->name_len))
- return -EEXIST;
- nlen = EXT4_DIR_ENTRY_LEN(de);
- rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+++ /dev/null
---- linux-4.18/fs/ext4/balloc.c 2019-11-28 14:55:26.506546036 +0300
-+++ linux-4.18/fs/ext4/balloc.c 2019-12-02 11:21:50.565975537 +0300
-@@ -404,7 +404,8 @@ verified:
- * Return buffer_head on success or NULL in case of failure.
- */
- struct buffer_head *
--ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
-+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
-+ int ignore_locked)
- {
- struct ext4_group_desc *desc;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-@@ -435,6 +436,13 @@ ext4_read_block_bitmap_nowait(struct
- if (bitmap_uptodate(bh))
- goto verify;
-
-+ if (ignore_locked && buffer_locked(bh)) {
-+ /* buffer under IO already, do not wait
-+ * if called for prefetching */
-+ put_bh(bh);
-+ return NULL;
-+ }
-+
- lock_buffer(bh);
- if (bitmap_uptodate(bh)) {
- unlock_buffer(bh);
-@@ -524,7 +532,7 @@ ext4_read_block_bitmap(struct super_b
- struct buffer_head *bh;
- int err;
-
-- bh = ext4_read_block_bitmap_nowait(sb, block_group);
-+ bh = ext4_read_block_bitmap_nowait(sb, block_group, 0);
- if (IS_ERR(bh))
- return bh;
- err = ext4_wait_block_bitmap(sb, block_group, bh);
---- linux-4.18/fs/ext4/ext4.h 2019-11-28 14:55:26.470545343 +0300
-+++ linux-4.18/fs/ext4/ext4.h 2019-12-02 11:21:40.795779972 +0300
-@@ -1446,6 +1446,8 @@ struct ext4_sb_info {
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
-+ unsigned int s_mb_prefetch;
-+ unsigned int s_mb_prefetch_limit;
-
- /* stats for buddy allocator */
- atomic_t s_bal_reqs; /* number of reqs with len > 1 */
-@@ -2401,7 +2403,8 @@ extern struct ext4_group_desc * ldisk
- extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-
- extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-- ext4_group_t block_group);
-+ ext4_group_t block_group,
-+ int ignore_locked);
- extern int ext4_wait_block_bitmap(struct super_block *sb,
- ext4_group_t block_group,
- struct buffer_head *bh);
-@@ -3047,6 +3051,7 @@ struct ext4_group_info {
- #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
- #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
- (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
-+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
-
- #define EXT4_MB_GRP_NEED_INIT(grp) \
- (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-@@ -3065,6 +3070,10 @@ struct ext4_group_info {
- (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
- #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
- (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-+#define EXT4_MB_GRP_TEST(grp) \
-+ (test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
-+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
-+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
-
- #define EXT4_MAX_CONTENTION 8
- #define EXT4_CONTENTION_THRESHOLD 2
---- linux-4.18/fs/ext4/mballoc.c 2019-11-28 14:55:26.500545920 +0300
-+++ linux-4.18/fs/ext4/mballoc.c 2019-12-02 11:21:46.656897291 +0300
-@@ -868,7 +868,7 @@ static int ext4_mb_init_cache(struct
- bh[i] = NULL;
- continue;
- }
-- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
-+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0);
- if (IS_ERR(bh[i])) {
- err = PTR_ERR(bh[i]);
- bh[i] = NULL;
-@@ -2104,6 +2112,92 @@ static int ext4_mb_good_group(struct
- return 0;
- }
-
-+/*
-+ * each allocation context (i.e. a thread doing allocation) has own
-+ * sliding prefetch window of @s_mb_prefetch size which starts at the
-+ * very first goal and moves ahead of scaning.
-+ * a side effect is that subsequent allocations will likely find
-+ * the bitmaps in cache or at least in-flight.
-+ */
-+static void
-+ext4_mb_prefetch(struct ext4_allocation_context *ac,
-+ ext4_group_t start)
-+{
-+ struct super_block *sb = ac->ac_sb;
-+ ext4_group_t ngroups = ext4_get_groups_count(sb);
-+ struct ext4_sb_info *sbi = EXT4_SB(sb);
-+ struct ext4_group_info *grp;
-+ ext4_group_t group = start;
-+ struct buffer_head *bh;
-+ int nr;
-+
-+ /* limit prefetching at cr=0, otherwise mballoc can
-+ * spend a lot of time loading imperfect groups */
-+ if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
-+ return;
-+
-+ /* batch prefetching to get few READs in flight */
-+ nr = ac->ac_prefetch - group;
-+ if (ac->ac_prefetch < group)
-+ /* wrapped to the first groups */
-+ nr += ngroups;
-+ if (nr > 0)
-+ return;
-+ BUG_ON(nr < 0);
-+
-+ nr = sbi->s_mb_prefetch;
-+ if (ext4_has_feature_flex_bg(sb)) {
-+ /* align to flex_bg to get more bitmas with a single IO */
-+ nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
-+ nr = nr + sbi->s_mb_prefetch - group;
-+ }
-+ while (nr-- > 0) {
-+ grp = ext4_get_group_info(sb, group);
-+ /* prevent expensive getblk() on groups w/ IO in progress */
-+ if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp))
-+ goto next;
-+
-+ /* ignore empty groups - those will be skipped
-+ * during the scanning as well */
-+ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
-+ bh = ext4_read_block_bitmap_nowait(sb, group, 1);
-+ if (bh && !IS_ERR(bh)) {
-+ if (!buffer_uptodate(bh))
-+ ac->ac_prefetch_ios++;
-+ brelse(bh);
-+ }
-+ }
-+next:
-+ if (++group >= ngroups)
-+ group = 0;
-+ }
-+ ac->ac_prefetch = group;
-+}
-+
-+static void
-+ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
-+{
-+ struct ext4_group_info *grp;
-+ ext4_group_t group;
-+ int nr, rc;
-+
-+ /* initialize last window of prefetched groups */
-+ nr = ac->ac_prefetch_ios;
-+ if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
-+ nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
-+ group = ac->ac_prefetch;
-+ while (nr-- > 0) {
-+ grp = ext4_get_group_info(ac->ac_sb, group);
-+ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
-+ rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
-+ if (rc)
-+ break;
-+ }
-+ if (group-- == 0)
-+ group = ext4_get_groups_count(ac->ac_sb) - 1;
-+ }
-+}
-+
- static noinline_for_stack int
- ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
- {
-@@ -2176,6 +2264,7 @@ repeat:
- * from the goal value specified
- */
- group = ac->ac_g_ex.fe_group;
-+ ac->ac_prefetch = group;
-
- for (i = 0; i < ngroups; group++, i++) {
- int ret = 0;
-@@ -2188,6 +2277,8 @@ repeat:
- if (group >= ngroups)
- group = 0;
-
-+ ext4_mb_prefetch(ac, group);
-+
- /* This now checks without needing the buddy page */
- ret = ext4_mb_good_group(ac, group, cr);
- if (ret <= 0) {
-@@ -2260,6 +2351,8 @@ repeat:
- out:
- if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
- err = first_err;
-+ /* use prefetched bitmaps to init buddy so that read info is not lost */
-+ ext4_mb_prefetch_fini(ac);
- return err;
- }
-
-@@ -2832,6 +2925,24 @@ int ext4_mb_init(struct super_block *
- sbi->s_mb_large_req = sbi->s_stripe * 8;
- sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
- }
-+ if (ext4_has_feature_flex_bg(sb)) {
-+ /* a single flex group is supposed to be read by a single IO */
-+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
-+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
-+ } else {
-+ sbi->s_mb_prefetch = 32;
-+ }
-+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
-+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
-+ /* now many real IOs to prefetch within a single allocation at cr=0
-+ * given cr=0 is an CPU-related optimization we shouldn't try to
-+ * load too many groups, at some point we should start to use what
-+ * we've got in memory.
-+ * with an average random access time 5ms, it'd take a second to get
-+ * 200 groups (* N with flex_bg), so let's make this limit 4 */
-+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
-+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
-+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
-
- sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
- if (sbi->s_locality_groups == NULL) {
---- linux-4.18/fs/ext4/mballoc.h 2019-11-28 14:55:26.471545362 +0300
-+++ linux-4.18/fs/ext4/mballoc.h 2019-12-02 11:21:57.028104886 +0300
-@@ -177,6 +177,8 @@ struct ext4_allocation_context {
- struct page *ac_buddy_page;
- struct ext4_prealloc_space *ac_pa;
- struct ext4_locality_group *ac_lg;
-+ ext4_group_t ac_prefetch;
-+ int ac_prefetch_ios; /* number of initialied prefetch IO */
- };
-
- #define AC_STATUS_CONTINUE 1
---- linux-4.18/fs/ext4/sysfs.c 2019-11-28 14:55:26.502545959 +0300
-+++ linux-4.18/fs/ext4/sysfs.c 2019-11-28 20:07:48.104558177 +0300
-@@ -190,6 +190,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_bur
- EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
- EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
- EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
-+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
-+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
-
- static unsigned int old_bump_val = 128;
- EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
-@@ -223,6 +224,8 @@ static struct attribute *ext4_attrs[]
- ATTR_LIST(errors_count),
- ATTR_LIST(first_error_time),
- ATTR_LIST(last_error_time),
-+ ATTR_LIST(mb_prefetch),
-+ ATTR_LIST(mb_prefetch_limit),
- NULL,
- };
-
+++ /dev/null
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
-@@ -1591,6 +1591,8 @@ static inline void ext4_clear_state_flag
- */
- #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
-
-+#define JOURNAL_START_HAS_3ARGS 1
-+
- /*
- * Codes for operating systems
- */
-@@ -1805,7 +1807,21 @@ static inline bool ext4_has_unknown_ext#
-
- EXTN_FEATURE_FUNCS(2)
- EXTN_FEATURE_FUNCS(3)
--EXTN_FEATURE_FUNCS(4)
-+static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb)
-+{
-+ return ((EXT4_SB(sb)->s_es->s_feature_compat &
-+ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0);
-+}
-+static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb)
-+{
-+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat &
-+ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0);
-+}
-+static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb)
-+{
-+ return ((EXT4_SB(sb)->s_es->s_feature_incompat &
-+ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0);
-+}
-
- static inline bool ext4_has_compat_features(struct super_block *sb)
- {
-@@ -3111,6 +3127,13 @@ struct ext4_extent;
-
- extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
- extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-+extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb,
-+ ext4_group_t block_group);
-+extern void ext4_inc_count(handle_t *handle, struct inode *inode);
-+extern void ext4_dec_count(handle_t *handle, struct inode *inode);
-+extern struct buffer_head *ext4_append(handle_t *handle,
-+ struct inode *inode,
-+ ext4_lblk_t *block);
- extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
- extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map, int flags);
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/ialloc.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ialloc.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ialloc.c
-@@ -114,7 +114,7 @@ verified:
- *
- * Return buffer_head of bitmap on success or NULL.
- */
--static struct buffer_head *
-+struct buffer_head *
- ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
- {
- struct ext4_group_desc *desc;
-@@ -211,6 +211,7 @@ out:
- put_bh(bh);
- return ERR_PTR(err);
- }
-+EXPORT_SYMBOL(ext4_read_inode_bitmap);
-
- /*
- * NOTE! When we get the inode, we're the only people
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/inode.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/inode.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/inode.c
-@@ -6267,3 +6267,19 @@ int ext4_filemap_fault(struct vm_fault *
-
- return err;
- }
-+EXPORT_SYMBOL(ext4_map_blocks);
-+EXPORT_SYMBOL(ext4_truncate);
-+EXPORT_SYMBOL(ext4_iget);
-+EXPORT_SYMBOL(ext4_bread);
-+EXPORT_SYMBOL(ext4_itable_unused_count);
-+EXPORT_SYMBOL(ext4_force_commit);
-+EXPORT_SYMBOL(ext4_mark_inode_dirty);
-+EXPORT_SYMBOL(ext4_get_group_desc);
-+EXPORT_SYMBOL(__ext4_journal_get_write_access);
-+EXPORT_SYMBOL(__ext4_journal_start_sb);
-+EXPORT_SYMBOL(__ext4_journal_stop);
-+EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
-+EXPORT_SYMBOL(__ext4_std_error);
-+EXPORT_SYMBOL(ext4fs_dirhash);
-+EXPORT_SYMBOL(ext4_get_inode_loc);
-+EXPORT_SYMBOL(ext4_chunk_trans_blocks);
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/namei.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/namei.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/namei.c
-@@ -49,7 +49,7 @@
- #define NAMEI_RA_BLOCKS 4
- #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-
--static struct buffer_head *ext4_append(handle_t *handle,
-+struct buffer_head *ext4_append(handle_t *handle,
- struct inode *inode,
- ext4_lblk_t *block)
- {
-@@ -160,6 +160,7 @@ static struct buffer_head *__ext4_read_d
- }
- return bh;
- }
-+EXPORT_SYMBOL(ext4_append);
-
- #ifndef assert
- #define assert(test) J_ASSERT(test)
-@@ -2415,23 +2416,25 @@ EXPORT_SYMBOL(ext4_delete_entry);
- * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
- * on regular files) and to avoid creating huge/slow non-HTREE directories.
- */
--static void ext4_inc_count(handle_t *handle, struct inode *inode)
-+void ext4_inc_count(handle_t *handle, struct inode *inode)
- {
- inc_nlink(inode);
- if (is_dx(inode) &&
- (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2))
- set_nlink(inode, 1);
- }
-+EXPORT_SYMBOL(ext4_inc_count);
-
- /*
- * If a directory had nlink == 1, then we should let it be 1. This indicates
- * directory has >EXT4_LINK_MAX subdirs.
- */
--static void ext4_dec_count(handle_t *handle, struct inode *inode)
-+void ext4_dec_count(handle_t *handle, struct inode *inode)
- {
- if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
- drop_nlink(inode);
- }
-+EXPORT_SYMBOL(ext4_dec_count);
-
-
- static int ext4_add_nondir(handle_t *handle,
-Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/super.c
-===================================================================
---- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/super.c
-+++ linux-4.18.0-80.1.2.el8_0/fs/ext4/super.c
-@@ -323,11 +323,11 @@ static void __save_error_info(struct sup
- return;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- es->s_last_error_time = cpu_to_le32(get_seconds());
-- strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
-+ strlcpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
- es->s_last_error_line = cpu_to_le32(line);
- if (!es->s_first_error_time) {
- es->s_first_error_time = es->s_last_error_time;
-- strncpy(es->s_first_error_func, func,
-+ strlcpy(es->s_first_error_func, func,
- sizeof(es->s_first_error_func));
- es->s_first_error_line = cpu_to_le32(line);
- es->s_first_error_ino = es->s_last_error_ino;
-@@ -5957,16 +5957,12 @@ static int __init ext4_init_fs(void)
- err = init_inodecache();
- if (err)
- goto out1;
-- register_as_ext3();
-- register_as_ext2();
- err = register_filesystem(&ext4_fs_type);
- if (err)
- goto out;
-
- return 0;
- out:
-- unregister_as_ext2();
-- unregister_as_ext3();
- destroy_inodecache();
- out1:
- ext4_exit_mballoc();
-@@ -5985,8 +5981,6 @@ out5:
- static void __exit ext4_exit_fs(void)
- {
- ext4_destroy_lazyinit_thread();
-- unregister_as_ext2();
-- unregister_as_ext3();
- unregister_filesystem(&ext4_fs_type);
- destroy_inodecache();
- ext4_exit_mballoc();
+++ /dev/null
-rhel8.1/ext4-inode-version.patch
-suse15/ext4-lookup-dotdot.patch
-suse15/ext4-print-inum-in-htree-warning.patch
-rhel8/ext4-prealloc.patch
-ubuntu18/ext4-osd-iop-common.patch
-rhel8.1/ext4-misc.patch
-rhel8/ext4-mballoc-extra-checks.patch
-ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch
-rhel8.1/ext4-kill-dx-root.patch
-rhel7.6/ext4-mballoc-pa-free-mismatch.patch
-ubuntu18/ext4-data-in-dirent.patch
-rhel8/ext4-nocmtime.patch
-base/ext4-htree-lock.patch
-rhel8/ext4-pdirop.patch
-rhel8/ext4-max-dir-size.patch
-rhel8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
-ubuntu18/ext4-give-warning-with-dir-htree-growing.patch
-ubuntu18/ext4-jcb-optimization.patch
-ubuntu18/ext4-attach-jinode-in-writepages.patch
-rhel8/ext4-dont-check-before-replay.patch
-rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
-rhel7.6/ext4-export-orphan-add.patch
-rhel8/ext4-export-mb-stream-allocator-variables.patch
-rhel8/ext4-simple-blockalloc.patch
-rhel8/ext4-mballoc-skip-uninit-groups-cr0.patch
-rhel8.1/ext4-mballoc-prefetch.patch
-rhel8/ext4-xattr-disable-credits-check.patch
-base/ext4-no-max-dir-size-limit-for-iam-objects.patch
-rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch
-base/ext4-projid-xattrs.patch
-rhel8/ext4-enc-flag.patch
-base/ext4-delayed-iput.patch
-rhel8/ext4-add-periodic-superblock-update.patch
-rhel8/ext4-filename-encode.patch
-rhel8/ext4-encdata.patch
+++ /dev/null
-rhel8.1/ext4-inode-version.patch
-suse15/ext4-lookup-dotdot.patch
-suse15/ext4-print-inum-in-htree-warning.patch
-rhel8/ext4-prealloc.patch
-ubuntu18/ext4-osd-iop-common.patch
-rhel8.1/ext4-misc.patch
-rhel8/ext4-mballoc-extra-checks.patch
-ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch
-rhel8.1/ext4-kill-dx-root.patch
-rhel7.6/ext4-mballoc-pa-free-mismatch.patch
-ubuntu18/ext4-data-in-dirent.patch
-rhel8/ext4-nocmtime.patch
-base/ext4-htree-lock.patch
-rhel8/ext4-pdirop.patch
-rhel8/ext4-max-dir-size.patch
-rhel8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
-ubuntu18/ext4-give-warning-with-dir-htree-growing.patch
-ubuntu18/ext4-jcb-optimization.patch
-rhel8.2/ext4-attach-jinode-in-writepages.patch
-rhel8/ext4-dont-check-before-replay.patch
-rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
-rhel7.6/ext4-export-orphan-add.patch
-rhel8/ext4-export-mb-stream-allocator-variables.patch
-rhel8/ext4-simple-blockalloc.patch
-rhel8/ext4-mballoc-skip-uninit-groups-cr0.patch
-rhel8.1/ext4-mballoc-prefetch.patch
-rhel8/ext4-xattr-disable-credits-check.patch
-base/ext4-no-max-dir-size-limit-for-iam-objects.patch
-rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch
-base/ext4-projid-xattrs.patch
-rhel8/ext4-enc-flag.patch
-base/ext4-delayed-iput.patch
-rhel8/ext4-add-periodic-superblock-update.patch
-rhel8/ext4-filename-encode.patch
-rhel8/ext4-encdata.patch
-rhel8.5/ext4-limit-per-inode-preallocation-list.patch
+++ /dev/null
-rhel8.1/ext4-inode-version.patch
-suse15/ext4-lookup-dotdot.patch
-suse15/ext4-print-inum-in-htree-warning.patch
-rhel8/ext4-prealloc.patch
-ubuntu18/ext4-osd-iop-common.patch
-rhel8.3/ext4-misc.patch
-rhel8.3/ext4-mballoc-extra-checks.patch
-ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch
-rhel8.1/ext4-kill-dx-root.patch
-rhel7.6/ext4-mballoc-pa-free-mismatch.patch
-linux-5.4/ext4-data-in-dirent.patch
-rhel8/ext4-nocmtime.patch
-base/ext4-htree-lock.patch
-rhel8.3/ext4-pdirop.patch
-rhel8/ext4-max-dir-size.patch
-rhel8.3/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
-ubuntu18/ext4-give-warning-with-dir-htree-growing.patch
-ubuntu18/ext4-jcb-optimization.patch
-rhel8.2/ext4-attach-jinode-in-writepages.patch
-rhel8/ext4-dont-check-before-replay.patch
-rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
-rhel7.6/ext4-export-orphan-add.patch
-rhel8/ext4-export-mb-stream-allocator-variables.patch
-rhel8/ext4-simple-blockalloc.patch
-rhel8/ext4-mballoc-skip-uninit-groups-cr0.patch
-rhel8.1/ext4-mballoc-prefetch.patch
-rhel8.3/ext4-xattr-disable-credits-check.patch
-base/ext4-no-max-dir-size-limit-for-iam-objects.patch
-rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch
-base/ext4-projid-xattrs.patch
-rhel8/ext4-enc-flag.patch
-base/ext4-delayed-iput.patch
-rhel8/ext4-add-periodic-superblock-update.patch
-rhel8/ext4-filename-encode.patch
-rhel8/ext4-encdata.patch
-rhel8.5/ext4-limit-per-inode-preallocation-list.patch
+++ /dev/null
-rhel8/ext4-inode-version.patch
-suse15/ext4-lookup-dotdot.patch
-suse15/ext4-print-inum-in-htree-warning.patch
-rhel8/ext4-prealloc.patch
-ubuntu18/ext4-osd-iop-common.patch
-rhel8/ext4-misc.patch
-rhel8/ext4-mballoc-extra-checks.patch
-ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch
-ubuntu18/ext4-kill-dx-root.patch
-rhel7.6/ext4-mballoc-pa-free-mismatch.patch
-ubuntu18/ext4-data-in-dirent.patch
-rhel8/ext4-nocmtime.patch
-base/ext4-htree-lock.patch
-rhel8/ext4-pdirop.patch
-rhel8/ext4-max-dir-size.patch
-rhel8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
-ubuntu18/ext4-give-warning-with-dir-htree-growing.patch
-ubuntu18/ext4-jcb-optimization.patch
-ubuntu18/ext4-attach-jinode-in-writepages.patch
-rhel8/ext4-dont-check-before-replay.patch
-rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
-rhel7.6/ext4-export-orphan-add.patch
-ubuntu18/ext4-include-terminating-u32-in-size-of-xattr-entries-when-expanding-inodes.patch
-rhel8/ext4-export-mb-stream-allocator-variables.patch
-rhel8/ext4-simple-blockalloc.patch
-rhel8/ext4-mballoc-skip-uninit-groups-cr0.patch
-rhel8/ext4-mballoc-prefetch.patch
-rhel8/ext4-xattr-disable-credits-check.patch
-base/ext4-no-max-dir-size-limit-for-iam-objects.patch
-base/ext4-reset-exts-for-gcc10.patch
-rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch
-ubuntu18/ext4-projid-xattrs.patch
-rhel8/ext4-enc-flag.patch
-base/ext4-delayed-iput.patch
-rhel8/ext4-add-periodic-superblock-update.patch
-rhel8/ext4-filename-encode.patch
-rhel8/ext4-encdata.patch