AS_IF([test -z "$LDISKFS_SERIES"], [
AS_IF([test x$RHEL_KERNEL = xyes], [
case $RHEL_RELEASE_NO in
+ 74) LDISKFS_SERIES="3.10-rhel7.4.series" ;;
73) LDISKFS_SERIES="3.10-rhel7.3.series" ;;
72) LDISKFS_SERIES="3.10-rhel7.2.series" ;;
71) LDISKFS_SERIES="3.10-rhel7.series" ;;
3.12-sles12 | 4.4-sles12)
CANONICAL_TARGET="sles12"
;;
- 3.10-rhel7)
+ 3.10-rhel7*)
CANONICAL_TARGET="rhel7"
;;
2.6-rhel6*)
--- /dev/null
+Index: linux-stage/fs/ext4/inode.c
+===================================================================
+--- linux-stage.orig/fs/ext4/inode.c
++++ linux-stage/fs/ext4/inode.c
+@@ -745,6 +745,9 @@ out_sem:
+ !(flags & EXT4_GET_BLOCKS_ZERO) &&
+ !IS_NOQUOTA(inode) &&
+ ext4_should_order_data(inode)) {
++ ret = ext4_inode_attach_jinode(inode);
++ if (ret)
++ return ret;
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret)
+ return ret;
+@@ -2503,6 +2506,9 @@ static int ext4_writepages(struct addres
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+ }
+
++ ret = ext4_inode_attach_jinode(inode);
++ if (ret)
++ goto out_writepages;
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
+@@ -3837,6 +3843,7 @@ int ext4_inode_attach_jinode(struct inod
+ jbd2_free_inode(jinode);
+ return 0;
+ }
++EXPORT_SYMBOL(ext4_inode_attach_jinode);
+
+ /*
+ * ext4_truncate()
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
+@@ -2379,6 +2379,7 @@ extern int ext4_group_add_blocks(handle_
+ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
+ /* inode.c */
++#define HAVE_LDISKFS_INFO_JINODE
+ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+ ext4_lblk_t, int, int *);
+ struct buffer_head *ext4_bread(handle_t *, struct inode *,
--- /dev/null
+Index: linux-stage/fs/ext4/super.c
+When ldiskfs run in failover mode whith read-only disk.
+Part of allocation updates are lost and ldiskfs may fail
+while mounting this is due to inconsistent state of
+group-descriptor. Group-descriptor check is added after
+journal replay.
+===================================================================
+--- linux-stage/fs/ext4/super.c 2016-11-06 15:15:30.892386878 +0530
++++ linux-stage.orig.1/fs/ext4/super.c 2016-11-08 10:56:45.579892189 +0530
+@@ -3980,10 +3980,6 @@
+ goto failed_mount2;
+ }
+ }
+- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
+- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+- goto failed_mount2;
+- }
+
+ sbi->s_gdb_count = db_count;
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+@@ -4104,6 +4100,12 @@
+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
+ no_journal:
++
++ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
++ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
++ goto failed_mount_wq;
++ }
++
+ /*
+ * Get the # of file system overhead blocks from the
+ * superblock if present.
--- /dev/null
+From e3014d14a81edde488d9a6758eea8afc41752d2d Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Mon, 29 Aug 2016 15:38:11 -0400
+Subject: [PATCH] ext4: fixup free space calculations when expanding inodes
+
+Conditions checking whether there is enough free space in an xattr block
+and when xattr is large enough to make enough space in the inode forgot
+to account for the fact that inode need not be completely filled up with
+xattrs. Thus we could move unnecessarily many xattrs out of inode or
+even falsely claim there is not enough space to expand the inode. We
+also forgot to update the amount of free space in xattr block when moving
+more xattrs and thus could decide to move too big xattr resulting in
+unexpected failure.
+
+Fix these problems by properly updating free space in the inode and
+xattr block as we move xattrs. To simplify the math, avoid shifting
+xattrs after removing each one xattr and instead just shift xattrs only
+once there is enough free space in the inode.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/xattr.c | 58 ++++++++++++++++++++++++---------------------------------
+ 1 file changed, 24 insertions(+), 34 deletions(-)
+
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 2eb935c..22d2ebc 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1350,7 +1350,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_xattr_ibody_find *is = NULL;
+ struct ext4_xattr_block_find *bs = NULL;
+ char *buffer = NULL, *b_entry_name = NULL;
+- size_t min_offs, free;
++ size_t min_offs;
++ size_t ifree, bfree;
+ int total_ino;
+ void *base, *start, *end;
+ int error = 0, tried_min_extra_isize = 0;
+@@ -1385,17 +1386,9 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ if (error)
+ goto cleanup;
+
+- free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+- if (free >= isize_diff) {
+- entry = IFIRST(header);
+- ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
+- - new_extra_isize, (void *)raw_inode +
+- EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+- (void *)header, total_ino,
+- inode->i_sb->s_blocksize);
+- EXT4_I(inode)->i_extra_isize = new_extra_isize;
+- goto out;
+- }
++ ifree = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
++ if (ifree >= isize_diff)
++ goto shift;
+
+ /*
+ * Enough free space isn't available in the inode, check if
+@@ -1416,8 +1409,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ first = BFIRST(bh);
+ end = bh->b_data + bh->b_size;
+ min_offs = end - base;
+- free = ext4_xattr_free_space(first, &min_offs, base, NULL);
+- if (free < isize_diff) {
++ bfree = ext4_xattr_free_space(first, &min_offs, base, NULL);
++ if (bfree + ifree < isize_diff) {
+ if (!tried_min_extra_isize && s_min_extra_isize) {
+ tried_min_extra_isize++;
+ new_extra_isize = s_min_extra_isize;
+@@ -1428,10 +1421,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ goto cleanup;
+ }
+ } else {
+- free = inode->i_sb->s_blocksize;
++ bfree = inode->i_sb->s_blocksize;
+ }
+
+- while (isize_diff > 0) {
++ while (isize_diff > ifree) {
+ size_t offs, size, entry_size;
+ struct ext4_xattr_entry *small_entry = NULL;
+ struct ext4_xattr_info i = {
+@@ -1439,7 +1432,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ .value_len = 0,
+ };
+ unsigned int total_size; /* EA entry size + value size */
+- unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
+ unsigned int min_total_size = ~0U;
+
+ is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+@@ -1461,8 +1453,9 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ total_size =
+ EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+ EXT4_XATTR_LEN(last->e_name_len);
+- if (total_size <= free && total_size < min_total_size) {
+- if (total_size < isize_diff) {
++ if (total_size <= bfree &&
++ total_size < min_total_size) {
++ if (total_size + ifree < isize_diff) {
+ small_entry = last;
+ } else {
+ entry = last;
+@@ -1491,6 +1484,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ offs = le16_to_cpu(entry->e_value_offs);
+ size = le32_to_cpu(entry->e_value_size);
+ entry_size = EXT4_XATTR_LEN(entry->e_name_len);
++ total_size = entry_size + EXT4_XATTR_SIZE(size);
+ i.name_index = entry->e_name_index,
+ buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+ b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
+@@ -1518,21 +1512,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ if (error)
+ goto cleanup;
+ total_ino -= entry_size;
+-
+- entry = IFIRST(header);
+- if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff)
+- shift_bytes = isize_diff;
+- else
+- shift_bytes = entry_size + EXT4_XATTR_SIZE(size);
+- /* Adjust the offsets and shift the remaining entries ahead */
+- ext4_xattr_shift_entries(entry, -shift_bytes,
+- (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
+- EXT4_I(inode)->i_extra_isize + shift_bytes,
+- (void *)header, total_ino, inode->i_sb->s_blocksize);
+-
+- isize_diff -= shift_bytes;
+- EXT4_I(inode)->i_extra_isize += shift_bytes;
+- header = IHDR(inode, raw_inode);
++ ifree += total_size;
++ bfree -= total_size;
+
+ i.name = b_entry_name;
+ i.value = buffer;
+@@ -1553,6 +1534,15 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ kfree(is);
+ kfree(bs);
+ }
++
++shift:
++ /* Adjust the offsets and shift the remaining entries ahead */
++ entry = IFIRST(header);
++ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
++ - new_extra_isize, (void *)raw_inode +
++ EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
++ (void *)header, total_ino, inode->i_sb->s_blocksize);
++ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ brelse(bh);
+ out:
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+--
+2.9.3
+
+From 94405713889d4a9d341b4ad92956e4e2ec8ec2c2 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Mon, 29 Aug 2016 15:41:11 -0400
+Subject: [PATCH] ext4: replace bogus assertion in ext4_xattr_shift_entries()
+
+We were checking whether computed offsets do not exceed end of block in
+ext4_xattr_shift_entries(). However this does not make sense since we
+always only decrease offsets. So replace that assertion with a check
+whether we really decrease xattrs value offsets.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/xattr.c | 9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 1447860..82b025c 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1319,18 +1319,19 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+ */
+ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+ int value_offs_shift, void *to,
+- void *from, size_t n, int blocksize)
++ void *from, size_t n)
+ {
+ struct ext4_xattr_entry *last = entry;
+ int new_offs;
+
++ /* We always shift xattr headers further thus offsets get lower */
++ BUG_ON(value_offs_shift > 0);
++
+ /* Adjust the value offsets of the entries */
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ if (!last->e_value_inum && last->e_value_size) {
+ new_offs = le16_to_cpu(last->e_value_offs) +
+ value_offs_shift;
+- BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+- > blocksize);
+ last->e_value_offs = cpu_to_le16(new_offs);
+ }
+ }
+@@ -1542,7 +1543,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
+ - new_extra_isize, (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+- (void *)header, total_ino, inode->i_sb->s_blocksize);
++ (void *)header, total_ino);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ brelse(bh);
+ out:
+--
+2.9.3
+
+From 887a9730614727c4fff7cb756711b190593fc1df Mon Sep 17 00:00:00 2001
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Date: Sun, 21 May 2017 22:36:23 -0400
+Subject: [PATCH] ext4: keep existing extra fields when inode expands
+
+ext4_expand_extra_isize() should clear only space between old and new
+size.
+
+Fixes: 6dd4ee7cab7e # v2.6.23
+Cc: stable@vger.kernel.org
+Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/inode.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 1bd0bfa..7cd99de 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5637,8 +5637,9 @@ static int ext4_expand_extra_isize(struct inode *inode,
+ /* No extended attributes present */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+- memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+- new_extra_isize);
++ memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
++ EXT4_I(inode)->i_extra_isize, 0,
++ new_extra_isize - EXT4_I(inode)->i_extra_isize);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ return 0;
+ }
+--
+2.9.3
+
--- /dev/null
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+@@ -1585,7 +1585,8 @@ static inline void ext4_clear_state_flag
+ EXT4_FEATURE_INCOMPAT_EA_INODE| \
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_DIRDATA| \
+- EXT4_FEATURE_INCOMPAT_INLINE_DATA)
++ EXT4_FEATURE_INCOMPAT_INLINE_DATA| \
++ EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -1999,6 +2000,9 @@ struct mmpd_data {
+ # define NORET_TYPE /**/
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND noreturn,
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL 3
+
+ struct ext4_xattr_ino_array {
+ unsigned int xia_count; /* # of used item in the array */
+@@ -2472,13 +2476,16 @@ static inline void ext4_r_blocks_count_s
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++ struct ext4_inode *raw_inode)
+ {
+- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++ if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) ||
++ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) &&
++ S_ISDIR(le16_to_cpu(raw_inode->i_mode))))
+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+ le32_to_cpu(raw_inode->i_size_lo);
+- else
+- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++ return (loff_t)le32_to_cpu(raw_inode->i_size_lo);
+ }
+
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+@@ -513,7 +513,14 @@ struct dx_root_info * dx_get_dx_info(str
+
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+- return le32_to_cpu(entry->block) & 0x00ffffff;
++ return le32_to_cpu(entry->block) & 0x0fffffff;
++}
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
+
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -681,7 +688,7 @@ dx_probe(const struct qstr *d_name, stru
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+
+- frame->bh = NULL;
++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+ bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
+@@ -714,10 +721,15 @@ dx_probe(const struct qstr *d_name, stru
+ }
+
+ indirect = info->indirect_levels;
+- if (indirect > 1) {
+- ext4_warning(dir->i_sb,
+- "inode #%lu: unimplemented hash depth %u",
+- dir->i_ino, info->indirect_levels);
++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++ ext4_warning(dir->i_sb,
++ "inode #%lu: comm %s: htree depth %#06x exceed max depth %u",
++ dir->i_ino, current->comm, indirect,
++ ext4_dir_htree_level(dir->i_sb));
++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(dir->i_sb, "Enable large directory "
++ "feature to access it");
++ }
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+@@ -812,13 +826,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+ struct dx_root_info *info;
++ int i;
++
+ if (frames[0].bh == NULL)
+ return;
+
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
+- if (info->indirect_levels)
+- brelse(frames[1].bh);
+- brelse(frames[0].bh);
++ for (i = 0; i <= info->indirect_levels; i++) {
++ if (frames[i].bh == NULL)
++ break;
++ brelse(frames[i].bh);
++ frames[i].bh = NULL;
++ }
+ }
+
+ /*
+@@ -960,7 +979,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+ struct dx_hash_info hinfo;
+ struct ext4_dir_entry_2 *de;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct inode *dir;
+ ext4_lblk_t block;
+ int count = 0;
+@@ -1376,7 +1395,7 @@ static struct buffer_head * ext4_dx_find
+ {
+ struct super_block * sb = dir->i_sb;
+ struct dx_hash_info hinfo;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct buffer_head *bh;
+ ext4_lblk_t block;
+ int retval;
+@@ -1832,7 +1851,7 @@ static int make_indexed_dir(handle_t *ha
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries;
+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+ struct ext4_dir_entry_tail *t;
+@@ -2117,15 +2136,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries, *at;
+ struct dx_hash_info hinfo;
+ struct buffer_head *bh;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct ext4_dir_entry_2 *de;
++ int restart;
+ int err;
+
++again:
++ restart = 0;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+@@ -2138,33 +2160,48 @@ static int ext4_dx_add_entry(handle_t *h
+ goto cleanup;
+ }
+
+- BUFFER_TRACE(bh, "get_write_access");
+- err = ext4_journal_get_write_access(handle, bh);
+- if (err)
+- goto journal_error;
+-
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (err != -ENOSPC)
+ goto cleanup;
+
++ err = 0;
+ /* Block full, should compress but for now just split */
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+ /* Need to split index? */
+ if (dx_get_count(entries) == dx_get_limit(entries)) {
+ ext4_lblk_t newblock;
+- unsigned icount = dx_get_count(entries);
+- int levels = frame - frames;
++ int levels = frame - frames + 1;
++ unsigned icount;
++ int add_level = 1;
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
+- if (levels && (dx_get_count(frames->entries) ==
+- dx_get_limit(frames->entries))) {
+- ext4_warning(sb, "Directory index full!");
++ while (frame > frames) {
++ if (dx_get_count((frame - 1)->entries) <
++ dx_get_limit((frame - 1)->entries)) {
++ add_level = 0;
++ break;
++ }
++ frame--; /* split higher index block */
++ at = frame->at;
++ entries = frame->entries;
++ restart = 1;
++ }
++ if (add_level && levels == ext4_dir_htree_level(sb)) {
++ ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u",
++ dir->i_ino, current->comm, levels,
++ ext4_dir_htree_level(sb));
++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(sb, "Large directory feature is"
++ "not enabled on this "
++ "filesystem");
++ }
+ err = -ENOSPC;
+ goto cleanup;
+ }
++ icount = dx_get_count(entries);
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
+@@ -2179,7 +2216,7 @@ static int ext4_dx_add_entry(handle_t *h
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+- if (levels) {
++ if (!add_level) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *h
+
+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+ err = ext4_journal_get_write_access(handle,
+- frames[0].bh);
++ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+
+@@ -2203,19 +2240,25 @@ static int ext4_dx_add_entry(handle_t *h
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ }
+- dx_insert_block(frames + 0, hash2, newblock);
+- dxtrace(dx_show_index("node", frames[1].entries));
++ dx_insert_block(frame - 1, hash2, newblock);
++ dxtrace(dx_show_index("node", frame->entries));
+ dxtrace(dx_show_index("node",
+- ((struct dx_node *) bh2->b_data)->entries));
++ ((struct dx_node *)bh2->b_data)->entries));
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
++ ext4_handle_dirty_dirent_node(handle, dir,
++ (frame - 1)->bh);
++ if (restart) {
++ ext4_handle_dirty_dirent_node(handle, dir,
++ frame->bh);
++ goto cleanup;
++ }
+ } else {
+ struct dx_root_info *info;
+- dxtrace(printk(KERN_DEBUG
+- "Creating second level index...\n"));
+- memcpy((char *) entries2, (char *) entries,
++
++ memcpy((char *)entries2, (char *)entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+@@ -2224,21 +2267,14 @@ static int ext4_dx_add_entry(handle_t *h
+ dx_set_block(entries + 0, newblock);
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)
+ frames[0].bh->b_data);
+- info->indirect_levels = 1;
+-
+- /* Add new access path frame */
+- frame = frames + 1;
+- frame->at = at = at - entries + entries2;
+- frame->entries = entries = entries2;
+- frame->bh = bh2;
+- err = ext4_journal_get_write_access(handle,
+- frame->bh);
+- if (err)
+- goto journal_error;
+- }
+- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+- if (err) {
+- ext4_std_error(inode->i_sb, err);
++ info->indirect_levels += 1;
++ dxtrace(printk(KERN_DEBUG
++ "Creating %d level index...\n",
++ info->indirect_levels));
++ ext4_handle_dirty_dirent_node(handle, dir, frame->bh);
++ ext4_handle_dirty_dirent_node(handle, dir, bh2);
++ brelse(bh2);
++ restart = 1;
+ goto cleanup;
+ }
+ }
+@@ -2253,6 +2289,10 @@ journal_error:
+ cleanup:
+ brelse(bh);
+ dx_release(frames);
++ /* @restart is true means htree-path has been changed, we need to
++ * repeat dx_probe() to find out valid htree-path */
++ if (restart && err == 0)
++ goto again;
+ return err;
+ }
+
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/inode.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c
+@@ -4056,12 +4056,12 @@ struct inode *ext4_iget(struct super_blo
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ ei->i_file_acl |=
+ ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+- inode->i_size = ext4_isize(raw_inode);
++ inode->i_size = ext4_isize(sb, raw_inode);
+ if ((size = i_size_read(inode)) < 0) {
+ EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+@@ -4306,7 +4306,7 @@ static int ext4_do_update_inode(handle_t
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+- if (ei->i_disksize != ext4_isize(raw_inode)) {
++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
--- /dev/null
+Single directory performance is a critical for HPC workloads. In a
+typical use case an application creates a separate output file for
+each node and task in a job. As nodes and tasks increase, hundreds
+of thousands of files may be created in a single directory within
+a short window of time.
+Today, both filename lookup and file system modifying operations
+(such as create and unlink) are protected with a single lock for
+an entire ldiskfs directory. PDO project will remove this
+bottleneck by introducing a parallel locking mechanism for entire
+ldiskfs directories. This work will enable multiple application
+threads to simultaneously lookup, create and unlink in parallel.
+
+This patch contains:
+ - pdirops support for ldiskfs
+ - integrate with osd-ldiskfs
+
+Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
+===================================================================
+--- /dev/null
++++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
+@@ -0,0 +1,187 @@
++/*
++ * include/linux/htree_lock.h
++ *
++ * Copyright (c) 2011, 2012, Intel Corporation.
++ *
++ * Author: Liang Zhen <liang@whamcloud.com>
++ */
++
++/*
++ * htree lock
++ *
++ * htree_lock is an advanced lock, it can support five lock modes (concept is
++ * taken from DLM) and it's a sleeping lock.
++ *
++ * most common use case is:
++ * - create a htree_lock_head for data
++ * - each thread (contender) creates it's own htree_lock
++ * - contender needs to call htree_lock(lock_node, mode) to protect data and
++ * call htree_unlock to release lock
++ *
++ * Also, there is advanced use-case which is more complex, user can have
++ * PW/PR lock on particular key, it's mostly used while user holding shared
++ * lock on the htree (CW, CR)
++ *
++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
++ * ...
++ * htree_node_unlock(lock_node);; unlock the key
++ *
++ * Another tip is, we can have N-levels of this kind of keys, all we need to
++ * do is specifying N-levels while creating htree_lock_head, then we can
++ * lock/unlock a specific level by:
++ * htree_node_lock(lock_node, mode1, key1, level1...);
++ * do something;
++ * htree_node_lock(lock_node, mode1, key2, level2...);
++ * do something;
++ * htree_node_unlock(lock_node, level2);
++ * htree_node_unlock(lock_node, level1);
++ *
++ * NB: for multi-level, should be careful about locking order to avoid deadlock
++ */
++
++#ifndef _LINUX_HTREE_LOCK_H
++#define _LINUX_HTREE_LOCK_H
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>
++
++/*
++ * Lock Modes
++ * more details can be found here:
++ * http://en.wikipedia.org/wiki/Distributed_lock_manager
++ */
++typedef enum {
++ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */
++ HTREE_LOCK_PW, /* protected write: allows only CR users */
++ HTREE_LOCK_PR, /* protected read: allow PR, CR users */
++ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */
++ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */
++ HTREE_LOCK_MAX, /* number of lock modes */
++} htree_lock_mode_t;
++
++#define HTREE_LOCK_NL HTREE_LOCK_MAX
++#define HTREE_LOCK_INVAL 0xdead10c
++
++enum {
++ HTREE_HBITS_MIN = 2,
++ HTREE_HBITS_DEF = 14,
++ HTREE_HBITS_MAX = 32,
++};
++
++enum {
++ HTREE_EVENT_DISABLE = (0),
++ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR),
++ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW),
++ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR),
++};
++
++struct htree_lock;
++
++typedef void (*htree_event_cb_t)(void *target, void *event);
++
++struct htree_lock_child {
++ struct list_head lc_list; /* granted list */
++ htree_event_cb_t lc_callback; /* event callback */
++ unsigned lc_events; /* event types */
++};
++
++struct htree_lock_head {
++ unsigned long lh_lock; /* bits lock */
++ /* blocked lock list (htree_lock) */
++ struct list_head lh_blocked_list;
++ /* # key levels */
++ u16 lh_depth;
++ /* hash bits for key and limit number of locks */
++ u16 lh_hbits;
++ /* counters for blocked locks */
++ u16 lh_nblocked[HTREE_LOCK_MAX];
++ /* counters for granted locks */
++ u16 lh_ngranted[HTREE_LOCK_MAX];
++ /* private data */
++ void *lh_private;
++ /* array of children locks */
++ struct htree_lock_child lh_children[0];
++};
++
++/* htree_lock_node_t is child-lock for a specific key (ln_value) */
++struct htree_lock_node {
++ htree_lock_mode_t ln_mode;
++ /* major hash key */
++ u16 ln_major_key;
++ /* minor hash key */
++ u16 ln_minor_key;
++ struct list_head ln_major_list;
++ struct list_head ln_minor_list;
++ /* alive list, all locks (granted, blocked, listening) are on it */
++ struct list_head ln_alive_list;
++ /* blocked list */
++ struct list_head ln_blocked_list;
++ /* granted list */
++ struct list_head ln_granted_list;
++ void *ln_ev_target;
++};
++
++struct htree_lock {
++ struct task_struct *lk_task;
++ struct htree_lock_head *lk_head;
++ void *lk_private;
++ unsigned lk_depth;
++ htree_lock_mode_t lk_mode;
++ struct list_head lk_blocked_list;
++ struct htree_lock_node lk_nodes[0];
++};
++
++/* create a lock head, which stands for a resource */
++struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
++ unsigned hbits, unsigned priv);
++/* free a lock head */
++void htree_lock_head_free(struct htree_lock_head *lhead);
++/* register event callback for child lock at level @depth */
++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
++ unsigned events, htree_event_cb_t callback);
++/* create a lock handle, which stands for a thread */
++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
++/* free a lock handle */
++void htree_lock_free(struct htree_lock *lck);
++/* lock htree, when @wait is true, 0 is returned if the lock can't
++ * be granted immediately */
++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
++ htree_lock_mode_t mode, int wait);
++/* unlock htree */
++void htree_unlock(struct htree_lock *lck);
++/* unlock and relock htree with @new_mode */
++int htree_change_lock_try(struct htree_lock *lck,
++ htree_lock_mode_t new_mode, int wait);
++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
++/* require child lock (key) of htree at level @dep, @event will be sent to all
++ * listeners on this @key while lock being granted */
++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
++ u32 key, unsigned dep, int wait, void *event);
++/* release child lock at level @dep, this lock will listen on it's key
++ * if @event isn't NULL, event_cb will be called against @lck while granting
++ * any other lock at level @dep with the same key */
++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
++/* stop listening on child lock at level @dep */
++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
++/* for debug */
++void htree_lock_stat_print(int depth);
++void htree_lock_stat_reset(void);
++
++#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1)
++#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1)
++
++#define htree_lock_mode(lck) ((lck)->lk_mode)
++
++#define htree_node_lock(lck, mode, key, dep) \
++ htree_node_lock_try(lck, mode, key, dep, 1, NULL)
++/* this is only safe in thread context of lock owner */
++#define htree_node_is_granted(lck, dep) \
++ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
++ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
++/* this is only safe in thread context of lock owner */
++#define htree_node_is_listening(lck, dep) \
++ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
++
++#endif
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
+===================================================================
+--- /dev/null
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
+@@ -0,0 +1,880 @@
++/*
++ * fs/ext4/htree_lock.c
++ *
++ * Copyright (c) 2011, 2012, Intel Corporation.
++ *
++ * Author: Liang Zhen <liang@whamcloud.com>
++ */
++#include <linux/jbd2.h>
++#include <linux/hash.h>
++#include <linux/module.h>
++#include <linux/htree_lock.h>
++
++enum {
++ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX),
++ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW),
++ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR),
++ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW),
++ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR),
++};
++
++enum {
++ HTREE_LOCK_COMPAT_EX = 0,
++ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
++ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
++ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
++ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
++ HTREE_LOCK_BIT_PW,
++};
++
++static int htree_lock_compat[] = {
++ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX,
++ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW,
++ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR,
++ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW,
++ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR,
++};
++
++/* max allowed htree-lock depth.
++ * We only need depth=3 for ext4 although user can have higher value. */
++#define HTREE_LOCK_DEP_MAX 16
++
++#ifdef HTREE_LOCK_DEBUG
++
++static char *hl_name[] = {
++ [HTREE_LOCK_EX] "EX",
++ [HTREE_LOCK_PW] "PW",
++ [HTREE_LOCK_PR] "PR",
++ [HTREE_LOCK_CW] "CW",
++ [HTREE_LOCK_CR] "CR",
++};
++
++/* lock stats */
++struct htree_lock_node_stats {
++ unsigned long long blocked[HTREE_LOCK_MAX];
++ unsigned long long granted[HTREE_LOCK_MAX];
++ unsigned long long retried[HTREE_LOCK_MAX];
++ unsigned long long events;
++};
++
++struct htree_lock_stats {
++ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX];
++ unsigned long long granted[HTREE_LOCK_MAX];
++ unsigned long long blocked[HTREE_LOCK_MAX];
++};
++
++static struct htree_lock_stats hl_stats;
++
++void htree_lock_stat_reset(void)
++{
++ memset(&hl_stats, 0, sizeof(hl_stats));
++}
++
++void htree_lock_stat_print(int depth)
++{
++ int i;
++ int j;
++
++ printk(KERN_DEBUG "HTREE LOCK STATS:\n");
++ for (i = 0; i < HTREE_LOCK_MAX; i++) {
++ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
++ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
++ }
++ for (i = 0; i < depth; i++) {
++ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
++ for (j = 0; j < HTREE_LOCK_MAX; j++) {
++ printk(KERN_DEBUG
++ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
++ hl_name[j], hl_stats.nodes[i].granted[j],
++ hl_stats.nodes[i].blocked[j],
++ hl_stats.nodes[i].retried[j]);
++ }
++ }
++}
++
++#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0)
++#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0)
++#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0)
++#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0)
++#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0)
++#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0)
++
++#else /* !DEBUG */
++
++void htree_lock_stat_reset(void) {}
++void htree_lock_stat_print(int depth) {}
++
++#define lk_grant_inc(m) do {} while (0)
++#define lk_block_inc(m) do {} while (0)
++#define ln_grant_inc(d, m) do {} while (0)
++#define ln_block_inc(d, m) do {} while (0)
++#define ln_retry_inc(d, m) do {} while (0)
++#define ln_event_inc(d) do {} while (0)
++
++#endif /* DEBUG */
++
++EXPORT_SYMBOL(htree_lock_stat_reset);
++EXPORT_SYMBOL(htree_lock_stat_print);
++
++#define HTREE_DEP_ROOT (-1)
++
++#define htree_spin_lock(lhead, dep) \
++ bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
++#define htree_spin_unlock(lhead, dep) \
++ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
++
++#define htree_key_event_ignore(child, ln) \
++ (!((child)->lc_events & (1 << (ln)->ln_mode)))
++
++static int
++htree_key_list_empty(struct htree_lock_node *ln)
++{
++ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
++}
++
++static void
++htree_key_list_del_init(struct htree_lock_node *ln)
++{
++ struct htree_lock_node *tmp = NULL;
++
++ if (!list_empty(&ln->ln_minor_list)) {
++ tmp = list_entry(ln->ln_minor_list.next,
++ struct htree_lock_node, ln_minor_list);
++ list_del_init(&ln->ln_minor_list);
++ }
++
++ if (list_empty(&ln->ln_major_list))
++ return;
++
++ if (tmp == NULL) { /* not on minor key list */
++ list_del_init(&ln->ln_major_list);
++ } else {
++ BUG_ON(!list_empty(&tmp->ln_major_list));
++ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
++ }
++}
++
++static void
++htree_key_list_replace_init(struct htree_lock_node *old,
++ struct htree_lock_node *new)
++{
++ if (!list_empty(&old->ln_major_list))
++ list_replace_init(&old->ln_major_list, &new->ln_major_list);
++
++ if (!list_empty(&old->ln_minor_list))
++ list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
++}
++
++static void
++htree_key_event_enqueue(struct htree_lock_child *child,
++ struct htree_lock_node *ln, int dep, void *event)
++{
++ struct htree_lock_node *tmp;
++
++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */
++ BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
++ if (event == NULL || htree_key_event_ignore(child, ln))
++ return;
++
++ /* shouldn't be a very long list */
++ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
++ if (tmp->ln_mode == HTREE_LOCK_NL) {
++ ln_event_inc(dep);
++ if (child->lc_callback != NULL)
++ child->lc_callback(tmp->ln_ev_target, event);
++ }
++ }
++}
++
++static int
++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
++ unsigned dep, int wait, void *event)
++{
++ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
++ struct htree_lock_node *newln = &newlk->lk_nodes[dep];
++ struct htree_lock_node *curln = &curlk->lk_nodes[dep];
++
++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */
++ /* NB: we only expect PR/PW lock mode at here, only these two modes are
++ * allowed for htree_node_lock(asserted in htree_node_lock_internal),
++ * NL is only used for listener, user can't directly require NL mode */
++ if ((curln->ln_mode == HTREE_LOCK_NL) ||
++ (curln->ln_mode != HTREE_LOCK_PW &&
++ newln->ln_mode != HTREE_LOCK_PW)) {
++ /* no conflict, attach it on granted list of @curlk */
++ if (curln->ln_mode != HTREE_LOCK_NL) {
++ list_add(&newln->ln_granted_list,
++ &curln->ln_granted_list);
++ } else {
++ /* replace key owner */
++ htree_key_list_replace_init(curln, newln);
++ }
++
++ list_add(&newln->ln_alive_list, &curln->ln_alive_list);
++ htree_key_event_enqueue(child, newln, dep, event);
++ ln_grant_inc(dep, newln->ln_mode);
++ return 1; /* still hold lh_lock */
++ }
++
++ if (!wait) { /* can't grant and don't want to wait */
++ ln_retry_inc(dep, newln->ln_mode);
++ newln->ln_mode = HTREE_LOCK_INVAL;
++ return -1; /* don't wait and just return -1 */
++ }
++
++ newlk->lk_task = current;
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ /* conflict, attach it on blocked list of curlk */
++ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
++ list_add(&newln->ln_alive_list, &curln->ln_alive_list);
++ ln_block_inc(dep, newln->ln_mode);
++
++ htree_spin_unlock(newlk->lk_head, dep);
++ /* wait to be given the lock */
++ if (newlk->lk_task != NULL)
++ schedule();
++ /* granted, no doubt, wake up will set me RUNNING */
++ if (event == NULL || htree_key_event_ignore(child, newln))
++ return 0; /* granted without lh_lock */
++
++ htree_spin_lock(newlk->lk_head, dep);
++ htree_key_event_enqueue(child, newln, dep, event);
++ return 1; /* still hold lh_lock */
++}
++
++/*
++ * get PR/PW access to particular tree-node according to @dep and @key,
++ * it will return -1 if @wait is false and can't immediately grant this lock.
++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
++ * @event if it's not NULL.
++ * NB: ALWAYS called holding lhead::lh_lock
++ */
++static int
++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
++ htree_lock_mode_t mode, u32 key, unsigned dep,
++ int wait, void *event)
++{
++ LIST_HEAD(list);
++ struct htree_lock *tmp;
++ struct htree_lock *tmp2;
++ u16 major;
++ u16 minor;
++ u8 reverse;
++ u8 ma_bits;
++ u8 mi_bits;
++
++ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
++ BUG_ON(htree_node_is_granted(lck, dep));
++
++ key = hash_long(key, lhead->lh_hbits);
++
++ mi_bits = lhead->lh_hbits >> 1;
++ ma_bits = lhead->lh_hbits - mi_bits;
++
++ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
++ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
++ lck->lk_nodes[dep].ln_mode = mode;
++
++ /*
++ * The major key list is an ordered list, so searches are started
++ * at the end of the list that is numerically closer to major_key,
++ * so at most half of the list will be walked (for well-distributed
++ * keys). The list traversal aborts early if the expected key
++ * location is passed.
++ */
++ reverse = (major >= (1 << (ma_bits - 1)));
++
++ if (reverse) {
++ list_for_each_entry_reverse(tmp,
++ &lhead->lh_children[dep].lc_list,
++ lk_nodes[dep].ln_major_list) {
++ if (tmp->lk_nodes[dep].ln_major_key == major) {
++ goto search_minor;
++
++ } else if (tmp->lk_nodes[dep].ln_major_key < major) {
++ /* attach _after_ @tmp */
++ list_add(&lck->lk_nodes[dep].ln_major_list,
++ &tmp->lk_nodes[dep].ln_major_list);
++ goto out_grant_major;
++ }
++ }
++
++ list_add(&lck->lk_nodes[dep].ln_major_list,
++ &lhead->lh_children[dep].lc_list);
++ goto out_grant_major;
++
++ } else {
++ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
++ lk_nodes[dep].ln_major_list) {
++ if (tmp->lk_nodes[dep].ln_major_key == major) {
++ goto search_minor;
++
++ } else if (tmp->lk_nodes[dep].ln_major_key > major) {
++ /* insert _before_ @tmp */
++ list_add_tail(&lck->lk_nodes[dep].ln_major_list,
++ &tmp->lk_nodes[dep].ln_major_list);
++ goto out_grant_major;
++ }
++ }
++
++ list_add_tail(&lck->lk_nodes[dep].ln_major_list,
++ &lhead->lh_children[dep].lc_list);
++ goto out_grant_major;
++ }
++
++ search_minor:
++ /*
++ * NB: minor_key list doesn't have a "head", @list is just a
++ * temporary stub for helping list searching, make sure it's removed
++ * after searching.
++ * minor_key list is an ordered list too.
++ */
++ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
++
++ reverse = (minor >= (1 << (mi_bits - 1)));
++
++ if (reverse) {
++ list_for_each_entry_reverse(tmp2, &list,
++ lk_nodes[dep].ln_minor_list) {
++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
++ goto out_enqueue;
++
++ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
++ /* attach _after_ @tmp2 */
++ list_add(&lck->lk_nodes[dep].ln_minor_list,
++ &tmp2->lk_nodes[dep].ln_minor_list);
++ goto out_grant_minor;
++ }
++ }
++
++ list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
++
++ } else {
++ list_for_each_entry(tmp2, &list,
++ lk_nodes[dep].ln_minor_list) {
++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
++ goto out_enqueue;
++
++ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
++ /* insert _before_ @tmp2 */
++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
++ &tmp2->lk_nodes[dep].ln_minor_list);
++ goto out_grant_minor;
++ }
++ }
++
++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
++ }
++
++ out_grant_minor:
++ if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
++ /* new lock @lck is the first one on minor_key list, which
++ * means it has the smallest minor_key and it should
++ * replace @tmp as minor_key owner */
++ list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
++ &lck->lk_nodes[dep].ln_major_list);
++ }
++ /* remove the temporary head */
++ list_del(&list);
++
++ out_grant_major:
++ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
++ return 1; /* granted with holding lh_lock */
++
++ out_enqueue:
++ list_del(&list); /* remove temprary head */
++ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
++}
++
++/*
++ * release the key of @lck at level @dep, and grant any blocked locks.
++ * caller will still listen on @key if @event is not NULL, which means
++ * caller can see a event (by event_cb) while granting any lock with
++ * the same key at level @dep.
++ * NB: ALWAYS called holding lhead::lh_lock
++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
++ */
++static void
++htree_node_unlock_internal(struct htree_lock_head *lhead,
++ struct htree_lock *curlk, unsigned dep, void *event)
++{
++ struct htree_lock_node *curln = &curlk->lk_nodes[dep];
++ struct htree_lock *grtlk = NULL;
++ struct htree_lock_node *grtln;
++ struct htree_lock *poslk;
++ struct htree_lock *tmplk;
++
++ if (!htree_node_is_granted(curlk, dep))
++ return;
++
++ if (!list_empty(&curln->ln_granted_list)) {
++ /* there is another granted lock */
++ grtlk = list_entry(curln->ln_granted_list.next,
++ struct htree_lock,
++ lk_nodes[dep].ln_granted_list);
++ list_del_init(&curln->ln_granted_list);
++ }
++
++ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
++ /*
++ * @curlk is the only granted lock, so we confirmed:
++ * a) curln is key owner (attached on major/minor_list),
++ * so if there is any blocked lock, it should be attached
++ * on curln->ln_blocked_list
++ * b) we always can grant the first blocked lock
++ */
++ grtlk = list_entry(curln->ln_blocked_list.next,
++ struct htree_lock,
++ lk_nodes[dep].ln_blocked_list);
++ BUG_ON(grtlk->lk_task == NULL);
++ wake_up_process(grtlk->lk_task);
++ }
++
++ if (event != NULL &&
++ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
++ curln->ln_ev_target = event;
++ curln->ln_mode = HTREE_LOCK_NL; /* listen! */
++ } else {
++ curln->ln_mode = HTREE_LOCK_INVAL;
++ }
++
++ if (grtlk == NULL) { /* I must be the only one locking this key */
++ struct htree_lock_node *tmpln;
++
++ BUG_ON(htree_key_list_empty(curln));
++
++ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
++ return;
++
++ /* not listening */
++ if (list_empty(&curln->ln_alive_list)) { /* no more listener */
++ htree_key_list_del_init(curln);
++ return;
++ }
++
++ tmpln = list_entry(curln->ln_alive_list.next,
++ struct htree_lock_node, ln_alive_list);
++
++ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
++
++ htree_key_list_replace_init(curln, tmpln);
++ list_del_init(&curln->ln_alive_list);
++
++ return;
++ }
++
++ /* have a granted lock */
++ grtln = &grtlk->lk_nodes[dep];
++ if (!list_empty(&curln->ln_blocked_list)) {
++ /* only key owner can be on both lists */
++ BUG_ON(htree_key_list_empty(curln));
++
++ if (list_empty(&grtln->ln_blocked_list)) {
++ list_add(&grtln->ln_blocked_list,
++ &curln->ln_blocked_list);
++ }
++ list_del_init(&curln->ln_blocked_list);
++ }
++ /*
++ * NB: this is the tricky part:
++ * We have only two modes for child-lock (PR and PW), also,
++ * only owner of the key (attached on major/minor_list) can be on
++ * both blocked_list and granted_list, so @grtlk must be one
++ * of these two cases:
++ *
++ * a) @grtlk is taken from granted_list, which means we've granted
++ * more than one lock so @grtlk has to be PR, the first blocked
++ * lock must be PW and we can't grant it at all.
++ * So even @grtlk is not owner of the key (empty blocked_list),
++ * we don't care because we can't grant any lock.
++ * b) we just grant a new lock which is taken from head of blocked
++ * list, and it should be the first granted lock, and it should
++ * be the first one linked on blocked_list.
++ *
++ * Either way, we can get correct result by iterating blocked_list
++ * of @grtlk, and don't have to bother on how to find out
++ * owner of current key.
++ */
++ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
++ lk_nodes[dep].ln_blocked_list) {
++ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
++ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
++ break;
++ /* grant all readers */
++ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
++ list_add(&poslk->lk_nodes[dep].ln_granted_list,
++ &grtln->ln_granted_list);
++
++ BUG_ON(poslk->lk_task == NULL);
++ wake_up_process(poslk->lk_task);
++ }
++
++ /* if @curln is the owner of this key, replace it with @grtln */
++ if (!htree_key_list_empty(curln))
++ htree_key_list_replace_init(curln, grtln);
++
++ if (curln->ln_mode == HTREE_LOCK_INVAL)
++ list_del_init(&curln->ln_alive_list);
++}
++
++/*
++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
++ * and 0 only if @wait is false and can't grant it immediately
++ */
++int
++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
++ u32 key, unsigned dep, int wait, void *event)
++{
++ struct htree_lock_head *lhead = lck->lk_head;
++ int rc;
++
++ BUG_ON(dep >= lck->lk_depth);
++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++ htree_spin_lock(lhead, dep);
++ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
++ if (rc != 0)
++ htree_spin_unlock(lhead, dep);
++ return rc >= 0;
++}
++EXPORT_SYMBOL(htree_node_lock_try);
++
++/* it's wrapper of htree_node_unlock_internal */
++void
++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
++{
++ struct htree_lock_head *lhead = lck->lk_head;
++
++ BUG_ON(dep >= lck->lk_depth);
++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++ htree_spin_lock(lhead, dep);
++ htree_node_unlock_internal(lhead, lck, dep, event);
++ htree_spin_unlock(lhead, dep);
++}
++EXPORT_SYMBOL(htree_node_unlock);
++
++/* stop listening on child-lock level @dep */
++void
++htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
++{
++ struct htree_lock_node *ln = &lck->lk_nodes[dep];
++ struct htree_lock_node *tmp;
++
++ BUG_ON(htree_node_is_granted(lck, dep));
++ BUG_ON(!list_empty(&ln->ln_blocked_list));
++ BUG_ON(!list_empty(&ln->ln_granted_list));
++
++ if (!htree_node_is_listening(lck, dep))
++ return;
++
++ htree_spin_lock(lck->lk_head, dep);
++ ln->ln_mode = HTREE_LOCK_INVAL;
++ ln->ln_ev_target = NULL;
++
++ if (htree_key_list_empty(ln)) { /* not owner */
++ list_del_init(&ln->ln_alive_list);
++ goto out;
++ }
++
++ /* I'm the owner... */
++ if (list_empty(&ln->ln_alive_list)) { /* no more listener */
++ htree_key_list_del_init(ln);
++ goto out;
++ }
++
++ tmp = list_entry(ln->ln_alive_list.next,
++ struct htree_lock_node, ln_alive_list);
++
++ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
++ htree_key_list_replace_init(ln, tmp);
++ list_del_init(&ln->ln_alive_list);
++ out:
++ htree_spin_unlock(lck->lk_head, dep);
++}
++EXPORT_SYMBOL(htree_node_stop_listen);
++
++/* release all child-locks if we have any */
++static void
++htree_node_release_all(struct htree_lock *lck)
++{
++ int i;
++
++ for (i = 0; i < lck->lk_depth; i++) {
++ if (htree_node_is_granted(lck, i))
++ htree_node_unlock(lck, i, NULL);
++ else if (htree_node_is_listening(lck, i))
++ htree_node_stop_listen(lck, i);
++ }
++}
++
++/*
++ * obtain htree lock, it could be blocked inside if there's conflict
++ * with any granted or blocked lock and @wait is true.
++ * NB: ALWAYS called holding lhead::lh_lock
++ */
++static int
++htree_lock_internal(struct htree_lock *lck, int wait)
++{
++ struct htree_lock_head *lhead = lck->lk_head;
++ int granted = 0;
++ int blocked = 0;
++ int i;
++
++ for (i = 0; i < HTREE_LOCK_MAX; i++) {
++ if (lhead->lh_ngranted[i] != 0)
++ granted |= 1 << i;
++ if (lhead->lh_nblocked[i] != 0)
++ blocked |= 1 << i;
++ }
++ if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
++ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
++ /* will block current lock even it just conflicts with any
++ * other blocked lock, so lock like EX wouldn't starve */
++ if (!wait)
++ return -1;
++ lhead->lh_nblocked[lck->lk_mode]++;
++ lk_block_inc(lck->lk_mode);
++
++ lck->lk_task = current;
++ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
++
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++ /* wait to be given the lock */
++ if (lck->lk_task != NULL)
++ schedule();
++ /* granted, no doubt. wake up will set me RUNNING */
++ return 0; /* without lh_lock */
++ }
++ lhead->lh_ngranted[lck->lk_mode]++;
++ lk_grant_inc(lck->lk_mode);
++ return 1;
++}
++
++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
++static void
++htree_unlock_internal(struct htree_lock *lck)
++{
++ struct htree_lock_head *lhead = lck->lk_head;
++ struct htree_lock *tmp;
++ struct htree_lock *tmp2;
++ int granted = 0;
++ int i;
++
++ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
++
++ lhead->lh_ngranted[lck->lk_mode]--;
++ lck->lk_mode = HTREE_LOCK_INVAL;
++
++ for (i = 0; i < HTREE_LOCK_MAX; i++) {
++ if (lhead->lh_ngranted[i] != 0)
++ granted |= 1 << i;
++ }
++ list_for_each_entry_safe(tmp, tmp2,
++ &lhead->lh_blocked_list, lk_blocked_list) {
++ /* conflict with any granted lock? */
++ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
++ break;
++
++ list_del_init(&tmp->lk_blocked_list);
++
++ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
++
++ lhead->lh_nblocked[tmp->lk_mode]--;
++ lhead->lh_ngranted[tmp->lk_mode]++;
++ granted |= 1 << tmp->lk_mode;
++
++ BUG_ON(tmp->lk_task == NULL);
++ wake_up_process(tmp->lk_task);
++ }
++}
++
++/* it's wrapper of htree_lock_internal and exported interface.
++ * It always return 1 with granted lock if @wait is true, it can return 0
++ * if @wait is false and locking request can't be granted immediately */
++int
++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
++ htree_lock_mode_t mode, int wait)
++{
++ int rc;
++
++ BUG_ON(lck->lk_depth > lhead->lh_depth);
++ BUG_ON(lck->lk_head != NULL);
++ BUG_ON(lck->lk_task != NULL);
++
++ lck->lk_head = lhead;
++ lck->lk_mode = mode;
++
++ htree_spin_lock(lhead, HTREE_DEP_ROOT);
++ rc = htree_lock_internal(lck, wait);
++ if (rc != 0)
++ htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++ return rc >= 0;
++}
++EXPORT_SYMBOL(htree_lock_try);
++
++/* it's wrapper of htree_unlock_internal and exported interface.
++ * It will release all htree_node_locks and htree_lock */
++void
++htree_unlock(struct htree_lock *lck)
++{
++ BUG_ON(lck->lk_head == NULL);
++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++ htree_node_release_all(lck);
++
++ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
++ htree_unlock_internal(lck);
++ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
++ lck->lk_head = NULL;
++ lck->lk_task = NULL;
++}
++EXPORT_SYMBOL(htree_unlock);
++
++/* change lock mode */
++void
++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
++{
++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++ lck->lk_mode = mode;
++}
++EXPORT_SYMBOL(htree_change_mode);
++
++/* release htree lock, and lock it again with new mode.
++ * This function will first release all htree_node_locks and htree_lock,
++ * then try to gain htree_lock with new @mode.
++ * It always return 1 with granted lock if @wait is true, it can return 0
++ * if @wait is false and locking request can't be granted immediately */
++int
++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
++{
++ struct htree_lock_head *lhead = lck->lk_head;
++ int rc;
++
++ BUG_ON(lhead == NULL);
++ BUG_ON(lck->lk_mode == mode);
++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
++
++ htree_node_release_all(lck);
++
++ htree_spin_lock(lhead, HTREE_DEP_ROOT);
++ htree_unlock_internal(lck);
++ lck->lk_mode = mode;
++ rc = htree_lock_internal(lck, wait);
++ if (rc != 0)
++ htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++ return rc >= 0;
++}
++EXPORT_SYMBOL(htree_change_lock_try);
++
++/* create a htree_lock head with @depth levels (number of child-locks),
++ * it is a per resoruce structure */
++struct htree_lock_head *
++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
++{
++ struct htree_lock_head *lhead;
++ int i;
++
++ if (depth > HTREE_LOCK_DEP_MAX) {
++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
++ depth, HTREE_LOCK_DEP_MAX);
++ return NULL;
++ }
++
++ lhead = kzalloc(offsetof(struct htree_lock_head,
++ lh_children[depth]) + priv, GFP_NOFS);
++ if (lhead == NULL)
++ return NULL;
++
++ if (hbits < HTREE_HBITS_MIN)
++ lhead->lh_hbits = HTREE_HBITS_MIN;
++ else if (hbits > HTREE_HBITS_MAX)
++ lhead->lh_hbits = HTREE_HBITS_MAX;
++
++ lhead->lh_lock = 0;
++ lhead->lh_depth = depth;
++ INIT_LIST_HEAD(&lhead->lh_blocked_list);
++ if (priv > 0) {
++ lhead->lh_private = (void *)lhead +
++ offsetof(struct htree_lock_head, lh_children[depth]);
++ }
++
++ for (i = 0; i < depth; i++) {
++ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
++ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
++ }
++ return lhead;
++}
++EXPORT_SYMBOL(htree_lock_head_alloc);
++
++/* free the htree_lock head */
++void
++htree_lock_head_free(struct htree_lock_head *lhead)
++{
++ int i;
++
++ BUG_ON(!list_empty(&lhead->lh_blocked_list));
++ for (i = 0; i < lhead->lh_depth; i++)
++ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
++ kfree(lhead);
++}
++EXPORT_SYMBOL(htree_lock_head_free);
++
++/* register event callback for @events of child-lock at level @dep */
++void
++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
++ unsigned events, htree_event_cb_t callback)
++{
++ BUG_ON(lhead->lh_depth <= dep);
++ lhead->lh_children[dep].lc_events = events;
++ lhead->lh_children[dep].lc_callback = callback;
++}
++EXPORT_SYMBOL(htree_lock_event_attach);
++
++/* allocate a htree_lock, which is per-thread structure, @pbytes is some
++ * extra-bytes as private data for caller */
++struct htree_lock *
++htree_lock_alloc(unsigned depth, unsigned pbytes)
++{
++ struct htree_lock *lck;
++ int i = offsetof(struct htree_lock, lk_nodes[depth]);
++
++ if (depth > HTREE_LOCK_DEP_MAX) {
++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
++ depth, HTREE_LOCK_DEP_MAX);
++ return NULL;
++ }
++ lck = kzalloc(i + pbytes, GFP_NOFS);
++ if (lck == NULL)
++ return NULL;
++
++ if (pbytes != 0)
++ lck->lk_private = (void *)lck + i;
++ lck->lk_mode = HTREE_LOCK_INVAL;
++ lck->lk_depth = depth;
++ INIT_LIST_HEAD(&lck->lk_blocked_list);
++
++ for (i = 0; i < depth; i++) {
++ struct htree_lock_node *node = &lck->lk_nodes[i];
++
++ node->ln_mode = HTREE_LOCK_INVAL;
++ INIT_LIST_HEAD(&node->ln_major_list);
++ INIT_LIST_HEAD(&node->ln_minor_list);
++ INIT_LIST_HEAD(&node->ln_alive_list);
++ INIT_LIST_HEAD(&node->ln_blocked_list);
++ INIT_LIST_HEAD(&node->ln_granted_list);
++ }
++
++ return lck;
++}
++EXPORT_SYMBOL(htree_lock_alloc);
++
++/* free htree_lock node */
++void
++htree_lock_free(struct htree_lock *lck)
++{
++ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
++ kfree(lck);
++}
++EXPORT_SYMBOL(htree_lock_free);
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/Makefile
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ htree_lock.o \
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+ xattr_trusted.o inline.o
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+@@ -27,6 +27,7 @@
+ #include <linux/mutex.h>
+ #include <linux/timer.h>
+ #include <linux/wait.h>
++#include <linux/htree_lock.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
+ #include <linux/ratelimit.h>
+@@ -821,6 +822,9 @@ struct ext4_inode_info {
+ __u32 i_dtime;
+ ext4_fsblk_t i_file_acl;
+
++ /* following fields for parallel directory operations -bzzz */
++ struct semaphore i_append_sem;
++
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+@@ -1846,6 +1850,71 @@ struct dx_hash_info
+ */
+ #define HASH_NB_ALWAYS 1
+
++/* assume name-hash is protected by upper layer */
++#define EXT4_HTREE_LOCK_HASH 0
++
++enum ext4_pdo_lk_types {
++#if EXT4_HTREE_LOCK_HASH
++ EXT4_LK_HASH,
++#endif
++ EXT4_LK_DX, /* index block */
++ EXT4_LK_DE, /* directory entry block */
++ EXT4_LK_SPIN, /* spinlock */
++ EXT4_LK_MAX,
++};
++
++/* read-only bit */
++#define EXT4_LB_RO(b) (1 << (b))
++/* read + write, high bits for writer */
++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
++
++enum ext4_pdo_lock_bits {
++ /* DX lock bits */
++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX),
++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX),
++ /* DE lock bits */
++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE),
++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE),
++ /* DX spinlock bits */
++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN),
++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN),
++ /* accurate searching */
++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1),
++};
++
++enum ext4_pdo_lock_opc {
++ /* external */
++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
++ EXT4_LB_EXACT),
++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
++ EXT4_LB_EXACT),
++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
++
++ /* internal */
++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
++ EXT4_LB_EXACT),
++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
++};
++
++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead)
++
++extern struct htree_lock *ext4_htree_lock_alloc(void);
++#define ext4_htree_lock_free(lck) htree_lock_free(lck)
++
++extern void ext4_htree_lock(struct htree_lock *lck,
++ struct htree_lock_head *lhead,
++ struct inode *dir, unsigned flags);
++#define ext4_htree_unlock(lck) htree_unlock(lck)
++
++extern struct buffer_head *__ext4_find_entry(struct inode *dir,
++ const struct qstr *d_name,
++ struct ext4_dir_entry_2 **res_dir,
++ int *inlined, struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct htree_lock *lck);
+
+ /*
+ * Describe an inode's exact location on disk and in memory
+@@ -2088,9 +2157,17 @@ void ext4_insert_dentry(struct inode *in
+ const char *name, int namelen, void *data);
+ static inline void ext4_update_dx_flag(struct inode *inode)
+ {
++ /* Disable it for ldiskfs, because going from a DX directory to
++ * a non-DX directory while it is in use will completely break
++ * the htree-locking.
++ * If we really want to support this operation in the future,
++ * we need to exclusively lock the directory at here which will
++ * increase complexity of code */
++#if 0
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
++#endif
+ }
+ static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+@@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t
+ ext4_lblk_t *block)
+ {
+ struct buffer_head *bh;
++ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err = 0;
+
+ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+@@ -60,15 +61,22 @@ struct buffer_head *ext4_append(handle_t
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+ return ERR_PTR(-ENOSPC);
+
++ /* with parallel dir operations all appends
++ * have to be serialized -bzzz */
++ down(&ei->i_append_sem);
++
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+ bh = ext4_bread(handle, inode, *block, 1, &err);
+- if (!bh)
++ if (!bh) {
++ up(&ei->i_append_sem);
+ return ERR_PTR(err);
++ }
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
++ up(&ei->i_append_sem);
+ if (err) {
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+@@ -246,7 +254,7 @@ static struct dx_frame *dx_probe(const s
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct dx_frame *frame,
+- int *err);
++ struct htree_lock *lck, int *err);
+ static void dx_release(struct dx_frame *frames);
+ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+@@ -259,13 +267,13 @@ static void dx_insert_block(struct dx_fr
+ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+- __u32 *start_hash);
++ __u32 *start_hash, struct htree_lock *lck);
+ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+- int *err);
++ struct htree_lock *lck, int *err);
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode);
++ struct inode *inode, struct htree_lock *lck);
+
+ /* checksumming functions */
+ void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+@@ -668,6 +676,227 @@ struct stats dx_show_entries(struct dx_h
+ }
+ #endif /* DX_DEBUG */
+
++/* private data for htree_lock */
++struct ext4_dir_lock_data {
++ unsigned ld_flags; /* bits-map for lock types */
++ unsigned ld_count; /* # entries of the last DX block */
++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */
++ struct dx_entry *ld_at; /* position of leaf dx_entry */
++};
++
++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_find_entry(dir, name, dirent, inline) \
++ __ext4_find_entry(dir, name, dirent, inline, NULL)
++#define ext4_add_entry(handle, dentry, inode) \
++ __ext4_add_entry(handle, dentry, inode, NULL)
++
++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32)
++
++static void ext4_htree_event_cb(void *target, void *event)
++{
++ u64 *block = (u64 *)target;
++
++ if (*block == dx_get_block((struct dx_entry *)event))
++ *block = EXT4_HTREE_NODE_CHANGED;
++}
++
++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
++{
++ struct htree_lock_head *lhead;
++
++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
++ if (lhead != NULL) {
++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
++ ext4_htree_event_cb);
++ }
++ return lhead;
++}
++EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
++
++struct htree_lock *ext4_htree_lock_alloc(void)
++{
++ return htree_lock_alloc(EXT4_LK_MAX,
++ sizeof(struct ext4_dir_lock_data));
++}
++EXPORT_SYMBOL(ext4_htree_lock_alloc);
++
++static htree_lock_mode_t ext4_htree_mode(unsigned flags)
++{
++ switch (flags) {
++ default: /* 0 or unknown flags require EX lock */
++ return HTREE_LOCK_EX;
++ case EXT4_HLOCK_READDIR:
++ return HTREE_LOCK_PR;
++ case EXT4_HLOCK_LOOKUP:
++ return HTREE_LOCK_CR;
++ case EXT4_HLOCK_DEL:
++ case EXT4_HLOCK_ADD:
++ return HTREE_LOCK_CW;
++ }
++}
++
++/* return PR for read-only operations, otherwise return EX */
++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
++{
++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
++
++ /* 0 requires EX lock */
++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
++}
++
++static int ext4_htree_safe_locked(struct htree_lock *lck)
++{
++ int writer;
++
++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
++ return 1;
++
++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
++ EXT4_LB_DE;
++ if (writer) /* all readers & writers are excluded? */
++ return lck->lk_mode == HTREE_LOCK_EX;
++
++ /* all writers are excluded? */
++ return lck->lk_mode == HTREE_LOCK_PR ||
++ lck->lk_mode == HTREE_LOCK_PW ||
++ lck->lk_mode == HTREE_LOCK_EX;
++}
++
++/* relock htree_lock with EX mode if it's change operation, otherwise
++ * relock it with PR mode. It's noop if PDO is disabled. */
++static void ext4_htree_safe_relock(struct htree_lock *lck)
++{
++ if (!ext4_htree_safe_locked(lck)) {
++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
++
++ htree_change_lock(lck, ext4_htree_safe_mode(flags));
++ }
++}
++
++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
++ struct inode *dir, unsigned flags)
++{
++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
++ ext4_htree_safe_mode(flags);
++
++ ext4_htree_lock_data(lck)->ld_flags = flags;
++ htree_lock(lck, lhead, mode);
++ if (!is_dx(dir))
++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */
++}
++EXPORT_SYMBOL(ext4_htree_lock);
++
++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
++ unsigned lmask, int wait, void *ev)
++{
++ u32 key = (at == NULL) ? 0 : dx_get_block(at);
++ u32 mode;
++
++ /* NOOP if htree is well protected or caller doesn't require the lock */
++ if (ext4_htree_safe_locked(lck) ||
++ !(ext4_htree_lock_data(lck)->ld_flags & lmask))
++ return 1;
++
++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
++ HTREE_LOCK_PW : HTREE_LOCK_PR;
++ while (1) {
++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
++ return 1;
++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
++ return 0;
++ cpu_relax(); /* spin until granted */
++ }
++}
++
++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
++{
++ return ext4_htree_safe_locked(lck) ||
++ htree_node_is_granted(lck, ffz(~lmask));
++}
++
++static void ext4_htree_node_unlock(struct htree_lock *lck,
++ unsigned lmask, void *buf)
++{
++ /* NB: it's safe to call mutiple times or even it's not locked */
++ if (!ext4_htree_safe_locked(lck) &&
++ htree_node_is_granted(lck, ffz(~lmask)))
++ htree_node_unlock(lck, ffz(~lmask), buf);
++}
++
++#define ext4_htree_dx_lock(lck, key) \
++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
++#define ext4_htree_dx_lock_try(lck, key) \
++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
++#define ext4_htree_dx_unlock(lck) \
++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
++#define ext4_htree_dx_locked(lck) \
++ ext4_htree_node_locked(lck, EXT4_LB_DX)
++
++static void ext4_htree_dx_need_lock(struct htree_lock *lck)
++{
++ struct ext4_dir_lock_data *ld;
++
++ if (ext4_htree_safe_locked(lck))
++ return;
++
++ ld = ext4_htree_lock_data(lck);
++ switch (ld->ld_flags) {
++ default:
++ return;
++ case EXT4_HLOCK_LOOKUP:
++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
++ return;
++ case EXT4_HLOCK_DEL:
++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
++ return;
++ case EXT4_HLOCK_ADD:
++ ld->ld_flags = EXT4_HLOCK_SPLIT;
++ return;
++ }
++}
++
++#define ext4_htree_de_lock(lck, key) \
++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
++#define ext4_htree_de_unlock(lck) \
++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
++
++#define ext4_htree_spin_lock(lck, key, event) \
++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
++#define ext4_htree_spin_unlock(lck) \
++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
++#define ext4_htree_spin_unlock_listen(lck, p) \
++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
++
++static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
++{
++ if (!ext4_htree_safe_locked(lck) &&
++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
++}
++
++enum {
++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */
++ DX_HASH_COL_YES, /* there is collision and it does matter */
++ DX_HASH_COL_NO, /* there is no collision */
++};
++
++static int dx_probe_hash_collision(struct htree_lock *lck,
++ struct dx_entry *entries,
++ struct dx_entry *at, u32 hash)
++{
++ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
++ return DX_HASH_COL_IGNORE; /* don't care about collision */
++
++ } else if (at == entries + dx_get_count(entries) - 1) {
++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
++
++ } else { /* hash collision? */
++ return ((dx_get_hash(at + 1) & ~1) == hash) ?
++ DX_HASH_COL_YES : DX_HASH_COL_NO;
++ }
++}
++
+ /*
+ * Probe for a directory leaf block to search.
+ *
+@@ -679,10 +908,11 @@ struct stats dx_show_entries(struct dx_h
+ */
+ static struct dx_frame *
+ dx_probe(const struct qstr *d_name, struct inode *dir,
+- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++ struct dx_hash_info *hinfo, struct dx_frame *frame_in,
++ struct htree_lock *lck, int *err)
+ {
+ unsigned count, indirect;
+- struct dx_entry *at, *entries, *p, *q, *m;
++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
+ struct dx_root_info *info;
+ struct buffer_head *bh;
+ struct dx_frame *frame = frame_in;
+@@ -750,8 +980,15 @@ dx_probe(const struct qstr *d_name, stru
+ dxtrace(printk("Look up %x", hash));
+ while (1)
+ {
++ if (indirect == 0) { /* the last index level */
++ /* NB: ext4_htree_dx_lock() could be noop if
++ * DX-lock flag is not set for current operation */
++ ext4_htree_dx_lock(lck, dx);
++ ext4_htree_spin_lock(lck, dx, NULL);
++ }
+ count = dx_get_count(entries);
+- if (!count || count > dx_get_limit(entries)) {
++ if (count == 0 || count > dx_get_limit(entries)) {
++ ext4_htree_spin_unlock(lck); /* release spin */
+ ext4_warning(dir->i_sb,
+ "dx entry: no count or count > limit");
+ brelse(bh);
+@@ -792,7 +1029,70 @@ dx_probe(const struct qstr *d_name, stru
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+- if (!indirect--) return frame;
++
++ if (indirect == 0) { /* the last index level */
++ struct ext4_dir_lock_data *ld;
++ u64 myblock;
++
++ /* By default we only lock DE-block, however, we will
++ * also lock the last level DX-block if:
++ * a) there is hash collision
++ * we will set DX-lock flag (a few lines below)
++ * and redo to lock DX-block
++ * see detail in dx_probe_hash_collision()
++ * b) it's a retry from splitting
++ * we need to lock the last level DX-block so nobody
++ * else can split any leaf blocks under the same
++ * DX-block, see detail in ext4_dx_add_entry()
++ */
++ if (ext4_htree_dx_locked(lck)) {
++ /* DX-block is locked, just lock DE-block
++ * and return */
++ ext4_htree_spin_unlock(lck);
++ if (!ext4_htree_safe_locked(lck))
++ ext4_htree_de_lock(lck, frame->at);
++ return frame;
++ }
++ /* it's pdirop and no DX lock */
++ if (dx_probe_hash_collision(lck, entries, at, hash) ==
++ DX_HASH_COL_YES) {
++ /* found hash collision, set DX-lock flag
++ * and retry to abtain DX-lock */
++ ext4_htree_spin_unlock(lck);
++ ext4_htree_dx_need_lock(lck);
++ continue;
++ }
++ ld = ext4_htree_lock_data(lck);
++ /* because I don't lock DX, so @at can't be trusted
++ * after I release spinlock so I have to save it */
++ ld->ld_at = at;
++ ld->ld_at_entry = *at;
++ ld->ld_count = dx_get_count(entries);
++
++ frame->at = &ld->ld_at_entry;
++ myblock = dx_get_block(at);
++
++ /* NB: ordering locking */
++ ext4_htree_spin_unlock_listen(lck, &myblock);
++ /* other thread can split this DE-block because:
++ * a) I don't have lock for the DE-block yet
++ * b) I released spinlock on DX-block
++ * if it happened I can detect it by listening
++ * splitting event on this DE-block */
++ ext4_htree_de_lock(lck, frame->at);
++ ext4_htree_spin_stop_listen(lck);
++
++ if (myblock == EXT4_HTREE_NODE_CHANGED) {
++ /* someone split this DE-block before
++ * I locked it, I need to retry and lock
++ * valid DE-block */
++ ext4_htree_de_unlock(lck);
++ continue;
++ }
++ return frame;
++ }
++ dx = at;
++ indirect--;
+ bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
+@@ -860,7 +1160,7 @@ static void dx_release (struct dx_frame
+ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+- __u32 *start_hash)
++ __u32 *start_hash, struct htree_lock *lck)
+ {
+ struct dx_frame *p;
+ struct buffer_head *bh;
+@@ -875,12 +1175,22 @@ static int ext4_htree_next_block(struct
+ * this loop, num_frames indicates the number of interior
+ * nodes need to be read.
+ */
++ ext4_htree_de_unlock(lck);
+ while (1) {
+- if (++(p->at) < p->entries + dx_get_count(p->entries))
+- break;
++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
++ /* num_frames > 0 :
++ * DX block
++ * ext4_htree_dx_locked:
++ * frame->at is reliable pointer returned by dx_probe,
++ * otherwise dx_probe already knew no collision */
++ if (++(p->at) < p->entries + dx_get_count(p->entries))
++ break;
++ }
+ if (p == frames)
+ return 0;
+ num_frames++;
++ if (num_frames == 1)
++ ext4_htree_dx_unlock(lck);
+ p--;
+ }
+
+@@ -903,6 +1213,13 @@ static int ext4_htree_next_block(struct
+ * block so no check is necessary
+ */
+ while (num_frames--) {
++ if (num_frames == 0) {
++ /* it's not always necessary, we just don't want to
++ * detect hash collision again */
++ ext4_htree_dx_need_lock(lck);
++ ext4_htree_dx_lock(lck, p->at);
++ }
++
+ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+@@ -911,6 +1228,7 @@ static int ext4_htree_next_block(struct
+ p->bh = bh;
+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+ }
++ ext4_htree_de_lock(lck, p->at);
+ return 1;
+ }
+
+@@ -1013,10 +1331,10 @@ int ext4_htree_fill_tree(struct file *di
+ }
+ hinfo.hash = start_hash;
+ hinfo.minor_hash = 0;
+- frame = dx_probe(NULL, dir, &hinfo, frames, &err);
++ /* assume it's PR locked */
++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err);
+ if (!frame)
+ return err;
+-
+ /* Add '.' and '..' from the htree header */
+ if (!start_hash && !start_minor_hash) {
+ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+@@ -1043,7 +1361,7 @@ int ext4_htree_fill_tree(struct file *di
+ count += ret;
+ hashval = ~0;
+ ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+- frame, frames, &hashval);
++ frame, frames, &hashval, NULL);
+ *next_hash = hashval;
+ if (ret < 0) {
+ err = ret;
+@@ -1236,10 +1554,10 @@ static int is_dx_internal_node(struct in
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
++struct buffer_head *__ext4_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+- int *inlined)
++ int *inlined, struct htree_lock *lck)
+ {
+ struct super_block *sb;
+ struct buffer_head *bh_use[NAMEI_RA_SIZE];
+@@ -1283,7 +1601,7 @@ static struct buffer_head * ext4_find_en
+ goto restart;
+ }
+ if (is_dx(dir)) {
+- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
++ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err);
+ /*
+ * On success, or if the error was file not found,
+ * return. Otherwise, fall back to doing a search the
+@@ -1297,6 +1615,7 @@ static struct buffer_head * ext4_find_en
+ return bh;
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
++ ext4_htree_safe_relock(lck);
+ }
+ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ start = EXT4_I(dir)->i_dir_start_lookup;
+@@ -1389,9 +1708,12 @@ cleanup_and_exit:
+ brelse(bh_use[ra_ptr]);
+ return ret;
+ }
++EXPORT_SYMBOL(__ext4_find_entry);
+
+-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
+- struct ext4_dir_entry_2 **res_dir, int *err)
++static struct buffer_head *ext4_dx_find_entry(struct inode *dir,
++ const struct qstr *d_name,
++ struct ext4_dir_entry_2 **res_dir,
++ struct htree_lock *lck, int *err)
+ {
+ struct super_block * sb = dir->i_sb;
+ struct dx_hash_info hinfo;
+@@ -1400,7 +1722,7 @@ static struct buffer_head * ext4_dx_find
+ ext4_lblk_t block;
+ int retval;
+
+- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
++ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err)))
+ return NULL;
+ do {
+ block = dx_get_block(frame->at);
+@@ -1424,7 +1746,7 @@ static struct buffer_head * ext4_dx_find
+
+ /* Check to see if we should continue to search */
+ retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+- frames, NULL);
++ frames, NULL, lck);
+ if (retval < 0) {
+ ext4_warning(sb,
+ "error reading index page in directory #%lu",
+@@ -1583,8 +1905,9 @@ static struct ext4_dir_entry_2* dx_pack_
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
+ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+- struct buffer_head **bh,struct dx_frame *frame,
+- struct dx_hash_info *hinfo, int *error)
++ struct buffer_head **bh, struct dx_frame *frames,
++ struct dx_frame *frame, struct dx_hash_info *hinfo,
++ struct htree_lock *lck, int *error)
+ {
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+@@ -1647,7 +1970,14 @@ static struct ext4_dir_entry_2 *do_split
+ hash2, split, count-split));
+
+ /* Fancy dance to stay within two buffers */
+- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
++ if (hinfo->hash < hash2) {
++ de2 = dx_move_dirents(data1, data2, map + split,
++ count - split, blocksize);
++ } else {
++ /* make sure we will add entry to the same block which
++ * we have already locked */
++ de2 = dx_move_dirents(data1, data2, map, split, blocksize);
++ }
+ de = dx_pack_dirents(data1, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
+@@ -1666,13 +1996,21 @@ static struct ext4_dir_entry_2 *do_split
+ dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+ dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+
+- /* Which block gets the new entry? */
+- if (hinfo->hash >= hash2)
+- {
+- swap(*bh, bh2);
+- de = de2;
++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
++ frame->at); /* notify block is being split */
++ if (hinfo->hash < hash2) {
++ dx_insert_block(frame, hash2 + continued, newblock);
++
++ } else {
++ /* switch block number */
++ dx_insert_block(frame, hash2 + continued,
++ dx_get_block(frame->at));
++ dx_set_block(frame->at, newblock);
++ (frame->at)++;
+ }
+- dx_insert_block(frame, hash2 + continued, newblock);
++ ext4_htree_spin_unlock(lck);
++ ext4_htree_dx_unlock(lck);
++
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+@@ -1945,7 +2283,7 @@ static int make_indexed_dir(handle_t *ha
+ if (retval)
+ goto out_frames;
+
+- de = do_split(handle,dir, &bh2, frame, &hinfo, &retval);
++ de = do_split(handle, dir, &bh2, frames, frame, &hinfo, NULL, &retval);
+ if (!de) {
+ goto out_frames;
+ }
+@@ -2051,8 +2389,8 @@ out:
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode)
++int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct htree_lock *lck)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct buffer_head *bh = NULL;
+@@ -2087,9 +2425,10 @@ static int ext4_add_entry(handle_t *hand
+ if (dentry->d_name.len == 2 &&
+ memcmp(dentry->d_name.name, "..", 2) == 0)
+ return ext4_update_dotdot(handle, dentry, inode);
+- retval = ext4_dx_add_entry(handle, dentry, inode);
++ retval = ext4_dx_add_entry(handle, dentry, inode, lck);
+ if (!retval || (retval != ERR_BAD_DX_DIR))
+ goto out;
++ ext4_htree_safe_relock(lck);
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+ dx_fallback++;
+ ext4_mark_inode_dirty(handle, dir);
+@@ -2129,12 +2468,13 @@ static int ext4_add_entry(handle_t *hand
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+ return retval;
+ }
++EXPORT_SYMBOL(__ext4_add_entry);
+
+ /*
+ * Returns 0 for success, or a negative error value
+ */
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode)
++ struct inode *inode, struct htree_lock *lck)
+ {
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries, *at;
+@@ -2148,7 +2488,7 @@ static int ext4_dx_add_entry(handle_t *h
+
+ again:
+ restart = 0;
+- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
+ if (!frame)
+ return err;
+ entries = frame->entries;
+@@ -2178,6 +2518,11 @@ again:
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
++ ext4_htree_safe_relock(lck);
++ restart = 1;
++ goto cleanup;
++ }
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+@@ -2277,16 +2622,43 @@ again:
+ restart = 1;
+ goto cleanup;
+ }
++ } else if (!ext4_htree_dx_locked(lck)) {
++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
++
++ /* not well protected, require DX lock */
++ ext4_htree_dx_need_lock(lck);
++ at = frame > frames ? (frame - 1)->at : NULL;
++
++ /* NB: no risk of deadlock because it's just a try.
++ *
++ * NB: we check ld_count for twice, the first time before
++ * having DX lock, the second time after holding DX lock.
++ *
++ * NB: We never free blocks for directory so far, which
++ * means value returned by dx_get_count() should equal to
++ * ld->ld_count if nobody split any DE-block under @at,
++ * and ld->ld_at still points to valid dx_entry. */
++ if ((ld->ld_count != dx_get_count(entries)) ||
++ !ext4_htree_dx_lock_try(lck, at) ||
++ (ld->ld_count != dx_get_count(entries))) {
++ restart = 1;
++ goto cleanup;
++ }
++ /* OK, I've got DX lock and nothing changed */
++ frame->at = ld->ld_at;
+ }
+- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
+ if (!de)
+ goto cleanup;
++
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ goto cleanup;
+
+ journal_error:
+ ext4_std_error(dir->i_sb, err);
+ cleanup:
++ ext4_htree_dx_unlock(lck);
++ ext4_htree_de_unlock(lck);
+ brelse(bh);
+ dx_release(frames);
+ /* @restart is true means htree-path has been changed, we need to
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/super.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c
+@@ -875,6 +875,7 @@ static struct inode *ext4_alloc_inode(st
+
+ ei->vfs_inode.i_version = 1;
+ spin_lock_init(&ei->i_raw_lock);
++ sema_init(&ei->i_append_sem, 1);
+ INIT_LIST_HEAD(&ei->i_prealloc_list);
+ spin_lock_init(&ei->i_prealloc_lock);
+ ext4_es_init_tree(&ei->i_es_tree);
--- /dev/null
+Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/ext4.h
+@@ -1270,11 +1270,14 @@ struct ext4_sb_info {
+
+ /* tunables */
+ unsigned long s_stripe;
+- unsigned int s_mb_stream_request;
++ unsigned long s_mb_small_req;
++ unsigned long s_mb_large_req;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
++ unsigned long *s_mb_prealloc_table;
++ unsigned long s_mb_prealloc_table_size;
+ unsigned int s_mb_group_prealloc;
+ unsigned int s_max_dir_size_kb;
+ /* where last allocation was done - for stream allocation */
+Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/mballoc.c
+===================================================================
+--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/mballoc.c
++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/mballoc.c
+@@ -1862,6 +1862,26 @@ int ext4_mb_find_by_goal(struct ext4_all
+ return 0;
+ }
+
++static int ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value)
++{
++ int i;
++
++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
++ return -1;
++
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++ if (sbi->s_mb_prealloc_table[i] == 0) {
++ sbi->s_mb_prealloc_table[i] = value;
++ return 0;
++ }
++
++ /* they should add values in order */
++ if (value <= sbi->s_mb_prealloc_table[i])
++ return -1;
++ }
++ return -1;
++}
++
+ /*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+@@ -2301,6 +2321,93 @@ static const struct seq_operations ext4_
+ .show = ext4_mb_seq_groups_show,
+ };
+
++#define EXT4_MB_PREALLOC_TABLE "prealloc_table"
++
++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file,
++ const char __user *buf,
++ size_t cnt, loff_t *pos)
++{
++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
++ unsigned long value;
++ unsigned long prev = 0;
++ char str[128];
++ char *cur;
++ char *end;
++ unsigned long *new_table;
++ int num = 0;
++ int i = 0;
++
++ if (cnt >= sizeof(str))
++ return -EINVAL;
++ if (copy_from_user(str, buf, cnt))
++ return -EFAULT;
++
++ num = 0;
++ cur = str;
++ end = str + cnt;
++ while (cur < end) {
++ int rc;
++ while ((cur < end) && (*cur == ' '))
++ cur++;
++ rc = kstrtol(cur, 0, &value);
++ if (rc != 0)
++ return -EINVAL;
++ if (value == 0)
++ break;
++ if (value <= prev)
++ return -EINVAL;
++ prev = value;
++ num++;
++ }
++
++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
++ if (new_table == NULL)
++ return -ENOMEM;
++ kfree(sbi->s_mb_prealloc_table);
++ memset(new_table, 0, num * sizeof(*new_table));
++ sbi->s_mb_prealloc_table = new_table;
++ sbi->s_mb_prealloc_table_size = num;
++ cur = str;
++ end = str + cnt;
++ while (cur < end && i < num) {
++ while (cur < end && *cur == ' ')
++ cur++;
++ value = simple_strtol(cur, &cur, 0);
++ if (ext4_mb_prealloc_table_add(sbi, value) == 0)
++ ++i;
++ }
++ if (i != num)
++ sbi->s_mb_prealloc_table_size = i;
++
++ return cnt;
++}
++
++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v)
++{
++ struct ext4_sb_info *sbi = EXT4_SB(m->private);
++ int i;
++
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]);
++ seq_printf(m, "\n");
++
++ return 0;
++}
++
++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file)
++{
++ return single_open(file, mb_prealloc_table_seq_show, PDE_DATA(inode));
++}
++
++static const struct file_operations ext4_mb_prealloc_seq_fops = {
++ .owner = THIS_MODULE,
++ .open = mb_prealloc_table_seq_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release,
++ .write = ext4_mb_prealloc_table_proc_write,
++};
++
+ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+ {
+ struct super_block *sb = PDE_DATA(inode);
+@@ -2550,7 +2657,7 @@ static int ext4_groupinfo_create_slab(si
+ int ext4_mb_init(struct super_block *sb)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+- unsigned i, j;
++ unsigned i, j, k, l;
+ unsigned offset, offset_incr;
+ unsigned max;
+ int ret;
+@@ -2595,7 +2702,6 @@ int ext4_mb_init(struct super_block *sb)
+ sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+ sbi->s_mb_stats = MB_DEFAULT_STATS;
+- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ /*
+ * The default group preallocation is 512, which for 4k block
+@@ -2619,9 +2725,47 @@ int ext4_mb_init(struct super_block *sb)
+ * RAID stripe size so that preallocations don't fragment
+ * the stripes.
+ */
+- if (sbi->s_stripe > 1) {
+- sbi->s_mb_group_prealloc = roundup(
+- sbi->s_mb_group_prealloc, sbi->s_stripe);
++
++ if (sbi->s_stripe == 0) {
++ sbi->s_mb_prealloc_table_size = 10;
++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++ if (sbi->s_mb_prealloc_table == NULL) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ memset(sbi->s_mb_prealloc_table, 0, i);
++
++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) {
++ if (ext4_mb_prealloc_table_add(sbi, l) < 0) {
++ sbi->s_mb_prealloc_table_size = k;
++ break;
++ }
++ }
++
++ sbi->s_mb_small_req = 256;
++ sbi->s_mb_large_req = 1024;
++ sbi->s_mb_group_prealloc = 512;
++ } else {
++ sbi->s_mb_prealloc_table_size = 3;
++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++ if (sbi->s_mb_prealloc_table == NULL) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ memset(sbi->s_mb_prealloc_table, 0, i);
++
++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) {
++ if (ext4_mb_prealloc_table_add(sbi, l) < 0) {
++ sbi->s_mb_prealloc_table_size = k;
++ break;
++ }
++ }
++
++ sbi->s_mb_small_req = sbi->s_stripe;
++ sbi->s_mb_large_req = sbi->s_stripe * 8;
++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
+ }
+
+ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+@@ -2643,9 +2787,13 @@ int ext4_mb_init(struct super_block *sb)
+ if (ret != 0)
+ goto out_free_locality_groups;
+
+- if (sbi->s_proc)
++ if (sbi->s_proc) {
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
++ proc_create_data(EXT4_MB_PREALLOC_TABLE, S_IFREG | S_IRUGO |
++ S_IWUSR, sbi->s_proc,
++ &ext4_mb_prealloc_seq_fops, sb);
++ }
+
+ return 0;
+
+@@ -2653,6 +2801,7 @@ out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
+ out:
++ kfree(sbi->s_mb_prealloc_table);
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+@@ -2687,8 +2836,10 @@ int ext4_mb_release(struct super_block *
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+- if (sbi->s_proc)
++ if (sbi->s_proc) {
+ remove_proc_entry("mb_groups", sbi->s_proc);
++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
++ }
+
+ if (sbi->s_group_info) {
+ for (i = 0; i < ngroups; i++) {
+@@ -3000,9 +3151,9 @@ ext4_mb_normalize_request(struct ext4_al
+ struct ext4_allocation_request *ar)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+- int bsbits, max;
++ int bsbits, i, wind;
+ ext4_lblk_t end;
+- loff_t size, start_off;
++ loff_t size;
+ loff_t orig_size __maybe_unused;
+ ext4_lblk_t start;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+@@ -3035,51 +3186,34 @@ ext4_mb_normalize_request(struct ext4_al
+ size = size << bsbits;
+ if (size < i_size_read(ac->ac_inode))
+ size = i_size_read(ac->ac_inode);
+- orig_size = size;
++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
+
+- /* max size of free chunks */
+- max = 2 << bsbits;
++ start = wind = 0;
+
+-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
+- (req <= (size) || max <= (chunk_size))
++ /* let's choose preallocation window depending on file size */
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++ if (size <= sbi->s_mb_prealloc_table[i]) {
++ wind = sbi->s_mb_prealloc_table[i];
++ break;
++ }
++ }
++ size = wind;
+
+- /* first, try to predict filesize */
+- /* XXX: should this table be tunable? */
+- start_off = 0;
+- if (size <= 16 * 1024) {
+- size = 16 * 1024;
+- } else if (size <= 32 * 1024) {
+- size = 32 * 1024;
+- } else if (size <= 64 * 1024) {
+- size = 64 * 1024;
+- } else if (size <= 128 * 1024) {
+- size = 128 * 1024;
+- } else if (size <= 256 * 1024) {
+- size = 256 * 1024;
+- } else if (size <= 512 * 1024) {
+- size = 512 * 1024;
+- } else if (size <= 1024 * 1024) {
+- size = 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (21 - bsbits)) << 21;
+- size = 2 * 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (22 - bsbits)) << 22;
+- size = 4 * 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+- (8<<20)>>bsbits, max, 8 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (23 - bsbits)) << 23;
+- size = 8 * 1024 * 1024;
+- } else {
+- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
+- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
+- ac->ac_o_ex.fe_len) << bsbits;
++ if (wind == 0) {
++ __u64 tstart, tend;
++ /* file is quite large, we now preallocate with
++ * the biggest configured window with regart to
++ * logical offset */
++ wind = sbi->s_mb_prealloc_table[i - 1];
++ tstart = ac->ac_o_ex.fe_logical;
++ do_div(tstart, wind);
++ start = tstart * wind;
++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
++ do_div(tend, wind);
++ tend = tend * wind + wind;
++ size = tend - start;
+ }
+- size = size >> bsbits;
+- start = start_off >> bsbits;
++ orig_size = size;
+
+ /* don't cover already allocated blocks in selected range */
+ if (ar->pleft && start <= ar->lleft) {
+@@ -3154,7 +3288,6 @@ ext4_mb_normalize_request(struct ext4_al
+ (unsigned long) ac->ac_o_ex.fe_logical);
+ BUG();
+ }
+- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+ /* now prepare goal request */
+
+@@ -4119,11 +4252,19 @@ static void ext4_mb_group_or_file(struct
+
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+- if (size > sbi->s_mb_stream_request) {
++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
++ (size >= sbi->s_mb_large_req)) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
++ /*
++ * request is so large that we don't care about
++ * streaming - it overweights any possible seek
++ */
++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
++ return;
++
+ BUG_ON(ac->ac_lg != NULL);
+ /*
+ * locality group prealloc space are per cpu. The reason for having
+Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/super.c
+===================================================================
+--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/super.c
++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/super.c
+@@ -2672,7 +2672,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats
+ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+ EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
+ EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+@@ -2698,7 +2699,8 @@ static struct attribute *ext4_attrs[] =
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+- ATTR_LIST(mb_stream_req),
++ ATTR_LIST(mb_small_req),
++ ATTR_LIST(mb_large_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/inode.c
++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/inode.c
+@@ -2399,6 +2399,9 @@ static int ext4_writepages(struct addres
+ ext4_journal_stop(handle);
+ }
+
++ if (wbc->nr_to_write < sbi->s_mb_small_req)
++ wbc->nr_to_write = sbi->s_mb_small_req;
++
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+
--- /dev/null
+From a521100231f816f8cdd9c8e77da14ff1e42c2b17 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Thu, 4 Sep 2014 18:06:25 -0400
+Subject: [PATCH] ext4: pass allocation_request struct to
+ ext4_(alloc,splice)_branch
+
+Instead of initializing the allocation_request structure in
+ext4_alloc_branch(), set it up in ext4_ind_map_blocks(), and then pass
+it to ext4_alloc_branch() and ext4_splice_branch().
+
+This allows ext4_ind_map_blocks to pass flags in the allocation
+request structure without having to add Yet Another argument to
+ext4_alloc_branch().
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Jan Kara <jack@suse.cz>
+---
+ fs/ext4/indirect.c | 82 +++++++++++++++++++++++++-----------------------------
+ 1 file changed, 38 insertions(+), 44 deletions(-)
+
+diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
+index e75f840..69af0cd 100644
+--- a/fs/ext4/indirect.c
++++ b/fs/ext4/indirect.c
+@@ -318,34 +318,22 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+ * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
+ */
+-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+- ext4_lblk_t iblock, int indirect_blks,
+- int *blks, ext4_fsblk_t goal,
+- ext4_lblk_t *offsets, Indirect *branch)
++static int ext4_alloc_branch(handle_t *handle,
++ struct ext4_allocation_request *ar,
++ int indirect_blks, ext4_lblk_t *offsets,
++ Indirect *branch)
+ {
+- struct ext4_allocation_request ar;
+ struct buffer_head * bh;
+ ext4_fsblk_t b, new_blocks[4];
+ __le32 *p;
+ int i, j, err, len = 1;
+
+- /*
+- * Set up for the direct block allocation
+- */
+- memset(&ar, 0, sizeof(ar));
+- ar.inode = inode;
+- ar.len = *blks;
+- ar.logical = iblock;
+- if (S_ISREG(inode->i_mode))
+- ar.flags = EXT4_MB_HINT_DATA;
+-
+ for (i = 0; i <= indirect_blks; i++) {
+ if (i == indirect_blks) {
+- ar.goal = goal;
+- new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
++ new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
+ } else
+- goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+- goal, 0, NULL, &err);
++ ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
++ ar->inode, ar->goal, 0, NULL, &err);
+ if (err) {
+ i--;
+ goto failed;
+@@ -354,7 +342,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ if (i == 0)
+ continue;
+
+- bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
++ bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+@@ -372,7 +360,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ b = new_blocks[i];
+
+ if (i == indirect_blks)
+- len = ar.len;
++ len = ar->len;
+ for (j = 0; j < len; j++)
+ *p++ = cpu_to_le32(b++);
+
+@@ -381,11 +369,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ unlock_buffer(bh);
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+- err = ext4_handle_dirty_metadata(handle, inode, bh);
++ err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
+ if (err)
+ goto failed;
+ }
+- *blks = ar.len;
+ return 0;
+ failed:
+ for (; i >= 0; i--) {
+@@ -396,10 +383,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ * existing before ext4_alloc_branch() was called.
+ */
+ if (i > 0 && i != indirect_blks && branch[i].bh)
+- ext4_forget(handle, 1, inode, branch[i].bh,
++ ext4_forget(handle, 1, ar->inode, branch[i].bh,
+ branch[i].bh->b_blocknr);
+- ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+- (i == indirect_blks) ? ar.len : 1, 0);
++ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
++ (i == indirect_blks) ? ar->len : 1, 0);
+ }
+ return err;
+ }
+@@ -419,9 +406,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+- ext4_lblk_t block, Indirect *where, int num,
+- int blks)
++static int ext4_splice_branch(handle_t *handle,
++ struct ext4_allocation_request *ar,
++ Indirect *where, int num)
+ {
+ int i;
+ int err = 0;
+@@ -446,9 +433,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ * Update the host buffer_head or inode to point to more just allocated
+ * direct blocks blocks
+ */
+- if (num == 0 && blks > 1) {
++ if (num == 0 && ar->len > 1) {
+ current_block = le32_to_cpu(where->key) + 1;
+- for (i = 1; i < blks; i++)
++ for (i = 1; i < ar->len; i++)
+ *(where->p + i) = cpu_to_le32(current_block++);
+ }
+
+@@ -465,14 +452,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ */
+ jbd_debug(5, "splicing indirect only\n");
+ BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+- err = ext4_handle_dirty_metadata(handle, inode, where->bh);
++ err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
+ if (err)
+ goto err_out;
+ } else {
+ /*
+ * OK, we spliced it into the inode itself on a direct block.
+ */
+- ext4_mark_inode_dirty(handle, inode);
++ ext4_mark_inode_dirty(handle, ar->inode);
+ jbd_debug(5, "splicing direct\n");
+ }
+ return err;
+@@ -484,11 +471,11 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+- ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
++ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+- ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+- blks, 0);
++ ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
++ ar->len, 0);
+
+ return err;
+ }
+@@ -525,11 +512,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ int flags)
+ {
++ struct ext4_allocation_request ar;
+ int err = -EIO;
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+- ext4_fsblk_t goal;
+ int indirect_blks;
+ int blocks_to_boundary = 0;
+ int depth;
+@@ -579,7 +566,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ return -ENOSPC;
+ }
+
+- goal = ext4_find_goal(inode, map->m_lblk, partial);
++ /* Set up for the direct block allocation */
++ memset(&ar, 0, sizeof(ar));
++ ar.inode = inode;
++ ar.logical = map->m_lblk;
++ if (S_ISREG(inode->i_mode))
++ ar.flags = EXT4_MB_HINT_DATA;
++
++ ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+ /* the number of blocks need to allocate for [d,t]indirect blocks */
+ indirect_blks = (chain + depth) - partial - 1;
+@@ -588,13 +582,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ * Next look up the indirect map to count the totoal number of
+ * direct blocks to allocate for this branch.
+ */
+- count = ext4_blks_to_allocate(partial, indirect_blks,
+- map->m_len, blocks_to_boundary);
++ ar.len = ext4_blks_to_allocate(partial, indirect_blks,
++ map->m_len, blocks_to_boundary);
++
+ /*
+ * Block out ext4_truncate while we alter the tree
+ */
+- err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+- &count, goal,
++ err = ext4_alloc_branch(handle, &ar, indirect_blks,
+ offsets + (partial - chain), partial);
+
+ /*
+@@ -605,14 +599,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ * may need to return -EAGAIN upwards in the worst case. --sct
+ */
+ if (!err)
+- err = ext4_splice_branch(handle, inode, map->m_lblk,
+- partial, indirect_blks, count);
++ err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
+ if (err)
+ goto cleanup;
+
+ map->m_flags |= EXT4_MAP_NEW;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
++ count = ar.len;
+ got_it:
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = le32_to_cpu(chain[depth-1].key);
+--
+2.7.4
+
+From e3cf5d5d9a86df1c5e413bdd3725c25a16ff854c Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Thu, 4 Sep 2014 18:07:25 -0400
+Subject: [PATCH] ext4: prepare to drop EXT4_STATE_DELALLOC_RESERVED
+
+The EXT4_STATE_DELALLOC_RESERVED flag was originally implemented
+because it was too hard to make sure the mballoc and get_block flags
+could be reliably passed down through all of the codepaths that end up
+calling ext4_mb_new_blocks().
+
+Since then, we have mb_flags passed down through most of the code
+paths, so getting rid of EXT4_STATE_DELALLOC_RESERVED isn't as tricky
+as it used to.
+
+This commit plumbs in the last of what is required, and then adds a
+WARN_ON check to make sure we haven't missed anything. If this passes
+a full regression test run, we can then drop
+EXT4_STATE_DELALLOC_RESERVED.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Jan Kara <jack@suse.cz>
+---
+ fs/ext4/balloc.c | 3 +--
+ fs/ext4/extents.c | 6 +++++-
+ fs/ext4/indirect.c | 6 +++++-
+ fs/ext4/mballoc.c | 10 ++++++----
+ 5 files changed, 17 insertions(+), 14 deletions(-)
+
+diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
+index 581ef40..d70f154 100644
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -636,8 +636,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ * Account for the allocated meta blocks. We will never
+ * fail EDQUOT for metdata, but we do account for it.
+ */
+- if (!(*errp) &&
+- ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
++ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
+ dquot_alloc_block_nofail(inode,
+ EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
+ }
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 3ac1686..8170b32 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -1933,6 +1933,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ ext4_lblk_t next;
+ int mb_flags = 0, unwritten;
+
++ if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
++ mb_flags |= EXT4_MB_DELALLOC_RESERVED;
+ if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+ EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+ return -EIO;
+@@ -2054,7 +2056,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ * We're gonna add a new leaf in the tree.
+ */
+ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+- mb_flags = EXT4_MB_USE_RESERVED;
++ mb_flags |= EXT4_MB_USE_RESERVED;
+ err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ ppath, newext);
+ if (err)
+@@ -4438,6 +4440,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
++ ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+ newblock = ext4_mb_new_blocks(handle, &ar, &err);
+ if (!newblock)
+ goto out2;
+diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
+index 69af0cd..36b3696 100644
+--- a/fs/ext4/indirect.c
++++ b/fs/ext4/indirect.c
+@@ -333,7 +333,9 @@ static int ext4_alloc_branch(handle_t *handle,
+ new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
+ } else
+ ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
+- ar->inode, ar->goal, 0, NULL, &err);
++ ar->inode, ar->goal,
++ ar->flags & EXT4_MB_DELALLOC_RESERVED,
++ NULL, &err);
+ if (err) {
+ i--;
+ goto failed;
+@@ -572,6 +574,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ ar.logical = map->m_lblk;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
++ ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+
+ ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 8b0f9ef..15dffda 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4415,9 +4415,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+ * EDQUOT check, as blocks and quotas have been already
+ * reserved when data being copied into pagecache.
+ */
+- if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
++ if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) {
++ WARN_ON((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0);
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+- else {
++ }
++
++ if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
+ /* Without delayed allocation we need to verify
+ * there is enough free blocks to do block allocation
+ * and verify allocation doesn't exceed the quota limits.
+@@ -4528,8 +4531,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+ if (inquota && ar->len < inquota)
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
+ if (!ar->len) {
+- if (!ext4_test_inode_state(ar->inode,
+- EXT4_STATE_DELALLOC_RESERVED))
++ if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
+--
+2.7.4
--- /dev/null
+rhel7/ext4-inode-version.patch
+rhel7/ext4-lookup-dotdot.patch
+rhel6.3/ext4-print-inum-in-htree-warning.patch
+rhel7.4/ext4-prealloc.patch
+rhel7/ext4-mballoc-extra-checks.patch
+rhel7/ext4-misc.patch
+rhel7/ext4-osd-iop-common.patch
+rhel7/ext4-hash-indexed-dir-dotdot-update.patch
+rhel7/ext4-kill-dx-root.patch
+rhel7/ext4-mballoc-pa-free-mismatch.patch
+rhel7.3/ext4-data-in-dirent.patch
+rhel7.2/ext4-large-eas.patch
+rhel7.3/ext4-disable-mb-cache.patch
+rhel7/ext4-nocmtime.patch
+rhel7.4/ext4-large-dir.patch
+rhel7.4/ext4-pdirop.patch
+rhel7/ext4-max-dir-size.patch
+rhel7/ext4-remove-truncate-warning.patch
+rhel7.3/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
+rhel7/ext4-give-warning-with-dir-htree-growing.patch
+rhel7/ext4-mmp-brelse.patch
+rhel7/ext4-jcb-optimization.patch
+rhel7/ext4_s_max_ext_tree_depth.patch
+rhel7.4/ext4-remove-i_data_sem-from-xattr.patch
+rhel7/ext4-projid-ignore-maxquotas.patch
+rhel7/ext4-projid-feature-support.patch
+rhel7/ext4-projid-quotas.patch
+rhel7/ext4-projid-xfs-ioctls.patch
+rhel7.4/ext4-fix-xattr-shifting-when-expanding-inodes.patch
+rhel7.4/ext4-attach-jinode-in-writepages.patch
+rhel6.3/ext4-dont-check-in-ro.patch
+rhel7.4/ext4-dont-check-before-replay.patch
2.6.32-573.26.1.el6 (RHEL6.7)
2.6.32-642.15.1.el6 (RHEL6.8)
2.6.32-696.6.3.el6 (RHEL6.9)
- 3.10.0-514.26.2.el7 (RHEL7.3)
+ 3.10.0-693.el7 (RHEL7.4)
3.0.101-0.47.71 (SLES11 SP3)
3.0.101-107 (SLES11 SP4)
3.12.74-60.64.40 (SLES12 SP1)
2.6.32-573.26.1.el6 (RHEL6.7)
2.6.32-642.15.1.el6 (RHEL6.8)
2.6.32-696.6.3.el6 (RHEL6.9)
- 3.10.0-514.26.2.el7 (RHEL7.3)
+ 3.10.0-693.el7 (RHEL7.4)
3.0.101-0.47.71 (SLES11 SP3)
3.0.101-107 (SLES11 SP4)
3.12.74-60.64.40 (SLES12 SP1)
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_MMU=y
+CONFIG_ARCH_MMAP_RND_BITS_MIN=28
+CONFIG_ARCH_MMAP_RND_BITS_MAX=32
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16
CONFIG_NEED_DMA_MAP_STATE=y
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_SLUB=y
CONFIG_PROFILING=y
CONFIG_TRACEPOINTS=y
+CONFIG_CRASH_CORE=y
+CONFIG_KEXEC_CORE=y
CONFIG_OPROFILE=m
CONFIG_OPROFILE_EVENT_MULTIPLEX=y
CONFIG_HAVE_OPROFILE=y
CONFIG_HAVE_ARCH_HUGE_VMAP=y
CONFIG_MODULES_USE_ELF_RELA=y
CONFIG_HAVE_STACK_VALIDATION=y
+CONFIG_ARCH_HAS_ELF_RANDOMIZE=y
+CONFIG_HAVE_ARCH_MMAP_RND_BITS=y
+CONFIG_ARCH_MMAP_RND_BITS=28
+CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8
CONFIG_OLD_SIGSUSPEND3=y
CONFIG_COMPAT_OLD_SIGACTION=y
CONFIG_PREEMPT_NOTIFIERS=y
CONFIG_PADATA=y
CONFIG_ASN1=y
-CONFIG_UNINLINE_SPIN_UNLOCK=y
CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
CONFIG_INLINE_READ_UNLOCK=y
CONFIG_INLINE_READ_UNLOCK_IRQ=y
CONFIG_INLINE_WRITE_UNLOCK=y
CONFIG_INLINE_WRITE_UNLOCK_IRQ=y
CONFIG_MUTEX_SPIN_ON_OWNER=y
+CONFIG_RWSEM_SPIN_ON_OWNER=y
+CONFIG_LOCK_SPIN_ON_OWNER=y
+CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y
+CONFIG_QUEUED_SPINLOCKS=y
+CONFIG_ARCH_USE_QUEUED_RWLOCKS=y
+CONFIG_QUEUED_RWLOCKS=y
CONFIG_FREEZER=y
#
CONFIG_SMP=y
CONFIG_X86_X2APIC=y
CONFIG_X86_MPPARSE=y
+CONFIG_INTEL_RDT_A=y
CONFIG_X86_EXTENDED_PLATFORM=y
# CONFIG_X86_NUMACHIP is not set
# CONFIG_X86_VSMP is not set
CONFIG_X86_UV=y
CONFIG_X86_INTEL_LPSS=y
+CONFIG_X86_AMD_PLATFORM_DEVICE=y
CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y
CONFIG_SCHED_OMIT_FRAME_POINTER=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
# CONFIG_PARAVIRT_DEBUG is not set
CONFIG_PARAVIRT_SPINLOCKS=y
+# CONFIG_QUEUED_LOCK_STAT is not set
CONFIG_XEN=y
# CONFIG_XEN_DOM0 is not set
# CONFIG_XEN_PRIVILEGED_GUEST is not set
CONFIG_X86_MCE_THRESHOLD=y
CONFIG_X86_MCE_INJECT=m
CONFIG_X86_THERMAL_VECTOR=y
+
+#
+# Performance monitoring
+#
+CONFIG_PERF_EVENTS_INTEL_UNCORE=y
+CONFIG_PERF_EVENTS_INTEL_RAPL=y
CONFIG_I8K=m
CONFIG_MICROCODE=y
CONFIG_MICROCODE_INTEL=y
CONFIG_MIGRATION=y
CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y
CONFIG_HMM=y
-CONFIG_HMM_MIGRATE=y
CONFIG_HMM_MIRROR=y
CONFIG_PHYS_ADDR_T_64BIT=y
CONFIG_ZONE_DMA_FLAG=1
CONFIG_KEXEC_JUMP=y
CONFIG_PHYSICAL_START=0x1000000
CONFIG_RELOCATABLE=y
-CONFIG_PHYSICAL_ALIGN=0x1000000
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RANDOMIZE_MEMORY=y
+CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa
CONFIG_HOTPLUG_CPU=y
CONFIG_BOOTPARAM_HOTPLUG_CPU0=y
# CONFIG_DEBUG_HOTPLUG_CPU0 is not set
# CONFIG_COMPAT_VDSO is not set
# CONFIG_CMDLINE_BOOL is not set
+CONFIG_ARCH_HAS_ADD_PAGES=y
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
CONFIG_USE_PERCPU_NUMA_NODE_ID=y
# CONFIG_PM_WAKELOCKS is not set
CONFIG_PM_RUNTIME=y
CONFIG_PM=y
-# CONFIG_PM_DEBUG is not set
+CONFIG_PM_DEBUG=y
+CONFIG_PM_ADVANCED_DEBUG=y
+# CONFIG_PM_TEST_SUSPEND is not set
+CONFIG_PM_SLEEP_DEBUG=y
+CONFIG_PM_TRACE=y
+CONFIG_PM_TRACE_RTC=y
CONFIG_PM_CLK=y
# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set
CONFIG_ACPI=y
#
CONFIG_BINFMT_ELF=y
CONFIG_COMPAT_BINFMT_ELF=y
-CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE=y
CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_BINFMT_SCRIPT=y
# CONFIG_HAVE_AOUT is not set
CONFIG_NF_CONNTRACK_TIMEOUT=y
CONFIG_NF_CONNTRACK_TIMESTAMP=y
CONFIG_NF_CONNTRACK_LABELS=y
-CONFIG_NF_CT_PROTO_DCCP=m
+CONFIG_NF_CT_PROTO_DCCP=y
CONFIG_NF_CT_PROTO_GRE=m
-CONFIG_NF_CT_PROTO_SCTP=m
-CONFIG_NF_CT_PROTO_UDPLITE=m
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
CONFIG_NETFILTER_NETLINK_QUEUE_CT=y
CONFIG_NF_NAT=m
CONFIG_NF_NAT_NEEDED=y
-CONFIG_NF_NAT_PROTO_DCCP=m
-CONFIG_NF_NAT_PROTO_UDPLITE=m
-CONFIG_NF_NAT_PROTO_SCTP=m
+CONFIG_NF_NAT_PROTO_DCCP=y
+CONFIG_NF_NAT_PROTO_UDPLITE=y
+CONFIG_NF_NAT_PROTO_SCTP=y
CONFIG_NF_NAT_AMANDA=m
CONFIG_NF_NAT_FTP=m
CONFIG_NF_NAT_IRC=m
CONFIG_IP6_NF_SECURITY=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
-# CONFIG_IP6_NF_TARGET_NPT is not set
+CONFIG_IP6_NF_TARGET_NPT=m
CONFIG_NF_TABLES_BRIDGE=m
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
-# CONFIG_NF_LOG_BRIDGE is not set
+CONFIG_NF_LOG_BRIDGE=m
CONFIG_BRIDGE_NF_EBTABLES=m
CONFIG_BRIDGE_EBT_BROUTE=m
CONFIG_BRIDGE_EBT_T_FILTER=m
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_STACK=32
CONFIG_NET_EMATCH_CMP=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
CONFIG_NET_CLS_IND=y
CONFIG_NET_SCH_FIFO=y
CONFIG_DCB=y
CONFIG_OPENVSWITCH_GENEVE=m
CONFIG_VSOCKETS=m
CONFIG_VMWARE_VMCI_VSOCKETS=m
+CONFIG_VIRTIO_VSOCKETS=m
+CONFIG_VIRTIO_VSOCKETS_COMMON=m
CONFIG_NETLINK_MMAP=y
CONFIG_NETLINK_DIAG=m
CONFIG_NET_MPLS_GSO=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_RPS=y
CONFIG_RFS_ACCEL=y
CONFIG_XPS=y
# CONFIG_BT_HCIUART_BCM is not set
# CONFIG_BT_HCIUART_QCA is not set
# CONFIG_BT_HCIUART_AG6XX is not set
+# CONFIG_BT_HCIUART_MRVL is not set
CONFIG_BT_HCIBCM203X=m
CONFIG_BT_HCIBPA10X=m
CONFIG_BT_HCIBFUSB=m
CONFIG_LWTUNNEL=y
CONFIG_DST_CACHE=y
CONFIG_NET_DEVLINK=m
+CONFIG_MAY_USE_DEVLINK=m
CONFIG_HAVE_BPF_JIT=y
#
# CONFIG_BLK_DEV_HD is not set
CONFIG_BLK_DEV_RBD=m
# CONFIG_BLK_DEV_RSXX is not set
+CONFIG_NVME_CORE=m
CONFIG_BLK_DEV_NVME=m
+CONFIG_BLK_DEV_NVME_SCSI=y
+CONFIG_NVME_FABRICS=m
+CONFIG_NVME_RDMA=m
+CONFIG_NVME_FC=m
+CONFIG_NVME_TARGET=m
+CONFIG_NVME_TARGET_LOOP=m
+CONFIG_NVME_TARGET_RDMA=m
+CONFIG_NVME_TARGET_FC=m
+CONFIG_NVME_TARGET_FCLOOP=m
#
# Misc devices
CONFIG_SCSI_MULTI_LUN=y
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
-CONFIG_SCSI_MAX_SG_SEGMENTS=128
CONFIG_SCSI_SCAN_ASYNC=y
#
CONFIG_SCSI_QLA_FC=m
# CONFIG_TCM_QLA2XXX is not set
CONFIG_SCSI_QLA_ISCSI=m
+CONFIG_QEDI=m
+CONFIG_QEDF=m
CONFIG_SCSI_LPFC=m
# CONFIG_SCSI_LPFC_DEBUG_FS is not set
# CONFIG_SCSI_DC395x is not set
# CONFIG_DM_MQ_DEFAULT is not set
CONFIG_DM_DEBUG=y
CONFIG_DM_BUFIO=m
-# CONFIG_DM_DEBUG_BLOCK_STACK_TRACING is not set
+# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set
CONFIG_DM_BIO_PRISON=m
CONFIG_DM_PERSISTENT_DATA=m
CONFIG_DM_CRYPT=m
CONFIG_DM_THIN_PROVISIONING=m
CONFIG_DM_CACHE=m
CONFIG_DM_CACHE_SMQ=m
-CONFIG_DM_CACHE_CLEANER=m
CONFIG_DM_ERA=m
CONFIG_DM_MIRROR=m
CONFIG_DM_LOG_USERSPACE=m
#
CONFIG_VHOST_NET=m
# CONFIG_VHOST_SCSI is not set
+CONFIG_VHOST_VSOCK=m
CONFIG_VHOST_RING=m
CONFIG_VHOST=m
# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set
# CONFIG_NET_VENDOR_3COM is not set
# CONFIG_NET_VENDOR_ADAPTEC is not set
# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
+CONFIG_NET_VENDOR_AMAZON=y
+CONFIG_ENA_ETHERNET=m
+CONFIG_NET_VENDOR_AMD=y
+CONFIG_AMD8111_ETH=m
+CONFIG_PCNET32=m
+# CONFIG_AMD_XGBE is not set
+# CONFIG_AMD_XGBE_HAVE_ECC is not set
+CONFIG_NET_VENDOR_AQUANTIA=y
+CONFIG_AQTION=m
CONFIG_NET_VENDOR_ATHEROS=y
CONFIG_ATL2=m
CONFIG_ATL1=m
CONFIG_TIGON3=m
CONFIG_BNX2X=m
CONFIG_BNX2X_SRIOV=y
-# CONFIG_BNX2X_VXLAN is not set
-# CONFIG_BNX2X_GENEVE is not set
CONFIG_BNXT=m
CONFIG_BNXT_SRIOV=y
+CONFIG_BNXT_DCB=y
CONFIG_NET_VENDOR_BROCADE=y
CONFIG_BNA=m
CONFIG_NET_CALXEDA_XGMAC=m
CONFIG_CHELSIO_T3=m
CONFIG_CHELSIO_T4=m
# CONFIG_CHELSIO_T4_DCB is not set
-CONFIG_CHELSIO_T4_UWIRE=y
CONFIG_CHELSIO_T4VF=m
+CONFIG_CHELSIO_LIB=m
CONFIG_NET_VENDOR_CISCO=y
CONFIG_ENIC=m
CONFIG_DNET=m
CONFIG_NET_VENDOR_EMULEX=y
CONFIG_BE2NET=m
CONFIG_BE2NET_HWMON=y
-CONFIG_BE2NET_VXLAN=y
# CONFIG_NET_VENDOR_EXAR is not set
# CONFIG_NET_VENDOR_HP is not set
CONFIG_NET_VENDOR_INTEL=y
CONFIG_IGBVF=m
# CONFIG_IXGB is not set
CONFIG_IXGBE=m
-# CONFIG_IXGBE_VXLAN is not set
CONFIG_IXGBE_HWMON=y
CONFIG_IXGBE_DCA=y
CONFIG_IXGBE_DCB=y
CONFIG_NET_VENDOR_MELLANOX=y
CONFIG_MLX4_EN=m
CONFIG_MLX4_EN_DCB=y
-CONFIG_MLX4_EN_VXLAN=y
CONFIG_MLX4_CORE=m
CONFIG_MLX4_DEBUG=y
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
CONFIG_MLX5_CORE_EN_DCB=y
-CONFIG_MLX5_CORE_EN_VXLAN=y
+CONFIG_MLXSW_CORE=m
+CONFIG_MLXSW_CORE_HWMON=y
+CONFIG_MLXSW_CORE_THERMAL=y
+CONFIG_MLXSW_PCI=m
+CONFIG_MLXSW_I2C=m
+CONFIG_MLXSW_SWITCHIB=m
+CONFIG_MLXSW_SWITCHX2=m
+CONFIG_MLXSW_SPECTRUM=m
+CONFIG_MLXSW_SPECTRUM_DCB=y
+CONFIG_MLXSW_MINIMAL=m
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
CONFIG_NET_VENDOR_MYRI=y
CONFIG_MYRI10GE_DCA=y
# CONFIG_FEALNX is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
+CONFIG_NET_VENDOR_NETRONOME=y
+CONFIG_NFP=m
+# CONFIG_NFP_DEBUG is not set
# CONFIG_NET_VENDOR_NVIDIA is not set
CONFIG_NET_VENDOR_OKI=y
CONFIG_PCH_GBE=m
CONFIG_QLGE=m
CONFIG_NETXEN_NIC=m
CONFIG_QED=m
+CONFIG_QED_LL2=y
CONFIG_QED_SRIOV=y
CONFIG_QEDE=m
# CONFIG_QEDE_VXLAN is not set
# CONFIG_QEDE_GENEVE is not set
+CONFIG_QED_RDMA=y
+CONFIG_QED_ISCSI=y
+CONFIG_QED_FCOE=y
CONFIG_NET_VENDOR_REALTEK=y
# CONFIG_ATP is not set
CONFIG_8139CP=m
# CONFIG_8139_OLD_RX_RESET is not set
CONFIG_R8169=m
# CONFIG_NET_VENDOR_RDC is not set
+CONFIG_NET_VENDOR_ROCKER=y
+CONFIG_ROCKER=m
# CONFIG_NET_VENDOR_SEEQ is not set
# CONFIG_NET_VENDOR_SILAN is not set
# CONFIG_NET_VENDOR_SIS is not set
+CONFIG_NET_VENDOR_SOLARFLARE=y
CONFIG_SFC=m
CONFIG_SFC_MTD=y
CONFIG_SFC_MCDI_MON=y
CONFIG_SFC_SRIOV=y
CONFIG_SFC_MCDI_LOGGING=y
+CONFIG_SFC_FALCON=m
+CONFIG_SFC_FALCON_MTD=y
CONFIG_NET_VENDOR_SMSC=y
CONFIG_EPIC100=m
CONFIG_SMSC9420=m
# CONFIG_ATH5K_PCI is not set
CONFIG_ATH9K_HW=m
CONFIG_ATH9K_COMMON=m
+CONFIG_ATH9K_COMMON_DEBUG=y
CONFIG_ATH9K_BTCOEX_SUPPORT=y
CONFIG_ATH9K=m
CONFIG_ATH9K_PCI=y
CONFIG_ATH9K_PCOEM=y
CONFIG_ATH9K_HTC=m
# CONFIG_ATH9K_HTC_DEBUGFS is not set
-CONFIG_ATH9K_HWRNG=y
+# CONFIG_ATH9K_HWRNG is not set
CONFIG_CARL9170=m
CONFIG_CARL9170_LEDS=y
# CONFIG_CARL9170_DEBUGFS is not set
CONFIG_IWLMVM=m
CONFIG_IWLWIFI_OPMODE_MODULAR=y
# CONFIG_IWLWIFI_BCAST_FILTERING is not set
-# CONFIG_IWLWIFI_PCIE_RTPM is not set
#
# Debugging Options
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
-# CONFIG_INPUT_JOYDEV is not set
+CONFIG_INPUT_JOYDEV=m
CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_EVBUG is not set
# CONFIG_IPMI_PANIC_EVENT is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
CONFIG_IPMI_SI=m
-# CONFIG_IPMI_SI_PROBE_DEFAULTS is not set
CONFIG_IPMI_SSIF=m
CONFIG_IPMI_WATCHDOG=m
CONFIG_IPMI_POWEROFF=m
CONFIG_HANGCHECK_TIMER=m
CONFIG_UV_MMTIMER=m
CONFIG_TCG_TPM=y
+CONFIG_TCG_TIS_CORE=y
CONFIG_TCG_TIS=y
+# CONFIG_TCG_TIS_SPI is not set
CONFIG_TCG_TIS_I2C_ATMEL=m
CONFIG_TCG_TIS_I2C_INFINEON=m
CONFIG_TCG_TIS_I2C_NUVOTON=m
CONFIG_TCG_INFINEON=m
# CONFIG_TCG_XEN is not set
CONFIG_TCG_CRB=m
+# CONFIG_TCG_VTPM_PROXY is not set
CONFIG_TCG_TIS_ST33ZP24=m
CONFIG_TCG_TIS_ST33ZP24_I2C=m
+# CONFIG_TCG_TIS_ST33ZP24_SPI is not set
CONFIG_TELCLOCK=m
CONFIG_DEVPORT=y
CONFIG_HMC_DRV=m
CONFIG_PTP_1588_CLOCK=m
CONFIG_DP83640_PHY=m
CONFIG_PTP_1588_CLOCK_PCH=m
+CONFIG_PTP_1588_CLOCK_KVM=m
CONFIG_PINCTRL=y
#
# Pin controllers
#
# CONFIG_PINMUX is not set
-# CONFIG_PINCONF is not set
+CONFIG_PINCONF=y
+CONFIG_GENERIC_PINCONF=y
# CONFIG_DEBUG_PINCTRL is not set
+CONFIG_PINCTRL_AMD=m
# CONFIG_PINCTRL_EXYNOS5440 is not set
CONFIG_PINCTRL_BAYTRAIL=y
+# CONFIG_PINCTRL_SUNRISEPOINT is not set
CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
CONFIG_GPIOLIB=y
CONFIG_GPIO_DEVRES=y
CONFIG_GPIO_ACPI=y
+CONFIG_GPIOLIB_IRQCHIP=y
# CONFIG_DEBUG_GPIO is not set
CONFIG_GPIO_SYSFS=y
+CONFIG_GPIO_AMDPT=m
#
# Memory mapped GPIO drivers:
CONFIG_SENSORS_IBMAEM=m
CONFIG_SENSORS_IBMPEX=m
CONFIG_SENSORS_IT87=m
-# CONFIG_SENSORS_JC42 is not set
+CONFIG_SENSORS_JC42=m
CONFIG_SENSORS_LINEAGE=m
CONFIG_SENSORS_LM63=m
# CONFIG_SENSORS_LM70 is not set
CONFIG_W83977F_WDT=m
CONFIG_MACHZ_WDT=m
# CONFIG_SBC_EPX_C3_WATCHDOG is not set
+CONFIG_INTEL_MEI_WDT=m
CONFIG_XEN_WDT=m
#
CONFIG_VGA_ARB_MAX_GPUS=64
CONFIG_VGA_SWITCHEROO=y
CONFIG_DRM=m
-CONFIG_DRM_MIPI_DSI=y
-# CONFIG_DRM_DP_AUX_CHARDEV is not set
+CONFIG_DRM_DP_AUX_CHARDEV=y
CONFIG_DRM_KMS_HELPER=m
CONFIG_DRM_KMS_FB_HELPER=y
CONFIG_DRM_FBDEV_EMULATION=y
CONFIG_DRM_I2C_CH7006=m
CONFIG_DRM_I2C_SIL164=m
# CONFIG_DRM_I2C_NXP_TDA998X is not set
-# CONFIG_DRM_TDFX is not set
-# CONFIG_DRM_R128 is not set
CONFIG_DRM_RADEON=m
# CONFIG_DRM_RADEON_USERPTR is not set
CONFIG_DRM_AMDGPU=m
+# CONFIG_DRM_AMDGPU_SI is not set
# CONFIG_DRM_AMDGPU_CIK is not set
# CONFIG_DRM_AMDGPU_USERPTR is not set
-CONFIG_DRM_AMD_POWERPLAY=y
+# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set
#
# ACP (Audio CoProcessor) Configuration
CONFIG_NOUVEAU_DEBUG=5
CONFIG_NOUVEAU_DEBUG_DEFAULT=3
CONFIG_DRM_NOUVEAU_BACKLIGHT=y
-# CONFIG_DRM_I810 is not set
CONFIG_DRM_I915=m
-# CONFIG_DRM_I915_PRELIMINARY_HW_SUPPORT is not set
+# CONFIG_DRM_I915_ALPHA_SUPPORT is not set
+CONFIG_DRM_I915_CAPTURE_ERROR=y
+CONFIG_DRM_I915_COMPRESS_ERROR=y
CONFIG_DRM_I915_USERPTR=y
-# CONFIG_DRM_MGA is not set
-# CONFIG_DRM_SIS is not set
-# CONFIG_DRM_VIA is not set
-# CONFIG_DRM_SAVAGE is not set
+CONFIG_DRM_I915_GVT=y
+CONFIG_DRM_I915_GVT_KVMGT=m
CONFIG_DRM_VMWGFX=m
CONFIG_DRM_VMWGFX_FBCON=y
CONFIG_DRM_GMA500=m
CONFIG_DRM_QXL=m
CONFIG_DRM_BOCHS=m
CONFIG_DRM_VIRTIO_GPU=m
-CONFIG_DRM_PANEL=y
-
-#
-# Display Panels
-#
CONFIG_HSA_AMD=m
+# CONFIG_DRM_LEGACY is not set
# CONFIG_VGASTATE is not set
CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_HDMI=y
CONFIG_SND_SOC_INTEL_SST=m
CONFIG_SND_SOC_INTEL_SST_ACPI=m
CONFIG_SND_SOC_INTEL_SST_MATCH=m
+# CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH is not set
+# CONFIG_SND_SOC_INTEL_BXT_RT298_MACH is not set
CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m
CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m
CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m
# CONFIG_USB_OTG is not set
# CONFIG_USB_OTG_WHITELIST is not set
# CONFIG_USB_OTG_FSM is not set
-# CONFIG_USB_ULPI_BUS is not set
+CONFIG_USB_LEDS_TRIGGER_USBPORT=m
CONFIG_USB_MON=y
CONFIG_USB_WUSB=m
CONFIG_USB_WUSB_CBAF=m
#
CONFIG_USB_MDC800=m
CONFIG_USB_MICROTEK=m
+CONFIG_USBIP_CORE=m
+# CONFIG_USBIP_VHCI_HCD is not set
+# CONFIG_USBIP_HOST is not set
+# CONFIG_USBIP_DEBUG is not set
# CONFIG_USB_DWC3 is not set
# CONFIG_USB_CHIPIDEA is not set
# CONFIG_USB_RIO500 is not set
CONFIG_USB_LEGOTOWER=m
CONFIG_USB_LCD=m
-CONFIG_USB_LED=m
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
CONFIG_USB_IDMOUSE=m
# CONFIG_USB_YUREX is not set
CONFIG_USB_EZUSB_FX2=m
CONFIG_USB_HSIC_USB3503=m
+# CONFIG_USB_HSIC_USB4604 is not set
# CONFIG_USB_LINK_LAYER_TEST is not set
# CONFIG_USB_CHAOSKEY is not set
+# CONFIG_UCSI is not set
CONFIG_USB_ATM=m
CONFIG_USB_SPEEDTOUCH=m
CONFIG_USB_CXACRU=m
# CONFIG_USB_PHY is not set
# CONFIG_USB_GADGET is not set
# CONFIG_USB_LED_TRIG is not set
+# CONFIG_USB_ULPI_BUS is not set
CONFIG_UWB=m
CONFIG_UWB_HWA=m
CONFIG_UWB_WHCI=m
CONFIG_UWB_I1480U=m
CONFIG_MMC=m
# CONFIG_MMC_DEBUG is not set
-
-#
-# MMC/SD/SDIO Card Drivers
-#
CONFIG_MMC_BLOCK=m
CONFIG_MMC_BLOCK_MINORS=8
CONFIG_MMC_BLOCK_BOUNCE=y
CONFIG_INFINIBAND_ISER=m
CONFIG_INFINIBAND_ISERT=m
CONFIG_INFINIBAND_RDMAVT=m
+CONFIG_RDMA_RXE=m
CONFIG_INFINIBAND_HFI1=m
# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set
CONFIG_HFI1_VERBS_31BIT_PSN=y
# CONFIG_SDMA_VERBOSITY is not set
+CONFIG_INFINIBAND_QEDR=m
CONFIG_EDAC=y
CONFIG_EDAC_LEGACY_SYSFS=y
# CONFIG_EDAC_DEBUG is not set
CONFIG_EDAC_I5100=m
CONFIG_EDAC_I7300=m
CONFIG_EDAC_SBRIDGE=m
+CONFIG_EDAC_SKX=m
CONFIG_RTC_LIB=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_HCTOSYS=y
CONFIG_NET_DMA_RH_KABI=y
CONFIG_ASYNC_TX_DMA=y
# CONFIG_DMATEST is not set
+
+#
+# DMABUF options
+#
+CONFIG_SYNC_FILE=y
+# CONFIG_SW_SYNC is not set
CONFIG_DCA=m
CONFIG_AUXDISPLAY=y
CONFIG_KS0108=m
CONFIG_UIO_SERCOS3=m
CONFIG_UIO_PCI_GENERIC=m
# CONFIG_UIO_NETX is not set
+CONFIG_UIO_HV_GENERIC=m
CONFIG_VFIO_IOMMU_TYPE1=m
CONFIG_VFIO=m
CONFIG_VFIO_NOIOMMU=y
# CONFIG_VFIO_PCI_VGA is not set
CONFIG_VFIO_PCI_MMAP=y
CONFIG_VFIO_PCI_INTX=y
+CONFIG_VFIO_MDEV=m
+CONFIG_VFIO_MDEV_DEVICE=m
CONFIG_IRQ_BYPASS_MANAGER=m
# CONFIG_VIRT_DRIVERS is not set
CONFIG_VIRTIO=m
CONFIG_STAGING=y
# CONFIG_ET131X is not set
# CONFIG_SLICOSS is not set
-# CONFIG_USBIP_CORE is not set
# CONFIG_W35UND is not set
# CONFIG_PRISM2_USB is not set
# CONFIG_ECHO is not set
CONFIG_ACERHDF=m
CONFIG_ASUS_LAPTOP=m
CONFIG_CHROMEOS_LAPTOP=m
+CONFIG_DELL_SMBIOS=m
CONFIG_DELL_LAPTOP=m
CONFIG_DELL_WMI=m
CONFIG_DELL_WMI_AIO=m
+CONFIG_DELL_SMO8800=m
+CONFIG_DELL_RBTN=m
CONFIG_FUJITSU_LAPTOP=m
# CONFIG_FUJITSU_LAPTOP_DEBUG is not set
CONFIG_FUJITSU_TABLET=m
CONFIG_ACPI_TOSHIBA=m
CONFIG_TOSHIBA_BT_RFKILL=m
CONFIG_ACPI_CMPC=m
+CONFIG_INTEL_HID_EVENT=m
CONFIG_INTEL_IPS=m
# CONFIG_IBM_RTL is not set
# CONFIG_XO15_EBOOK is not set
# CONFIG_MEMORY is not set
# CONFIG_IIO is not set
CONFIG_NTB=m
+CONFIG_NTB_AMD=m
+# CONFIG_NTB_INTEL is not set
+# CONFIG_NTB_PINGPONG is not set
+# CONFIG_NTB_TOOL is not set
+CONFIG_NTB_PERF=m
+CONFIG_NTB_TRANSPORT=m
# CONFIG_VME_BUS is not set
CONFIG_PWM=y
CONFIG_PWM_SYSFS=y
CONFIG_ND_PFN=m
CONFIG_NVDIMM_PFN=y
CONFIG_NVDIMM_DAX=y
+CONFIG_DEV_DAX=m
+CONFIG_DEV_DAX_PMEM=m
+CONFIG_NR_DEV_DAX=32768
#
# Firmware Drivers
# EFI (Extensible Firmware Interface) Support
#
CONFIG_EFI_VARS=y
+CONFIG_EFI_ESRT=y
CONFIG_EFI_VARS_PSTORE=y
CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y
CONFIG_EFI_RUNTIME_MAP=y
CONFIG_PROC_VMCORE=y
CONFIG_PROC_SYSCTL=y
CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_KERNFS=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_FUNCTION_GRAPH_TRACER=y
# CONFIG_IRQSOFF_TRACER is not set
CONFIG_SCHED_TRACER=y
+CONFIG_HWLAT_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_TRACER_SNAPSHOT=y
# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set
# CONFIG_RBTREE_TEST is not set
# CONFIG_INTERVAL_TREE_TEST is not set
# CONFIG_TEST_RHASHTABLE is not set
+# CONFIG_TEST_PARMAN is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
# CONFIG_FIREWIRE_OHCI_REMOTE_DMA is not set
CONFIG_BUILD_DOCSRC=y
CONFIG_SECURITYFS=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_NETWORK_XFRM=y
-# CONFIG_SECURITY_PATH is not set
+CONFIG_SECURITY_PATH=y
CONFIG_SECURITY_SECURELEVEL=y
CONFIG_INTEL_TXT=y
CONFIG_LSM_MMAP_MIN_ADDR=65535
# CONFIG_SECURITY_SMACK is not set
# CONFIG_SECURITY_TOMOYO is not set
# CONFIG_SECURITY_APPARMOR is not set
-# CONFIG_SECURITY_YAMA is not set
+CONFIG_SECURITY_YAMA=y
+CONFIG_SECURITY_YAMA_STACKED=y
CONFIG_INTEGRITY=y
CONFIG_INTEGRITY_SIGNATURE=y
CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
CONFIG_EVM=y
CONFIG_EVM_HMAC_VERSION=2
CONFIG_DEFAULT_SECURITY_SELINUX=y
+# CONFIG_DEFAULT_SECURITY_YAMA is not set
# CONFIG_DEFAULT_SECURITY_DAC is not set
CONFIG_DEFAULT_SECURITY="selinux"
CONFIG_XOR_BLOCKS=m
CONFIG_CRYPTO_PCOMP2=y
CONFIG_CRYPTO_AKCIPHER2=y
CONFIG_CRYPTO_AKCIPHER=m
-# CONFIG_CRYPTO_RSA is not set
+CONFIG_CRYPTO_KPP2=y
+CONFIG_CRYPTO_KPP=m
+CONFIG_CRYPTO_RSA=m
+CONFIG_CRYPTO_DH=m
+# CONFIG_CRYPTO_ECDH is not set
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_SHA256_SSSE3=y
CONFIG_CRYPTO_SHA512_SSSE3=m
CONFIG_CRYPTO_SHA1_MB=m
+CONFIG_CRYPTO_SHA256_MB=m
+CONFIG_CRYPTO_SHA512_MB=m
CONFIG_CRYPTO_SHA256=y
CONFIG_CRYPTO_SHA512=m
CONFIG_CRYPTO_TGR192=m
CONFIG_CRYPTO_DRBG_HASH=y
CONFIG_CRYPTO_DRBG_CTR=y
CONFIG_CRYPTO_DRBG=m
+CONFIG_CRYPTO_JITTERENTROPY=m
CONFIG_CRYPTO_USER_API=y
CONFIG_CRYPTO_USER_API_HASH=y
CONFIG_CRYPTO_USER_API_SKCIPHER=y
+CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_HASH_INFO=y
CONFIG_CRYPTO_HW=y
CONFIG_CRYPTO_DEV_PADLOCK=m
CONFIG_CRYPTO_DEV_PADLOCK_AES=m
CONFIG_CRYPTO_DEV_PADLOCK_SHA=m
+CONFIG_CRYPTO_DEV_CCP=y
+CONFIG_CRYPTO_DEV_CCP_DD=m
CONFIG_CRYPTO_DEV_QAT=m
CONFIG_CRYPTO_DEV_QAT_DH895xCC=m
CONFIG_CRYPTO_DEV_QAT_C3XXX=m
CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
CONFIG_CRYPTO_DEV_QAT_C62XVF=m
+CONFIG_CRYPTO_DEV_CHELSIO=m
CONFIG_ASYMMETRIC_KEY_TYPE=y
CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
CONFIG_PUBLIC_KEY_ALGO_RSA=y
CONFIG_CRC8=m
# CONFIG_RANDOM32_SELFTEST is not set
CONFIG_ZLIB_INFLATE=y
-CONFIG_ZLIB_DEFLATE=m
+CONFIG_ZLIB_DEFLATE=y
CONFIG_LZO_COMPRESS=y
CONFIG_LZO_DECOMPRESS=y
CONFIG_XZ_DEC=y
CONFIG_TEXTSEARCH_BM=m
CONFIG_TEXTSEARCH_FSM=m
CONFIG_INTERVAL_TREE=y
-CONFIG_GENERIC_PAGE_TABLE=y
CONFIG_ASSOCIATIVE_ARRAY=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_SIGNATURE=y
CONFIG_OID_REGISTRY=y
CONFIG_UCS2_STRING=y
+CONFIG_SG_POOL=y
CONFIG_ARCH_HAS_PMEM_API=y
CONFIG_ARCH_HAS_MMIO_FLUSH=y
+CONFIG_PARMAN=m
CONFIG_RH_KABI_SIZE_ALIGN_CHECKS=y
--- /dev/null
+Index: linux-3.10.0-495.el7.x86_64/block/blk-settings.c
+===================================================================
+--- linux-3.10.0-495.el7.x86_64.orig/block/blk-settings.c
++++ linux-3.10.0-495.el7.x86_64/block/blk-settings.c
+@@ -19,6 +19,12 @@ EXPORT_SYMBOL(blk_max_low_pfn);
+
+ unsigned long blk_max_pfn;
+
++int default_max_sectors = BLK_DEF_MAX_SECTORS;
++module_param(default_max_sectors, int, 0);
++
++int default_max_segments = BLK_MAX_SEGMENTS;
++module_param(default_max_segments, int, 0);
++
+ /**
+ * blk_queue_prep_rq - set a prepare_request function for queue
+ * @q: queue
+@@ -108,7 +114,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
+ */
+ void blk_set_default_limits(struct queue_limits *lim)
+ {
+- lim->max_segments = BLK_MAX_SEGMENTS;
++ lim->max_segments = default_max_segments;
+ lim->max_integrity_segments = 0;
+ lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+ if (lim->limits_aux)
+@@ -268,7 +274,7 @@ void blk_limits_max_hw_sectors(struct qu
+
+ limits->max_hw_sectors = max_hw_sectors;
+ max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
+- max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
++ max_sectors = min_t(unsigned int, max_sectors, default_max_sectors);
+ limits->max_sectors = max_sectors;
+ }
+ EXPORT_SYMBOL(blk_limits_max_hw_sectors);
+Index: linux-3.10.0-495.el7.x86_64/drivers/message/fusion/Kconfig
+===================================================================
+--- linux-3.10.0-495.el7.x86_64.orig/drivers/message/fusion/Kconfig
++++ linux-3.10.0-495.el7.x86_64/drivers/message/fusion/Kconfig
+@@ -61,9 +61,9 @@ config FUSION_SAS
+ LSISAS1078
+
+ config FUSION_MAX_SGE
+- int "Maximum number of scatter gather entries (16 - 128)"
+- default "128"
+- range 16 128
++ int "Maximum number of scatter gather entries (16 - 256)"
++ default "256"
++ range 16 256
+ help
+ This option allows you to specify the maximum number of scatter-
+ gather entries per I/O. The driver default is 128, which matches
+Index: linux-3.10.0-495.el7.x86_64/drivers/message/fusion/mptbase.h
+===================================================================
+--- linux-3.10.0-495.el7.x86_64.orig/drivers/message/fusion/mptbase.h
++++ linux-3.10.0-495.el7.x86_64/drivers/message/fusion/mptbase.h
+@@ -166,10 +166,10 @@
+ * Set the MAX_SGE value based on user input.
+ */
+ #ifdef CONFIG_FUSION_MAX_SGE
+-#if CONFIG_FUSION_MAX_SGE < 16
++#if CONFIG_FUSION_MAX_SGE < 16
+ #define MPT_SCSI_SG_DEPTH 16
+-#elif CONFIG_FUSION_MAX_SGE > 128
+-#define MPT_SCSI_SG_DEPTH 128
++#elif CONFIG_FUSION_MAX_SGE > 256
++#define MPT_SCSI_SG_DEPTH 256
+ #else
+ #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE
+ #endif
--- /dev/null
+--- linux-3.10.0-685.el7.x86_64/drivers/md/raid5.c.orig 2017-06-28 14:06:00.627299582 -0700
++++ linux-3.10.0-685.el7.x86_64/drivers/md/raid5.c 2017-06-28 14:08:01.564618793 -0700
+@@ -3090,6 +3090,8 @@ static int add_stripe_bio(struct stripe_
+ bi->bi_next = *bip;
+ *bip = bi;
+ raid5_inc_bi_active_stripes(bi);
++ if ((bi->bi_rw & REQ_SYNC) && !forwrite)
++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */
+
+ if (forwrite) {
+ /* check if page is covered */
+@@ -5538,6 +5540,9 @@ static bool raid5_make_request(struct md
+ bi, 0);
+ bio_endio(bi, 0);
+ }
++
++ if (bi->bi_rw & REQ_SYNC)
++ md_wakeup_thread(mddev->thread);
+ return true;
+ }
+
-raid5-mmp-unplug-dev-3.7.patch
+raid5-mmp-unplug-dev-3.8.patch
dev_read_only-3.7.patch
-blkdev_tunables-3.8.patch
+blkdev_tunables-3.9.patch
vfs-project-quotas-rhel7.patch
lnxmaj="3.10.0"
-lnxrel="514.26.2.el7"
+lnxrel="693.el7"
KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm
SERIES=3.10-rhel7.series
2.6-rhel6.series 2.6.32-573.26.1.el6 (RHEL 6.7)
2.6-rhel6.8.series 2.6.32-642.15.1.el6 (RHEL 6.8)
2.6-rhel6.8.series 2.6.32-696.6.3.el6 (RHEL 6.9)
-3.10-rhel7.series 3.10.0-514.26.2.el7 (RHEL 7.3)
+3.10-rhel7.series 3.10.0-693.el7 (RHEL 7.4)
3.0-sles11sp3.series 3.0.101-0.47.71 (SLES11 SP3)
3.0-sles11sp3.series 3.0.101-107 (SLES11 SP4)
3.12-sles12.series 3.12.74-60.64.40 (SLES12 SP1)