From: Bob Glossman Date: Tue, 1 Aug 2017 14:32:02 +0000 (-0700) Subject: LU-9816 kernel: kernel upgrade RHEL7.4 [3.10.0-693.el7] X-Git-Tag: 2.10.52~8 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=fd8ef48571a2752f6c6a4c47127178a765d5b328 LU-9816 kernel: kernel upgrade RHEL7.4 [3.10.0-693.el7] With this mod we switch our supported el7 version to RHEL 7.4 Signed-off-by: Bob Glossman Change-Id: Ib0bcb3547fcf220e1b4665229930c6ff28c6906c Reviewed-on: https://review.whamcloud.com/28301 Tested-by: Jenkins Reviewed-by: Yang Sheng Tested-by: Maloo Reviewed-by: Minh Diep Reviewed-by: Oleg Drokin --- diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index 83b4cbc..8cd0b9a 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -13,6 +13,7 @@ esac AS_IF([test -z "$LDISKFS_SERIES"], [ AS_IF([test x$RHEL_KERNEL = xyes], [ case $RHEL_RELEASE_NO in + 74) LDISKFS_SERIES="3.10-rhel7.4.series" ;; 73) LDISKFS_SERIES="3.10-rhel7.3.series" ;; 72) LDISKFS_SERIES="3.10-rhel7.2.series" ;; 71) LDISKFS_SERIES="3.10-rhel7.series" ;; diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild index d1067f8..99dd601 100755 --- a/contrib/lbuild/lbuild +++ b/contrib/lbuild/lbuild @@ -324,7 +324,7 @@ check_options() { 3.12-sles12 | 4.4-sles12) CANONICAL_TARGET="sles12" ;; - 3.10-rhel7) + 3.10-rhel7*) CANONICAL_TARGET="rhel7" ;; 2.6-rhel6*) diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-attach-jinode-in-writepages.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-attach-jinode-in-writepages.patch new file mode 100644 index 0000000..7ec7112 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-attach-jinode-in-writepages.patch @@ -0,0 +1,44 @@ +Index: linux-stage/fs/ext4/inode.c +=================================================================== +--- linux-stage.orig/fs/ext4/inode.c ++++ linux-stage/fs/ext4/inode.c +@@ -745,6 +745,9 @@ out_sem: + !(flags & EXT4_GET_BLOCKS_ZERO) && + !IS_NOQUOTA(inode) && + ext4_should_order_data(inode)) { ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ return ret; + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) + return ret; +@@ -2503,6 +2506,9 @@ static int ext4_writepages(struct addres + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; + } + ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ goto out_writepages; + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); +@@ -3837,6 +3843,7 @@ int ext4_inode_attach_jinode(struct inod + jbd2_free_inode(jinode); + return 0; + } ++EXPORT_SYMBOL(ext4_inode_attach_jinode); + + /* + * ext4_truncate() +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -2379,6 +2379,7 @@ extern int ext4_group_add_blocks(handle_ + extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + + /* inode.c */ ++#define HAVE_LDISKFS_INFO_JINODE + struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); + struct buffer_head *ext4_bread(handle_t *, struct inode *, diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-dont-check-before-replay.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-dont-check-before-replay.patch new file mode 100644 index 0000000..e1d41d7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-dont-check-before-replay.patch @@ -0,0 +1,33 @@ +Index: linux-stage/fs/ext4/super.c +When ldiskfs run in failover mode whith read-only disk. +Part of allocation updates are lost and ldiskfs may fail +while mounting this is due to inconsistent state of +group-descriptor. Group-descriptor check is added after +journal replay. +=================================================================== +--- linux-stage/fs/ext4/super.c 2016-11-06 15:15:30.892386878 +0530 ++++ linux-stage.orig.1/fs/ext4/super.c 2016-11-08 10:56:45.579892189 +0530 +@@ -3980,10 +3980,6 @@ + goto failed_mount2; + } + } +- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { +- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); +- goto failed_mount2; +- } + + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +@@ -4104,6 +4100,12 @@ + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + + no_journal: ++ ++ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ++ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ++ goto failed_mount_wq; ++ } ++ + /* + * Get the # of file system overhead blocks from the + * superblock if present. diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-fix-xattr-shifting-when-expanding-inodes.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-fix-xattr-shifting-when-expanding-inodes.patch new file mode 100644 index 0000000..f0b155f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-fix-xattr-shifting-when-expanding-inodes.patch @@ -0,0 +1,245 @@ +From e3014d14a81edde488d9a6758eea8afc41752d2d Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 29 Aug 2016 15:38:11 -0400 +Subject: [PATCH] ext4: fixup free space calculations when expanding inodes + +Conditions checking whether there is enough free space in an xattr block +and when xattr is large enough to make enough space in the inode forgot +to account for the fact that inode need not be completely filled up with +xattrs. Thus we could move unnecessarily many xattrs out of inode or +even falsely claim there is not enough space to expand the inode. We +also forgot to update the amount of free space in xattr block when moving +more xattrs and thus could decide to move too big xattr resulting in +unexpected failure. + +Fix these problems by properly updating free space in the inode and +xattr block as we move xattrs. To simplify the math, avoid shifting +xattrs after removing each one xattr and instead just shift xattrs only +once there is enough free space in the inode. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/xattr.c | 58 ++++++++++++++++++++++++--------------------------------- + 1 file changed, 24 insertions(+), 34 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 2eb935c..22d2ebc 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1350,7 +1350,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; +- size_t min_offs, free; ++ size_t min_offs; ++ size_t ifree, bfree; + int total_ino; + void *base, *start, *end; + int error = 0, tried_min_extra_isize = 0; +@@ -1385,17 +1386,9 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + if (error) + goto cleanup; + +- free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); +- if (free >= isize_diff) { +- entry = IFIRST(header); +- ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize +- - new_extra_isize, (void *)raw_inode + +- EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, +- (void *)header, total_ino, +- inode->i_sb->s_blocksize); +- EXT4_I(inode)->i_extra_isize = new_extra_isize; +- goto out; +- } ++ ifree = ext4_xattr_free_space(last, &min_offs, base, &total_ino); ++ if (ifree >= isize_diff) ++ goto shift; + + /* + * Enough free space isn't available in the inode, check if +@@ -1416,8 +1409,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + first = BFIRST(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; +- free = ext4_xattr_free_space(first, &min_offs, base, NULL); +- if (free < isize_diff) { ++ bfree = ext4_xattr_free_space(first, &min_offs, base, NULL); ++ if (bfree + ifree < isize_diff) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; +@@ -1428,10 +1421,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + goto cleanup; + } + } else { +- free = inode->i_sb->s_blocksize; ++ bfree = inode->i_sb->s_blocksize; + } + +- while (isize_diff > 0) { ++ while (isize_diff > ifree) { + size_t offs, size, entry_size; + struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_info i = { +@@ -1439,7 +1432,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + .value_len = 0, + }; + unsigned int total_size; /* EA entry size + value size */ +- unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ + unsigned int min_total_size = ~0U; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); +@@ -1461,8 +1453,9 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); +- if (total_size <= free && total_size < min_total_size) { +- if (total_size < isize_diff) { ++ if (total_size <= bfree && ++ total_size < min_total_size) { ++ if (total_size + ifree < isize_diff) { + small_entry = last; + } else { + entry = last; +@@ -1491,6 +1484,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + offs = le16_to_cpu(entry->e_value_offs); + size = le32_to_cpu(entry->e_value_size); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); ++ total_size = entry_size + EXT4_XATTR_SIZE(size); + i.name_index = entry->e_name_index, + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); +@@ -1518,21 +1512,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + if (error) + goto cleanup; + total_ino -= entry_size; +- +- entry = IFIRST(header); +- if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) +- shift_bytes = isize_diff; +- else +- shift_bytes = entry_size + EXT4_XATTR_SIZE(size); +- /* Adjust the offsets and shift the remaining entries ahead */ +- ext4_xattr_shift_entries(entry, -shift_bytes, +- (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + +- EXT4_I(inode)->i_extra_isize + shift_bytes, +- (void *)header, total_ino, inode->i_sb->s_blocksize); +- +- isize_diff -= shift_bytes; +- EXT4_I(inode)->i_extra_isize += shift_bytes; +- header = IHDR(inode, raw_inode); ++ ifree += total_size; ++ bfree -= total_size; + + i.name = b_entry_name; + i.value = buffer; +@@ -1553,6 +1534,15 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + kfree(is); + kfree(bs); + } ++ ++shift: ++ /* Adjust the offsets and shift the remaining entries ahead */ ++ entry = IFIRST(header); ++ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize ++ - new_extra_isize, (void *)raw_inode + ++ EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, ++ (void *)header, total_ino, inode->i_sb->s_blocksize); ++ EXT4_I(inode)->i_extra_isize = new_extra_isize; + brelse(bh); + out: + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); +-- +2.9.3 + +From 94405713889d4a9d341b4ad92956e4e2ec8ec2c2 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 29 Aug 2016 15:41:11 -0400 +Subject: [PATCH] ext4: replace bogus assertion in ext4_xattr_shift_entries() + +We were checking whether computed offsets do not exceed end of block in +ext4_xattr_shift_entries(). However this does not make sense since we +always only decrease offsets. So replace that assertion with a check +whether we really decrease xattrs value offsets. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/xattr.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 1447860..82b025c 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1319,18 +1319,19 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, + */ + static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, +- void *from, size_t n, int blocksize) ++ void *from, size_t n) + { + struct ext4_xattr_entry *last = entry; + int new_offs; + ++ /* We always shift xattr headers further thus offsets get lower */ ++ BUG_ON(value_offs_shift > 0); ++ + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_inum && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; +- BUG_ON(new_offs + le32_to_cpu(last->e_value_size) +- > blocksize); + last->e_value_offs = cpu_to_le16(new_offs); + } + } +@@ -1542,7 +1543,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, +- (void *)header, total_ino, inode->i_sb->s_blocksize); ++ (void *)header, total_ino); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + brelse(bh); + out: +-- +2.9.3 + +From 887a9730614727c4fff7cb756711b190593fc1df Mon Sep 17 00:00:00 2001 +From: Konstantin Khlebnikov +Date: Sun, 21 May 2017 22:36:23 -0400 +Subject: [PATCH] ext4: keep existing extra fields when inode expands + +ext4_expand_extra_isize() should clear only space between old and new +size. + +Fixes: 6dd4ee7cab7e # v2.6.23 +Cc: stable@vger.kernel.org +Signed-off-by: Konstantin Khlebnikov +Signed-off-by: Theodore Ts'o +--- + fs/ext4/inode.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 1bd0bfa..7cd99de 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5637,8 +5637,9 @@ static int ext4_expand_extra_isize(struct inode *inode, + /* No extended attributes present */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { +- memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, +- new_extra_isize); ++ memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ++ EXT4_I(inode)->i_extra_isize, 0, ++ new_extra_isize - EXT4_I(inode)->i_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } +-- +2.9.3 + diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-large-dir.patch new file mode 100644 index 0000000..4f5fcb4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-large-dir.patch @@ -0,0 +1,347 @@ +This INCOMPAT_LARGEDIR feature allows larger directories +to be created in ldiskfs, both with directory sizes over +2GB and and a maximum htree depth of 3 instead of the +current limit of 2. These features are needed in order +to exceed the current limit of approximately 10M entries +in a single directory. + +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +@@ -1585,7 +1585,8 @@ static inline void ext4_clear_state_flag + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_DIRDATA| \ +- EXT4_FEATURE_INCOMPAT_INLINE_DATA) ++ EXT4_FEATURE_INCOMPAT_INLINE_DATA| \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1999,6 +2000,9 @@ struct mmpd_data { + # define NORET_TYPE /**/ + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 + + struct ext4_xattr_ino_array { + unsigned int xia_count; /* # of used item in the array */ +@@ -2472,13 +2476,16 @@ static inline void ext4_r_blocks_count_s + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) || ++ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) && ++ S_ISDIR(le16_to_cpu(raw_inode->i_mode)))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t)le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +@@ -513,7 +513,14 @@ struct dx_root_info * dx_get_dx_info(str + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; ++} ++ ++static inline int ++ext4_dir_htree_level(struct super_block *sb) ++{ ++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -681,7 +688,7 @@ dx_probe(const struct qstr *d_name, stru + struct dx_frame *frame = frame_in; + u32 hash; + +- frame->bh = NULL; ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); +@@ -714,10 +721,15 @@ dx_probe(const struct qstr *d_name, stru + } + + indirect = info->indirect_levels; +- if (indirect > 1) { +- ext4_warning(dir->i_sb, +- "inode #%lu: unimplemented hash depth %u", +- dir->i_ino, info->indirect_levels); ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "inode #%lu: comm %s: htree depth %#06x exceed max depth %u", ++ dir->i_ino, current->comm, indirect, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -812,13 +826,18 @@ fail: + static void dx_release (struct dx_frame *frames) + { + struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + + info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data); +- if (info->indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0; i <= info->indirect_levels; i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -960,7 +979,7 @@ int ext4_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -1376,7 +1395,7 @@ static struct buffer_head * ext4_dx_find + { + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; +@@ -1832,7 +1851,7 @@ static int make_indexed_dir(handle_t *ha + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + struct ext4_dir_entry_tail *t; +@@ -2117,15 +2136,18 @@ static int ext4_add_entry(handle_t *hand + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + ++again: ++ restart = 0; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +@@ -2138,33 +2160,48 @@ static int ext4_dx_add_entry(handle_t *h + goto cleanup; + } + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext4_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning(sb, "Directory index full!"); ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u", ++ dir->i_ino, current->comm, levels, ++ ext4_dir_htree_level(sb)); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is" ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); +@@ -2179,7 +2216,7 @@ static int ext4_dx_add_entry(handle_t *h + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *h + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -2203,19 +2240,25 @@ static int ext4_dx_add_entry(handle_t *h + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block(frame - 1, hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", +- ((struct dx_node *) bh2->b_data)->entries)); ++ ((struct dx_node *)bh2->b_data)->entries)); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ ext4_handle_dirty_dirent_node(handle, dir, ++ (frame - 1)->bh); ++ if (restart) { ++ ext4_handle_dirty_dirent_node(handle, dir, ++ frame->bh); ++ goto cleanup; ++ } + } else { + struct dx_root_info *info; +- dxtrace(printk(KERN_DEBUG +- "Creating second level index...\n")); +- memcpy((char *) entries2, (char *) entries, ++ ++ memcpy((char *)entries2, (char *)entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + +@@ -2224,21 +2267,14 @@ static int ext4_dx_add_entry(handle_t *h + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2*) + frames[0].bh->b_data); +- info->indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; +- } +- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); +- if (err) { +- ext4_std_error(inode->i_sb, err); ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ ext4_handle_dirty_dirent_node(handle, dir, frame->bh); ++ ext4_handle_dirty_dirent_node(handle, dir, bh2); ++ brelse(bh2); ++ restart = 1; + goto cleanup; + } + } +@@ -2253,6 +2289,10 @@ journal_error: + cleanup: + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/inode.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c +@@ -4056,12 +4056,12 @@ struct inode *ext4_iget(struct super_blo + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +@@ -4306,7 +4306,7 @@ static int ext4_do_update_inode(handle_t + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); +- if (ei->i_disksize != ext4_isize(raw_inode)) { ++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-pdirop.patch new file mode 100644 index 0000000..53428bd --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-pdirop.patch @@ -0,0 +1,1932 @@ +Single directory performance is a critical for HPC workloads. In a +typical use case an application creates a separate output file for +each node and task in a job. As nodes and tasks increase, hundreds +of thousands of files may be created in a single directory within +a short window of time. +Today, both filename lookup and file system modifying operations +(such as create and unlink) are protected with a single lock for +an entire ldiskfs directory. PDO project will remove this +bottleneck by introducing a parallel locking mechanism for entire +ldiskfs directories. This work will enable multiple application +threads to simultaneously lookup, create and unlink in parallel. + +This patch contains: + - pdirops support for ldiskfs + - integrate with osd-ldiskfs + +Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h +=================================================================== +--- /dev/null ++++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h +@@ -0,0 +1,187 @@ ++/* ++ * include/linux/htree_lock.h ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++ ++/* ++ * htree lock ++ * ++ * htree_lock is an advanced lock, it can support five lock modes (concept is ++ * taken from DLM) and it's a sleeping lock. ++ * ++ * most common use case is: ++ * - create a htree_lock_head for data ++ * - each thread (contender) creates it's own htree_lock ++ * - contender needs to call htree_lock(lock_node, mode) to protect data and ++ * call htree_unlock to release lock ++ * ++ * Also, there is advanced use-case which is more complex, user can have ++ * PW/PR lock on particular key, it's mostly used while user holding shared ++ * lock on the htree (CW, CR) ++ * ++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR ++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR ++ * ... ++ * htree_node_unlock(lock_node);; unlock the key ++ * ++ * Another tip is, we can have N-levels of this kind of keys, all we need to ++ * do is specifying N-levels while creating htree_lock_head, then we can ++ * lock/unlock a specific level by: ++ * htree_node_lock(lock_node, mode1, key1, level1...); ++ * do something; ++ * htree_node_lock(lock_node, mode1, key2, level2...); ++ * do something; ++ * htree_node_unlock(lock_node, level2); ++ * htree_node_unlock(lock_node, level1); ++ * ++ * NB: for multi-level, should be careful about locking order to avoid deadlock ++ */ ++ ++#ifndef _LINUX_HTREE_LOCK_H ++#define _LINUX_HTREE_LOCK_H ++ ++#include ++#include ++#include ++ ++/* ++ * Lock Modes ++ * more details can be found here: ++ * http://en.wikipedia.org/wiki/Distributed_lock_manager ++ */ ++typedef enum { ++ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */ ++ HTREE_LOCK_PW, /* protected write: allows only CR users */ ++ HTREE_LOCK_PR, /* protected read: allow PR, CR users */ ++ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */ ++ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */ ++ HTREE_LOCK_MAX, /* number of lock modes */ ++} htree_lock_mode_t; ++ ++#define HTREE_LOCK_NL HTREE_LOCK_MAX ++#define HTREE_LOCK_INVAL 0xdead10c ++ ++enum { ++ HTREE_HBITS_MIN = 2, ++ HTREE_HBITS_DEF = 14, ++ HTREE_HBITS_MAX = 32, ++}; ++ ++enum { ++ HTREE_EVENT_DISABLE = (0), ++ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR), ++ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW), ++ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR), ++}; ++ ++struct htree_lock; ++ ++typedef void (*htree_event_cb_t)(void *target, void *event); ++ ++struct htree_lock_child { ++ struct list_head lc_list; /* granted list */ ++ htree_event_cb_t lc_callback; /* event callback */ ++ unsigned lc_events; /* event types */ ++}; ++ ++struct htree_lock_head { ++ unsigned long lh_lock; /* bits lock */ ++ /* blocked lock list (htree_lock) */ ++ struct list_head lh_blocked_list; ++ /* # key levels */ ++ u16 lh_depth; ++ /* hash bits for key and limit number of locks */ ++ u16 lh_hbits; ++ /* counters for blocked locks */ ++ u16 lh_nblocked[HTREE_LOCK_MAX]; ++ /* counters for granted locks */ ++ u16 lh_ngranted[HTREE_LOCK_MAX]; ++ /* private data */ ++ void *lh_private; ++ /* array of children locks */ ++ struct htree_lock_child lh_children[0]; ++}; ++ ++/* htree_lock_node_t is child-lock for a specific key (ln_value) */ ++struct htree_lock_node { ++ htree_lock_mode_t ln_mode; ++ /* major hash key */ ++ u16 ln_major_key; ++ /* minor hash key */ ++ u16 ln_minor_key; ++ struct list_head ln_major_list; ++ struct list_head ln_minor_list; ++ /* alive list, all locks (granted, blocked, listening) are on it */ ++ struct list_head ln_alive_list; ++ /* blocked list */ ++ struct list_head ln_blocked_list; ++ /* granted list */ ++ struct list_head ln_granted_list; ++ void *ln_ev_target; ++}; ++ ++struct htree_lock { ++ struct task_struct *lk_task; ++ struct htree_lock_head *lk_head; ++ void *lk_private; ++ unsigned lk_depth; ++ htree_lock_mode_t lk_mode; ++ struct list_head lk_blocked_list; ++ struct htree_lock_node lk_nodes[0]; ++}; ++ ++/* create a lock head, which stands for a resource */ ++struct htree_lock_head *htree_lock_head_alloc(unsigned depth, ++ unsigned hbits, unsigned priv); ++/* free a lock head */ ++void htree_lock_head_free(struct htree_lock_head *lhead); ++/* register event callback for child lock at level @depth */ ++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth, ++ unsigned events, htree_event_cb_t callback); ++/* create a lock handle, which stands for a thread */ ++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes); ++/* free a lock handle */ ++void htree_lock_free(struct htree_lock *lck); ++/* lock htree, when @wait is true, 0 is returned if the lock can't ++ * be granted immediately */ ++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait); ++/* unlock htree */ ++void htree_unlock(struct htree_lock *lck); ++/* unlock and relock htree with @new_mode */ ++int htree_change_lock_try(struct htree_lock *lck, ++ htree_lock_mode_t new_mode, int wait); ++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode); ++/* require child lock (key) of htree at level @dep, @event will be sent to all ++ * listeners on this @key while lock being granted */ ++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event); ++/* release child lock at level @dep, this lock will listen on it's key ++ * if @event isn't NULL, event_cb will be called against @lck while granting ++ * any other lock at level @dep with the same key */ ++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event); ++/* stop listening on child lock at level @dep */ ++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep); ++/* for debug */ ++void htree_lock_stat_print(int depth); ++void htree_lock_stat_reset(void); ++ ++#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1) ++#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1) ++ ++#define htree_lock_mode(lck) ((lck)->lk_mode) ++ ++#define htree_node_lock(lck, mode, key, dep) \ ++ htree_node_lock_try(lck, mode, key, dep, 1, NULL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_granted(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \ ++ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_listening(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) ++ ++#endif +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c +=================================================================== +--- /dev/null ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c +@@ -0,0 +1,880 @@ ++/* ++ * fs/ext4/htree_lock.c ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++#include ++#include ++#include ++#include ++ ++enum { ++ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX), ++ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW), ++ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR), ++ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW), ++ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR), ++}; ++ ++enum { ++ HTREE_LOCK_COMPAT_EX = 0, ++ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR, ++ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR, ++ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW, ++ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR | ++ HTREE_LOCK_BIT_PW, ++}; ++ ++static int htree_lock_compat[] = { ++ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX, ++ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW, ++ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR, ++ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW, ++ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR, ++}; ++ ++/* max allowed htree-lock depth. ++ * We only need depth=3 for ext4 although user can have higher value. */ ++#define HTREE_LOCK_DEP_MAX 16 ++ ++#ifdef HTREE_LOCK_DEBUG ++ ++static char *hl_name[] = { ++ [HTREE_LOCK_EX] "EX", ++ [HTREE_LOCK_PW] "PW", ++ [HTREE_LOCK_PR] "PR", ++ [HTREE_LOCK_CW] "CW", ++ [HTREE_LOCK_CR] "CR", ++}; ++ ++/* lock stats */ ++struct htree_lock_node_stats { ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long retried[HTREE_LOCK_MAX]; ++ unsigned long long events; ++}; ++ ++struct htree_lock_stats { ++ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++}; ++ ++static struct htree_lock_stats hl_stats; ++ ++void htree_lock_stat_reset(void) ++{ ++ memset(&hl_stats, 0, sizeof(hl_stats)); ++} ++ ++void htree_lock_stat_print(int depth) ++{ ++ int i; ++ int j; ++ ++ printk(KERN_DEBUG "HTREE LOCK STATS:\n"); ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n", ++ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]); ++ } ++ for (i = 0; i < depth; i++) { ++ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i); ++ for (j = 0; j < HTREE_LOCK_MAX; j++) { ++ printk(KERN_DEBUG ++ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n", ++ hl_name[j], hl_stats.nodes[i].granted[j], ++ hl_stats.nodes[i].blocked[j], ++ hl_stats.nodes[i].retried[j]); ++ } ++ } ++} ++ ++#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0) ++#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0) ++#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0) ++#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0) ++#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0) ++#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0) ++ ++#else /* !DEBUG */ ++ ++void htree_lock_stat_reset(void) {} ++void htree_lock_stat_print(int depth) {} ++ ++#define lk_grant_inc(m) do {} while (0) ++#define lk_block_inc(m) do {} while (0) ++#define ln_grant_inc(d, m) do {} while (0) ++#define ln_block_inc(d, m) do {} while (0) ++#define ln_retry_inc(d, m) do {} while (0) ++#define ln_event_inc(d) do {} while (0) ++ ++#endif /* DEBUG */ ++ ++EXPORT_SYMBOL(htree_lock_stat_reset); ++EXPORT_SYMBOL(htree_lock_stat_print); ++ ++#define HTREE_DEP_ROOT (-1) ++ ++#define htree_spin_lock(lhead, dep) \ ++ bit_spin_lock((dep) + 1, &(lhead)->lh_lock) ++#define htree_spin_unlock(lhead, dep) \ ++ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock) ++ ++#define htree_key_event_ignore(child, ln) \ ++ (!((child)->lc_events & (1 << (ln)->ln_mode))) ++ ++static int ++htree_key_list_empty(struct htree_lock_node *ln) ++{ ++ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list); ++} ++ ++static void ++htree_key_list_del_init(struct htree_lock_node *ln) ++{ ++ struct htree_lock_node *tmp = NULL; ++ ++ if (!list_empty(&ln->ln_minor_list)) { ++ tmp = list_entry(ln->ln_minor_list.next, ++ struct htree_lock_node, ln_minor_list); ++ list_del_init(&ln->ln_minor_list); ++ } ++ ++ if (list_empty(&ln->ln_major_list)) ++ return; ++ ++ if (tmp == NULL) { /* not on minor key list */ ++ list_del_init(&ln->ln_major_list); ++ } else { ++ BUG_ON(!list_empty(&tmp->ln_major_list)); ++ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list); ++ } ++} ++ ++static void ++htree_key_list_replace_init(struct htree_lock_node *old, ++ struct htree_lock_node *new) ++{ ++ if (!list_empty(&old->ln_major_list)) ++ list_replace_init(&old->ln_major_list, &new->ln_major_list); ++ ++ if (!list_empty(&old->ln_minor_list)) ++ list_replace_init(&old->ln_minor_list, &new->ln_minor_list); ++} ++ ++static void ++htree_key_event_enqueue(struct htree_lock_child *child, ++ struct htree_lock_node *ln, int dep, void *event) ++{ ++ struct htree_lock_node *tmp; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ BUG_ON(ln->ln_mode == HTREE_LOCK_NL); ++ if (event == NULL || htree_key_event_ignore(child, ln)) ++ return; ++ ++ /* shouldn't be a very long list */ ++ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) { ++ if (tmp->ln_mode == HTREE_LOCK_NL) { ++ ln_event_inc(dep); ++ if (child->lc_callback != NULL) ++ child->lc_callback(tmp->ln_ev_target, event); ++ } ++ } ++} ++ ++static int ++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk, ++ unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep]; ++ struct htree_lock_node *newln = &newlk->lk_nodes[dep]; ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ /* NB: we only expect PR/PW lock mode at here, only these two modes are ++ * allowed for htree_node_lock(asserted in htree_node_lock_internal), ++ * NL is only used for listener, user can't directly require NL mode */ ++ if ((curln->ln_mode == HTREE_LOCK_NL) || ++ (curln->ln_mode != HTREE_LOCK_PW && ++ newln->ln_mode != HTREE_LOCK_PW)) { ++ /* no conflict, attach it on granted list of @curlk */ ++ if (curln->ln_mode != HTREE_LOCK_NL) { ++ list_add(&newln->ln_granted_list, ++ &curln->ln_granted_list); ++ } else { ++ /* replace key owner */ ++ htree_key_list_replace_init(curln, newln); ++ } ++ ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ htree_key_event_enqueue(child, newln, dep, event); ++ ln_grant_inc(dep, newln->ln_mode); ++ return 1; /* still hold lh_lock */ ++ } ++ ++ if (!wait) { /* can't grant and don't want to wait */ ++ ln_retry_inc(dep, newln->ln_mode); ++ newln->ln_mode = HTREE_LOCK_INVAL; ++ return -1; /* don't wait and just return -1 */ ++ } ++ ++ newlk->lk_task = current; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ /* conflict, attach it on blocked list of curlk */ ++ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list); ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ ln_block_inc(dep, newln->ln_mode); ++ ++ htree_spin_unlock(newlk->lk_head, dep); ++ /* wait to be given the lock */ ++ if (newlk->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt, wake up will set me RUNNING */ ++ if (event == NULL || htree_key_event_ignore(child, newln)) ++ return 0; /* granted without lh_lock */ ++ ++ htree_spin_lock(newlk->lk_head, dep); ++ htree_key_event_enqueue(child, newln, dep, event); ++ return 1; /* still hold lh_lock */ ++} ++ ++/* ++ * get PR/PW access to particular tree-node according to @dep and @key, ++ * it will return -1 if @wait is false and can't immediately grant this lock. ++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get ++ * @event if it's not NULL. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck, ++ htree_lock_mode_t mode, u32 key, unsigned dep, ++ int wait, void *event) ++{ ++ LIST_HEAD(list); ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ u16 major; ++ u16 minor; ++ u8 reverse; ++ u8 ma_bits; ++ u8 mi_bits; ++ ++ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR); ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ ++ key = hash_long(key, lhead->lh_hbits); ++ ++ mi_bits = lhead->lh_hbits >> 1; ++ ma_bits = lhead->lh_hbits - mi_bits; ++ ++ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1); ++ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits; ++ lck->lk_nodes[dep].ln_mode = mode; ++ ++ /* ++ * The major key list is an ordered list, so searches are started ++ * at the end of the list that is numerically closer to major_key, ++ * so at most half of the list will be walked (for well-distributed ++ * keys). The list traversal aborts early if the expected key ++ * location is passed. ++ */ ++ reverse = (major >= (1 << (ma_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp, ++ &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key < major) { ++ /* attach _after_ @tmp */ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ ++ } else { ++ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key > major) { ++ /* insert _before_ @tmp */ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ } ++ ++ search_minor: ++ /* ++ * NB: minor_key list doesn't have a "head", @list is just a ++ * temporary stub for helping list searching, make sure it's removed ++ * after searching. ++ * minor_key list is an ordered list too. ++ */ ++ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list); ++ ++ reverse = (minor >= (1 << (mi_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) { ++ /* attach _after_ @tmp2 */ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, &list); ++ ++ } else { ++ list_for_each_entry(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) { ++ /* insert _before_ @tmp2 */ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list); ++ } ++ ++ out_grant_minor: ++ if (list.next == &lck->lk_nodes[dep].ln_minor_list) { ++ /* new lock @lck is the first one on minor_key list, which ++ * means it has the smallest minor_key and it should ++ * replace @tmp as minor_key owner */ ++ list_replace_init(&tmp->lk_nodes[dep].ln_major_list, ++ &lck->lk_nodes[dep].ln_major_list); ++ } ++ /* remove the temporary head */ ++ list_del(&list); ++ ++ out_grant_major: ++ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode); ++ return 1; /* granted with holding lh_lock */ ++ ++ out_enqueue: ++ list_del(&list); /* remove temprary head */ ++ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event); ++} ++ ++/* ++ * release the key of @lck at level @dep, and grant any blocked locks. ++ * caller will still listen on @key if @event is not NULL, which means ++ * caller can see a event (by event_cb) while granting any lock with ++ * the same key at level @dep. ++ * NB: ALWAYS called holding lhead::lh_lock ++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL ++ */ ++static void ++htree_node_unlock_internal(struct htree_lock_head *lhead, ++ struct htree_lock *curlk, unsigned dep, void *event) ++{ ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ struct htree_lock *grtlk = NULL; ++ struct htree_lock_node *grtln; ++ struct htree_lock *poslk; ++ struct htree_lock *tmplk; ++ ++ if (!htree_node_is_granted(curlk, dep)) ++ return; ++ ++ if (!list_empty(&curln->ln_granted_list)) { ++ /* there is another granted lock */ ++ grtlk = list_entry(curln->ln_granted_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_granted_list); ++ list_del_init(&curln->ln_granted_list); ++ } ++ ++ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) { ++ /* ++ * @curlk is the only granted lock, so we confirmed: ++ * a) curln is key owner (attached on major/minor_list), ++ * so if there is any blocked lock, it should be attached ++ * on curln->ln_blocked_list ++ * b) we always can grant the first blocked lock ++ */ ++ grtlk = list_entry(curln->ln_blocked_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_blocked_list); ++ BUG_ON(grtlk->lk_task == NULL); ++ wake_up_process(grtlk->lk_task); ++ } ++ ++ if (event != NULL && ++ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) { ++ curln->ln_ev_target = event; ++ curln->ln_mode = HTREE_LOCK_NL; /* listen! */ ++ } else { ++ curln->ln_mode = HTREE_LOCK_INVAL; ++ } ++ ++ if (grtlk == NULL) { /* I must be the only one locking this key */ ++ struct htree_lock_node *tmpln; ++ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */ ++ return; ++ ++ /* not listening */ ++ if (list_empty(&curln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(curln); ++ return; ++ } ++ ++ tmpln = list_entry(curln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL); ++ ++ htree_key_list_replace_init(curln, tmpln); ++ list_del_init(&curln->ln_alive_list); ++ ++ return; ++ } ++ ++ /* have a granted lock */ ++ grtln = &grtlk->lk_nodes[dep]; ++ if (!list_empty(&curln->ln_blocked_list)) { ++ /* only key owner can be on both lists */ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (list_empty(&grtln->ln_blocked_list)) { ++ list_add(&grtln->ln_blocked_list, ++ &curln->ln_blocked_list); ++ } ++ list_del_init(&curln->ln_blocked_list); ++ } ++ /* ++ * NB: this is the tricky part: ++ * We have only two modes for child-lock (PR and PW), also, ++ * only owner of the key (attached on major/minor_list) can be on ++ * both blocked_list and granted_list, so @grtlk must be one ++ * of these two cases: ++ * ++ * a) @grtlk is taken from granted_list, which means we've granted ++ * more than one lock so @grtlk has to be PR, the first blocked ++ * lock must be PW and we can't grant it at all. ++ * So even @grtlk is not owner of the key (empty blocked_list), ++ * we don't care because we can't grant any lock. ++ * b) we just grant a new lock which is taken from head of blocked ++ * list, and it should be the first granted lock, and it should ++ * be the first one linked on blocked_list. ++ * ++ * Either way, we can get correct result by iterating blocked_list ++ * of @grtlk, and don't have to bother on how to find out ++ * owner of current key. ++ */ ++ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list, ++ lk_nodes[dep].ln_blocked_list) { ++ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW || ++ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW) ++ break; ++ /* grant all readers */ ++ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list); ++ list_add(&poslk->lk_nodes[dep].ln_granted_list, ++ &grtln->ln_granted_list); ++ ++ BUG_ON(poslk->lk_task == NULL); ++ wake_up_process(poslk->lk_task); ++ } ++ ++ /* if @curln is the owner of this key, replace it with @grtln */ ++ if (!htree_key_list_empty(curln)) ++ htree_key_list_replace_init(curln, grtln); ++ ++ if (curln->ln_mode == HTREE_LOCK_INVAL) ++ list_del_init(&curln->ln_alive_list); ++} ++ ++/* ++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted ++ * and 0 only if @wait is false and can't grant it immediately ++ */ ++int ++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event); ++ if (rc != 0) ++ htree_spin_unlock(lhead, dep); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_node_lock_try); ++ ++/* it's wrapper of htree_node_unlock_internal */ ++void ++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ htree_node_unlock_internal(lhead, lck, dep, event); ++ htree_spin_unlock(lhead, dep); ++} ++EXPORT_SYMBOL(htree_node_unlock); ++ ++/* stop listening on child-lock level @dep */ ++void ++htree_node_stop_listen(struct htree_lock *lck, unsigned dep) ++{ ++ struct htree_lock_node *ln = &lck->lk_nodes[dep]; ++ struct htree_lock_node *tmp; ++ ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ BUG_ON(!list_empty(&ln->ln_blocked_list)); ++ BUG_ON(!list_empty(&ln->ln_granted_list)); ++ ++ if (!htree_node_is_listening(lck, dep)) ++ return; ++ ++ htree_spin_lock(lck->lk_head, dep); ++ ln->ln_mode = HTREE_LOCK_INVAL; ++ ln->ln_ev_target = NULL; ++ ++ if (htree_key_list_empty(ln)) { /* not owner */ ++ list_del_init(&ln->ln_alive_list); ++ goto out; ++ } ++ ++ /* I'm the owner... */ ++ if (list_empty(&ln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(ln); ++ goto out; ++ } ++ ++ tmp = list_entry(ln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL); ++ htree_key_list_replace_init(ln, tmp); ++ list_del_init(&ln->ln_alive_list); ++ out: ++ htree_spin_unlock(lck->lk_head, dep); ++} ++EXPORT_SYMBOL(htree_node_stop_listen); ++ ++/* release all child-locks if we have any */ ++static void ++htree_node_release_all(struct htree_lock *lck) ++{ ++ int i; ++ ++ for (i = 0; i < lck->lk_depth; i++) { ++ if (htree_node_is_granted(lck, i)) ++ htree_node_unlock(lck, i, NULL); ++ else if (htree_node_is_listening(lck, i)) ++ htree_node_stop_listen(lck, i); ++ } ++} ++ ++/* ++ * obtain htree lock, it could be blocked inside if there's conflict ++ * with any granted or blocked lock and @wait is true. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_lock_internal(struct htree_lock *lck, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int granted = 0; ++ int blocked = 0; ++ int i; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ if (lhead->lh_nblocked[i] != 0) ++ blocked |= 1 << i; ++ } ++ if ((htree_lock_compat[lck->lk_mode] & granted) != granted || ++ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) { ++ /* will block current lock even it just conflicts with any ++ * other blocked lock, so lock like EX wouldn't starve */ ++ if (!wait) ++ return -1; ++ lhead->lh_nblocked[lck->lk_mode]++; ++ lk_block_inc(lck->lk_mode); ++ ++ lck->lk_task = current; ++ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ /* wait to be given the lock */ ++ if (lck->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt. wake up will set me RUNNING */ ++ return 0; /* without lh_lock */ ++ } ++ lhead->lh_ngranted[lck->lk_mode]++; ++ lk_grant_inc(lck->lk_mode); ++ return 1; ++} ++ ++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */ ++static void ++htree_unlock_internal(struct htree_lock *lck) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ int granted = 0; ++ int i; ++ ++ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0); ++ ++ lhead->lh_ngranted[lck->lk_mode]--; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ } ++ list_for_each_entry_safe(tmp, tmp2, ++ &lhead->lh_blocked_list, lk_blocked_list) { ++ /* conflict with any granted lock? */ ++ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted) ++ break; ++ ++ list_del_init(&tmp->lk_blocked_list); ++ ++ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0); ++ ++ lhead->lh_nblocked[tmp->lk_mode]--; ++ lhead->lh_ngranted[tmp->lk_mode]++; ++ granted |= 1 << tmp->lk_mode; ++ ++ BUG_ON(tmp->lk_task == NULL); ++ wake_up_process(tmp->lk_task); ++ } ++} ++ ++/* it's wrapper of htree_lock_internal and exported interface. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait) ++{ ++ int rc; ++ ++ BUG_ON(lck->lk_depth > lhead->lh_depth); ++ BUG_ON(lck->lk_head != NULL); ++ BUG_ON(lck->lk_task != NULL); ++ ++ lck->lk_head = lhead; ++ lck->lk_mode = mode; ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_lock_try); ++ ++/* it's wrapper of htree_unlock_internal and exported interface. ++ * It will release all htree_node_locks and htree_lock */ ++void ++htree_unlock(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_head == NULL); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT); ++ lck->lk_head = NULL; ++ lck->lk_task = NULL; ++} ++EXPORT_SYMBOL(htree_unlock); ++ ++/* change lock mode */ ++void ++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode) ++{ ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ lck->lk_mode = mode; ++} ++EXPORT_SYMBOL(htree_change_mode); ++ ++/* release htree lock, and lock it again with new mode. ++ * This function will first release all htree_node_locks and htree_lock, ++ * then try to gain htree_lock with new @mode. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(lhead == NULL); ++ BUG_ON(lck->lk_mode == mode); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ lck->lk_mode = mode; ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_change_lock_try); ++ ++/* create a htree_lock head with @depth levels (number of child-locks), ++ * it is a per resoruce structure */ ++struct htree_lock_head * ++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv) ++{ ++ struct htree_lock_head *lhead; ++ int i; ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ ++ lhead = kzalloc(offsetof(struct htree_lock_head, ++ lh_children[depth]) + priv, GFP_NOFS); ++ if (lhead == NULL) ++ return NULL; ++ ++ if (hbits < HTREE_HBITS_MIN) ++ lhead->lh_hbits = HTREE_HBITS_MIN; ++ else if (hbits > HTREE_HBITS_MAX) ++ lhead->lh_hbits = HTREE_HBITS_MAX; ++ ++ lhead->lh_lock = 0; ++ lhead->lh_depth = depth; ++ INIT_LIST_HEAD(&lhead->lh_blocked_list); ++ if (priv > 0) { ++ lhead->lh_private = (void *)lhead + ++ offsetof(struct htree_lock_head, lh_children[depth]); ++ } ++ ++ for (i = 0; i < depth; i++) { ++ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list); ++ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE; ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(htree_lock_head_alloc); ++ ++/* free the htree_lock head */ ++void ++htree_lock_head_free(struct htree_lock_head *lhead) ++{ ++ int i; ++ ++ BUG_ON(!list_empty(&lhead->lh_blocked_list)); ++ for (i = 0; i < lhead->lh_depth; i++) ++ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list)); ++ kfree(lhead); ++} ++EXPORT_SYMBOL(htree_lock_head_free); ++ ++/* register event callback for @events of child-lock at level @dep */ ++void ++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep, ++ unsigned events, htree_event_cb_t callback) ++{ ++ BUG_ON(lhead->lh_depth <= dep); ++ lhead->lh_children[dep].lc_events = events; ++ lhead->lh_children[dep].lc_callback = callback; ++} ++EXPORT_SYMBOL(htree_lock_event_attach); ++ ++/* allocate a htree_lock, which is per-thread structure, @pbytes is some ++ * extra-bytes as private data for caller */ ++struct htree_lock * ++htree_lock_alloc(unsigned depth, unsigned pbytes) ++{ ++ struct htree_lock *lck; ++ int i = offsetof(struct htree_lock, lk_nodes[depth]); ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ lck = kzalloc(i + pbytes, GFP_NOFS); ++ if (lck == NULL) ++ return NULL; ++ ++ if (pbytes != 0) ++ lck->lk_private = (void *)lck + i; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ lck->lk_depth = depth; ++ INIT_LIST_HEAD(&lck->lk_blocked_list); ++ ++ for (i = 0; i < depth; i++) { ++ struct htree_lock_node *node = &lck->lk_nodes[i]; ++ ++ node->ln_mode = HTREE_LOCK_INVAL; ++ INIT_LIST_HEAD(&node->ln_major_list); ++ INIT_LIST_HEAD(&node->ln_minor_list); ++ INIT_LIST_HEAD(&node->ln_alive_list); ++ INIT_LIST_HEAD(&node->ln_blocked_list); ++ INIT_LIST_HEAD(&node->ln_granted_list); ++ } ++ ++ return lck; ++} ++EXPORT_SYMBOL(htree_lock_alloc); ++ ++/* free htree_lock node */ ++void ++htree_lock_free(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL); ++ kfree(lck); ++} ++EXPORT_SYMBOL(htree_lock_free); +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/Makefile ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile +@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ htree_lock.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -821,6 +822,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -1846,6 +1850,71 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *__ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); + + /* + * Describe an inode's exact location on disk and in memory +@@ -2088,9 +2157,17 @@ void ext4_insert_dentry(struct inode *in + const char *name, int namelen, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++#endif + } + static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +@@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t + ext4_lblk_t *block) + { + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err = 0; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -60,15 +61,22 @@ struct buffer_head *ext4_append(handle_t + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, 1, &err); +- if (!bh) ++ if (!bh) { ++ up(&ei->i_append_sem); + return ERR_PTR(err); ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); ++ up(&ei->i_append_sem); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); +@@ -246,7 +254,7 @@ static struct dx_frame *dx_probe(const s + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +- int *err); ++ struct htree_lock *lck, int *err); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +@@ -259,13 +267,13 @@ static void dx_insert_block(struct dx_fr + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *err); ++ struct htree_lock *lck, int *err); + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode); ++ struct inode *inode, struct htree_lock *lck); + + /* checksumming functions */ + void initialize_dirent_tail(struct ext4_dir_entry_tail *t, +@@ -668,6 +676,227 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ __ext4_find_entry(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ __ext4_add_entry(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -679,10 +908,11 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(const struct qstr *d_name, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck, int *err) + { + unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; +@@ -750,8 +980,15 @@ dx_probe(const struct qstr *d_name, stru + dxtrace(printk("Look up %x", hash)); + while (1) + { ++ if (indirect == 0) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); +- if (!count || count > dx_get_limit(entries)) { ++ if (count == 0 || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + brelse(bh); +@@ -792,7 +1029,70 @@ dx_probe(const struct qstr *d_name, stru + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; ++ ++ if (indirect == 0) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ indirect--; + bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); +@@ -860,7 +1160,7 @@ static void dx_release (struct dx_frame + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -875,12 +1175,22 @@ static int ext4_htree_next_block(struct + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -903,6 +1213,13 @@ static int ext4_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -911,6 +1228,7 @@ static int ext4_htree_next_block(struct + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1013,10 +1331,10 @@ int ext4_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames, &err); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err); + if (!frame) + return err; +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1043,7 +1361,7 @@ int ext4_htree_fill_tree(struct file *di + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1236,10 +1554,10 @@ static int is_dx_internal_node(struct in + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext4_find_entry (struct inode *dir, ++struct buffer_head *__ext4_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1283,7 +1601,7 @@ static struct buffer_head * ext4_find_en + goto restart; + } + if (is_dx(dir)) { +- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); ++ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1297,6 +1615,7 @@ static struct buffer_head * ext4_find_en + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; +@@ -1389,9 +1708,12 @@ cleanup_and_exit: + brelse(bh_use[ra_ptr]); + return ret; + } ++EXPORT_SYMBOL(__ext4_find_entry); + +-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, +- struct ext4_dir_entry_2 **res_dir, int *err) ++static struct buffer_head *ext4_dx_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck, int *err) + { + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; +@@ -1400,7 +1722,7 @@ static struct buffer_head * ext4_dx_find + ext4_lblk_t block; + int retval; + +- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) ++ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err))) + return NULL; + do { + block = dx_get_block(frame->at); +@@ -1424,7 +1746,7 @@ static struct buffer_head * ext4_dx_find + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning(sb, + "error reading index page in directory #%lu", +@@ -1583,8 +1905,9 @@ static struct ext4_dir_entry_2* dx_pack_ + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck, int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1647,7 +1970,14 @@ static struct ext4_dir_entry_2 *do_split + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -1666,13 +1996,21 @@ static struct ext4_dir_entry_2 *do_split + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); + if (err) + goto journal_error; +@@ -1945,7 +2283,7 @@ static int make_indexed_dir(handle_t *ha + if (retval) + goto out_frames; + +- de = do_split(handle,dir, &bh2, frame, &hinfo, &retval); ++ de = do_split(handle, dir, &bh2, frames, frame, &hinfo, NULL, &retval); + if (!de) { + goto out_frames; + } +@@ -2051,8 +2389,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh = NULL; +@@ -2087,9 +2425,10 @@ static int ext4_add_entry(handle_t *hand + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, dentry, inode); ++ retval = ext4_dx_add_entry(handle, dentry, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); +@@ -2129,12 +2468,13 @@ static int ext4_add_entry(handle_t *hand + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(__ext4_add_entry); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++ struct inode *inode, struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2148,7 +2488,7 @@ static int ext4_dx_add_entry(handle_t *h + + again: + restart = 0; +- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); + if (!frame) + return err; + entries = frame->entries; +@@ -2178,6 +2518,11 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2277,16 +2622,43 @@ again: + restart = 1; + goto cleanup; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err); + if (!de) + goto cleanup; ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup; + + journal_error: + ext4_std_error(dir->i_sb, err); + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/super.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c +@@ -875,6 +875,7 @@ static struct inode *ext4_alloc_inode(st + + ei->vfs_inode.i_version = 1; + spin_lock_init(&ei->i_raw_lock); ++ sema_init(&ei->i_append_sem, 1); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-prealloc.patch new file mode 100644 index 0000000..65a5d4e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-prealloc.patch @@ -0,0 +1,403 @@ +Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/ext4.h ++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/ext4.h +@@ -1270,11 +1270,14 @@ struct ext4_sb_info { + + /* tunables */ + unsigned long s_stripe; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ +Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/mballoc.c +=================================================================== +--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/mballoc.c ++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/mballoc.c +@@ -1862,6 +1862,26 @@ int ext4_mb_find_by_goal(struct ext4_all + return 0; + } + ++static int ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return -1; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return 0; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return -1; ++ } ++ return -1; ++} ++ + /* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req +@@ -2301,6 +2321,93 @@ static const struct seq_operations ext4_ + .show = ext4_mb_seq_groups_show, + }; + ++#define EXT4_MB_PREALLOC_TABLE "prealloc_table" ++ ++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ int rc; ++ while ((cur < end) && (*cur == ' ')) ++ cur++; ++ rc = kstrtol(cur, 0, &value); ++ if (rc != 0) ++ return -EINVAL; ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; ++ } ++ ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while (cur < end && *cur == ' ') ++ cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (ext4_mb_prealloc_table_add(sbi, value) == 0) ++ ++i; ++ } ++ if (i != num) ++ sbi->s_mb_prealloc_table_size = i; ++ ++ return cnt; ++} ++ ++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ int i; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]); ++ seq_printf(m, "\n"); ++ ++ return 0; ++} ++ ++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_prealloc_table_seq_show, PDE_DATA(inode)); ++} ++ ++static const struct file_operations ext4_mb_prealloc_seq_fops = { ++ .owner = THIS_MODULE, ++ .open = mb_prealloc_table_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++ .write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) + { + struct super_block *sb = PDE_DATA(inode); +@@ -2550,7 +2657,7 @@ static int ext4_groupinfo_create_slab(si + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- unsigned i, j; ++ unsigned i, j, k, l; + unsigned offset, offset_incr; + unsigned max; + int ret; +@@ -2595,7 +2702,6 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + /* + * The default group preallocation is 512, which for 4k block +@@ -2619,9 +2725,47 @@ int ext4_mb_init(struct super_block *sb) + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ +- if (sbi->s_stripe > 1) { +- sbi->s_mb_group_prealloc = roundup( +- sbi->s_mb_group_prealloc, sbi->s_stripe); ++ ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 10; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) { ++ if (ext4_mb_prealloc_table_add(sbi, l) < 0) { ++ sbi->s_mb_prealloc_table_size = k; ++ break; ++ } ++ } ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) { ++ if (ext4_mb_prealloc_table_add(sbi, l) < 0) { ++ sbi->s_mb_prealloc_table_size = k; ++ break; ++ } ++ } ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); +@@ -2643,9 +2787,13 @@ int ext4_mb_init(struct super_block *sb) + if (ret != 0) + goto out_free_locality_groups; + +- if (sbi->s_proc) ++ if (sbi->s_proc) { + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); ++ proc_create_data(EXT4_MB_PREALLOC_TABLE, S_IFREG | S_IRUGO | ++ S_IWUSR, sbi->s_proc, ++ &ext4_mb_prealloc_seq_fops, sb); ++ } + + return 0; + +@@ -2653,6 +2801,7 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); +@@ -2687,8 +2836,10 @@ int ext4_mb_release(struct super_block * + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + +- if (sbi->s_proc) ++ if (sbi->s_proc) { + remove_proc_entry("mb_groups", sbi->s_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ } + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { +@@ -3000,9 +3151,9 @@ ext4_mb_normalize_request(struct ext4_al + struct ext4_allocation_request *ar) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, start_off; ++ loff_t size; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); +@@ -3035,51 +3186,34 @@ ext4_mb_normalize_request(struct ext4_al + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); +- orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ start = wind = 0; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } ++ } ++ size = wind; + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; +- } else { +- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; +- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), +- ac->ac_o_ex.fe_len) << bsbits; ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ orig_size = size; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -3154,7 +3288,6 @@ ext4_mb_normalize_request(struct ext4_al + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -4119,11 +4252,19 @@ static void ext4_mb_group_or_file(struct + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/super.c +=================================================================== +--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/super.c ++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/super.c +@@ -2672,7 +2672,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +@@ -2698,7 +2699,8 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), +Index: linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/inode.c +=================================================================== +--- linux-3.10.0-514.16.1.el7.x86_64.orig/fs/ext4/inode.c ++++ linux-3.10.0-514.16.1.el7.x86_64/fs/ext4/inode.c +@@ -2399,6 +2399,9 @@ static int ext4_writepages(struct addres + ext4_journal_stop(handle); + } + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + diff --git a/ldiskfs/kernel_patches/patches/rhel7.4/ext4-remove-i_data_sem-from-xattr.patch b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-remove-i_data_sem-from-xattr.patch new file mode 100644 index 0000000..0a84192 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.4/ext4-remove-i_data_sem-from-xattr.patch @@ -0,0 +1,363 @@ +From a521100231f816f8cdd9c8e77da14ff1e42c2b17 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 4 Sep 2014 18:06:25 -0400 +Subject: [PATCH] ext4: pass allocation_request struct to + ext4_(alloc,splice)_branch + +Instead of initializing the allocation_request structure in +ext4_alloc_branch(), set it up in ext4_ind_map_blocks(), and then pass +it to ext4_alloc_branch() and ext4_splice_branch(). + +This allows ext4_ind_map_blocks to pass flags in the allocation +request structure without having to add Yet Another argument to +ext4_alloc_branch(). + +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +--- + fs/ext4/indirect.c | 82 +++++++++++++++++++++++++----------------------------- + 1 file changed, 38 insertions(+), 44 deletions(-) + +diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c +index e75f840..69af0cd 100644 +--- a/fs/ext4/indirect.c ++++ b/fs/ext4/indirect.c +@@ -318,34 +318,22 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +-static int ext4_alloc_branch(handle_t *handle, struct inode *inode, +- ext4_lblk_t iblock, int indirect_blks, +- int *blks, ext4_fsblk_t goal, +- ext4_lblk_t *offsets, Indirect *branch) ++static int ext4_alloc_branch(handle_t *handle, ++ struct ext4_allocation_request *ar, ++ int indirect_blks, ext4_lblk_t *offsets, ++ Indirect *branch) + { +- struct ext4_allocation_request ar; + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; + +- /* +- * Set up for the direct block allocation +- */ +- memset(&ar, 0, sizeof(ar)); +- ar.inode = inode; +- ar.len = *blks; +- ar.logical = iblock; +- if (S_ISREG(inode->i_mode)) +- ar.flags = EXT4_MB_HINT_DATA; +- + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { +- ar.goal = goal; +- new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); ++ new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); + } else +- goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, +- goal, 0, NULL, &err); ++ ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ++ ar->inode, ar->goal, 0, NULL, &err); + if (err) { + i--; + goto failed; +@@ -354,7 +342,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + if (i == 0) + continue; + +- bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); ++ bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; +@@ -372,7 +360,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + b = new_blocks[i]; + + if (i == indirect_blks) +- len = ar.len; ++ len = ar->len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + +@@ -381,11 +369,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); +- err = ext4_handle_dirty_metadata(handle, inode, bh); ++ err = ext4_handle_dirty_metadata(handle, ar->inode, bh); + if (err) + goto failed; + } +- *blks = ar.len; + return 0; + failed: + for (; i >= 0; i--) { +@@ -396,10 +383,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) +- ext4_forget(handle, 1, inode, branch[i].bh, ++ ext4_forget(handle, 1, ar->inode, branch[i].bh, + branch[i].bh->b_blocknr); +- ext4_free_blocks(handle, inode, NULL, new_blocks[i], +- (i == indirect_blks) ? ar.len : 1, 0); ++ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], ++ (i == indirect_blks) ? ar->len : 1, 0); + } + return err; + } +@@ -419,9 +406,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +-static int ext4_splice_branch(handle_t *handle, struct inode *inode, +- ext4_lblk_t block, Indirect *where, int num, +- int blks) ++static int ext4_splice_branch(handle_t *handle, ++ struct ext4_allocation_request *ar, ++ Indirect *where, int num) + { + int i; + int err = 0; +@@ -446,9 +433,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ +- if (num == 0 && blks > 1) { ++ if (num == 0 && ar->len > 1) { + current_block = le32_to_cpu(where->key) + 1; +- for (i = 1; i < blks; i++) ++ for (i = 1; i < ar->len; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + +@@ -465,14 +452,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); +- err = ext4_handle_dirty_metadata(handle, inode, where->bh); ++ err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ +- ext4_mark_inode_dirty(handle, inode); ++ ext4_mark_inode_dirty(handle, ar->inode); + jbd_debug(5, "splicing direct\n"); + } + return err; +@@ -484,11 +471,11 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ +- ext4_free_blocks(handle, inode, where[i].bh, 0, 1, ++ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } +- ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), +- blks, 0); ++ ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), ++ ar->len, 0); + + return err; + } +@@ -525,11 +512,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) + { ++ struct ext4_allocation_request ar; + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; +- ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; +@@ -579,7 +566,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + return -ENOSPC; + } + +- goal = ext4_find_goal(inode, map->m_lblk, partial); ++ /* Set up for the direct block allocation */ ++ memset(&ar, 0, sizeof(ar)); ++ ar.inode = inode; ++ ar.logical = map->m_lblk; ++ if (S_ISREG(inode->i_mode)) ++ ar.flags = EXT4_MB_HINT_DATA; ++ ++ ar.goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; +@@ -588,13 +582,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ +- count = ext4_blks_to_allocate(partial, indirect_blks, +- map->m_len, blocks_to_boundary); ++ ar.len = ext4_blks_to_allocate(partial, indirect_blks, ++ map->m_len, blocks_to_boundary); ++ + /* + * Block out ext4_truncate while we alter the tree + */ +- err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, +- &count, goal, ++ err = ext4_alloc_branch(handle, &ar, indirect_blks, + offsets + (partial - chain), partial); + + /* +@@ -605,14 +599,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) +- err = ext4_splice_branch(handle, inode, map->m_lblk, +- partial, indirect_blks, count); ++ err = ext4_splice_branch(handle, &ar, partial, indirect_blks); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); ++ count = ar.len; + got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); +-- +2.7.4 + +From e3cf5d5d9a86df1c5e413bdd3725c25a16ff854c Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 4 Sep 2014 18:07:25 -0400 +Subject: [PATCH] ext4: prepare to drop EXT4_STATE_DELALLOC_RESERVED + +The EXT4_STATE_DELALLOC_RESERVED flag was originally implemented +because it was too hard to make sure the mballoc and get_block flags +could be reliably passed down through all of the codepaths that end up +calling ext4_mb_new_blocks(). + +Since then, we have mb_flags passed down through most of the code +paths, so getting rid of EXT4_STATE_DELALLOC_RESERVED isn't as tricky +as it used to. + +This commit plumbs in the last of what is required, and then adds a +WARN_ON check to make sure we haven't missed anything. If this passes +a full regression test run, we can then drop +EXT4_STATE_DELALLOC_RESERVED. + +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +--- + fs/ext4/balloc.c | 3 +-- + fs/ext4/extents.c | 6 +++++- + fs/ext4/indirect.c | 6 +++++- + fs/ext4/mballoc.c | 10 ++++++---- + 5 files changed, 17 insertions(+), 14 deletions(-) + +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 581ef40..d70f154 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -636,8 +636,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. + */ +- if (!(*errp) && +- ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { ++ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); + } +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 3ac1686..8170b32 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -1933,6 +1933,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + ext4_lblk_t next; + int mb_flags = 0, unwritten; + ++ if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ++ mb_flags |= EXT4_MB_DELALLOC_RESERVED; + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; +@@ -2054,7 +2056,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + * We're gonna add a new leaf in the tree. + */ + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) +- mb_flags = EXT4_MB_USE_RESERVED; ++ mb_flags |= EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + ppath, newext); + if (err) +@@ -4438,6 +4440,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; ++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ++ ar.flags |= EXT4_MB_DELALLOC_RESERVED; + newblock = ext4_mb_new_blocks(handle, &ar, &err); + if (!newblock) + goto out2; +diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c +index 69af0cd..36b3696 100644 +--- a/fs/ext4/indirect.c ++++ b/fs/ext4/indirect.c +@@ -333,7 +333,9 @@ static int ext4_alloc_branch(handle_t *handle, + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); + } else + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, +- ar->inode, ar->goal, 0, NULL, &err); ++ ar->inode, ar->goal, ++ ar->flags & EXT4_MB_DELALLOC_RESERVED, ++ NULL, &err); + if (err) { + i--; + goto failed; +@@ -572,6 +574,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + ar.logical = map->m_lblk; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; ++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ++ ar.flags |= EXT4_MB_DELALLOC_RESERVED; + + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 8b0f9ef..15dffda 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4415,9 +4415,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + * EDQUOT check, as blocks and quotas have been already + * reserved when data being copied into pagecache. + */ +- if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) ++ if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) { ++ WARN_ON((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0); + ar->flags |= EXT4_MB_DELALLOC_RESERVED; +- else { ++ } ++ ++ if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { + /* Without delayed allocation we need to verify + * there is enough free blocks to do block allocation + * and verify allocation doesn't exceed the quota limits. +@@ -4528,8 +4531,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); + if (!ar->len) { +- if (!ext4_test_inode_state(ar->inode, +- EXT4_STATE_DELALLOC_RESERVED)) ++ if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); +-- +2.7.4 diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series new file mode 100644 index 0000000..6cb4f04 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series @@ -0,0 +1,32 @@ +rhel7/ext4-inode-version.patch +rhel7/ext4-lookup-dotdot.patch +rhel6.3/ext4-print-inum-in-htree-warning.patch +rhel7.4/ext4-prealloc.patch +rhel7/ext4-mballoc-extra-checks.patch +rhel7/ext4-misc.patch +rhel7/ext4-osd-iop-common.patch +rhel7/ext4-hash-indexed-dir-dotdot-update.patch +rhel7/ext4-kill-dx-root.patch +rhel7/ext4-mballoc-pa-free-mismatch.patch +rhel7.3/ext4-data-in-dirent.patch +rhel7.2/ext4-large-eas.patch +rhel7.3/ext4-disable-mb-cache.patch +rhel7/ext4-nocmtime.patch +rhel7.4/ext4-large-dir.patch +rhel7.4/ext4-pdirop.patch +rhel7/ext4-max-dir-size.patch +rhel7/ext4-remove-truncate-warning.patch +rhel7.3/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +rhel7/ext4-give-warning-with-dir-htree-growing.patch +rhel7/ext4-mmp-brelse.patch +rhel7/ext4-jcb-optimization.patch +rhel7/ext4_s_max_ext_tree_depth.patch +rhel7.4/ext4-remove-i_data_sem-from-xattr.patch +rhel7/ext4-projid-ignore-maxquotas.patch +rhel7/ext4-projid-feature-support.patch +rhel7/ext4-projid-quotas.patch +rhel7/ext4-projid-xfs-ioctls.patch +rhel7.4/ext4-fix-xattr-shifting-when-expanding-inodes.patch +rhel7.4/ext4-attach-jinode-in-writepages.patch +rhel6.3/ext4-dont-check-in-ro.patch +rhel7.4/ext4-dont-check-before-replay.patch diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8c70f3f..eee3e89 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -8,7 +8,7 @@ TBD Intel Corporation 2.6.32-573.26.1.el6 (RHEL6.7) 2.6.32-642.15.1.el6 (RHEL6.8) 2.6.32-696.6.3.el6 (RHEL6.9) - 3.10.0-514.26.2.el7 (RHEL7.3) + 3.10.0-693.el7 (RHEL7.4) 3.0.101-0.47.71 (SLES11 SP3) 3.0.101-107 (SLES11 SP4) 3.12.74-60.64.40 (SLES12 SP1) @@ -20,7 +20,7 @@ TBD Intel Corporation 2.6.32-573.26.1.el6 (RHEL6.7) 2.6.32-642.15.1.el6 (RHEL6.8) 2.6.32-696.6.3.el6 (RHEL6.9) - 3.10.0-514.26.2.el7 (RHEL7.3) + 3.10.0-693.el7 (RHEL7.4) 3.0.101-0.47.71 (SLES11 SP3) 3.0.101-107 (SLES11 SP4) 3.12.74-60.64.40 (SLES12 SP1) diff --git a/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config index a57f8da..b52cbbf 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config @@ -12,6 +12,10 @@ CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_HAVE_LATENCYTOP_SUPPORT=y CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 CONFIG_NEED_DMA_MAP_STATE=y CONFIG_NEED_SG_DMA_LENGTH=y CONFIG_GENERIC_ISA_DMA=y @@ -229,6 +233,8 @@ CONFIG_SLUB_DEBUG=y CONFIG_SLUB=y CONFIG_PROFILING=y CONFIG_TRACEPOINTS=y +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y CONFIG_OPROFILE=m CONFIG_OPROFILE_EVENT_MULTIPLEX=y CONFIG_HAVE_OPROFILE=y @@ -282,6 +288,11 @@ CONFIG_HAVE_ARCH_SOFT_DIRTY=y CONFIG_HAVE_ARCH_HUGE_VMAP=y CONFIG_MODULES_USE_ELF_RELA=y CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_ARCH_MMAP_RND_BITS=28 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 CONFIG_OLD_SIGSUSPEND3=y CONFIG_COMPAT_OLD_SIGACTION=y @@ -355,13 +366,18 @@ CONFIG_DEFAULT_IOSCHED="deadline" CONFIG_PREEMPT_NOTIFIERS=y CONFIG_PADATA=y CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y CONFIG_INLINE_SPIN_UNLOCK_IRQ=y CONFIG_INLINE_READ_UNLOCK=y CONFIG_INLINE_READ_UNLOCK_IRQ=y CONFIG_INLINE_WRITE_UNLOCK=y CONFIG_INLINE_WRITE_UNLOCK_IRQ=y CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y CONFIG_FREEZER=y # @@ -371,17 +387,20 @@ CONFIG_ZONE_DMA=y CONFIG_SMP=y CONFIG_X86_X2APIC=y CONFIG_X86_MPPARSE=y +CONFIG_INTEL_RDT_A=y CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_NUMACHIP is not set # CONFIG_X86_VSMP is not set CONFIG_X86_UV=y CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y CONFIG_SCHED_OMIT_FRAME_POINTER=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y # CONFIG_PARAVIRT_DEBUG is not set CONFIG_PARAVIRT_SPINLOCKS=y +# CONFIG_QUEUED_LOCK_STAT is not set CONFIG_XEN=y # CONFIG_XEN_DOM0 is not set # CONFIG_XEN_PRIVILEGED_GUEST is not set @@ -433,6 +452,12 @@ CONFIG_X86_MCE_AMD=y CONFIG_X86_MCE_THRESHOLD=y CONFIG_X86_MCE_INJECT=m CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=y +CONFIG_PERF_EVENTS_INTEL_RAPL=y CONFIG_I8K=m CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y @@ -483,7 +508,6 @@ CONFIG_COMPACTION=y CONFIG_MIGRATION=y CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y CONFIG_HMM=y -CONFIG_HMM_MIGRATE=y CONFIG_HMM_MIRROR=y CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_ZONE_DMA_FLAG=1 @@ -545,12 +569,17 @@ CONFIG_CRASH_DUMP=y CONFIG_KEXEC_JUMP=y CONFIG_PHYSICAL_START=0x1000000 CONFIG_RELOCATABLE=y -CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa CONFIG_HOTPLUG_CPU=y CONFIG_BOOTPARAM_HOTPLUG_CPU0=y # CONFIG_DEBUG_HOTPLUG_CPU0 is not set # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set +CONFIG_ARCH_HAS_ADD_PAGES=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y @@ -570,7 +599,12 @@ CONFIG_PM_SLEEP_SMP=y # CONFIG_PM_WAKELOCKS is not set CONFIG_PM_RUNTIME=y CONFIG_PM=y -# CONFIG_PM_DEBUG is not set +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y CONFIG_PM_CLK=y # CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set CONFIG_ACPI=y @@ -719,7 +753,6 @@ CONFIG_HOTPLUG_PCI_SHPC=m # CONFIG_BINFMT_ELF=y CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE=y CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_BINFMT_SCRIPT=y # CONFIG_HAVE_AOUT is not set @@ -858,10 +891,10 @@ CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=m +CONFIG_NF_CT_PROTO_DCCP=y CONFIG_NF_CT_PROTO_GRE=m -CONFIG_NF_CT_PROTO_SCTP=m -CONFIG_NF_CT_PROTO_UDPLITE=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m @@ -879,9 +912,9 @@ CONFIG_NF_CT_NETLINK_HELPER=m CONFIG_NETFILTER_NETLINK_QUEUE_CT=y CONFIG_NF_NAT=m CONFIG_NF_NAT_NEEDED=y -CONFIG_NF_NAT_PROTO_DCCP=m -CONFIG_NF_NAT_PROTO_UDPLITE=m -CONFIG_NF_NAT_PROTO_SCTP=m +CONFIG_NF_NAT_PROTO_DCCP=y +CONFIG_NF_NAT_PROTO_UDPLITE=y +CONFIG_NF_NAT_PROTO_SCTP=y CONFIG_NF_NAT_AMANDA=m CONFIG_NF_NAT_FTP=m CONFIG_NF_NAT_IRC=m @@ -1128,11 +1161,11 @@ CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_SECURITY=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m -# CONFIG_IP6_NF_TARGET_NPT is not set +CONFIG_IP6_NF_TARGET_NPT=m CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m -# CONFIG_NF_LOG_BRIDGE is not set +CONFIG_NF_LOG_BRIDGE=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -1278,6 +1311,8 @@ CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_EMATCH=y CONFIG_NET_EMATCH_STACK=32 CONFIG_NET_EMATCH_CMP=m @@ -1298,6 +1333,8 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m CONFIG_NET_CLS_IND=y CONFIG_NET_SCH_FIFO=y CONFIG_DCB=y @@ -1309,9 +1346,12 @@ CONFIG_OPENVSWITCH_VXLAN=m CONFIG_OPENVSWITCH_GENEVE=m CONFIG_VSOCKETS=m CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m CONFIG_NETLINK_MMAP=y CONFIG_NETLINK_DIAG=m CONFIG_NET_MPLS_GSO=m +CONFIG_NET_SWITCHDEV=y CONFIG_RPS=y CONFIG_RFS_ACCEL=y CONFIG_XPS=y @@ -1404,6 +1444,7 @@ CONFIG_BT_HCIUART_3WIRE=y # CONFIG_BT_HCIUART_BCM is not set # CONFIG_BT_HCIUART_QCA is not set # CONFIG_BT_HCIUART_AG6XX is not set +# CONFIG_BT_HCIUART_MRVL is not set CONFIG_BT_HCIBCM203X=m CONFIG_BT_HCIBPA10X=m CONFIG_BT_HCIBFUSB=m @@ -1455,6 +1496,7 @@ CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y CONFIG_LWTUNNEL=y CONFIG_DST_CACHE=y CONFIG_NET_DEVLINK=m +CONFIG_MAY_USE_DEVLINK=m CONFIG_HAVE_BPF_JIT=y # @@ -1612,7 +1654,17 @@ CONFIG_VIRTIO_BLK=m # CONFIG_BLK_DEV_HD is not set CONFIG_BLK_DEV_RBD=m # CONFIG_BLK_DEV_RSXX is not set +CONFIG_NVME_CORE=m CONFIG_BLK_DEV_NVME=m +CONFIG_BLK_DEV_NVME_SCSI=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m # # Misc devices @@ -1712,7 +1764,6 @@ CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_MULTI_LUN=y CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_MAX_SG_SEGMENTS=128 CONFIG_SCSI_SCAN_ASYNC=y # @@ -1794,6 +1845,8 @@ CONFIG_SCSI_STEX=m CONFIG_SCSI_QLA_FC=m # CONFIG_TCM_QLA2XXX is not set CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m CONFIG_SCSI_LPFC=m # CONFIG_SCSI_LPFC_DEBUG_FS is not set # CONFIG_SCSI_DC395x is not set @@ -1928,7 +1981,7 @@ CONFIG_BLK_DEV_DM=m # CONFIG_DM_MQ_DEFAULT is not set CONFIG_DM_DEBUG=y CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_STACK_TRACING is not set +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set CONFIG_DM_BIO_PRISON=m CONFIG_DM_PERSISTENT_DATA=m CONFIG_DM_CRYPT=m @@ -1936,7 +1989,6 @@ CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_CACHE=m CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_CACHE_CLEANER=m CONFIG_DM_ERA=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m @@ -2016,6 +2068,7 @@ CONFIG_NLMON=m # CONFIG_VHOST_NET=m # CONFIG_VHOST_SCSI is not set +CONFIG_VHOST_VSOCK=m CONFIG_VHOST_RING=m CONFIG_VHOST=m # CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set @@ -2033,7 +2086,15 @@ CONFIG_MDIO=m # CONFIG_NET_VENDOR_3COM is not set # CONFIG_NET_VENDOR_ADAPTEC is not set # CONFIG_NET_VENDOR_ALTEON is not set -# CONFIG_NET_VENDOR_AMD is not set +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +# CONFIG_AMD_XGBE is not set +# CONFIG_AMD_XGBE_HAVE_ECC is not set +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m CONFIG_NET_VENDOR_ATHEROS=y CONFIG_ATL2=m CONFIG_ATL1=m @@ -2053,10 +2114,9 @@ CONFIG_CNIC=m CONFIG_TIGON3=m CONFIG_BNX2X=m CONFIG_BNX2X_SRIOV=y -# CONFIG_BNX2X_VXLAN is not set -# CONFIG_BNX2X_GENEVE is not set CONFIG_BNXT=m CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_DCB=y CONFIG_NET_VENDOR_BROCADE=y CONFIG_BNA=m CONFIG_NET_CALXEDA_XGMAC=m @@ -2065,8 +2125,8 @@ CONFIG_NET_VENDOR_CHELSIO=y CONFIG_CHELSIO_T3=m CONFIG_CHELSIO_T4=m # CONFIG_CHELSIO_T4_DCB is not set -CONFIG_CHELSIO_T4_UWIRE=y CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m CONFIG_NET_VENDOR_CISCO=y CONFIG_ENIC=m CONFIG_DNET=m @@ -2087,7 +2147,6 @@ CONFIG_PCMCIA_XIRCOM=m CONFIG_NET_VENDOR_EMULEX=y CONFIG_BE2NET=m CONFIG_BE2NET_HWMON=y -CONFIG_BE2NET_VXLAN=y # CONFIG_NET_VENDOR_EXAR is not set # CONFIG_NET_VENDOR_HP is not set CONFIG_NET_VENDOR_INTEL=y @@ -2101,7 +2160,6 @@ CONFIG_IGB_DCA=y CONFIG_IGBVF=m # CONFIG_IXGB is not set CONFIG_IXGBE=m -# CONFIG_IXGBE_VXLAN is not set CONFIG_IXGBE_HWMON=y CONFIG_IXGBE_DCA=y CONFIG_IXGBE_DCB=y @@ -2126,13 +2184,21 @@ CONFIG_SKY2=m CONFIG_NET_VENDOR_MELLANOX=y CONFIG_MLX4_EN=m CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_EN_VXLAN=y CONFIG_MLX4_CORE=m CONFIG_MLX4_DEBUG=y CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y CONFIG_MLX5_CORE_EN_DCB=y -CONFIG_MLX5_CORE_EN_VXLAN=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set CONFIG_NET_VENDOR_MYRI=y @@ -2140,6 +2206,9 @@ CONFIG_MYRI10GE=m CONFIG_MYRI10GE_DCA=y # CONFIG_FEALNX is not set # CONFIG_NET_VENDOR_NATSEMI is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +# CONFIG_NFP_DEBUG is not set # CONFIG_NET_VENDOR_NVIDIA is not set CONFIG_NET_VENDOR_OKI=y CONFIG_PCH_GBE=m @@ -2157,10 +2226,14 @@ CONFIG_QLCNIC_HWMON=y CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_QED=m +CONFIG_QED_LL2=y CONFIG_QED_SRIOV=y CONFIG_QEDE=m # CONFIG_QEDE_VXLAN is not set # CONFIG_QEDE_GENEVE is not set +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y CONFIG_NET_VENDOR_REALTEK=y # CONFIG_ATP is not set CONFIG_8139CP=m @@ -2171,14 +2244,19 @@ CONFIG_8139TOO_8129=y # CONFIG_8139_OLD_RX_RESET is not set CONFIG_R8169=m # CONFIG_NET_VENDOR_RDC is not set +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m # CONFIG_NET_VENDOR_SEEQ is not set # CONFIG_NET_VENDOR_SILAN is not set # CONFIG_NET_VENDOR_SIS is not set +CONFIG_NET_VENDOR_SOLARFLARE=y CONFIG_SFC=m CONFIG_SFC_MTD=y CONFIG_SFC_MCDI_MON=y CONFIG_SFC_SRIOV=y CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y CONFIG_NET_VENDOR_SMSC=y CONFIG_EPIC100=m CONFIG_SMSC9420=m @@ -2296,6 +2374,7 @@ CONFIG_WLAN_VENDOR_ATH=y # CONFIG_ATH5K_PCI is not set CONFIG_ATH9K_HW=m CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_COMMON_DEBUG=y CONFIG_ATH9K_BTCOEX_SUPPORT=y CONFIG_ATH9K=m CONFIG_ATH9K_PCI=y @@ -2309,7 +2388,7 @@ CONFIG_ATH9K_RFKILL=y CONFIG_ATH9K_PCOEM=y CONFIG_ATH9K_HTC=m # CONFIG_ATH9K_HTC_DEBUGFS is not set -CONFIG_ATH9K_HWRNG=y +# CONFIG_ATH9K_HWRNG is not set CONFIG_CARL9170=m CONFIG_CARL9170_LEDS=y # CONFIG_CARL9170_DEBUGFS is not set @@ -2357,7 +2436,6 @@ CONFIG_IWLDVM=m CONFIG_IWLMVM=m CONFIG_IWLWIFI_OPMODE_MODULAR=y # CONFIG_IWLWIFI_BCAST_FILTERING is not set -# CONFIG_IWLWIFI_PCIE_RTPM is not set # # Debugging Options @@ -2597,7 +2675,7 @@ CONFIG_INPUT_MOUSEDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -# CONFIG_INPUT_JOYDEV is not set +CONFIG_INPUT_JOYDEV=m CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_EVBUG is not set @@ -2817,7 +2895,6 @@ CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set CONFIG_IPMI_DEVICE_INTERFACE=m CONFIG_IPMI_SI=m -# CONFIG_IPMI_SI_PROBE_DEFAULTS is not set CONFIG_IPMI_SSIF=m CONFIG_IPMI_WATCHDOG=m CONFIG_IPMI_POWEROFF=m @@ -2842,7 +2919,9 @@ CONFIG_HPET_MMAP=y CONFIG_HANGCHECK_TIMER=m CONFIG_UV_MMTIMER=m CONFIG_TCG_TPM=y +CONFIG_TCG_TIS_CORE=y CONFIG_TCG_TIS=y +# CONFIG_TCG_TIS_SPI is not set CONFIG_TCG_TIS_I2C_ATMEL=m CONFIG_TCG_TIS_I2C_INFINEON=m CONFIG_TCG_TIS_I2C_NUVOTON=m @@ -2851,8 +2930,10 @@ CONFIG_TCG_ATMEL=m CONFIG_TCG_INFINEON=m # CONFIG_TCG_XEN is not set CONFIG_TCG_CRB=m +# CONFIG_TCG_VTPM_PROXY is not set CONFIG_TCG_TIS_ST33ZP24=m CONFIG_TCG_TIS_ST33ZP24_I2C=m +# CONFIG_TCG_TIS_ST33ZP24_SPI is not set CONFIG_TELCLOCK=m CONFIG_DEVPORT=y CONFIG_HMC_DRV=m @@ -2986,22 +3067,28 @@ CONFIG_PPS_CLIENT_GPIO=m CONFIG_PTP_1588_CLOCK=m CONFIG_DP83640_PHY=m CONFIG_PTP_1588_CLOCK_PCH=m +CONFIG_PTP_1588_CLOCK_KVM=m CONFIG_PINCTRL=y # # Pin controllers # # CONFIG_PINMUX is not set -# CONFIG_PINCONF is not set +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y # CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AMD=m # CONFIG_PINCTRL_EXYNOS5440 is not set CONFIG_PINCTRL_BAYTRAIL=y +# CONFIG_PINCTRL_SUNRISEPOINT is not set CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y CONFIG_GPIOLIB=y CONFIG_GPIO_DEVRES=y CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y # CONFIG_DEBUG_GPIO is not set CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_AMDPT=m # # Memory mapped GPIO drivers: @@ -3128,7 +3215,7 @@ CONFIG_SENSORS_CORETEMP=m CONFIG_SENSORS_IBMAEM=m CONFIG_SENSORS_IBMPEX=m CONFIG_SENSORS_IT87=m -# CONFIG_SENSORS_JC42 is not set +CONFIG_SENSORS_JC42=m CONFIG_SENSORS_LINEAGE=m CONFIG_SENSORS_LM63=m # CONFIG_SENSORS_LM70 is not set @@ -3278,6 +3365,7 @@ CONFIG_W83877F_WDT=m CONFIG_W83977F_WDT=m CONFIG_MACHZ_WDT=m # CONFIG_SBC_EPX_C3_WATCHDOG is not set +CONFIG_INTEL_MEI_WDT=m CONFIG_XEN_WDT=m # @@ -3901,8 +3989,7 @@ CONFIG_VGA_ARB=y CONFIG_VGA_ARB_MAX_GPUS=64 CONFIG_VGA_SWITCHEROO=y CONFIG_DRM=m -CONFIG_DRM_MIPI_DSI=y -# CONFIG_DRM_DP_AUX_CHARDEV is not set +CONFIG_DRM_DP_AUX_CHARDEV=y CONFIG_DRM_KMS_HELPER=m CONFIG_DRM_KMS_FB_HELPER=y CONFIG_DRM_FBDEV_EMULATION=y @@ -3916,14 +4003,13 @@ CONFIG_DRM_TTM=m CONFIG_DRM_I2C_CH7006=m CONFIG_DRM_I2C_SIL164=m # CONFIG_DRM_I2C_NXP_TDA998X is not set -# CONFIG_DRM_TDFX is not set -# CONFIG_DRM_R128 is not set CONFIG_DRM_RADEON=m # CONFIG_DRM_RADEON_USERPTR is not set CONFIG_DRM_AMDGPU=m +# CONFIG_DRM_AMDGPU_SI is not set # CONFIG_DRM_AMDGPU_CIK is not set # CONFIG_DRM_AMDGPU_USERPTR is not set -CONFIG_DRM_AMD_POWERPLAY=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set # # ACP (Audio CoProcessor) Configuration @@ -3933,14 +4019,13 @@ CONFIG_DRM_NOUVEAU=m CONFIG_NOUVEAU_DEBUG=5 CONFIG_NOUVEAU_DEBUG_DEFAULT=3 CONFIG_DRM_NOUVEAU_BACKLIGHT=y -# CONFIG_DRM_I810 is not set CONFIG_DRM_I915=m -# CONFIG_DRM_I915_PRELIMINARY_HW_SUPPORT is not set +# CONFIG_DRM_I915_ALPHA_SUPPORT is not set +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y CONFIG_DRM_I915_USERPTR=y -# CONFIG_DRM_MGA is not set -# CONFIG_DRM_SIS is not set -# CONFIG_DRM_VIA is not set -# CONFIG_DRM_SAVAGE is not set +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m CONFIG_DRM_VMWGFX=m CONFIG_DRM_VMWGFX_FBCON=y CONFIG_DRM_GMA500=m @@ -3953,12 +4038,8 @@ CONFIG_DRM_CIRRUS_QEMU=m CONFIG_DRM_QXL=m CONFIG_DRM_BOCHS=m CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# CONFIG_HSA_AMD=m +# CONFIG_DRM_LEGACY is not set # CONFIG_VGASTATE is not set CONFIG_VIDEO_OUTPUT_CONTROL=m CONFIG_HDMI=y @@ -4259,6 +4340,8 @@ CONFIG_SND_SST_IPC_ACPI=m CONFIG_SND_SOC_INTEL_SST=m CONFIG_SND_SOC_INTEL_SST_ACPI=m CONFIG_SND_SOC_INTEL_SST_MATCH=m +# CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH is not set +# CONFIG_SND_SOC_INTEL_BXT_RT298_MACH is not set CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m @@ -4405,7 +4488,7 @@ CONFIG_USB_DEFAULT_PERSIST=y # CONFIG_USB_OTG is not set # CONFIG_USB_OTG_WHITELIST is not set # CONFIG_USB_OTG_FSM is not set -# CONFIG_USB_ULPI_BUS is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m CONFIG_USB_MON=y CONFIG_USB_WUSB=m CONFIG_USB_WUSB_CBAF=m @@ -4480,6 +4563,10 @@ CONFIG_USB_UAS=m # CONFIG_USB_MDC800=m CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +# CONFIG_USBIP_VHCI_HCD is not set +# CONFIG_USBIP_HOST is not set +# CONFIG_USBIP_DEBUG is not set # CONFIG_USB_DWC3 is not set # CONFIG_USB_CHIPIDEA is not set @@ -4553,7 +4640,6 @@ CONFIG_USB_SEVSEG=m # CONFIG_USB_RIO500 is not set CONFIG_USB_LEGOTOWER=m CONFIG_USB_LCD=m -CONFIG_USB_LED=m # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set CONFIG_USB_IDMOUSE=m @@ -4570,8 +4656,10 @@ CONFIG_USB_ISIGHTFW=m # CONFIG_USB_YUREX is not set CONFIG_USB_EZUSB_FX2=m CONFIG_USB_HSIC_USB3503=m +# CONFIG_USB_HSIC_USB4604 is not set # CONFIG_USB_LINK_LAYER_TEST is not set # CONFIG_USB_CHAOSKEY is not set +# CONFIG_UCSI is not set CONFIG_USB_ATM=m CONFIG_USB_SPEEDTOUCH=m CONFIG_USB_CXACRU=m @@ -4580,16 +4668,13 @@ CONFIG_USB_XUSBATM=m # CONFIG_USB_PHY is not set # CONFIG_USB_GADGET is not set # CONFIG_USB_LED_TRIG is not set +# CONFIG_USB_ULPI_BUS is not set CONFIG_UWB=m CONFIG_UWB_HWA=m CONFIG_UWB_WHCI=m CONFIG_UWB_I1480U=m CONFIG_MMC=m # CONFIG_MMC_DEBUG is not set - -# -# MMC/SD/SDIO Card Drivers -# CONFIG_MMC_BLOCK=m CONFIG_MMC_BLOCK_MINORS=8 CONFIG_MMC_BLOCK_BOUNCE=y @@ -4712,10 +4797,12 @@ CONFIG_INFINIBAND_SRPT=m CONFIG_INFINIBAND_ISER=m CONFIG_INFINIBAND_ISERT=m CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m CONFIG_INFINIBAND_HFI1=m # CONFIG_HFI1_DEBUG_SDMA_ORDER is not set CONFIG_HFI1_VERBS_31BIT_PSN=y # CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m CONFIG_EDAC=y CONFIG_EDAC_LEGACY_SYSFS=y # CONFIG_EDAC_DEBUG is not set @@ -4736,6 +4823,7 @@ CONFIG_EDAC_I5000=m CONFIG_EDAC_I5100=m CONFIG_EDAC_I7300=m CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m CONFIG_RTC_LIB=y CONFIG_RTC_CLASS=y CONFIG_RTC_HCTOSYS=y @@ -4840,6 +4928,12 @@ CONFIG_DMA_ACPI=y CONFIG_NET_DMA_RH_KABI=y CONFIG_ASYNC_TX_DMA=y # CONFIG_DMATEST is not set + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set CONFIG_DCA=m CONFIG_AUXDISPLAY=y CONFIG_KS0108=m @@ -4856,6 +4950,7 @@ CONFIG_UIO_AEC=m CONFIG_UIO_SERCOS3=m CONFIG_UIO_PCI_GENERIC=m # CONFIG_UIO_NETX is not set +CONFIG_UIO_HV_GENERIC=m CONFIG_VFIO_IOMMU_TYPE1=m CONFIG_VFIO=m CONFIG_VFIO_NOIOMMU=y @@ -4863,6 +4958,8 @@ CONFIG_VFIO_PCI=m # CONFIG_VFIO_PCI_VGA is not set CONFIG_VFIO_PCI_MMAP=y CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m CONFIG_IRQ_BYPASS_MANAGER=m # CONFIG_VIRT_DRIVERS is not set CONFIG_VIRTIO=m @@ -4904,7 +5001,6 @@ CONFIG_XEN_HAVE_PVMMU=y CONFIG_STAGING=y # CONFIG_ET131X is not set # CONFIG_SLICOSS is not set -# CONFIG_USBIP_CORE is not set # CONFIG_W35UND is not set # CONFIG_PRISM2_USB is not set # CONFIG_ECHO is not set @@ -4964,9 +5060,12 @@ CONFIG_ACER_WMI=m CONFIG_ACERHDF=m CONFIG_ASUS_LAPTOP=m CONFIG_CHROMEOS_LAPTOP=m +CONFIG_DELL_SMBIOS=m CONFIG_DELL_LAPTOP=m CONFIG_DELL_WMI=m CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m CONFIG_FUJITSU_LAPTOP=m # CONFIG_FUJITSU_LAPTOP_DEBUG is not set CONFIG_FUJITSU_TABLET=m @@ -4999,6 +5098,7 @@ CONFIG_TOPSTAR_LAPTOP=m CONFIG_ACPI_TOSHIBA=m CONFIG_TOSHIBA_BT_RFKILL=m CONFIG_ACPI_CMPC=m +CONFIG_INTEL_HID_EVENT=m CONFIG_INTEL_IPS=m # CONFIG_IBM_RTL is not set # CONFIG_XO15_EBOOK is not set @@ -5049,6 +5149,12 @@ CONFIG_IRQ_REMAP=y # CONFIG_MEMORY is not set # CONFIG_IIO is not set CONFIG_NTB=m +CONFIG_NTB_AMD=m +# CONFIG_NTB_INTEL is not set +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +CONFIG_NTB_PERF=m +CONFIG_NTB_TRANSPORT=m # CONFIG_VME_BUS is not set CONFIG_PWM=y CONFIG_PWM_SYSFS=y @@ -5065,6 +5171,9 @@ CONFIG_BTT=y CONFIG_ND_PFN=m CONFIG_NVDIMM_PFN=y CONFIG_NVDIMM_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_NR_DEV_DAX=32768 # # Firmware Drivers @@ -5085,6 +5194,7 @@ CONFIG_ISCSI_IBFT=m # EFI (Extensible Firmware Interface) Support # CONFIG_EFI_VARS=y +CONFIG_EFI_ESRT=y CONFIG_EFI_VARS_PSTORE=y CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y CONFIG_EFI_RUNTIME_MAP=y @@ -5185,6 +5295,7 @@ CONFIG_PROC_KCORE=y CONFIG_PROC_VMCORE=y CONFIG_PROC_SYSCTL=y CONFIG_PROC_PAGE_MONITOR=y +CONFIG_KERNFS=y CONFIG_SYSFS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -5460,6 +5571,7 @@ CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y # CONFIG_IRQSOFF_TRACER is not set CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_TRACER_SNAPSHOT=y # CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set @@ -5482,6 +5594,7 @@ CONFIG_RING_BUFFER_BENCHMARK=m # CONFIG_RBTREE_TEST is not set # CONFIG_INTERVAL_TREE_TEST is not set # CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_PARMAN is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_FIREWIRE_OHCI_REMOTE_DMA is not set CONFIG_BUILD_DOCSRC=y @@ -5545,7 +5658,7 @@ CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_NETWORK_XFRM=y -# CONFIG_SECURITY_PATH is not set +CONFIG_SECURITY_PATH=y CONFIG_SECURITY_SECURELEVEL=y CONFIG_INTEL_TXT=y CONFIG_LSM_MMAP_MIN_ADDR=65535 @@ -5560,7 +5673,8 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SMACK is not set # CONFIG_SECURITY_TOMOYO is not set # CONFIG_SECURITY_APPARMOR is not set -# CONFIG_SECURITY_YAMA is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_YAMA_STACKED=y CONFIG_INTEGRITY=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y @@ -5573,6 +5687,7 @@ CONFIG_IMA_TRUSTED_KEYRING=y CONFIG_EVM=y CONFIG_EVM_HMAC_VERSION=2 CONFIG_DEFAULT_SECURITY_SELINUX=y +# CONFIG_DEFAULT_SECURITY_YAMA is not set # CONFIG_DEFAULT_SECURITY_DAC is not set CONFIG_DEFAULT_SECURITY="selinux" CONFIG_XOR_BLOCKS=m @@ -5601,7 +5716,11 @@ CONFIG_CRYPTO_PCOMP=m CONFIG_CRYPTO_PCOMP2=y CONFIG_CRYPTO_AKCIPHER2=y CONFIG_CRYPTO_AKCIPHER=m -# CONFIG_CRYPTO_RSA is not set +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +# CONFIG_CRYPTO_ECDH is not set CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y CONFIG_CRYPTO_USER=m @@ -5665,6 +5784,8 @@ CONFIG_CRYPTO_SHA1_SSSE3=y CONFIG_CRYPTO_SHA256_SSSE3=y CONFIG_CRYPTO_SHA512_SSSE3=m CONFIG_CRYPTO_SHA1_MB=m +CONFIG_CRYPTO_SHA256_MB=m +CONFIG_CRYPTO_SHA512_MB=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_TGR192=m @@ -5725,14 +5846,18 @@ CONFIG_CRYPTO_DRBG_HMAC=y CONFIG_CRYPTO_DRBG_HASH=y CONFIG_CRYPTO_DRBG_CTR=y CONFIG_CRYPTO_DRBG=m +CONFIG_CRYPTO_JITTERENTROPY=m CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y CONFIG_CRYPTO_USER_API_SKCIPHER=y +CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_HASH_INFO=y CONFIG_CRYPTO_HW=y CONFIG_CRYPTO_DEV_PADLOCK=m CONFIG_CRYPTO_DEV_PADLOCK_AES=m CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m CONFIG_CRYPTO_DEV_QAT=m CONFIG_CRYPTO_DEV_QAT_DH895xCC=m CONFIG_CRYPTO_DEV_QAT_C3XXX=m @@ -5740,6 +5865,7 @@ CONFIG_CRYPTO_DEV_QAT_C62X=m CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_CHELSIO=m CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y CONFIG_PUBLIC_KEY_ALGO_RSA=y @@ -5798,7 +5924,7 @@ CONFIG_LIBCRC32C=m CONFIG_CRC8=m # CONFIG_RANDOM32_SELFTEST is not set CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=m +CONFIG_ZLIB_DEFLATE=y CONFIG_LZO_COMPRESS=y CONFIG_LZO_DECOMPRESS=y CONFIG_XZ_DEC=y @@ -5824,7 +5950,6 @@ CONFIG_TEXTSEARCH_KMP=m CONFIG_TEXTSEARCH_BM=m CONFIG_TEXTSEARCH_FSM=m CONFIG_INTERVAL_TREE=y -CONFIG_GENERIC_PAGE_TABLE=y CONFIG_ASSOCIATIVE_ARRAY=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y @@ -5844,6 +5969,8 @@ CONFIG_MPILIB=y CONFIG_SIGNATURE=y CONFIG_OID_REGISTRY=y CONFIG_UCS2_STRING=y +CONFIG_SG_POOL=y CONFIG_ARCH_HAS_PMEM_API=y CONFIG_ARCH_HAS_MMIO_FLUSH=y +CONFIG_PARMAN=m CONFIG_RH_KABI_SIZE_ALIGN_CHECKS=y diff --git a/lustre/kernel_patches/patches/blkdev_tunables-3.9.patch b/lustre/kernel_patches/patches/blkdev_tunables-3.9.patch new file mode 100644 index 0000000..786c0df --- /dev/null +++ b/lustre/kernel_patches/patches/blkdev_tunables-3.9.patch @@ -0,0 +1,70 @@ +Index: linux-3.10.0-495.el7.x86_64/block/blk-settings.c +=================================================================== +--- linux-3.10.0-495.el7.x86_64.orig/block/blk-settings.c ++++ linux-3.10.0-495.el7.x86_64/block/blk-settings.c +@@ -19,6 +19,12 @@ EXPORT_SYMBOL(blk_max_low_pfn); + + unsigned long blk_max_pfn; + ++int default_max_sectors = BLK_DEF_MAX_SECTORS; ++module_param(default_max_sectors, int, 0); ++ ++int default_max_segments = BLK_MAX_SEGMENTS; ++module_param(default_max_segments, int, 0); ++ + /** + * blk_queue_prep_rq - set a prepare_request function for queue + * @q: queue +@@ -108,7 +114,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); + */ + void blk_set_default_limits(struct queue_limits *lim) + { +- lim->max_segments = BLK_MAX_SEGMENTS; ++ lim->max_segments = default_max_segments; + lim->max_integrity_segments = 0; + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + if (lim->limits_aux) +@@ -268,7 +274,7 @@ void blk_limits_max_hw_sectors(struct qu + + limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); +- max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); ++ max_sectors = min_t(unsigned int, max_sectors, default_max_sectors); + limits->max_sectors = max_sectors; + } + EXPORT_SYMBOL(blk_limits_max_hw_sectors); +Index: linux-3.10.0-495.el7.x86_64/drivers/message/fusion/Kconfig +=================================================================== +--- linux-3.10.0-495.el7.x86_64.orig/drivers/message/fusion/Kconfig ++++ linux-3.10.0-495.el7.x86_64/drivers/message/fusion/Kconfig +@@ -61,9 +61,9 @@ config FUSION_SAS + LSISAS1078 + + config FUSION_MAX_SGE +- int "Maximum number of scatter gather entries (16 - 128)" +- default "128" +- range 16 128 ++ int "Maximum number of scatter gather entries (16 - 256)" ++ default "256" ++ range 16 256 + help + This option allows you to specify the maximum number of scatter- + gather entries per I/O. The driver default is 128, which matches +Index: linux-3.10.0-495.el7.x86_64/drivers/message/fusion/mptbase.h +=================================================================== +--- linux-3.10.0-495.el7.x86_64.orig/drivers/message/fusion/mptbase.h ++++ linux-3.10.0-495.el7.x86_64/drivers/message/fusion/mptbase.h +@@ -166,10 +166,10 @@ + * Set the MAX_SGE value based on user input. + */ + #ifdef CONFIG_FUSION_MAX_SGE +-#if CONFIG_FUSION_MAX_SGE < 16 ++#if CONFIG_FUSION_MAX_SGE < 16 + #define MPT_SCSI_SG_DEPTH 16 +-#elif CONFIG_FUSION_MAX_SGE > 128 +-#define MPT_SCSI_SG_DEPTH 128 ++#elif CONFIG_FUSION_MAX_SGE > 256 ++#define MPT_SCSI_SG_DEPTH 256 + #else + #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE + #endif diff --git a/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-3.8.patch b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-3.8.patch new file mode 100644 index 0000000..35f71de --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-3.8.patch @@ -0,0 +1,21 @@ +--- linux-3.10.0-685.el7.x86_64/drivers/md/raid5.c.orig 2017-06-28 14:06:00.627299582 -0700 ++++ linux-3.10.0-685.el7.x86_64/drivers/md/raid5.c 2017-06-28 14:08:01.564618793 -0700 +@@ -3090,6 +3090,8 @@ static int add_stripe_bio(struct stripe_ + bi->bi_next = *bip; + *bip = bi; + raid5_inc_bi_active_stripes(bi); ++ if ((bi->bi_rw & REQ_SYNC) && !forwrite) ++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ + + if (forwrite) { + /* check if page is covered */ +@@ -5538,6 +5540,9 @@ static bool raid5_make_request(struct md + bi, 0); + bio_endio(bi, 0); + } ++ ++ if (bi->bi_rw & REQ_SYNC) ++ md_wakeup_thread(mddev->thread); + return true; + } + diff --git a/lustre/kernel_patches/series/3.10-rhel7.series b/lustre/kernel_patches/series/3.10-rhel7.series index 24abd03..093f78d 100644 --- a/lustre/kernel_patches/series/3.10-rhel7.series +++ b/lustre/kernel_patches/series/3.10-rhel7.series @@ -1,4 +1,4 @@ -raid5-mmp-unplug-dev-3.7.patch +raid5-mmp-unplug-dev-3.8.patch dev_read_only-3.7.patch -blkdev_tunables-3.8.patch +blkdev_tunables-3.9.patch vfs-project-quotas-rhel7.patch diff --git a/lustre/kernel_patches/targets/3.10-rhel7.target.in b/lustre/kernel_patches/targets/3.10-rhel7.target.in index 90c7701..b4f583b 100644 --- a/lustre/kernel_patches/targets/3.10-rhel7.target.in +++ b/lustre/kernel_patches/targets/3.10-rhel7.target.in @@ -1,5 +1,5 @@ lnxmaj="3.10.0" -lnxrel="514.26.2.el7" +lnxrel="693.el7" KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm SERIES=3.10-rhel7.series diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index 07918d4..2fc9af1 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -18,7 +18,7 @@ PATCH SERIES FOR SERVER KERNELS: 2.6-rhel6.series 2.6.32-573.26.1.el6 (RHEL 6.7) 2.6-rhel6.8.series 2.6.32-642.15.1.el6 (RHEL 6.8) 2.6-rhel6.8.series 2.6.32-696.6.3.el6 (RHEL 6.9) -3.10-rhel7.series 3.10.0-514.26.2.el7 (RHEL 7.3) +3.10-rhel7.series 3.10.0-693.el7 (RHEL 7.4) 3.0-sles11sp3.series 3.0.101-0.47.71 (SLES11 SP3) 3.0-sles11sp3.series 3.0.101-107 (SLES11 SP4) 3.12-sles12.series 3.12.74-60.64.40 (SLES12 SP1)