From: Bob Glossman Date: Thu, 19 Nov 2015 17:28:04 +0000 (-0800) Subject: LU-7451 kernel: kernel upgrade RHEL7.2 [3.10.0-327.3.1.el7] X-Git-Tag: 2.7.65~40 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=b8f8a5b8f0fe889b61acafb48a756cb3872b8598;hp=ef46d32bc5aa713ab55179bd836d750def0022d7 LU-7451 kernel: kernel upgrade RHEL7.2 [3.10.0-327.3.1.el7] with this mod we switch our supported el7 version to RHEL 7.2 original release RHEL 7.2 kernel version is 3.10.0-327.el7 current RHEL 7.2 kernel update version is 3.10.0-327.3.1.el7 Test-Parameters: clientdistro=el7 mdsdistro=el7 ossdistro=el7 \ mdsfilesystemtype=ldiskfs mdtfilesystemtype=ldiskfs \ ostfilesystemtype=ldiskfs testgroup=review-ldiskfs Signed-off-by: Bob Glossman Change-Id: Idcfbee3875e7114962c92401f782bb035a6e3221 Reviewed-on: http://review.whamcloud.com/17305 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Dmitry Eremin Reviewed-by: Yang Sheng Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index c815b84..29d08d1 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -6,6 +6,7 @@ LDISKFS_SERIES= AC_MSG_CHECKING([which ldiskfs series to use]) AS_IF([test x$RHEL_KERNEL = xyes], [ case $RHEL_RELEASE_NO in + 72) LDISKFS_SERIES="3.10-rhel7.2.series" ;; 71) LDISKFS_SERIES="3.10-rhel7.series" ;; 67) LDISKFS_SERIES="2.6-rhel6.7.series" ;; 66) LDISKFS_SERIES="2.6-rhel6.6.series" ;; diff --git a/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch new file mode 100644 index 0000000..03bdf17 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch @@ -0,0 +1,1060 @@ +This patch implements the large EA support in ext4. If the size of +an EA value is larger than the blocksize, then the EA value would +not be saved in the external EA block, instead it would be saved +in an external EA inode. So, the patch also helps support a larger +number of EAs. + +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -1579,6 +1579,7 @@ static inline void ext4_clear_state_flag + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) +@@ -1979,6 +1980,12 @@ struct mmpd_data { + #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + + /* ++ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb ++ * This limit is arbitrary, but is reasonable for the xattr API. ++ */ ++#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) ++ ++/* + * Function prototypes + */ + +@@ -1990,6 +1997,10 @@ struct mmpd_data { + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, + ++struct ext4_xattr_ino_array { ++ unsigned int xia_count; /* # of used item in the array */ ++ unsigned int xia_inodes[0]; ++}; + /* bitmap.c */ + extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); + void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +@@ -2194,6 +2205,7 @@ extern void ext4_set_inode_flags(struct + extern void ext4_get_inode_flags(struct ext4_inode_info *); + extern int ext4_alloc_da_blocks(struct inode *inode); + extern void ext4_set_aops(struct inode *inode); ++extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); + extern int ext4_writepage_trans_blocks(struct inode *); + extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); + extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, +Index: linux-stage/fs/ext4/inode.c +=================================================================== +--- linux-stage.orig/fs/ext4/inode.c ++++ linux-stage/fs/ext4/inode.c +@@ -134,8 +134,6 @@ static void ext4_invalidatepage(struct p + unsigned int length); + static int __ext4_journalled_writepage(struct page *page, unsigned int len); + static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +- int pextents); + + /* + * Test whether an inode is a fast symlink. +@@ -184,6 +182,8 @@ void ext4_evict_inode(struct inode *inod + { + handle_t *handle; + int err; ++ int extra_credits = 3; ++ struct ext4_xattr_ino_array *lea_ino_array = NULL; + + trace_ext4_evict_inode(inode); + +@@ -236,8 +236,8 @@ void ext4_evict_inode(struct inode *inod + * protection against it + */ + sb_start_intwrite(inode->i_sb); +- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, +- ext4_blocks_for_truncate(inode)+3); ++ ++ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* +@@ -249,9 +249,36 @@ void ext4_evict_inode(struct inode *inod + sb_end_intwrite(inode->i_sb); + goto no_delete; + } +- + if (IS_SYNC(inode)) + ext4_handle_sync(handle); ++ ++ /* ++ * Delete xattr inode before deleting the main inode. ++ */ ++ err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array); ++ if (err) { ++ ext4_warning(inode->i_sb, ++ "couldn't delete inode's xattr (err %d)", err); ++ goto stop_handle; ++ } ++ ++ if (!IS_NOQUOTA(inode)) ++ extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); ++ ++ if (!ext4_handle_has_enough_credits(handle, ++ ext4_blocks_for_truncate(inode) + extra_credits)) { ++ err = ext4_journal_extend(handle, ++ ext4_blocks_for_truncate(inode) + extra_credits); ++ if (err > 0) ++ err = ext4_journal_restart(handle, ++ ext4_blocks_for_truncate(inode) + extra_credits); ++ if (err != 0) { ++ ext4_warning(inode->i_sb, ++ "couldn't extend journal (err %d)", err); ++ goto stop_handle; ++ } ++ } ++ + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { +@@ -306,8 +333,12 @@ void ext4_evict_inode(struct inode *inod + ext4_clear_inode(inode); + else + ext4_free_inode(handle, inode); ++ + ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); ++ ++ if (lea_ino_array != NULL) ++ ext4_xattr_inode_array_free(inode, lea_ino_array); + return; + no_delete: + ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ +@@ -4681,7 +4712,7 @@ static int ext4_index_trans_blocks(struc + * + * Also account for superblock, inode, quota and xattr blocks + */ +-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, ++int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) + { + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); +Index: linux-stage/fs/ext4/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext4/xattr.c ++++ linux-stage/fs/ext4/xattr.c +@@ -201,6 +201,7 @@ ext4_xattr_check_names(struct ext4_xattr + + while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_size != 0 && ++ entry->e_value_inum == 0 && + (value_start + le16_to_cpu(entry->e_value_offs) < + (void *)e + sizeof(__u32) || + value_start + le16_to_cpu(entry->e_value_offs) + +@@ -233,19 +234,26 @@ ext4_xattr_check_block(struct inode *ino + } + + static inline int +-ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) ++ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size, ++ struct inode *inode) + { + size_t value_size = le32_to_cpu(entry->e_value_size); + +- if (entry->e_value_block != 0 || value_size > size || ++ if (!entry->e_value_inum && + le16_to_cpu(entry->e_value_offs) + value_size > size) ++ return -EIO; ++ if (entry->e_value_inum && ++ (le32_to_cpu(entry->e_value_inum) < EXT4_FIRST_INO(inode->i_sb) || ++ le32_to_cpu(entry->e_value_inum) > ++ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_inodes_count))) + return -EIO; + return 0; + } + + static int + ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, +- const char *name, size_t size, int sorted) ++ const char *name, size_t size, int sorted, ++ struct inode *inode) + { + struct ext4_xattr_entry *entry; + size_t name_len; +@@ -265,11 +273,104 @@ ext4_xattr_find_entry(struct ext4_xattr_ + break; + } + *pentry = entry; +- if (!cmp && ext4_xattr_check_entry(entry, size)) ++ if (!cmp && ext4_xattr_check_entry(entry, size, inode)) + return -EIO; + return cmp ? -ENODATA : 0; + } + ++/* ++ * Read the EA value from an inode. ++ */ ++static int ++ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) ++{ ++ unsigned long block = 0; ++ struct buffer_head *bh = NULL; ++ int err, blocksize; ++ size_t csize, ret_size = 0; ++ ++ if (*size == 0) ++ return 0; ++ ++ blocksize = ea_inode->i_sb->s_blocksize; ++ ++ while (ret_size < *size) { ++ csize = (*size - ret_size) > blocksize ? blocksize : ++ *size - ret_size; ++ bh = ext4_bread(NULL, ea_inode, block, 0, &err); ++ if (!bh) { ++ *size = ret_size; ++ return err; ++ } ++ memcpy(buf, bh->b_data, csize); ++ brelse(bh); ++ ++ buf += csize; ++ block += 1; ++ ret_size += csize; ++ } ++ ++ *size = ret_size; ++ ++ return err; ++} ++ ++struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err) ++{ ++ struct inode *ea_inode = NULL; ++ ++ ea_inode = ext4_iget(parent->i_sb, ea_ino); ++ if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) { ++ int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0; ++ ext4_error(parent->i_sb, "error while reading EA inode %lu " ++ "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode)); ++ *err = rc != 0 ? rc : -EIO; ++ return NULL; ++ } ++ ++ if (ea_inode->i_xattr_inode_parent != parent->i_ino || ++ ea_inode->i_generation != parent->i_generation) { ++ ext4_error(parent->i_sb, "Backpointer from EA inode %lu " ++ "to parent invalid.", ea_ino); ++ *err = -EINVAL; ++ goto error; ++ } ++ ++ if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) { ++ ext4_error(parent->i_sb, "EA inode %lu does not have " ++ "EXT4_EA_INODE_FL flag set.\n", ea_ino); ++ *err = -EINVAL; ++ goto error; ++ } ++ ++ *err = 0; ++ return ea_inode; ++ ++error: ++ iput(ea_inode); ++ return NULL; ++} ++ ++/* ++ * Read the value from the EA inode. ++ */ ++static int ++ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, ++ size_t *size) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); ++ if (err) ++ return err; ++ ++ err = ext4_xattr_inode_read(ea_inode, buffer, size); ++ iput(ea_inode); ++ ++ return err; ++} ++ + static int + ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +@@ -301,7 +401,8 @@ bad_block: + } + ext4_xattr_cache_insert(bh); + entry = BFIRST(bh); +- error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); ++ error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1, ++ inode); + if (error == -EIO) + goto bad_block; + if (error) +@@ -311,8 +412,16 @@ bad_block: + error = -ERANGE; + if (size > buffer_size) + goto cleanup; +- memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), +- size); ++ if (entry->e_value_inum) { ++ error = ext4_xattr_inode_get(inode, ++ le32_to_cpu(entry->e_value_inum), ++ buffer, &size); ++ if (error) ++ goto cleanup; ++ } else { ++ memcpy(buffer, bh->b_data + ++ le16_to_cpu(entry->e_value_offs), size); ++ } + } + error = size; + +@@ -346,7 +455,7 @@ ext4_xattr_ibody_get(struct inode *inode + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, +- end - (void *)entry, 0); ++ end - (void *)entry, 0, inode); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); +@@ -354,8 +463,16 @@ ext4_xattr_ibody_get(struct inode *inode + error = -ERANGE; + if (size > buffer_size) + goto cleanup; +- memcpy(buffer, (void *)IFIRST(header) + +- le16_to_cpu(entry->e_value_offs), size); ++ if (entry->e_value_inum) { ++ error = ext4_xattr_inode_get(inode, ++ le32_to_cpu(entry->e_value_inum), ++ buffer, &size); ++ if (error) ++ goto cleanup; ++ } else { ++ memcpy(buffer, (void *)IFIRST(header) + ++ le16_to_cpu(entry->e_value_offs), size); ++ } + } + error = size; + +@@ -600,7 +717,7 @@ static size_t ext4_xattr_free_space(stru + size_t *min_offs, void *base, int *total) + { + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (!last->e_value_block && last->e_value_size) { ++ if (!last->e_value_inum && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; +@@ -611,16 +728,174 @@ static size_t ext4_xattr_free_space(stru + return (*min_offs - ((void *)last - base) - sizeof(__u32)); + } + ++/* ++ * Write the value of the EA in an inode. ++ */ + static int +-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) ++ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, ++ const void *buf, int bufsize) ++{ ++ struct buffer_head *bh = NULL; ++ unsigned long block = 0; ++ unsigned blocksize = ea_inode->i_sb->s_blocksize; ++ unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; ++ int csize, wsize = 0; ++ int ret = 0; ++ int retries = 0; ++ ++retry: ++ while (ret >= 0 && ret < max_blocks) { ++ struct ext4_map_blocks map; ++ map.m_lblk = block += ret; ++ map.m_len = max_blocks -= ret; ++ ++ ret = ext4_map_blocks(handle, ea_inode, &map, ++ EXT4_GET_BLOCKS_CREATE); ++ if (ret <= 0) { ++ ext4_mark_inode_dirty(handle, ea_inode); ++ if (ret == -ENOSPC && ++ ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ++ ret = 0; ++ goto retry; ++ } ++ break; ++ } ++ } ++ ++ if (ret < 0) ++ return ret; ++ ++ block = 0; ++ while (wsize < bufsize) { ++ if (bh != NULL) ++ brelse(bh); ++ csize = (bufsize - wsize) > blocksize ? blocksize : ++ bufsize - wsize; ++ bh = ext4_getblk(handle, ea_inode, block, 0, &ret); ++ if (!bh) ++ goto out; ++ ret = ext4_journal_get_write_access(handle, bh); ++ if (ret) ++ goto out; ++ ++ memcpy(bh->b_data, buf, csize); ++ set_buffer_uptodate(bh); ++ ext4_handle_dirty_metadata(handle, ea_inode, bh); ++ ++ buf += csize; ++ wsize += csize; ++ block += 1; ++ } ++ ++ mutex_lock(&ea_inode->i_mutex); ++ i_size_write(ea_inode, wsize); ++ ext4_update_i_disksize(ea_inode, wsize); ++ mutex_unlock(&ea_inode->i_mutex); ++ ++ ext4_mark_inode_dirty(handle, ea_inode); ++ ++out: ++ brelse(bh); ++ ++ return ret; ++} ++ ++/* ++ * Create an inode to store the value of a large EA. ++ */ ++static struct inode * ++ext4_xattr_inode_create(handle_t *handle, struct inode *inode) ++{ ++ struct inode *ea_inode = NULL; ++ ++ /* ++ * Let the next inode be the goal, so we try and allocate the EA inode ++ * in the same group, or nearby one. ++ */ ++ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, ++ S_IFREG|0600, NULL, inode->i_ino + 1, NULL); ++ ++ if (!IS_ERR(ea_inode)) { ++ ea_inode->i_op = &ext4_file_inode_operations; ++ ea_inode->i_fop = &ext4_file_operations; ++ ext4_set_aops(ea_inode); ++ ea_inode->i_generation = inode->i_generation; ++ EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; ++ ++ /* ++ * A back-pointer from EA inode to parent inode will be useful ++ * for e2fsck. ++ */ ++ ea_inode->i_xattr_inode_parent = inode->i_ino; ++ unlock_new_inode(ea_inode); ++ } ++ ++ return ea_inode; ++} ++ ++/* ++ * Unlink the inode storing the value of the EA. ++ */ ++int ++ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); ++ if (err) ++ return err; ++ ++ clear_nlink(ea_inode); ++ iput(ea_inode); ++ ++ return 0; ++} ++ ++/* ++ * Add value of the EA in an inode. ++ */ ++static int ++ext4_xattr_inode_set(handle_t *handle, struct inode *inode, unsigned long *ea_ino, ++ const void *value, size_t value_len) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ /* Create an inode for the EA value */ ++ ea_inode = ext4_xattr_inode_create(handle, inode); ++ if (IS_ERR(ea_inode)) ++ return -1; ++ ++ err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); ++ if (err) ++ clear_nlink(ea_inode); ++ else ++ *ea_ino = ea_inode->i_ino; ++ ++ iput(ea_inode); ++ ++ return err; ++} ++ ++static int ++ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, ++ handle_t *handle, struct inode *inode) + { + struct ext4_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); ++ int in_inode = i->in_inode; ++ ++ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, ++ EXT4_FEATURE_INCOMPAT_EA_INODE) && ++ (EXT4_XATTR_SIZE(i->value_len) > ++ EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) ++ in_inode = 1; + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (!last->e_value_block && last->e_value_size) { ++ if (!last->e_value_inum && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; +@@ -628,15 +903,21 @@ ext4_xattr_set_entry(struct ext4_xattr_i + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { +- if (!s->here->e_value_block && s->here->e_value_size) { ++ if (!in_inode && ++ !s->here->e_value_inum && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { +- if (free < EXT4_XATTR_LEN(name_len) + +- EXT4_XATTR_SIZE(i->value_len)) ++ size_t value_len = EXT4_XATTR_SIZE(i->value_len); ++ ++ if (in_inode) ++ value_len = 0; ++ ++ if (free < value_len || ++ free < EXT4_XATTR_LEN(name_len) + value_len) + return -ENOSPC; + } + +@@ -651,7 +931,8 @@ ext4_xattr_set_entry(struct ext4_xattr_i + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { +- if (!s->here->e_value_block && s->here->e_value_size) { ++ if (!s->here->e_value_inum && s->here->e_value_size && ++ s->here->e_value_offs > 0) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; +@@ -685,13 +966,18 @@ ext4_xattr_set_entry(struct ext4_xattr_i + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); +- if (!last->e_value_block && ++ if (!last->e_value_inum && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } ++ if (s->here->e_value_inum) { ++ ext4_xattr_inode_unlink(inode, ++ le32_to_cpu(s->here->e_value_inum)); ++ s->here->e_value_inum = 0; ++ } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); +@@ -705,10 +990,17 @@ ext4_xattr_set_entry(struct ext4_xattr_i + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); +- if (i->value_len) { ++ if (in_inode) { ++ unsigned long ea_ino = le32_to_cpu(s->here->e_value_inum); ++ ext4_xattr_inode_set(handle, inode, &ea_ino, i->value, ++ i->value_len); ++ s->here->e_value_inum = cpu_to_le32(ea_ino); ++ s->here->e_value_offs = 0; ++ } else if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); ++ s->here->e_value_inum = 0; + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { +@@ -758,7 +1050,7 @@ ext4_xattr_block_find(struct inode *inod + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext4_xattr_find_entry(&bs->s.here, i->name_index, +- i->name, bs->bh->b_size, 1); ++ i->name, bs->bh->b_size, 1, inode); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; +@@ -782,8 +1074,6 @@ ext4_xattr_block_set(handle_t *handle, s + + #define header(x) ((struct ext4_xattr_header *)(x)) + +- if (i->value && i->value_len > sb->s_blocksize) +- return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); +@@ -799,7 +1089,7 @@ ext4_xattr_block_set(handle_t *handle, s + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), +@@ -850,7 +1140,7 @@ ext4_xattr_block_set(handle_t *handle, s + s->end = s->base + sb->s_blocksize; + } + +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error == -EIO) + goto bad_block; + if (error) +@@ -1000,7 +1290,7 @@ int ext4_xattr_ibody_find(struct inode * + /* Find the named attribute. */ + error = ext4_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - +- (void *)is->s.base, 0); ++ (void *)is->s.base, 0, inode); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; +@@ -1018,7 +1308,7 @@ int ext4_xattr_ibody_inline_set(handle_t + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error) { + if (error == -ENOSPC && + ext4_has_inline_data(inode)) { +@@ -1030,7 +1320,7 @@ int ext4_xattr_ibody_inline_set(handle_t + error = ext4_xattr_ibody_find(inode, i, is); + if (error) + return error; +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + } + if (error) + return error; +@@ -1056,7 +1346,7 @@ static int ext4_xattr_ibody_set(handle_t + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); +@@ -1092,7 +1382,7 @@ ext4_xattr_set_handle(handle_t *handle, + .name = name, + .value = value, + .value_len = value_len, +- ++ .in_inode = 0, + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, +@@ -1157,6 +1447,15 @@ ext4_xattr_set_handle(handle_t *handle, + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); ++ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, ++ EXT4_FEATURE_INCOMPAT_EA_INODE) && ++ error == -ENOSPC) { ++ /* xattr not fit to block, store at external ++ * inode */ ++ i.in_inode = 1; ++ error = ext4_xattr_ibody_set(handle, inode, ++ &i, &is); ++ } + if (error) + goto cleanup; + if (!is.s.not_found) { +@@ -1203,9 +1502,22 @@ ext4_xattr_set(struct inode *inode, int + const void *value, size_t value_len, int flags) + { + handle_t *handle; ++ struct super_block *sb = inode->i_sb; + int error, retries = 0; + int credits = ext4_jbd2_credits_xattr(inode); + ++ if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && ++ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EA_INODE)) { ++ int nrblocks = (value_len + sb->s_blocksize - 1) >> ++ sb->s_blocksize_bits; ++ ++ /* For new inode */ ++ credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; ++ ++ /* For data blocks of EA inode */ ++ credits += ext4_meta_trans_blocks(inode, nrblocks, 0); ++ } ++ + retry: + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); + if (IS_ERR(handle)) { +@@ -1217,7 +1529,7 @@ retry: + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && +- ext4_should_retry_alloc(inode->i_sb, &retries)) ++ ext4_should_retry_alloc(sb, &retries)) + goto retry; + if (error == 0) + error = error2; +@@ -1239,7 +1551,7 @@ static void ext4_xattr_shift_entries(str + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (!last->e_value_block && last->e_value_size) { ++ if (!last->e_value_inum && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) +@@ -1477,21 +1789,135 @@ cleanup: + } + + ++#define EIA_INCR 16 /* must be 2^n */ ++#define EIA_MASK (EIA_INCR - 1) ++/* Add the large xattr @ino into @lea_ino_array for later deletion. ++ * If @lea_ino_array is new or full it will be grown and the old ++ * contents copied over. ++ */ ++static int ++ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino) ++{ ++ if (*lea_ino_array == NULL) { ++ /* ++ * Start with 15 inodes, so it fits into a power-of-two size. ++ * If *lea_ino_array is NULL, this is essentially offsetof() ++ */ ++ (*lea_ino_array) = ++ kmalloc(offsetof(struct ext4_xattr_ino_array, ++ xia_inodes[EIA_MASK]), ++ GFP_NOFS); ++ if (*lea_ino_array == NULL) ++ return -ENOMEM; ++ (*lea_ino_array)->xia_count = 0; ++ } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) { ++ /* expand the array once all 15 + n * 16 slots are full */ ++ struct ext4_xattr_ino_array *new_array = NULL; ++ int count = (*lea_ino_array)->xia_count; ++ ++ /* if new_array is NULL, this is essentially offsetof() */ ++ new_array = kmalloc( ++ offsetof(struct ext4_xattr_ino_array, ++ xia_inodes[count + EIA_INCR]), ++ GFP_NOFS); ++ if (new_array == NULL) ++ return -ENOMEM; ++ memcpy(new_array, *lea_ino_array, ++ offsetof(struct ext4_xattr_ino_array, ++ xia_inodes[count])); ++ kfree(*lea_ino_array); ++ *lea_ino_array = new_array; ++ } ++ (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino; ++ return 0; ++} ++ ++/** ++ * Add xattr inode to orphan list ++ */ ++static int ++ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, ++ int credits, struct ext4_xattr_ino_array *lea_ino_array) ++{ ++ struct inode *ea_inode = NULL; ++ int idx = 0, error = 0; ++ ++ if (lea_ino_array == NULL) ++ return 0; ++ ++ for (; idx < lea_ino_array->xia_count; ++idx) { ++ if (!ext4_handle_has_enough_credits(handle, credits)) { ++ error = ext4_journal_extend(handle, credits); ++ if (error > 0) ++ error = ext4_journal_restart(handle, credits); ++ ++ if (error != 0) { ++ ext4_warning(inode->i_sb, ++ "couldn't extend journal " ++ "(err %d)", error); ++ return error; ++ } ++ } ++ ea_inode = ext4_xattr_inode_iget(inode, ++ lea_ino_array->xia_inodes[idx], &error); ++ if (error) ++ continue; ++ ext4_orphan_add(handle, ea_inode); ++ /* the inode's i_count will be released by caller */ ++ } ++ ++ return 0; ++} + + /* + * ext4_xattr_delete_inode() + * +- * Free extended attribute resources associated with this inode. This ++ * Free extended attribute resources associated with this inode. Traverse ++ * all entries and unlink any xattr inodes associated with this inode. This + * is called immediately before an inode is freed. We have exclusive +- * access to the inode. ++ * access to the inode. If an orphan inode is deleted it will also delete any ++ * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() ++ * to ensure they belong to the parent inode and were not deleted already. + */ +-void +-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) ++int ++ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, ++ struct ext4_xattr_ino_array **lea_ino_array) + { + struct buffer_head *bh = NULL; ++ struct ext4_xattr_ibody_header *header; ++ struct ext4_inode *raw_inode; ++ struct ext4_iloc iloc; ++ struct ext4_xattr_entry *entry; ++ int credits = 3, error = 0; + +- if (!EXT4_I(inode)->i_file_acl) ++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) ++ goto delete_external_ea; ++ ++ error = ext4_get_inode_loc(inode, &iloc); ++ if (error) + goto cleanup; ++ raw_inode = ext4_raw_inode(&iloc); ++ header = IHDR(inode, raw_inode); ++ for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); ++ entry = EXT4_XATTR_NEXT(entry)) { ++ if (!entry->e_value_inum) ++ continue; ++ if (ext4_expand_ino_array(lea_ino_array, ++ entry->e_value_inum) != 0) { ++ brelse(iloc.bh); ++ goto cleanup; ++ } ++ entry->e_value_inum = 0; ++ } ++ brelse(iloc.bh); ++ ++delete_external_ea: ++ if (!EXT4_I(inode)->i_file_acl) { ++ /* add xattr inode to orphan list */ ++ ext4_xattr_inode_orphan_add(handle, inode, credits, ++ *lea_ino_array); ++ goto cleanup; ++ } + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", +@@ -1504,11 +1930,69 @@ ext4_xattr_delete_inode(handle_t *handle + EXT4_I(inode)->i_file_acl); + goto cleanup; + } ++ ++ for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT4_XATTR_NEXT(entry)) { ++ if (!entry->e_value_inum) ++ continue; ++ if (ext4_expand_ino_array(lea_ino_array, ++ entry->e_value_inum) != 0) ++ goto cleanup; ++ entry->e_value_inum = 0; ++ } ++ ++ /* add xattr inode to orphan list */ ++ error = ext4_xattr_inode_orphan_add(handle, inode, credits, ++ *lea_ino_array); ++ if (error != 0) ++ goto cleanup; ++ ++ if (!IS_NOQUOTA(inode)) ++ credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); ++ ++ if (!ext4_handle_has_enough_credits(handle, credits)) { ++ error = ext4_journal_extend(handle, credits); ++ if (error > 0) ++ error = ext4_journal_restart(handle, credits); ++ if (error != 0) { ++ ext4_warning(inode->i_sb, ++ "couldn't extend journal (err %d)", error); ++ goto cleanup; ++ } ++ } ++ + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + + cleanup: + brelse(bh); ++ ++ return error; ++} ++ ++void ++ext4_xattr_inode_array_free(struct inode *inode, ++ struct ext4_xattr_ino_array *lea_ino_array) ++{ ++ struct inode *ea_inode = NULL; ++ int idx = 0; ++ int err; ++ ++ if (lea_ino_array == NULL) ++ return; ++ ++ for (; idx < lea_ino_array->xia_count; ++idx) { ++ ea_inode = ext4_xattr_inode_iget(inode, ++ lea_ino_array->xia_inodes[idx], &err); ++ if (err) ++ continue; ++ /* for inode's i_count get from ext4_xattr_delete_inode */ ++ if (!list_empty(&EXT4_I(ea_inode)->i_orphan)) ++ iput(ea_inode); ++ clear_nlink(ea_inode); ++ iput(ea_inode); ++ } ++ kfree(lea_ino_array); + } + + /* +@@ -1578,10 +2062,9 @@ ext4_xattr_cmp(struct ext4_xattr_header + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || ++ entry1->e_value_inum != entry2->e_value_inum || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; +- if (entry1->e_value_block != 0 || entry2->e_value_block != 0) +- return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) +@@ -1665,7 +2148,7 @@ static inline void ext4_xattr_hash_entry + *name++; + } + +- if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ if (!entry->e_value_inum && entry->e_value_size) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + +Index: linux-stage/fs/ext4/xattr.h +=================================================================== +--- linux-stage.orig/fs/ext4/xattr.h ++++ linux-stage/fs/ext4/xattr.h +@@ -42,7 +42,7 @@ struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ +- __le32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __le32 e_value_inum; /* inode in which the value is stored */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +@@ -67,6 +67,15 @@ struct ext4_xattr_entry { + EXT4_I(inode)->i_extra_isize)) + #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + ++#define i_xattr_inode_parent i_mtime.tv_sec ++ ++/* ++ * The minimum size of EA value when you start storing it in an external inode ++ * size of block - size of header - size of 1 entry - 4 null bytes ++*/ ++#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ ++ ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) ++ + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) + #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) + #define BFIRST(bh) ENTRY(BHDR(bh)+1) +@@ -75,10 +84,11 @@ struct ext4_xattr_entry { + #define EXT4_ZERO_XATTR_VALUE ((void *)-1) + + struct ext4_xattr_info { +- int name_index; + const char *name; + const void *value; + size_t value_len; ++ int name_index; ++ int in_inode; + }; + + struct ext4_xattr_search { +@@ -106,7 +116,13 @@ extern int ext4_xattr_get(struct inode * + extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); + extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +-extern void ext4_xattr_delete_inode(handle_t *, struct inode *); ++extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, ++ int *err); ++extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); ++extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, ++ struct ext4_xattr_ino_array **array); ++extern void ext4_xattr_inode_array_free(struct inode *inode, ++ struct ext4_xattr_ino_array *array); + extern void ext4_xattr_put_super(struct super_block *); + + extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, +Index: linux-stage/fs/ext4/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/ialloc.c ++++ linux-stage/fs/ext4/ialloc.c +@@ -269,7 +269,6 @@ void ext4_free_inode(handle_t *handle, s + * as writing the quota to disk may need the lock as well. + */ + dquot_initialize(inode); +- ext4_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + +Index: linux-stage/fs/ext4/inline.c +=================================================================== +--- linux-stage.orig/fs/ext4/inline.c ++++ linux-stage/fs/ext4/inline.c +@@ -59,7 +59,7 @@ static int get_max_inline_xattr_value_si + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { +- if (!entry->e_value_block && entry->e_value_size) { ++ if (!entry->e_value_inum && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; diff --git a/ldiskfs/kernel_patches/patches/rhel7.2/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-pdirop.patch new file mode 100644 index 0000000..21991ab --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-pdirop.patch @@ -0,0 +1,1932 @@ +Single directory performance is a critical for HPC workloads. In a +typical use case an application creates a separate output file for +each node and task in a job. As nodes and tasks increase, hundreds +of thousands of files may be created in a single directory within +a short window of time. +Today, both filename lookup and file system modifying operations +(such as create and unlink) are protected with a single lock for +an entire ldiskfs directory. PDO project will remove this +bottleneck by introducing a parallel locking mechanism for entire +ldiskfs directories. This work will enable multiple application +threads to simultaneously lookup, create and unlink in parallel. + +This patch contains: + - pdirops support for ldiskfs + - integrate with osd-ldiskfs + +Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h +=================================================================== +--- /dev/null ++++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h +@@ -0,0 +1,187 @@ ++/* ++ * include/linux/htree_lock.h ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++ ++/* ++ * htree lock ++ * ++ * htree_lock is an advanced lock, it can support five lock modes (concept is ++ * taken from DLM) and it's a sleeping lock. ++ * ++ * most common use case is: ++ * - create a htree_lock_head for data ++ * - each thread (contender) creates it's own htree_lock ++ * - contender needs to call htree_lock(lock_node, mode) to protect data and ++ * call htree_unlock to release lock ++ * ++ * Also, there is advanced use-case which is more complex, user can have ++ * PW/PR lock on particular key, it's mostly used while user holding shared ++ * lock on the htree (CW, CR) ++ * ++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR ++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR ++ * ... ++ * htree_node_unlock(lock_node);; unlock the key ++ * ++ * Another tip is, we can have N-levels of this kind of keys, all we need to ++ * do is specifying N-levels while creating htree_lock_head, then we can ++ * lock/unlock a specific level by: ++ * htree_node_lock(lock_node, mode1, key1, level1...); ++ * do something; ++ * htree_node_lock(lock_node, mode1, key2, level2...); ++ * do something; ++ * htree_node_unlock(lock_node, level2); ++ * htree_node_unlock(lock_node, level1); ++ * ++ * NB: for multi-level, should be careful about locking order to avoid deadlock ++ */ ++ ++#ifndef _LINUX_HTREE_LOCK_H ++#define _LINUX_HTREE_LOCK_H ++ ++#include ++#include ++#include ++ ++/* ++ * Lock Modes ++ * more details can be found here: ++ * http://en.wikipedia.org/wiki/Distributed_lock_manager ++ */ ++typedef enum { ++ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */ ++ HTREE_LOCK_PW, /* protected write: allows only CR users */ ++ HTREE_LOCK_PR, /* protected read: allow PR, CR users */ ++ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */ ++ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */ ++ HTREE_LOCK_MAX, /* number of lock modes */ ++} htree_lock_mode_t; ++ ++#define HTREE_LOCK_NL HTREE_LOCK_MAX ++#define HTREE_LOCK_INVAL 0xdead10c ++ ++enum { ++ HTREE_HBITS_MIN = 2, ++ HTREE_HBITS_DEF = 14, ++ HTREE_HBITS_MAX = 32, ++}; ++ ++enum { ++ HTREE_EVENT_DISABLE = (0), ++ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR), ++ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW), ++ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR), ++}; ++ ++struct htree_lock; ++ ++typedef void (*htree_event_cb_t)(void *target, void *event); ++ ++struct htree_lock_child { ++ struct list_head lc_list; /* granted list */ ++ htree_event_cb_t lc_callback; /* event callback */ ++ unsigned lc_events; /* event types */ ++}; ++ ++struct htree_lock_head { ++ unsigned long lh_lock; /* bits lock */ ++ /* blocked lock list (htree_lock) */ ++ struct list_head lh_blocked_list; ++ /* # key levels */ ++ u16 lh_depth; ++ /* hash bits for key and limit number of locks */ ++ u16 lh_hbits; ++ /* counters for blocked locks */ ++ u16 lh_nblocked[HTREE_LOCK_MAX]; ++ /* counters for granted locks */ ++ u16 lh_ngranted[HTREE_LOCK_MAX]; ++ /* private data */ ++ void *lh_private; ++ /* array of children locks */ ++ struct htree_lock_child lh_children[0]; ++}; ++ ++/* htree_lock_node_t is child-lock for a specific key (ln_value) */ ++struct htree_lock_node { ++ htree_lock_mode_t ln_mode; ++ /* major hash key */ ++ u16 ln_major_key; ++ /* minor hash key */ ++ u16 ln_minor_key; ++ struct list_head ln_major_list; ++ struct list_head ln_minor_list; ++ /* alive list, all locks (granted, blocked, listening) are on it */ ++ struct list_head ln_alive_list; ++ /* blocked list */ ++ struct list_head ln_blocked_list; ++ /* granted list */ ++ struct list_head ln_granted_list; ++ void *ln_ev_target; ++}; ++ ++struct htree_lock { ++ struct task_struct *lk_task; ++ struct htree_lock_head *lk_head; ++ void *lk_private; ++ unsigned lk_depth; ++ htree_lock_mode_t lk_mode; ++ struct list_head lk_blocked_list; ++ struct htree_lock_node lk_nodes[0]; ++}; ++ ++/* create a lock head, which stands for a resource */ ++struct htree_lock_head *htree_lock_head_alloc(unsigned depth, ++ unsigned hbits, unsigned priv); ++/* free a lock head */ ++void htree_lock_head_free(struct htree_lock_head *lhead); ++/* register event callback for child lock at level @depth */ ++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth, ++ unsigned events, htree_event_cb_t callback); ++/* create a lock handle, which stands for a thread */ ++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes); ++/* free a lock handle */ ++void htree_lock_free(struct htree_lock *lck); ++/* lock htree, when @wait is true, 0 is returned if the lock can't ++ * be granted immediately */ ++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait); ++/* unlock htree */ ++void htree_unlock(struct htree_lock *lck); ++/* unlock and relock htree with @new_mode */ ++int htree_change_lock_try(struct htree_lock *lck, ++ htree_lock_mode_t new_mode, int wait); ++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode); ++/* require child lock (key) of htree at level @dep, @event will be sent to all ++ * listeners on this @key while lock being granted */ ++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event); ++/* release child lock at level @dep, this lock will listen on it's key ++ * if @event isn't NULL, event_cb will be called against @lck while granting ++ * any other lock at level @dep with the same key */ ++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event); ++/* stop listening on child lock at level @dep */ ++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep); ++/* for debug */ ++void htree_lock_stat_print(int depth); ++void htree_lock_stat_reset(void); ++ ++#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1) ++#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1) ++ ++#define htree_lock_mode(lck) ((lck)->lk_mode) ++ ++#define htree_node_lock(lck, mode, key, dep) \ ++ htree_node_lock_try(lck, mode, key, dep, 1, NULL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_granted(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \ ++ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_listening(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) ++ ++#endif +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c +=================================================================== +--- /dev/null ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c +@@ -0,0 +1,880 @@ ++/* ++ * fs/ext4/htree_lock.c ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++#include ++#include ++#include ++#include ++ ++enum { ++ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX), ++ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW), ++ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR), ++ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW), ++ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR), ++}; ++ ++enum { ++ HTREE_LOCK_COMPAT_EX = 0, ++ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR, ++ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR, ++ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW, ++ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR | ++ HTREE_LOCK_BIT_PW, ++}; ++ ++static int htree_lock_compat[] = { ++ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX, ++ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW, ++ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR, ++ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW, ++ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR, ++}; ++ ++/* max allowed htree-lock depth. ++ * We only need depth=3 for ext4 although user can have higher value. */ ++#define HTREE_LOCK_DEP_MAX 16 ++ ++#ifdef HTREE_LOCK_DEBUG ++ ++static char *hl_name[] = { ++ [HTREE_LOCK_EX] "EX", ++ [HTREE_LOCK_PW] "PW", ++ [HTREE_LOCK_PR] "PR", ++ [HTREE_LOCK_CW] "CW", ++ [HTREE_LOCK_CR] "CR", ++}; ++ ++/* lock stats */ ++struct htree_lock_node_stats { ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long retried[HTREE_LOCK_MAX]; ++ unsigned long long events; ++}; ++ ++struct htree_lock_stats { ++ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++}; ++ ++static struct htree_lock_stats hl_stats; ++ ++void htree_lock_stat_reset(void) ++{ ++ memset(&hl_stats, 0, sizeof(hl_stats)); ++} ++ ++void htree_lock_stat_print(int depth) ++{ ++ int i; ++ int j; ++ ++ printk(KERN_DEBUG "HTREE LOCK STATS:\n"); ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n", ++ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]); ++ } ++ for (i = 0; i < depth; i++) { ++ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i); ++ for (j = 0; j < HTREE_LOCK_MAX; j++) { ++ printk(KERN_DEBUG ++ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n", ++ hl_name[j], hl_stats.nodes[i].granted[j], ++ hl_stats.nodes[i].blocked[j], ++ hl_stats.nodes[i].retried[j]); ++ } ++ } ++} ++ ++#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0) ++#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0) ++#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0) ++#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0) ++#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0) ++#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0) ++ ++#else /* !DEBUG */ ++ ++void htree_lock_stat_reset(void) {} ++void htree_lock_stat_print(int depth) {} ++ ++#define lk_grant_inc(m) do {} while (0) ++#define lk_block_inc(m) do {} while (0) ++#define ln_grant_inc(d, m) do {} while (0) ++#define ln_block_inc(d, m) do {} while (0) ++#define ln_retry_inc(d, m) do {} while (0) ++#define ln_event_inc(d) do {} while (0) ++ ++#endif /* DEBUG */ ++ ++EXPORT_SYMBOL(htree_lock_stat_reset); ++EXPORT_SYMBOL(htree_lock_stat_print); ++ ++#define HTREE_DEP_ROOT (-1) ++ ++#define htree_spin_lock(lhead, dep) \ ++ bit_spin_lock((dep) + 1, &(lhead)->lh_lock) ++#define htree_spin_unlock(lhead, dep) \ ++ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock) ++ ++#define htree_key_event_ignore(child, ln) \ ++ (!((child)->lc_events & (1 << (ln)->ln_mode))) ++ ++static int ++htree_key_list_empty(struct htree_lock_node *ln) ++{ ++ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list); ++} ++ ++static void ++htree_key_list_del_init(struct htree_lock_node *ln) ++{ ++ struct htree_lock_node *tmp = NULL; ++ ++ if (!list_empty(&ln->ln_minor_list)) { ++ tmp = list_entry(ln->ln_minor_list.next, ++ struct htree_lock_node, ln_minor_list); ++ list_del_init(&ln->ln_minor_list); ++ } ++ ++ if (list_empty(&ln->ln_major_list)) ++ return; ++ ++ if (tmp == NULL) { /* not on minor key list */ ++ list_del_init(&ln->ln_major_list); ++ } else { ++ BUG_ON(!list_empty(&tmp->ln_major_list)); ++ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list); ++ } ++} ++ ++static void ++htree_key_list_replace_init(struct htree_lock_node *old, ++ struct htree_lock_node *new) ++{ ++ if (!list_empty(&old->ln_major_list)) ++ list_replace_init(&old->ln_major_list, &new->ln_major_list); ++ ++ if (!list_empty(&old->ln_minor_list)) ++ list_replace_init(&old->ln_minor_list, &new->ln_minor_list); ++} ++ ++static void ++htree_key_event_enqueue(struct htree_lock_child *child, ++ struct htree_lock_node *ln, int dep, void *event) ++{ ++ struct htree_lock_node *tmp; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ BUG_ON(ln->ln_mode == HTREE_LOCK_NL); ++ if (event == NULL || htree_key_event_ignore(child, ln)) ++ return; ++ ++ /* shouldn't be a very long list */ ++ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) { ++ if (tmp->ln_mode == HTREE_LOCK_NL) { ++ ln_event_inc(dep); ++ if (child->lc_callback != NULL) ++ child->lc_callback(tmp->ln_ev_target, event); ++ } ++ } ++} ++ ++static int ++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk, ++ unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep]; ++ struct htree_lock_node *newln = &newlk->lk_nodes[dep]; ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ /* NB: we only expect PR/PW lock mode at here, only these two modes are ++ * allowed for htree_node_lock(asserted in htree_node_lock_internal), ++ * NL is only used for listener, user can't directly require NL mode */ ++ if ((curln->ln_mode == HTREE_LOCK_NL) || ++ (curln->ln_mode != HTREE_LOCK_PW && ++ newln->ln_mode != HTREE_LOCK_PW)) { ++ /* no conflict, attach it on granted list of @curlk */ ++ if (curln->ln_mode != HTREE_LOCK_NL) { ++ list_add(&newln->ln_granted_list, ++ &curln->ln_granted_list); ++ } else { ++ /* replace key owner */ ++ htree_key_list_replace_init(curln, newln); ++ } ++ ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ htree_key_event_enqueue(child, newln, dep, event); ++ ln_grant_inc(dep, newln->ln_mode); ++ return 1; /* still hold lh_lock */ ++ } ++ ++ if (!wait) { /* can't grant and don't want to wait */ ++ ln_retry_inc(dep, newln->ln_mode); ++ newln->ln_mode = HTREE_LOCK_INVAL; ++ return -1; /* don't wait and just return -1 */ ++ } ++ ++ newlk->lk_task = current; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ /* conflict, attach it on blocked list of curlk */ ++ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list); ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ ln_block_inc(dep, newln->ln_mode); ++ ++ htree_spin_unlock(newlk->lk_head, dep); ++ /* wait to be given the lock */ ++ if (newlk->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt, wake up will set me RUNNING */ ++ if (event == NULL || htree_key_event_ignore(child, newln)) ++ return 0; /* granted without lh_lock */ ++ ++ htree_spin_lock(newlk->lk_head, dep); ++ htree_key_event_enqueue(child, newln, dep, event); ++ return 1; /* still hold lh_lock */ ++} ++ ++/* ++ * get PR/PW access to particular tree-node according to @dep and @key, ++ * it will return -1 if @wait is false and can't immediately grant this lock. ++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get ++ * @event if it's not NULL. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck, ++ htree_lock_mode_t mode, u32 key, unsigned dep, ++ int wait, void *event) ++{ ++ LIST_HEAD(list); ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ u16 major; ++ u16 minor; ++ u8 reverse; ++ u8 ma_bits; ++ u8 mi_bits; ++ ++ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR); ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ ++ key = hash_long(key, lhead->lh_hbits); ++ ++ mi_bits = lhead->lh_hbits >> 1; ++ ma_bits = lhead->lh_hbits - mi_bits; ++ ++ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1); ++ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits; ++ lck->lk_nodes[dep].ln_mode = mode; ++ ++ /* ++ * The major key list is an ordered list, so searches are started ++ * at the end of the list that is numerically closer to major_key, ++ * so at most half of the list will be walked (for well-distributed ++ * keys). The list traversal aborts early if the expected key ++ * location is passed. ++ */ ++ reverse = (major >= (1 << (ma_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp, ++ &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key < major) { ++ /* attach _after_ @tmp */ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ ++ } else { ++ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key > major) { ++ /* insert _before_ @tmp */ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ } ++ ++ search_minor: ++ /* ++ * NB: minor_key list doesn't have a "head", @list is just a ++ * temporary stub for helping list searching, make sure it's removed ++ * after searching. ++ * minor_key list is an ordered list too. ++ */ ++ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list); ++ ++ reverse = (minor >= (1 << (mi_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) { ++ /* attach _after_ @tmp2 */ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, &list); ++ ++ } else { ++ list_for_each_entry(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) { ++ /* insert _before_ @tmp2 */ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list); ++ } ++ ++ out_grant_minor: ++ if (list.next == &lck->lk_nodes[dep].ln_minor_list) { ++ /* new lock @lck is the first one on minor_key list, which ++ * means it has the smallest minor_key and it should ++ * replace @tmp as minor_key owner */ ++ list_replace_init(&tmp->lk_nodes[dep].ln_major_list, ++ &lck->lk_nodes[dep].ln_major_list); ++ } ++ /* remove the temporary head */ ++ list_del(&list); ++ ++ out_grant_major: ++ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode); ++ return 1; /* granted with holding lh_lock */ ++ ++ out_enqueue: ++ list_del(&list); /* remove temprary head */ ++ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event); ++} ++ ++/* ++ * release the key of @lck at level @dep, and grant any blocked locks. ++ * caller will still listen on @key if @event is not NULL, which means ++ * caller can see a event (by event_cb) while granting any lock with ++ * the same key at level @dep. ++ * NB: ALWAYS called holding lhead::lh_lock ++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL ++ */ ++static void ++htree_node_unlock_internal(struct htree_lock_head *lhead, ++ struct htree_lock *curlk, unsigned dep, void *event) ++{ ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ struct htree_lock *grtlk = NULL; ++ struct htree_lock_node *grtln; ++ struct htree_lock *poslk; ++ struct htree_lock *tmplk; ++ ++ if (!htree_node_is_granted(curlk, dep)) ++ return; ++ ++ if (!list_empty(&curln->ln_granted_list)) { ++ /* there is another granted lock */ ++ grtlk = list_entry(curln->ln_granted_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_granted_list); ++ list_del_init(&curln->ln_granted_list); ++ } ++ ++ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) { ++ /* ++ * @curlk is the only granted lock, so we confirmed: ++ * a) curln is key owner (attached on major/minor_list), ++ * so if there is any blocked lock, it should be attached ++ * on curln->ln_blocked_list ++ * b) we always can grant the first blocked lock ++ */ ++ grtlk = list_entry(curln->ln_blocked_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_blocked_list); ++ BUG_ON(grtlk->lk_task == NULL); ++ wake_up_process(grtlk->lk_task); ++ } ++ ++ if (event != NULL && ++ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) { ++ curln->ln_ev_target = event; ++ curln->ln_mode = HTREE_LOCK_NL; /* listen! */ ++ } else { ++ curln->ln_mode = HTREE_LOCK_INVAL; ++ } ++ ++ if (grtlk == NULL) { /* I must be the only one locking this key */ ++ struct htree_lock_node *tmpln; ++ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */ ++ return; ++ ++ /* not listening */ ++ if (list_empty(&curln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(curln); ++ return; ++ } ++ ++ tmpln = list_entry(curln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL); ++ ++ htree_key_list_replace_init(curln, tmpln); ++ list_del_init(&curln->ln_alive_list); ++ ++ return; ++ } ++ ++ /* have a granted lock */ ++ grtln = &grtlk->lk_nodes[dep]; ++ if (!list_empty(&curln->ln_blocked_list)) { ++ /* only key owner can be on both lists */ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (list_empty(&grtln->ln_blocked_list)) { ++ list_add(&grtln->ln_blocked_list, ++ &curln->ln_blocked_list); ++ } ++ list_del_init(&curln->ln_blocked_list); ++ } ++ /* ++ * NB: this is the tricky part: ++ * We have only two modes for child-lock (PR and PW), also, ++ * only owner of the key (attached on major/minor_list) can be on ++ * both blocked_list and granted_list, so @grtlk must be one ++ * of these two cases: ++ * ++ * a) @grtlk is taken from granted_list, which means we've granted ++ * more than one lock so @grtlk has to be PR, the first blocked ++ * lock must be PW and we can't grant it at all. ++ * So even @grtlk is not owner of the key (empty blocked_list), ++ * we don't care because we can't grant any lock. ++ * b) we just grant a new lock which is taken from head of blocked ++ * list, and it should be the first granted lock, and it should ++ * be the first one linked on blocked_list. ++ * ++ * Either way, we can get correct result by iterating blocked_list ++ * of @grtlk, and don't have to bother on how to find out ++ * owner of current key. ++ */ ++ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list, ++ lk_nodes[dep].ln_blocked_list) { ++ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW || ++ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW) ++ break; ++ /* grant all readers */ ++ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list); ++ list_add(&poslk->lk_nodes[dep].ln_granted_list, ++ &grtln->ln_granted_list); ++ ++ BUG_ON(poslk->lk_task == NULL); ++ wake_up_process(poslk->lk_task); ++ } ++ ++ /* if @curln is the owner of this key, replace it with @grtln */ ++ if (!htree_key_list_empty(curln)) ++ htree_key_list_replace_init(curln, grtln); ++ ++ if (curln->ln_mode == HTREE_LOCK_INVAL) ++ list_del_init(&curln->ln_alive_list); ++} ++ ++/* ++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted ++ * and 0 only if @wait is false and can't grant it immediately ++ */ ++int ++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event); ++ if (rc != 0) ++ htree_spin_unlock(lhead, dep); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_node_lock_try); ++ ++/* it's wrapper of htree_node_unlock_internal */ ++void ++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ htree_node_unlock_internal(lhead, lck, dep, event); ++ htree_spin_unlock(lhead, dep); ++} ++EXPORT_SYMBOL(htree_node_unlock); ++ ++/* stop listening on child-lock level @dep */ ++void ++htree_node_stop_listen(struct htree_lock *lck, unsigned dep) ++{ ++ struct htree_lock_node *ln = &lck->lk_nodes[dep]; ++ struct htree_lock_node *tmp; ++ ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ BUG_ON(!list_empty(&ln->ln_blocked_list)); ++ BUG_ON(!list_empty(&ln->ln_granted_list)); ++ ++ if (!htree_node_is_listening(lck, dep)) ++ return; ++ ++ htree_spin_lock(lck->lk_head, dep); ++ ln->ln_mode = HTREE_LOCK_INVAL; ++ ln->ln_ev_target = NULL; ++ ++ if (htree_key_list_empty(ln)) { /* not owner */ ++ list_del_init(&ln->ln_alive_list); ++ goto out; ++ } ++ ++ /* I'm the owner... */ ++ if (list_empty(&ln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(ln); ++ goto out; ++ } ++ ++ tmp = list_entry(ln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL); ++ htree_key_list_replace_init(ln, tmp); ++ list_del_init(&ln->ln_alive_list); ++ out: ++ htree_spin_unlock(lck->lk_head, dep); ++} ++EXPORT_SYMBOL(htree_node_stop_listen); ++ ++/* release all child-locks if we have any */ ++static void ++htree_node_release_all(struct htree_lock *lck) ++{ ++ int i; ++ ++ for (i = 0; i < lck->lk_depth; i++) { ++ if (htree_node_is_granted(lck, i)) ++ htree_node_unlock(lck, i, NULL); ++ else if (htree_node_is_listening(lck, i)) ++ htree_node_stop_listen(lck, i); ++ } ++} ++ ++/* ++ * obtain htree lock, it could be blocked inside if there's conflict ++ * with any granted or blocked lock and @wait is true. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_lock_internal(struct htree_lock *lck, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int granted = 0; ++ int blocked = 0; ++ int i; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ if (lhead->lh_nblocked[i] != 0) ++ blocked |= 1 << i; ++ } ++ if ((htree_lock_compat[lck->lk_mode] & granted) != granted || ++ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) { ++ /* will block current lock even it just conflicts with any ++ * other blocked lock, so lock like EX wouldn't starve */ ++ if (!wait) ++ return -1; ++ lhead->lh_nblocked[lck->lk_mode]++; ++ lk_block_inc(lck->lk_mode); ++ ++ lck->lk_task = current; ++ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ /* wait to be given the lock */ ++ if (lck->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt. wake up will set me RUNNING */ ++ return 0; /* without lh_lock */ ++ } ++ lhead->lh_ngranted[lck->lk_mode]++; ++ lk_grant_inc(lck->lk_mode); ++ return 1; ++} ++ ++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */ ++static void ++htree_unlock_internal(struct htree_lock *lck) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ int granted = 0; ++ int i; ++ ++ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0); ++ ++ lhead->lh_ngranted[lck->lk_mode]--; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ } ++ list_for_each_entry_safe(tmp, tmp2, ++ &lhead->lh_blocked_list, lk_blocked_list) { ++ /* conflict with any granted lock? */ ++ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted) ++ break; ++ ++ list_del_init(&tmp->lk_blocked_list); ++ ++ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0); ++ ++ lhead->lh_nblocked[tmp->lk_mode]--; ++ lhead->lh_ngranted[tmp->lk_mode]++; ++ granted |= 1 << tmp->lk_mode; ++ ++ BUG_ON(tmp->lk_task == NULL); ++ wake_up_process(tmp->lk_task); ++ } ++} ++ ++/* it's wrapper of htree_lock_internal and exported interface. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait) ++{ ++ int rc; ++ ++ BUG_ON(lck->lk_depth > lhead->lh_depth); ++ BUG_ON(lck->lk_head != NULL); ++ BUG_ON(lck->lk_task != NULL); ++ ++ lck->lk_head = lhead; ++ lck->lk_mode = mode; ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_lock_try); ++ ++/* it's wrapper of htree_unlock_internal and exported interface. ++ * It will release all htree_node_locks and htree_lock */ ++void ++htree_unlock(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_head == NULL); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT); ++ lck->lk_head = NULL; ++ lck->lk_task = NULL; ++} ++EXPORT_SYMBOL(htree_unlock); ++ ++/* change lock mode */ ++void ++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode) ++{ ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ lck->lk_mode = mode; ++} ++EXPORT_SYMBOL(htree_change_mode); ++ ++/* release htree lock, and lock it again with new mode. ++ * This function will first release all htree_node_locks and htree_lock, ++ * then try to gain htree_lock with new @mode. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(lhead == NULL); ++ BUG_ON(lck->lk_mode == mode); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ lck->lk_mode = mode; ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_change_lock_try); ++ ++/* create a htree_lock head with @depth levels (number of child-locks), ++ * it is a per resoruce structure */ ++struct htree_lock_head * ++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv) ++{ ++ struct htree_lock_head *lhead; ++ int i; ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ ++ lhead = kzalloc(offsetof(struct htree_lock_head, ++ lh_children[depth]) + priv, GFP_NOFS); ++ if (lhead == NULL) ++ return NULL; ++ ++ if (hbits < HTREE_HBITS_MIN) ++ lhead->lh_hbits = HTREE_HBITS_MIN; ++ else if (hbits > HTREE_HBITS_MAX) ++ lhead->lh_hbits = HTREE_HBITS_MAX; ++ ++ lhead->lh_lock = 0; ++ lhead->lh_depth = depth; ++ INIT_LIST_HEAD(&lhead->lh_blocked_list); ++ if (priv > 0) { ++ lhead->lh_private = (void *)lhead + ++ offsetof(struct htree_lock_head, lh_children[depth]); ++ } ++ ++ for (i = 0; i < depth; i++) { ++ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list); ++ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE; ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(htree_lock_head_alloc); ++ ++/* free the htree_lock head */ ++void ++htree_lock_head_free(struct htree_lock_head *lhead) ++{ ++ int i; ++ ++ BUG_ON(!list_empty(&lhead->lh_blocked_list)); ++ for (i = 0; i < lhead->lh_depth; i++) ++ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list)); ++ kfree(lhead); ++} ++EXPORT_SYMBOL(htree_lock_head_free); ++ ++/* register event callback for @events of child-lock at level @dep */ ++void ++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep, ++ unsigned events, htree_event_cb_t callback) ++{ ++ BUG_ON(lhead->lh_depth <= dep); ++ lhead->lh_children[dep].lc_events = events; ++ lhead->lh_children[dep].lc_callback = callback; ++} ++EXPORT_SYMBOL(htree_lock_event_attach); ++ ++/* allocate a htree_lock, which is per-thread structure, @pbytes is some ++ * extra-bytes as private data for caller */ ++struct htree_lock * ++htree_lock_alloc(unsigned depth, unsigned pbytes) ++{ ++ struct htree_lock *lck; ++ int i = offsetof(struct htree_lock, lk_nodes[depth]); ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ lck = kzalloc(i + pbytes, GFP_NOFS); ++ if (lck == NULL) ++ return NULL; ++ ++ if (pbytes != 0) ++ lck->lk_private = (void *)lck + i; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ lck->lk_depth = depth; ++ INIT_LIST_HEAD(&lck->lk_blocked_list); ++ ++ for (i = 0; i < depth; i++) { ++ struct htree_lock_node *node = &lck->lk_nodes[i]; ++ ++ node->ln_mode = HTREE_LOCK_INVAL; ++ INIT_LIST_HEAD(&node->ln_major_list); ++ INIT_LIST_HEAD(&node->ln_minor_list); ++ INIT_LIST_HEAD(&node->ln_alive_list); ++ INIT_LIST_HEAD(&node->ln_blocked_list); ++ INIT_LIST_HEAD(&node->ln_granted_list); ++ } ++ ++ return lck; ++} ++EXPORT_SYMBOL(htree_lock_alloc); ++ ++/* free htree_lock node */ ++void ++htree_lock_free(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL); ++ kfree(lck); ++} ++EXPORT_SYMBOL(htree_lock_free); +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/Makefile ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile +@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ htree_lock.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -821,6 +822,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -1846,6 +1850,71 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *__ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); + + /* + * Describe an inode's exact location on disk and in memory +@@ -2088,9 +2157,17 @@ void ext4_insert_dentry(struct inode *in + const char *name, int namelen, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++#endif + } + static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +@@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t + ext4_lblk_t *block) + { + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err = 0; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -60,15 +61,22 @@ struct buffer_head *ext4_append(handle_t + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, 1, &err); +- if (!bh) ++ if (!bh) { ++ up(&ei->i_append_sem); + return ERR_PTR(err); ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); ++ up(&ei->i_append_sem); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); +@@ -246,7 +254,7 @@ static struct dx_frame *dx_probe(const s + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +- int *err); ++ struct htree_lock *lck, int *err); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +@@ -259,13 +267,13 @@ static void dx_insert_block(struct dx_fr + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *err); ++ struct htree_lock *lck, int *err); + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode); ++ struct inode *inode, struct htree_lock *lck); + + /* checksumming functions */ + void initialize_dirent_tail(struct ext4_dir_entry_tail *t, +@@ -668,6 +676,227 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ __ext4_find_entry(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ __ext4_add_entry(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -679,10 +908,11 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(const struct qstr *d_name, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck, int *err) + { + unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; +@@ -750,8 +980,15 @@ dx_probe(const struct qstr *d_name, stru + dxtrace(printk("Look up %x", hash)); + while (1) + { ++ if (indirect == 0) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); +- if (!count || count > dx_get_limit(entries)) { ++ if (count == 0 || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + brelse(bh); +@@ -792,7 +1029,70 @@ dx_probe(const struct qstr *d_name, stru + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; ++ ++ if (indirect == 0) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ indirect--; + bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); +@@ -860,7 +1160,7 @@ static void dx_release (struct dx_frame + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -875,12 +1175,22 @@ static int ext4_htree_next_block(struct + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -903,6 +1213,13 @@ static int ext4_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -911,6 +1228,7 @@ static int ext4_htree_next_block(struct + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1013,10 +1331,10 @@ int ext4_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames, &err); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err); + if (!frame) + return err; +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1043,7 +1361,7 @@ int ext4_htree_fill_tree(struct file *di + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1236,10 +1554,10 @@ static int is_dx_internal_node(struct in + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext4_find_entry (struct inode *dir, ++struct buffer_head *__ext4_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1283,7 +1601,7 @@ static struct buffer_head * ext4_find_en + goto restart; + } + if (is_dx(dir)) { +- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); ++ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1297,6 +1615,7 @@ static struct buffer_head * ext4_find_en + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; +@@ -1389,9 +1708,12 @@ cleanup_and_exit: + brelse(bh_use[ra_ptr]); + return ret; + } ++EXPORT_SYMBOL(__ext4_find_entry); + +-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, +- struct ext4_dir_entry_2 **res_dir, int *err) ++static struct buffer_head *ext4_dx_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck, int *err) + { + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; +@@ -1400,7 +1722,7 @@ static struct buffer_head * ext4_dx_find + ext4_lblk_t block; + int retval; + +- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) ++ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err))) + return NULL; + do { + block = dx_get_block(frame->at); +@@ -1424,7 +1746,7 @@ static struct buffer_head * ext4_dx_find + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning(sb, + "error reading index page in directory #%lu", +@@ -1583,8 +1905,9 @@ static struct ext4_dir_entry_2* dx_pack_ + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck, int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1647,7 +1970,14 @@ static struct ext4_dir_entry_2 *do_split + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -1666,13 +1996,21 @@ static struct ext4_dir_entry_2 *do_split + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); + if (err) + goto journal_error; +@@ -1945,7 +2283,7 @@ static int make_indexed_dir(handle_t *ha + ext4_handle_dirty_dx_node(handle, dir, frame->bh); + ext4_handle_dirty_dirent_node(handle, dir, bh); + +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, NULL, &retval); + if (!de) { + /* + * Even if the block split failed, we have to properly write +@@ -2051,8 +2389,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh = NULL; +@@ -2087,9 +2425,10 @@ static int ext4_add_entry(handle_t *hand + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, dentry, inode); ++ retval = ext4_dx_add_entry(handle, dentry, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); +@@ -2129,12 +2468,13 @@ static int ext4_add_entry(handle_t *hand + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(__ext4_add_entry); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++ struct inode *inode, struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2148,7 +2488,7 @@ static int ext4_dx_add_entry(handle_t *h + + again: + restart = 0; +- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); + if (!frame) + return err; + entries = frame->entries; +@@ -2178,6 +2518,11 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2277,16 +2622,43 @@ again: + restart = 1; + goto cleanup; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err); + if (!de) + goto cleanup; ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup; + + journal_error: + ext4_std_error(dir->i_sb, err); + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/super.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c +@@ -875,6 +875,7 @@ static struct inode *ext4_alloc_inode(st + + ei->vfs_inode.i_version = 1; + spin_lock_init(&ei->i_raw_lock); ++ sema_init(&ei->i_append_sem, 1); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series new file mode 100644 index 0000000..51fdeba --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series @@ -0,0 +1,19 @@ +rhel7/ext4-inode-version.patch +rhel7/ext4-lookup-dotdot.patch +rhel6.3/ext4-print-inum-in-htree-warning.patch +rhel7/ext4-prealloc.patch +rhel7/ext4-mballoc-extra-checks.patch +rhel7/ext4-misc.patch +rhel7/ext4-osd-iop-common.patch +rhel7/ext4-hash-indexed-dir-dotdot-update.patch +rhel7/ext4-kill-dx-root.patch +rhel7/ext4-mballoc-pa-free-mismatch.patch +rhel7/ext4-data-in-dirent.patch +rhel7.2/ext4-large-eas.patch +rhel7/ext4-disable-mb-cache.patch +rhel7/ext4-nocmtime.patch +rhel7/ext4-large-dir.patch +rhel7.2/ext4-pdirop.patch +rhel7/ext4-max-dir-size.patch +rhel7/ext4-remove-truncate-warning.patch +rhel7/ext4-corrupted-inode-block-bitmaps-handling-patches.patch diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 46324a3..086591a 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -6,7 +6,7 @@ TBD Intel Corporation 2.6.32-431.29.2.el6 (RHEL6.5) 2.6.32-504.30.3.el6 (RHEL6.6) 2.6.32-573.8.1.el6 (RHEL6.7) - 3.10.0-229.20.1.el7 (RHEL7.1) + 3.10.0-327.3.1.el7 (RHEL7.2) 3.0.101-0.47.67 (SLES11 SP3) 3.0.101-65 (SLES11 SP4) vanilla linux 4.2.1 (ZFS only) @@ -14,7 +14,7 @@ TBD Intel Corporation 2.6.32-431.29.2.el6 (RHEL6.5) 2.6.32-504.30.3.el6 (RHEL6.6) 2.6.32-573.8.1.el6 (RHEL6.7) - 3.10.0-229.20.1.el7 (RHEL7.1) + 3.10.0-327.3.1.el7 (RHEL7.2) 3.0.101-0.47.67 (SLES11 SP3) 3.0.101-65 (SLES11 SP4) 3.12.39-47 (SLES12) diff --git a/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config index 9f6e7fd..822315d 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-3.10.0-3.10-rhel7-x86_64.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 3.10.0 Kernel Configuration +# Linux/x86_64 3.10.0 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -78,6 +78,7 @@ CONFIG_HAVE_GENERIC_HARDIRQS=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ=y CONFIG_GENERIC_PENDING_IRQ=y CONFIG_IRQ_DOMAIN=y # CONFIG_IRQ_DOMAIN_DEBUG is not set @@ -162,7 +163,7 @@ CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y # CONFIG_DEBUG_BLK_CGROUP is not set -# CONFIG_CHECKPOINT_RESTORE is not set +CONFIG_CHECKPOINT_RESTORE=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_IPC_NS=y @@ -204,6 +205,7 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_AIO=y +CONFIG_USERFAULTFD=y CONFIG_PCI_QUIRKS=y # CONFIG_EMBEDDED is not set CONFIG_HAVE_PERF_EVENTS=y @@ -244,6 +246,7 @@ CONFIG_HAVE_DMA_ATTRS=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_GENERIC_SMP_IDLE_THREAD=y CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_CLK=y CONFIG_HAVE_DMA_API_DEBUG=y CONFIG_HAVE_HW_BREAKPOINT=y CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y @@ -362,7 +365,7 @@ CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_NUMACHIP is not set # CONFIG_X86_VSMP is not set CONFIG_X86_UV=y -# CONFIG_X86_INTEL_LPSS is not set +CONFIG_X86_INTEL_LPSS=y CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y CONFIG_SCHED_OMIT_FRAME_POINTER=y CONFIG_HYPERVISOR_GUEST=y @@ -489,8 +492,11 @@ CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set -CONFIG_ZBUD=y CONFIG_ZSWAP=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set CONFIG_X86_CHECK_BIOS_CORRUPTION=y # CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK is not set CONFIG_X86_RESERVE_LOW=64 @@ -547,6 +553,7 @@ CONFIG_PM_SLEEP_SMP=y CONFIG_PM_RUNTIME=y CONFIG_PM=y # CONFIG_PM_DEBUG is not set +CONFIG_PM_CLK=y CONFIG_ACPI=y CONFIG_ACPI_SLEEP=y CONFIG_ACPI_PROCFS=y @@ -559,7 +566,6 @@ CONFIG_ACPI_BUTTON=y CONFIG_ACPI_VIDEO=m CONFIG_ACPI_FAN=y CONFIG_ACPI_DOCK=y -CONFIG_ACPI_I2C=m CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_IPMI=m CONFIG_ACPI_HOTPLUG_CPU=y @@ -568,7 +574,6 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y # CONFIG_ACPI_CUSTOM_DSDT is not set CONFIG_ACPI_INITRD_TABLE_OVERRIDE=y -CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_PCI_SLOT=y CONFIG_X86_PM_TIMER=y @@ -745,6 +750,10 @@ CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +# CONFIG_NET_FOU is not set +# CONFIG_NET_FOU_IP_TUNNELS is not set +CONFIG_GENEVE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_IPCOMP=m @@ -770,6 +779,7 @@ CONFIG_TCP_CONG_LP=m CONFIG_TCP_CONG_VENO=m CONFIG_TCP_CONG_YEAH=m CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m CONFIG_DEFAULT_CUBIC=y # CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" @@ -788,11 +798,12 @@ CONFIG_INET6_XFRM_MODE_TRANSPORT=m CONFIG_INET6_XFRM_MODE_TUNNEL=m CONFIG_INET6_XFRM_MODE_BEET=m CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_VTI=m CONFIG_IPV6_SIT=m CONFIG_IPV6_SIT_6RD=y CONFIG_IPV6_NDISC_NODETYPE=y CONFIG_IPV6_TUNNEL=m -# CONFIG_IPV6_GRE is not set +CONFIG_IPV6_GRE=m CONFIG_IPV6_MULTIPLE_TABLES=y # CONFIG_IPV6_SUBTREES is not set CONFIG_IPV6_MROUTE=y @@ -1191,6 +1202,7 @@ CONFIG_NET_SCH_CHOKE=m CONFIG_NET_SCH_QFQ=m CONFIG_NET_SCH_CODEL=m CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_FQ=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m @@ -1235,13 +1247,14 @@ CONFIG_DCB=y CONFIG_DNS_RESOLVER=m # CONFIG_BATMAN_ADV is not set CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=y -CONFIG_OPENVSWITCH_VXLAN=y +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m CONFIG_VSOCKETS=m CONFIG_VMWARE_VMCI_VSOCKETS=m CONFIG_NETLINK_MMAP=y CONFIG_NETLINK_DIAG=m -# CONFIG_NET_MPLS_GSO is not set +CONFIG_NET_MPLS_GSO=m CONFIG_RPS=y CONFIG_RFS_ACCEL=y CONFIG_XPS=y @@ -1307,6 +1320,7 @@ CONFIG_MAC80211=m CONFIG_MAC80211_HAS_RC=y CONFIG_MAC80211_RC_MINSTREL=y CONFIG_MAC80211_RC_MINSTREL_HT=y +# CONFIG_MAC80211_RC_MINSTREL_VHT is not set CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" # CONFIG_MAC80211_MESH is not set @@ -1481,6 +1495,7 @@ CONFIG_BLK_DEV_RBD=m # CONFIG_SENSORS_LIS3LV02D=m # CONFIG_AD525X_DPOT is not set +# CONFIG_ATMEL_PWM is not set # CONFIG_DUMMY_IRQ is not set # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set @@ -1536,6 +1551,7 @@ CONFIG_INTEL_MEI_ME=m # CONFIG_INTEL_MEI_TXE is not set CONFIG_VMWARE_VMCI=m # CONFIG_GENWQE is not set +# CONFIG_CXL_BASE is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -1778,6 +1794,7 @@ CONFIG_MD_FAULTY=m # CONFIG_BCACHE is not set CONFIG_BLK_DEV_DM_BUILTIN=y CONFIG_BLK_DEV_DM=m +# CONFIG_DM_MQ_DEFAULT is not set CONFIG_DM_DEBUG=y CONFIG_DM_BUFIO=m CONFIG_DM_BIO_PRISON=m @@ -1788,6 +1805,7 @@ CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_CACHE=m CONFIG_DM_CACHE_MQ=m +CONFIG_DM_CACHE_SMQ=m CONFIG_DM_CACHE_CLEANER=m CONFIG_DM_ERA=m CONFIG_DM_MIRROR=m @@ -1806,6 +1824,7 @@ CONFIG_TARGET_CORE=m CONFIG_TCM_IBLOCK=m CONFIG_TCM_FILEIO=m CONFIG_TCM_PSCSI=m +# CONFIG_TCM_USER is not set CONFIG_LOOPBACK_TARGET=m CONFIG_TCM_FC=m CONFIG_ISCSI_TARGET=m @@ -1849,10 +1868,10 @@ CONFIG_VXLAN=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_NETPOLL=y -CONFIG_NETPOLL_TRAP=y CONFIG_NET_POLL_CONTROLLER=y CONFIG_NTB_NETDEV=m CONFIG_TUN=m +# CONFIG_TUN_VNET_CROSS_LE is not set CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m @@ -1865,6 +1884,8 @@ CONFIG_NLMON=m CONFIG_VHOST_NET=m # CONFIG_VHOST_SCSI is not set CONFIG_VHOST_RING=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set # # Distributed Switch Architecture drivers @@ -1906,6 +1927,7 @@ CONFIG_NET_VENDOR_CHELSIO=y # CONFIG_CHELSIO_T1 is not set CONFIG_CHELSIO_T3=m CONFIG_CHELSIO_T4=m +# CONFIG_CHELSIO_T4_DCB is not set CONFIG_CHELSIO_T4VF=m CONFIG_NET_VENDOR_CISCO=y CONFIG_ENIC=m @@ -1926,6 +1948,7 @@ CONFIG_PCMCIA_XIRCOM=m # CONFIG_NET_VENDOR_DLINK is not set CONFIG_NET_VENDOR_EMULEX=y CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y CONFIG_BE2NET_VXLAN=y # CONFIG_NET_VENDOR_EXAR is not set # CONFIG_NET_VENDOR_HP is not set @@ -1944,9 +1967,12 @@ CONFIG_IXGBE_DCA=y CONFIG_IXGBE_DCB=y CONFIG_IXGBEVF=m CONFIG_I40E=m -# CONFIG_I40E_VXLAN is not set +CONFIG_I40E_VXLAN=y CONFIG_I40E_DCB=y +# CONFIG_I40E_FCOE is not set CONFIG_I40EVF=m +CONFIG_FM10K=m +CONFIG_FM10K_VXLAN=y # CONFIG_NET_VENDOR_I825XX is not set CONFIG_IP1000=m CONFIG_JME=m @@ -1964,6 +1990,7 @@ CONFIG_MLX4_EN_VXLAN=y CONFIG_MLX4_CORE=m CONFIG_MLX4_DEBUG=y CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_MICREL is not set CONFIG_NET_VENDOR_MYRI=y CONFIG_MYRI10GE=m @@ -2003,13 +2030,15 @@ CONFIG_SFC=m CONFIG_SFC_MTD=y CONFIG_SFC_MCDI_MON=y CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y CONFIG_NET_VENDOR_SMSC=y CONFIG_EPIC100=m CONFIG_SMSC9420=m # CONFIG_NET_VENDOR_STMICRO is not set # CONFIG_NET_VENDOR_SUN is not set # CONFIG_NET_VENDOR_TEHUTI is not set -# CONFIG_NET_VENDOR_TI is not set +CONFIG_NET_VENDOR_TI=y +CONFIG_TLAN=m # CONFIG_NET_VENDOR_VIA is not set # CONFIG_NET_VENDOR_WIZNET is not set # CONFIG_FDDI is not set @@ -2125,8 +2154,11 @@ CONFIG_ATH9K_PCI=y CONFIG_ATH9K_AHB=y CONFIG_ATH9K_DEBUGFS=y # CONFIG_ATH9K_STATION_STATISTICS is not set +# CONFIG_ATH9K_DYNACK is not set CONFIG_ATH9K_WOW=y CONFIG_ATH9K_RFKILL=y +# CONFIG_ATH9K_CHANNEL_CONTEXT is not set +CONFIG_ATH9K_PCOEM=y CONFIG_ATH9K_HTC=m # CONFIG_ATH9K_HTC_DEBUGFS is not set CONFIG_CARL9170=m @@ -2144,8 +2176,10 @@ CONFIG_WIL6210_TRACING=y CONFIG_BRCMUTIL=m CONFIG_BRCMSMAC=m CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y CONFIG_BRCMFMAC_SDIO=y CONFIG_BRCMFMAC_USB=y +# CONFIG_BRCMFMAC_PCIE is not set # CONFIG_BRCM_TRACING is not set # CONFIG_BRCMDBG is not set # CONFIG_HOSTAP is not set @@ -2157,6 +2191,7 @@ CONFIG_IWLDVM=m CONFIG_IWLMVM=m CONFIG_IWLWIFI_OPMODE_MODULAR=y # CONFIG_IWLWIFI_BCAST_FILTERING is not set +# CONFIG_IWLWIFI_UAPSD is not set # # Debugging Options @@ -2414,6 +2449,7 @@ CONFIG_KEYBOARD_ATKBD=y # CONFIG_KEYBOARD_MPR121 is not set # CONFIG_KEYBOARD_NEWTON is not set # CONFIG_KEYBOARD_OPENCORES is not set +# CONFIG_KEYBOARD_SAMSUNG is not set # CONFIG_KEYBOARD_STOWAWAY is not set # CONFIG_KEYBOARD_SUNKBD is not set # CONFIG_KEYBOARD_XTKBD is not set @@ -2473,6 +2509,7 @@ CONFIG_TOUCHSCREEN_WACOM_I2C=m # CONFIG_TOUCHSCREEN_TOUCHIT213 is not set # CONFIG_TOUCHSCREEN_TSC_SERIO is not set # CONFIG_TOUCHSCREEN_TSC2007 is not set +# CONFIG_TOUCHSCREEN_W90X900 is not set # CONFIG_TOUCHSCREEN_ST1232 is not set # CONFIG_TOUCHSCREEN_TPS6507X is not set CONFIG_INPUT_MISC=y @@ -2593,14 +2630,17 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_DEVICE_INTERFACE=m CONFIG_IPMI_SI=m # CONFIG_IPMI_SI_PROBE_DEFAULTS is not set +CONFIG_IPMI_SSIF=m CONFIG_IPMI_WATCHDOG=m CONFIG_IPMI_POWEROFF=m CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_TIMERIOMEM=m CONFIG_HW_RANDOM_INTEL=m CONFIG_HW_RANDOM_AMD=m +# CONFIG_HW_RANDOM_ATMEL is not set CONFIG_HW_RANDOM_VIA=m CONFIG_HW_RANDOM_VIRTIO=m +# CONFIG_HW_RANDOM_EXYNOS is not set CONFIG_HW_RANDOM_TPM=m CONFIG_NVRAM=y # CONFIG_R3964 is not set @@ -2615,12 +2655,17 @@ CONFIG_HANGCHECK_TIMER=m CONFIG_UV_MMTIMER=m CONFIG_TCG_TPM=y CONFIG_TCG_TIS=y -# CONFIG_TCG_TIS_I2C_INFINEON is not set +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m CONFIG_TCG_NSC=m CONFIG_TCG_ATMEL=m CONFIG_TCG_INFINEON=m +# CONFIG_TCG_XEN is not set +CONFIG_TCG_CRB=m CONFIG_TELCLOCK=m CONFIG_DEVPORT=y +CONFIG_HMC_DRV=m CONFIG_I2C=m CONFIG_I2C_BOARDINFO=y CONFIG_I2C_COMPAT=y @@ -2664,6 +2709,8 @@ CONFIG_I2C_SCMI=m # # I2C system bus drivers (mostly embedded / system-on-chip) # +CONFIG_I2C_DESIGNWARE_CORE=m +CONFIG_I2C_DESIGNWARE_PLATFORM=m # CONFIG_I2C_DESIGNWARE_PCI is not set # CONFIG_I2C_EG20T is not set # CONFIG_I2C_INTEL_MID is not set @@ -2970,6 +3017,7 @@ CONFIG_BCMA=m CONFIG_BCMA_HOST_PCI_POSSIBLE=y CONFIG_BCMA_HOST_PCI=y # CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y CONFIG_BCMA_DRIVER_GMAC_CMN=y # CONFIG_BCMA_DEBUG is not set @@ -3538,7 +3586,7 @@ CONFIG_VGA_ARB=y CONFIG_VGA_ARB_MAX_GPUS=64 CONFIG_VGA_SWITCHEROO=y CONFIG_DRM=m -CONFIG_DRM_USB=m +CONFIG_DRM_MIPI_DSI=y CONFIG_DRM_KMS_HELPER=m CONFIG_DRM_KMS_FB_HELPER=y CONFIG_DRM_LOAD_EDID_FIRMWARE=y @@ -3547,12 +3595,14 @@ CONFIG_DRM_TTM=m # # I2C encoder or helper chips # +# CONFIG_DRM_I2C_ADV7511 is not set CONFIG_DRM_I2C_CH7006=m CONFIG_DRM_I2C_SIL164=m CONFIG_DRM_I2C_NXP_TDA998X=m # CONFIG_DRM_TDFX is not set # CONFIG_DRM_R128 is not set CONFIG_DRM_RADEON=m +# CONFIG_DRM_RADEON_USERPTR is not set # CONFIG_DRM_RADEON_UMS is not set CONFIG_DRM_NOUVEAU=m CONFIG_NOUVEAU_DEBUG=5 @@ -3578,6 +3628,11 @@ CONFIG_DRM_MGAG200=m CONFIG_DRM_CIRRUS_QEMU=m CONFIG_DRM_QXL=m CONFIG_DRM_BOCHS=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# # CONFIG_VGASTATE is not set CONFIG_VIDEO_OUTPUT_CONTROL=m CONFIG_HDMI=y @@ -3702,11 +3757,11 @@ CONFIG_SND_SEQ_HRTIMER_DEFAULT=y CONFIG_SND_DYNAMIC_MINORS=y CONFIG_SND_MAX_CARDS=32 # CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set CONFIG_SND_VMASTER=y -CONFIG_SND_KCTL_JACK=y CONFIG_SND_DMA_SGBUF=y CONFIG_SND_RAWMIDI_SEQ=m CONFIG_SND_OPL3_LIB_SEQ=m @@ -3807,20 +3862,17 @@ CONFIG_SND_VX222=m # CONFIG_SND_HDA=m CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_DSP_LOADER=y CONFIG_SND_HDA_PREALLOC_SIZE=512 CONFIG_SND_HDA_HWDEP=y CONFIG_SND_HDA_RECONFIG=y CONFIG_SND_HDA_INPUT_BEEP=y CONFIG_SND_HDA_INPUT_BEEP_MODE=0 -CONFIG_SND_HDA_INPUT_JACK=y CONFIG_SND_HDA_PATCH_LOADER=y CONFIG_SND_HDA_CODEC_REALTEK=m CONFIG_SND_HDA_CODEC_ANALOG=m CONFIG_SND_HDA_CODEC_SIGMATEL=m CONFIG_SND_HDA_CODEC_VIA=m CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_I915=y CONFIG_SND_HDA_CODEC_CIRRUS=m CONFIG_SND_HDA_CODEC_CONEXANT=m CONFIG_SND_HDA_CODEC_CA0110=m @@ -3830,6 +3882,9 @@ CONFIG_SND_HDA_CODEC_CMEDIA=m CONFIG_SND_HDA_CODEC_SI3054=m CONFIG_SND_HDA_GENERIC=m CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_I915=y CONFIG_SND_USB=y CONFIG_SND_USB_AUDIO=m CONFIG_SND_USB_UA101=m @@ -3840,6 +3895,10 @@ CONFIG_SND_USB_US122L=m CONFIG_SND_USB_6FIRE=m # CONFIG_SND_USB_HIFACE is not set # CONFIG_SND_BCD2000 is not set +# CONFIG_SND_USB_POD is not set +# CONFIG_SND_USB_PODHD is not set +# CONFIG_SND_USB_TONEPORT is not set +# CONFIG_SND_USB_VARIAX is not set CONFIG_SND_FIREWIRE=y CONFIG_SND_FIREWIRE_LIB=m CONFIG_SND_FIREWIRE_SPEAKERS=m @@ -3919,6 +3978,7 @@ CONFIG_HID_SONY=m CONFIG_HID_SPEEDLINK=m CONFIG_HID_STEELSERIES=m CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m CONFIG_HID_GREENASIA=m # CONFIG_GREENASIA_FF is not set CONFIG_HID_HYPERV_MOUSE=m @@ -3961,6 +4021,7 @@ CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_DEFAULT_PERSIST=y # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set CONFIG_USB_MON=y CONFIG_USB_WUSB=m CONFIG_USB_WUSB_CBAF=m @@ -3971,6 +4032,7 @@ CONFIG_USB_WUSB_CBAF=m # # CONFIG_USB_C67X00_HCD is not set CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PCI=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_ROOT_HUB_TT=y CONFIG_USB_EHCI_TT_NEWSCHED=y @@ -4121,6 +4183,8 @@ CONFIG_USB_ISIGHTFW=m # CONFIG_USB_YUREX is not set CONFIG_USB_EZUSB_FX2=m CONFIG_USB_HSIC_USB3503=m +# CONFIG_USB_LINK_LAYER_TEST is not set +# CONFIG_USB_CHAOSKEY is not set CONFIG_USB_ATM=m CONFIG_USB_SPEEDTOUCH=m CONFIG_USB_CXACRU=m @@ -4128,6 +4192,7 @@ CONFIG_USB_UEAGLEATM=m CONFIG_USB_XUSBATM=m # CONFIG_USB_PHY is not set # CONFIG_USB_GADGET is not set +# CONFIG_USB_LED_TRIG is not set CONFIG_UWB=m CONFIG_UWB_HWA=m CONFIG_UWB_WHCI=m @@ -4154,6 +4219,8 @@ CONFIG_MMC_SDHCI_PCI=m CONFIG_MMC_RICOH_MMC=y CONFIG_MMC_SDHCI_ACPI=m CONFIG_MMC_SDHCI_PLTFM=m +# CONFIG_MMC_SDHCI_PXAV3 is not set +# CONFIG_MMC_SDHCI_PXAV2 is not set # CONFIG_MMC_WBSD is not set CONFIG_MMC_TIFM_SD=m CONFIG_MMC_CB710=m @@ -4223,6 +4290,7 @@ CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_MAD=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y CONFIG_INFINIBAND_ADDR_TRANS=y CONFIG_INFINIBAND_MTHCA=m CONFIG_INFINIBAND_MTHCA_DEBUG=y @@ -4259,6 +4327,7 @@ CONFIG_EDAC_E752X=m CONFIG_EDAC_I82975X=m CONFIG_EDAC_I3000=m CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m CONFIG_EDAC_X38=m CONFIG_EDAC_I5400=m CONFIG_EDAC_I7CORE=m @@ -4385,7 +4454,9 @@ CONFIG_VIRTIO=m # Virtio drivers # CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m # CONFIG_VIRTIO_MMIO is not set # @@ -4439,7 +4510,6 @@ CONFIG_R8712U=m # CONFIG_VT6655 is not set # CONFIG_VT6656 is not set # CONFIG_DX_SEP is not set -CONFIG_ZSMALLOC=y CONFIG_ZRAM=m CONFIG_ZRAM_DEBUG=y # CONFIG_FB_SM7XX is not set @@ -4472,6 +4542,16 @@ CONFIG_ZRAM_DEBUG=y CONFIG_FIREWIRE_SERIAL=m # CONFIG_ZCACHE is not set # CONFIG_USB_DWC2 is not set +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_UNISYS_VISORHID=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +CONFIG_HFI1_VERBS_31BIT_PSN=y +# CONFIG_CONFIG_SDMA_VERBOSITY is not set +# CONFIG_CONFIG_PRESCAN_RXQ is not set CONFIG_X86_PLATFORM_DEVICES=y CONFIG_ACER_WMI=m CONFIG_ACERHDF=m @@ -4485,6 +4565,7 @@ CONFIG_FUJITSU_LAPTOP=m CONFIG_FUJITSU_TABLET=m CONFIG_AMILO_RFKILL=m CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m CONFIG_HP_WMI=m CONFIG_MSI_LAPTOP=m CONFIG_PANASONIC_LAPTOP=m @@ -4520,6 +4601,14 @@ CONFIG_INTEL_OAKTRAIL=m CONFIG_SAMSUNG_Q10=m CONFIG_APPLE_GMUX=m CONFIG_PVPANIC=y +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +# CONFIG_COMMON_CLK_DEBUG is not set # # Hardware Spinlock drivers @@ -4568,6 +4657,7 @@ CONFIG_DELL_RBU=m CONFIG_DCDBAS=m CONFIG_DMIID=y CONFIG_DMI_SYSFS=y +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y CONFIG_ISCSI_IBFT_FIND=y CONFIG_ISCSI_IBFT=m # CONFIG_GOOGLE_FIRMWARE is not set @@ -4732,6 +4822,7 @@ CONFIG_NFS_V4_2=y CONFIG_PNFS_FILE_LAYOUT=m CONFIG_PNFS_BLOCK=m CONFIG_PNFS_OBJLAYOUT=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" # CONFIG_NFS_V4_1_MIGRATION is not set CONFIG_NFS_V4_SECURITY_LABEL=y @@ -4744,8 +4835,10 @@ CONFIG_NFSD_V2_ACL=y CONFIG_NFSD_V3=y CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y +# CONFIG_NFSD_PNFS is not set CONFIG_NFSD_V4_SECURITY_LABEL=y # CONFIG_NFSD_FAULT_INJECTION is not set +CONFIG_GRACE_PERIOD=m CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_NFS_ACL_SUPPORT=m @@ -5138,8 +5231,8 @@ CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA1_SSSE3=y +CONFIG_CRYPTO_SHA256_SSSE3=y CONFIG_CRYPTO_SHA512_SSSE3=m CONFIG_CRYPTO_SHA1_MB=m CONFIG_CRYPTO_SHA256=y @@ -5191,6 +5284,7 @@ CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_ZLIB=m CONFIG_CRYPTO_LZO=y +# CONFIG_CRYPTO_842 is not set # # Random Number Generation @@ -5230,6 +5324,7 @@ CONFIG_KVM_ASYNC_PF=y CONFIG_HAVE_KVM_MSI=y CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y CONFIG_VIRTUALIZATION=y CONFIG_KVM=m CONFIG_KVM_INTEL=m @@ -5311,3 +5406,4 @@ CONFIG_MPILIB=y CONFIG_SIGNATURE=y CONFIG_OID_REGISTRY=y CONFIG_UCS2_STRING=y +CONFIG_RH_KABI_SIZE_ALIGN_CHECKS=y diff --git a/lustre/kernel_patches/targets/3.10-rhel7.target.in b/lustre/kernel_patches/targets/3.10-rhel7.target.in index 9175659..50d2903 100644 --- a/lustre/kernel_patches/targets/3.10-rhel7.target.in +++ b/lustre/kernel_patches/targets/3.10-rhel7.target.in @@ -1,5 +1,5 @@ lnxmaj="3.10.0" -lnxrel="229.20.1.el7" +lnxrel="327.3.1.el7" KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm SERIES=3.10-rhel7.series diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index b9b303a..7e64833 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -16,7 +16,7 @@ PATCH SERIES FOR SERVER KERNELS: 2.6-rhel6.series 2.6.32-431.29.2.el6 (RHEL 6.5) 2.6-rhel6.series 2.6.32-504.30.3.el6 (RHEL 6.6) 2.6-rhel6.series 2.6.32-573.8.1.el6 (RHEL 6.7) -3.10-rhel7.series 3.10.0-229.20.1.el7 (RHEL 7.1) +3.10-rhel7.series 3.10.0-327.3.1.el7 (RHEL 7.2) 3.0-sles11sp3.series 3.0.101-0.47.67 (SLES11 SP3) 3.0-sles11sp3.series 3.0.101-65 (SLES11 SP4)