From 953e97987426bcc43a74be5b14f3a71319b33825 Mon Sep 17 00:00:00 2001 From: Yang Sheng Date: Mon, 30 Mar 2015 10:40:35 +0800 Subject: [PATCH] LU-6030 ldiskfs: split pdirop patch Split pdirop patch as two parts. One for Nlevel-htree and Largedir, other one for pdirop and htree-lock. Also doing some cleanup work to reduce the patch size. Signed-off-by: Yang Sheng Change-Id: I08b65d9098be95994f44748dbf14afa9f6d5b372 Reviewed-on: http://review.whamcloud.com/14264 Tested-by: Jenkins Reviewed-by: Andreas Dilger Reviewed-by: Bob Glossman Tested-by: Maloo Reviewed-by: Oleg Drokin --- .../patches/rhel6.3/ext4-large-dir.patch | 355 +++ .../patches/rhel6.3/ext4-osd-iop-common.patch | 48 +- .../patches/rhel6.3/ext4-pdirop.patch | 584 +---- .../patches/rhel6.3/ext4-use-correct-inode.patch | 49 + .../patches/rhel7/ext4-large-dir.patch | 342 +++ .../patches/rhel7/ext4-osd-iop-common.patch | 44 +- .../kernel_patches/patches/rhel7/ext4-pdirop.patch | 614 +----- .../patches/sles11sp2/ext4-large-dir.patch | 364 ++++ .../patches/sles11sp2/ext4-osd-iop-common.patch | 48 +- .../patches/sles11sp2/ext4-pdirop.patch | 2273 -------------------- .../series/ldiskfs-2.6-rhel6.6.series | 2 + .../series/ldiskfs-2.6-sles11.series | 1 + .../series/ldiskfs-3.0-sles11.series | 3 +- .../series/ldiskfs-3.0-sles11sp3.series | 3 +- .../series/ldiskfs-3.10-rhel7.series | 1 + lustre/osd-ldiskfs/osd_internal.h | 8 +- 16 files changed, 1328 insertions(+), 3411 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch create mode 100644 ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch delete mode 100644 ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch new file mode 100644 index 0000000..cf5f1f1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch @@ -0,0 +1,355 @@ +This INCOMPAT_LARGEDIR feature allows larger directories +to be created in ldiskfs, both with directory sizes over +2GB and and a maximum htree depth of 3 instead of the +current limit of 2. These features are needed in order +to exceed the current limit of approximately 10M entries +in a single directory. + +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h +@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags) + #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 + #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 + #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 ++#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 + + #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ +@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP| \ +- EXT4_FEATURE_INCOMPAT_DIRDATA) ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b + */ + #define ERR_BAD_DX_DIR -75000 + ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 ++ ++static inline int ++ext4_dir_htree_level(struct super_block *sb) ++{ ++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; ++} ++ + void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); + +@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || ++ S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c +@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c +@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru + struct dx_frame *frame = frame_in; + u32 hash; + +- frame->bh = NULL; ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + +@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru + goto fail; + } + +- if ((indirect = info->indirect_levels) > 1) { +- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- info->indirect_levels); ++ indirect = info->indirect_levels; ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "Directory (ino: %lu) htree depth %#06x exceed " ++ "supported value", dir->i_ino, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -512,13 +519,18 @@ fail: + static void dx_release (struct dx_frame *frames) + { + struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + + info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); +- if (info->indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0; i <= info->indirect_levels; i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct ext4_dir_entry_2 *de, *top; + struct buffer_head *bh; + ext4_lblk_t block; +@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); +- dir->i_version++; ++ inode_inc_iversion(dir); + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); +@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data1, *top; +@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + ++again: ++ restart = 0; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext4_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning(sb, "Directory index full!"); ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "Directory (ino: %lu) index full, " ++ "reach max htree level :%d", ++ dir->i_ino, levels); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is" ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; +@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block((frame - 1), hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ ext4_handle_dirty_metadata(handle, dir, ++ (frame - 1)->bh); ++ if (restart) { ++ ext4_handle_dirty_metadata(handle, dir, ++ frame->bh); ++ goto cleanup; ++ } + } else { + struct dx_root_info * info; +- dxtrace(printk(KERN_DEBUG +- "Creating second level index...\n")); ++ + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); +@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2*) + frames[0].bh->b_data); +- info->indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ ext4_handle_dirty_metadata(handle, dir, frame->bh); ++ ext4_handle_dirty_metadata(handle, dir, bh2); ++ brelse(bh2); ++ restart = 1; ++ goto cleanup; + } +- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; +@@ -1840,6 +1873,10 @@ cleanup: + if (bh) + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle, + blocksize); + else + de->inode = 0; +- dir->i_version++; ++ inode_inc_iversion(dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, bh); + return 0; diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-osd-iop-common.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-osd-iop-common.patch index 31c68a4..e8718f9 100644 --- a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-osd-iop-common.patch +++ b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-osd-iop-common.patch @@ -1,20 +1,14 @@ --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h -@@ -1778,6 +1778,19 @@ extern int ext4_orphan_add(handle_t *, s +@@ -1778,6 +1778,13 @@ extern int ext4_orphan_add(handle_t *, s extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern struct inode *ext4_create_inode(handle_t *handle, + struct inode * dir, int mode); -+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); +extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 * de_del, + struct buffer_head * bh); -+extern struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir); -+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) +extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode); @@ -30,46 +24,6 @@ #include #include #include -@@ -873,9 +874,9 @@ static inline int search_dirblock(struct - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ --static struct buffer_head * ext4_find_entry (struct inode *dir, -- const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir) -+struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir) - { - struct super_block *sb; - struct buffer_head *bh_use[NAMEI_RA_SIZE]; -@@ -981,6 +982,7 @@ cleanup_and_exit: - brelse(bh_use[ra_ptr]); - return ret; - } -+EXPORT_SYMBOL(ext4_find_entry); - - static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) -@@ -1515,8 +1517,8 @@ static int make_indexed_dir(handle_t *ha - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ --static int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; -@@ -1565,6 +1567,7 @@ static int ext4_add_entry(handle_t *hand - brelse(bh); - return retval; - } -+EXPORT_SYMBOL(ext4_add_entry); - - /* - * Returns 0 for success, or a negative error value @@ -1704,10 +1707,10 @@ cleanup: * ext4_delete_entry deletes a directory entry by merging it with the * previous entry diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-pdirop.patch index c6e93c3..32adfc1 100644 --- a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-pdirop.patch +++ b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-pdirop.patch @@ -1,5 +1,7 @@ ---- /dev/null 2011-12-14 22:16:16.000000000 +0800 -+++ linux-2.6.32-131.6.1-pdo/include/linux/htree_lock.h 2011-12-02 17:09:34.000000000 +0800 +Index: linux-2.6.32-504.3.3.el6.x86_64/include/linux/htree_lock.h +=================================================================== +--- /dev/null ++++ linux-2.6.32-504.3.3.el6.x86_64/include/linux/htree_lock.h @@ -0,0 +1,187 @@ +/* + * include/linux/htree_lock.h @@ -188,8 +190,10 @@ + ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) + +#endif ---- /dev/null 2011-12-14 22:16:16.000000000 +0800 -+++ linux-2.6.32-131.6.1-pdo/fs/ext4/htree_lock.c 2011-12-14 22:56:28.000000000 +0800 +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/htree_lock.c +=================================================================== +--- /dev/null ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/htree_lock.c @@ -0,0 +1,880 @@ +/* + * fs/ext4/htree_lock.c @@ -1071,9 +1075,11 @@ + kfree(lck); +} +EXPORT_SYMBOL(htree_lock_free); ---- linux-2.6.32-131.6.1/fs/ext4/ext4.h 2011-10-06 20:10:49.000000000 +0800 -+++ linux-2.6.32-131.6.1-pdo/fs/ext4/ext4.h 2011-12-08 18:25:00.000000000 +0800 -@@ -28,6 +28,7 @@ +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h +@@ -27,6 +27,7 @@ #include #include #include @@ -1081,39 +1087,10 @@ #include #include #ifdef __KERNEL__ -@@ -1277,6 +1278,7 @@ EXT4_INODE_BIT_FNS(state, state_flags) - #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 - #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 - #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 -+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 - - #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ -@@ -1286,7 +1288,8 @@ EXT4_INODE_BIT_FNS(state, state_flags) - EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP| \ -- EXT4_FEATURE_INCOMPAT_DIRDATA) -+ EXT4_FEATURE_INCOMPAT_DIRDATA| \ -+ EXT4_FEATURE_INCOMPAT_LARGEDIR) - - #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ -@@ -1536,6 +1539,76 @@ ext4_group_first_block_no(struct super_b - */ - #define ERR_BAD_DX_DIR -75000 +@@ -1625,6 +1626,71 @@ ext4_dir_htree_level(struct super_block + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; + } -+/* htree levels for ext4 */ -+#define EXT4_HTREE_LEVEL_COMPAT 2 -+#define EXT4_HTREE_LEVEL 3 -+ -+static inline int -+ext4_dir_htree_level(struct super_block *sb) -+{ -+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? -+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; -+} -+ +/* assume name-hash is protected by upper layer */ +#define EXT4_HTREE_LOCK_HASH 0 + @@ -1173,51 +1150,19 @@ + struct inode *dir, unsigned flags); +#define ext4_htree_unlock(lck) htree_unlock(lck) + ++extern struct buffer_head * __ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck); ++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); -@@ -1769,14 +1842,16 @@ extern int ext4_htree_fill_tree(struct f - extern struct inode *ext4_create_inode(handle_t *handle, - struct inode * dir, int mode); - extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode); -+ struct inode *inode, struct htree_lock *lck); - extern int ext4_delete_entry(handle_t *handle, struct inode * dir, - struct ext4_dir_entry_2 * de_del, - struct buffer_head * bh); - extern struct buffer_head * ext4_find_entry(struct inode *dir, - const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir); --#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) -+ struct ext4_dir_entry_2 **res_dir, -+ struct htree_lock *lck); -+#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \ -+ ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck) - extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, - struct inode *inode, const void *, const void *); - extern struct buffer_head *ext4_append(handle_t *handle, -@@ -1893,13 +1968,15 @@ static inline void ext4_r_blocks_count_s - es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); - } - --static inline loff_t ext4_isize(struct ext4_inode *raw_inode) -+static inline loff_t ext4_isize(struct super_block *sb, -+ struct ext4_inode *raw_inode) - { -- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) -+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || -+ S_ISREG(le16_to_cpu(raw_inode->i_mode))) - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | - le32_to_cpu(raw_inode->i_size_lo); -- else -- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); -+ -+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); - } - - static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) ---- linux-2.6.32-131.6.1/fs/ext4/namei.c 2011-10-06 20:10:49.000000000 +0800 -+++ linux-2.6.32-131.6.1-pdo/fs/ext4/namei.c 2011-12-14 22:55:28.000000000 +0800 +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c @@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s struct inode *dir, struct dx_hash_info *hinfo, @@ -1244,16 +1189,7 @@ /* * p is at least 6 bytes before the end of page -@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str - - static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) - { -- return le32_to_cpu(entry->block) & 0x00ffffff; -+ return le32_to_cpu(entry->block) & 0x0fffffff; - } - - static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) -@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h +@@ -368,6 +368,225 @@ struct stats dx_show_entries(struct dx_h } #endif /* DX_DEBUG */ @@ -1265,7 +1201,9 @@ + struct dx_entry *ld_at; /* position of leaf dx_entry */ +}; + -+#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent) __ext4_find_entry(dir, name, dirent, NULL) ++#define ext4_add_entry(handle, dentry, inode) __ext4_add_entry(handle, dentry, inode, NULL) + +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ +#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) @@ -1477,7 +1415,7 @@ /* * Probe for a directory leaf block to search. * -@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h +@@ -379,10 +598,11 @@ struct stats dx_show_entries(struct dx_h */ static struct dx_frame * dx_probe(const struct qstr *d_name, struct inode *dir, @@ -1491,34 +1429,7 @@ struct dx_root_info * info; struct buffer_head *bh; struct dx_frame *frame = frame_in; - u32 hash; - -- frame->bh = NULL; -+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) - goto fail; - -@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru - goto fail; - } - -- if ((indirect = info->indirect_levels) > 1) { -- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", -- info->indirect_levels); -+ indirect = info->indirect_levels; -+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { -+ ext4_warning(dir->i_sb, -+ "Directory (ino: %lu) htree depth %#06x exceed " -+ "supported value", dir->i_ino, -+ ext4_dir_htree_level(dir->i_sb)); -+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { -+ ext4_warning(dir->i_sb, "Enable large directory " -+ "feature to access it"); -+ } - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; -@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru +@@ -447,8 +667,15 @@ dx_probe(const struct qstr *d_name, stru dxtrace(printk("Look up %x", hash)); while (1) { @@ -1535,7 +1446,7 @@ ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); brelse(bh); -@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru +@@ -489,9 +716,73 @@ dx_probe(const struct qstr *d_name, stru frame->bh = bh; frame->entries = entries; frame->at = at; @@ -1610,29 +1521,7 @@ at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, -@@ -512,13 +808,18 @@ fail: - static void dx_release (struct dx_frame *frames) - { - struct dx_root_info *info; -+ int i; -+ - if (frames[0].bh == NULL) - return; - - info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); -- if (info->indirect_levels) -- brelse(frames[1].bh); -- brelse(frames[0].bh); -+ for (i = 0; i <= info->indirect_levels; i++) { -+ if (frames[i].bh == NULL) -+ break; -+ brelse(frames[i].bh); -+ frames[i].bh = NULL; -+ } - } - - /* -@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame +@@ -553,7 +844,7 @@ static void dx_release (struct dx_frame static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, @@ -1641,7 +1530,7 @@ { struct dx_frame *p; struct buffer_head *bh; -@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct +@@ -568,12 +859,22 @@ static int ext4_htree_next_block(struct * this loop, num_frames indicates the number of interior * nodes need to be read. */ @@ -1666,7 +1555,7 @@ p--; } -@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct +@@ -596,6 +897,13 @@ static int ext4_htree_next_block(struct * block so no check is necessary */ while (num_frames--) { @@ -1680,7 +1569,7 @@ if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 0, &err))) return err; /* Failure */ -@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct +@@ -604,6 +912,7 @@ static int ext4_htree_next_block(struct p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } @@ -1688,16 +1577,7 @@ return 1; } -@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di - { - struct dx_hash_info hinfo; - struct ext4_dir_entry_2 *de; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct inode *dir; - ext4_lblk_t block; - int count = 0; -@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di +@@ -696,10 +1005,10 @@ int ext4_htree_fill_tree(struct file *di } hinfo.hash = start_hash; hinfo.minor_hash = 0; @@ -1710,7 +1590,7 @@ /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; -@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di +@@ -726,7 +1035,7 @@ int ext4_htree_fill_tree(struct file *di count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, @@ -1719,7 +1599,7 @@ *next_hash = hashval; if (ret < 0) { err = ret; -@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr +@@ -826,9 +1135,17 @@ static void dx_insert_block(struct dx_fr static void ext4_update_dx_flag(struct inode *inode) { @@ -1737,19 +1617,20 @@ } /* -@@ -889,8 +1216,9 @@ static inline int search_dirblock(struct +@@ -900,9 +1217,10 @@ static inline int search_dirblock(struct + * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ - struct buffer_head * ext4_find_entry(struct inode *dir, -- const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir) -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 **res_dir, -+ struct htree_lock *lck) +-static struct buffer_head * ext4_find_entry (struct inode *dir, ++struct buffer_head * __ext4_find_entry(struct inode *dir, + const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; -@@ -911,7 +1239,7 @@ struct buffer_head * ext4_find_entry(str +@@ -923,7 +1241,7 @@ static struct buffer_head * ext4_find_en if (namelen > EXT4_NAME_LEN) return NULL; if (is_dx(dir)) { @@ -1758,7 +1639,7 @@ /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the -@@ -921,6 +1249,7 @@ struct buffer_head * ext4_find_entry(str +@@ -933,6 +1251,7 @@ static struct buffer_head * ext4_find_en return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1766,9 +1647,11 @@ } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); start = EXT4_I(dir)->i_dir_start_lookup; -@@ -998,13 +1327,15 @@ cleanup_and_exit: +@@ -1008,9 +1327,12 @@ cleanup_and_exit: + brelse(bh_use[ra_ptr]); + return ret; } - EXPORT_SYMBOL(ext4_find_entry); ++EXPORT_SYMBOL(__ext4_find_entry); -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) @@ -1779,13 +1662,7 @@ { struct super_block * sb; struct dx_hash_info hinfo; - u32 hash; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct ext4_dir_entry_2 *de, *top; - struct buffer_head *bh; - ext4_lblk_t block; -@@ -1015,13 +1346,16 @@ static struct buffer_head * ext4_dx_find +@@ -1026,13 +1348,16 @@ static struct buffer_head * ext4_dx_find sb = dir->i_sb; /* NFS may look up ".." - look at dx_root directory block */ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ @@ -1803,7 +1680,7 @@ } hash = hinfo.hash; do { -@@ -1050,7 +1384,7 @@ static struct buffer_head * ext4_dx_find +@@ -1061,7 +1386,7 @@ static struct buffer_head * ext4_dx_find brelse(bh); /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, hash, frame, @@ -1812,25 +1689,7 @@ if (retval < 0) { ext4_warning(sb, "error reading index page in directory #%lu", -@@ -1076,7 +1410,7 @@ static struct dentry *ext4_lookup(struct - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -- bh = ext4_find_entry(dir, &dentry->d_name, &de); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - inode = NULL; - if (bh) { - __u32 ino = le32_to_cpu(de->inode); -@@ -1144,7 +1478,7 @@ struct dentry *ext4_get_parent(struct de - struct ext4_dir_entry_2 * de; - struct buffer_head *bh; - -- bh = ext4_find_entry(child->d_inode, &dotdot, &de); -+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); - inode = NULL; - if (!bh) - return ERR_PTR(-ENOENT); -@@ -1233,8 +1567,9 @@ static struct ext4_dir_entry_2* dx_pack_ +@@ -1244,8 +1569,9 @@ static struct ext4_dir_entry_2* dx_pack_ * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, @@ -1842,7 +1701,7 @@ { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; -@@ -1291,7 +1626,14 @@ static struct ext4_dir_entry_2 *do_split +@@ -1302,7 +1628,14 @@ static struct ext4_dir_entry_2 *do_split hash2, split, count-split)); /* Fancy dance to stay within two buffers */ @@ -1858,7 +1717,7 @@ de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, blocksize); -@@ -1300,13 +1642,21 @@ static struct ext4_dir_entry_2 *do_split +@@ -1311,13 +1644,21 @@ static struct ext4_dir_entry_2 *do_split dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1886,43 +1745,27 @@ err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; -@@ -1418,7 +1768,7 @@ static int add_dirent_to_buf(handle_t *h - if (!IS_NOCMTIME(dir)) - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); - ext4_update_dx_flag(dir); -- dir->i_version++; -+ inode_inc_iversion(dir); - ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); -@@ -1438,7 +1788,7 @@ static int make_indexed_dir(handle_t *ha - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct dx_entry *entries; - struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; - char *data1, *top; -@@ -1517,7 +1867,7 @@ static int make_indexed_dir(handle_t *ha - ext4_handle_dirty_metadata(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); - +@@ -1558,7 +1899,7 @@ static int make_indexed_dir(handle_t *ha + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh); + - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval); - if (!de) { - /* - * Even if the block split failed, we have to properly write -@@ -1616,7 +1966,7 @@ out: + if (!de) { + /* + * Even if the block split failed, we have to properly write +@@ -1664,8 +2005,8 @@ out: + * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ - int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+ struct inode *inode, struct htree_lock *lck) +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) { struct inode *dir = dentry->d_parent->d_inode; struct buffer_head *bh; -@@ -1635,9 +1985,10 @@ int ext4_add_entry(handle_t *handle, str +@@ -1684,9 +2025,10 @@ static int ext4_add_entry(handle_t *hand if (dentry->d_name.len == 2 && memcmp(dentry->d_name.name, "..", 2) == 0) return ext4_update_dotdot(handle, dentry, inode); @@ -1934,169 +1777,49 @@ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); -@@ -1674,18 +2025,21 @@ EXPORT_SYMBOL(ext4_add_entry); +@@ -1717,12 +2059,13 @@ static int ext4_add_entry(handle_t *hand + brelse(bh); + return retval; + } ++EXPORT_SYMBOL(__ext4_add_entry); + + /* * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) + struct inode *inode, struct htree_lock *lck) { -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; - struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; - struct super_block *sb = dir->i_sb; - struct ext4_dir_entry_2 *de; -+ int restart; - int err; +@@ -1736,7 +2079,7 @@ static int ext4_dx_add_entry(handle_t *h + again: + restart = 0; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); -+again: -+ restart = 0; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); if (!frame) return err; entries = frame->entries; -@@ -1694,33 +2048,53 @@ static int ext4_dx_add_entry(handle_t *h - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) - goto cleanup; - -- BUFFER_TRACE(bh, "get_write_access"); -- err = ext4_journal_get_write_access(handle, bh); -- if (err) -- goto journal_error; -- - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) - goto cleanup; - -+ err = 0; - /* Block full, should compress but for now just split */ - dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); - /* Need to split index? */ - if (dx_get_count(entries) == dx_get_limit(entries)) { - ext4_lblk_t newblock; -- unsigned icount = dx_get_count(entries); -- int levels = frame - frames; -+ int levels = frame - frames + 1; -+ unsigned icount; -+ int add_level = 1; - struct dx_entry *entries2; +@@ -1763,6 +2106,11 @@ again: struct dx_node *node2; struct buffer_head *bh2; -- if (levels && (dx_get_count(frames->entries) == -- dx_get_limit(frames->entries))) { -- ext4_warning(sb, "Directory index full!"); + if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ + ext4_htree_safe_relock(lck); + restart = 1; + goto cleanup; + } -+ while (frame > frames) { -+ if (dx_get_count((frame - 1)->entries) < -+ dx_get_limit((frame - 1)->entries)) { -+ add_level = 0; -+ break; -+ } -+ frame--; /* split higher index block */ -+ at = frame->at; -+ entries = frame->entries; -+ restart = 1; -+ } -+ if (add_level && levels == ext4_dir_htree_level(sb)) { -+ ext4_warning(sb, "Directory (ino: %lu) index full, " -+ "reach max htree level :%d", -+ dir->i_ino, levels); -+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { -+ ext4_warning(sb, "Large directory feature is" -+ "not enabled on this " -+ "filesystem"); -+ } - err = -ENOSPC; + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -1860,16 +2208,43 @@ again: + restart = 1; goto cleanup; } -+ icount = dx_get_count(entries); - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) - goto cleanup; -@@ -1733,7 +2107,7 @@ static int ext4_dx_add_entry(handle_t *h - err = ext4_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; -- if (levels) { -+ if (!add_level) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", -@@ -1741,7 +2115,7 @@ static int ext4_dx_add_entry(handle_t *h - - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, -- frames[0].bh); -+ (frame - 1)->bh); - if (err) - goto journal_error; - -@@ -1757,18 +2131,24 @@ static int ext4_dx_add_entry(handle_t *h - frame->entries = entries = entries2; - swap(frame->bh, bh2); - } -- dx_insert_block(frames + 0, hash2, newblock); -- dxtrace(dx_show_index("node", frames[1].entries)); -+ dx_insert_block((frame - 1), hash2, newblock); -+ dxtrace(dx_show_index("node", frame->entries)); - dxtrace(dx_show_index("node", - ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, inode, bh2); - if (err) - goto journal_error; - brelse (bh2); -+ ext4_handle_dirty_metadata(handle, inode, -+ (frame - 1)->bh); -+ if (restart) { -+ ext4_handle_dirty_metadata(handle, inode, -+ frame->bh); -+ goto cleanup; -+ } - } else { - struct dx_root_info * info; -- dxtrace(printk(KERN_DEBUG -- "Creating second level index...\n")); -+ - memcpy((char *) entries2, (char *) entries, - icount * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); -@@ -1778,32 +2158,60 @@ static int ext4_dx_add_entry(handle_t *h - dx_set_block(entries + 0, newblock); - info = dx_get_dx_info((struct ext4_dir_entry_2*) - frames[0].bh->b_data); -- info->indirect_levels = 1; -+ info->indirect_levels += 1; -+ dxtrace(printk(KERN_DEBUG -+ "Creating %d level index...\n", -+ info->indirect_levels)); -+ ext4_handle_dirty_metadata(handle, inode, frame->bh); -+ ext4_handle_dirty_metadata(handle, inode, bh2); -+ brelse(bh2); -+ restart = 1; -+ goto cleanup; -+ } + } else if (!ext4_htree_dx_locked(lck)) { + struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); - -- /* Add new access path frame */ -- frame = frames + 1; -- frame->at = at = at - entries + entries2; -- frame->entries = entries = entries2; -- frame->bh = bh2; -- err = ext4_journal_get_write_access(handle, -- frame->bh); -- if (err) -- goto journal_error; ++ + /* not well protected, require DX lock */ + ext4_htree_dx_need_lock(lck); + at = frame > frames ? (frame - 1)->at : NULL; @@ -2115,8 +1838,7 @@ + (ld->ld_count != dx_get_count(entries))) { + restart = 1; + goto cleanup; - } -- ext4_handle_dirty_metadata(handle, inode, frames[0].bh); ++ } + /* OK, I've got DX lock and nothing changed */ + frame->at = ld->ld_at; } @@ -2136,123 +1858,15 @@ if (bh) brelse(bh); dx_release(frames); -+ /* @restart is true means htree-path has been changed, we need to -+ * repeat dx_probe() to find out valid htree-path */ -+ if (restart && err == 0) -+ goto again; - return err; - } - -@@ -1838,7 +2246,7 @@ int ext4_delete_entry(handle_t *handle, - blocksize); - else - de->inode = 0; -- dir->i_version++; -+ inode_inc_iversion(dir); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, bh); - return 0; -@@ -1882,7 +2290,7 @@ static void ext4_dec_count(handle_t *han - static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) - { -- int err = ext4_add_entry(handle, dentry, inode); -+ int err = ext4_add_entry(handle, dentry, inode, NULL); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); -@@ -2112,7 +2520,7 @@ retry: - goto out_stop; - } - -- err = ext4_add_entry(handle, dentry, inode); -+ err = ext4_add_entry(handle, dentry, inode, NULL); - if (err) - goto out_clear_inode; - ext4_inc_count(handle, dir); -@@ -2381,7 +2789,7 @@ static int ext4_rmdir(struct inode *dir, - return PTR_ERR(handle); - - retval = -ENOENT; -- bh = ext4_find_entry(dir, &dentry->d_name, &de); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (!bh) - goto end_rmdir; - -@@ -2443,7 +2851,7 @@ static int ext4_unlink(struct inode *dir - ext4_handle_sync(handle); - - retval = -ENOENT; -- bh = ext4_find_entry(dir, &dentry->d_name, &de); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (!bh) - goto end_unlink; - -@@ -2567,7 +2975,7 @@ retry: - ext4_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext4_add_entry(handle, dentry, inode); -+ err = ext4_add_entry(handle, dentry, inode, NULL); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); -@@ -2612,7 +3020,7 @@ static int ext4_rename(struct inode *old - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); - -- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); -+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process -@@ -2625,7 +3033,7 @@ static int ext4_rename(struct inode *old - goto end_rename; - - new_inode = new_dentry->d_inode; -- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); -+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL); - if (new_bh) { - if (!new_inode) { - brelse(new_bh); -@@ -2651,7 +3059,7 @@ static int ext4_rename(struct inode *old - goto end_rename; - } - if (!new_bh) { -- retval = ext4_add_entry(handle, new_dentry, old_inode); -+ retval = ext4_add_entry(handle, new_dentry, old_inode, NULL); - if (retval) - goto end_rename; - } else { -@@ -2693,7 +3101,8 @@ static int ext4_rename(struct inode *old - struct buffer_head *old_bh2; - struct ext4_dir_entry_2 *old_de2; +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/Makefile +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/Makefile ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/Makefile +@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o -- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); -+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, -+ &old_de2, NULL); - if (old_bh2) { - retval = ext4_delete_entry(handle, old_dir, - old_de2, old_bh2); ---- linux-2.6.32-131.6.1/fs/ext4/inode.c 2011-10-06 20:10:49.000000000 +0800 -+++ linux-2.6.32-131.6.1-pdo/fs/ext4/inode.c 2011-12-01 22:02:11.000000000 +0800 -@@ -5112,7 +5112,7 @@ struct inode *ext4_iget(struct super_blo - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) - ei->i_file_acl |= - ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; -- inode->i_size = ext4_isize(raw_inode); -+ inode->i_size = ext4_isize(sb, raw_inode); - ei->i_disksize = inode->i_size; - #ifdef CONFIG_QUOTA - ei->i_reserved_quota = 0; ---- linux-2.6.32-131.6.1/fs/ext4/Makefile 2011-10-06 20:10:49.000000000 +0800 -+++ linux-2.6.32-131.6.1-pdo/fs/ext4/Makefile 2011-10-06 12:21:30.000000000 +0800 -@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ htree_lock.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ -- mmp.o -+ htree_lock.o mmp.o + mmp.o - ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch new file mode 100644 index 0000000..5d86d19 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch @@ -0,0 +1,49 @@ +From 5930ea643805feb50a2f8383ae12eb6f10935e49 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Wed, 31 Aug 2011 12:02:51 -0400 +Subject: [PATCH] ext4: call ext4_handle_dirty_metadata with correct inode in + ext4_dx_add_entry + +ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads +that point to directory blocks assigned to the directory inode. However, the +function calls ext4_handle_dirty_metadata with the inode of the file that's +being added to the directory, not the directory inode itself. Therefore, +correct the code to dirty the directory buffers with the directory inode, not +the file inode. + +Signed-off-by: Darrick J. Wong +Signed-off-by: "Theodore Ts'o" +Cc: stable@kernel.org +--- + fs/ext4/namei.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index f0abe43..a067835 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1585,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); +- err = ext4_handle_dirty_metadata(handle, inode, bh2); ++ err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); +@@ -1611,7 +1611,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + if (err) + goto journal_error; + } +- ext4_handle_dirty_metadata(handle, inode, frames[0].bh); ++ err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); ++ if (err) { ++ ext4_std_error(inode->i_sb, err); ++ goto cleanup; ++ } + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) +-- +2.1.0 + diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch new file mode 100644 index 0000000..24ef03e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch @@ -0,0 +1,342 @@ +This INCOMPAT_LARGEDIR feature allows larger directories +to be created in ldiskfs, both with directory sizes over +2GB and and a maximum htree depth of 3 instead of the +current limit of 2. These features are needed in order +to exceed the current limit of approximately 10M entries +in a single directory. + +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +@@ -1585,7 +1585,8 @@ static inline void ext4_clear_state_flag + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_DIRDATA| \ +- EXT4_FEATURE_INCOMPAT_INLINE_DATA) ++ EXT4_FEATURE_INCOMPAT_INLINE_DATA| \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1999,6 +2000,9 @@ struct mmpd_data { + # define NORET_TYPE /**/ + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 + + struct ext4_xattr_ino_array { + unsigned int xia_count; /* # of used item in the array */ +@@ -2472,13 +2476,16 @@ static inline void ext4_r_blocks_count_s + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) || ++ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) && ++ S_ISDIR(le16_to_cpu(raw_inode->i_mode)))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t)le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c +@@ -513,7 +513,14 @@ struct dx_root_info * dx_get_dx_info(str + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; ++} ++ ++static inline int ++ext4_dir_htree_level(struct super_block *sb) ++{ ++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -681,7 +688,7 @@ dx_probe(const struct qstr *d_name, stru + struct dx_frame *frame = frame_in; + u32 hash; + +- frame->bh = NULL; ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); +@@ -714,9 +721,16 @@ dx_probe(const struct qstr *d_name, stru + goto fail; + } + +- if ((indirect = info->indirect_levels) > 1) { +- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- info->indirect_levels); ++ indirect = info->indirect_levels; ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "inode #%lu: comm %s: htree depth %#06x exceed max depth %u", ++ dir->i_ino, current->comm, indirect, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -812,13 +826,18 @@ fail: + static void dx_release (struct dx_frame *frames) + { + struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + + info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); +- if (info->indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0; i <= info->indirect_levels; i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -960,7 +979,7 @@ int ext4_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -1376,7 +1395,7 @@ static struct buffer_head * ext4_dx_find + { + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; +@@ -1832,7 +1851,7 @@ static int make_indexed_dir(handle_t *ha + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + struct ext4_dir_entry_tail *t; +@@ -2117,15 +2136,18 @@ static int ext4_add_entry(handle_t *hand + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + ++again: ++ restart = 0; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +@@ -2138,33 +2160,48 @@ static int ext4_dx_add_entry(handle_t *h + goto cleanup; + } + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext4_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning(sb, "Directory index full!"); ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u", ++ dir->i_ino, current->comm, levels, ++ ext4_dir_htree_level(sb)); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is" ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); +@@ -2179,7 +2216,7 @@ static int ext4_dx_add_entry(handle_t *h + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *h + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -2203,19 +2240,25 @@ static int ext4_dx_add_entry(handle_t *h + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block(frame - 1, hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", +- ((struct dx_node *) bh2->b_data)->entries)); ++ ((struct dx_node *)bh2->b_data)->entries)); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ ext4_handle_dirty_dirent_node(handle, dir, ++ (frame - 1)->bh); ++ if (restart) { ++ ext4_handle_dirty_dirent_node(handle, dir, ++ frame->bh); ++ goto cleanup; ++ } + } else { + struct dx_root_info * info; +- dxtrace(printk(KERN_DEBUG +- "Creating second level index...\n")); +- memcpy((char *) entries2, (char *) entries, ++ ++ memcpy((char *)entries2, (char *)entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + +@@ -2224,21 +2267,14 @@ static int ext4_dx_add_entry(handle_t *h + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2*) + frames[0].bh->b_data); +- info->indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; +- } +- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); +- if (err) { +- ext4_std_error(inode->i_sb, err); ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ ext4_handle_dirty_dirent_node(handle, dir, frame->bh); ++ ext4_handle_dirty_dirent_node(handle, dir, bh2); ++ brelse(bh2); ++ restart = 1; + goto cleanup; + } + } +@@ -2253,6 +2289,10 @@ journal_error: + cleanup: + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/inode.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c +@@ -4056,7 +4056,7 @@ struct inode *ext4_iget(struct super_blo + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +@@ -4306,7 +4306,7 @@ static int ext4_do_update_inode(handle_t + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); +- if (ei->i_disksize != ext4_isize(raw_inode)) { ++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-osd-iop-common.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-osd-iop-common.patch index d00452f..a351b61 100644 --- a/ldiskfs/kernel_patches/patches/rhel7/ext4-osd-iop-common.patch +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-osd-iop-common.patch @@ -2,21 +2,15 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h =================================================================== --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h -@@ -2145,6 +2145,19 @@ extern int ext4_orphan_add(handle_t *, s +@@ -2145,6 +2145,13 @@ extern int ext4_orphan_add(handle_t *, s extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern struct inode *ext4_create_inode(handle_t *handle, + struct inode * dir, int mode); -+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); +extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 * de_del, + struct buffer_head * bh); -+extern struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir, -+ int *inlined); +extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode); extern int search_dir(struct buffer_head *bh, @@ -26,42 +20,6 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c =================================================================== --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c -@@ -1211,7 +1211,7 @@ static int is_dx_internal_node(struct in - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ --static struct buffer_head * ext4_find_entry (struct inode *dir, -+struct buffer_head * ext4_find_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *inlined) -@@ -1355,6 +1355,7 @@ cleanup_and_exit: - brelse(bh_use[ra_ptr]); - return ret; - } -+EXPORT_SYMBOL(ext4_find_entry); - - static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) -@@ -1903,8 +1904,8 @@ static int make_indexed_dir(handle_t *ha - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ --static int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; -@@ -1979,6 +1980,7 @@ static int ext4_add_entry(handle_t *hand - ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); - return retval; - } -+EXPORT_SYMBOL(ext4_add_entry); - - /* - * Returns 0 for success, or a negative error value @@ -2165,7 +2167,7 @@ int ext4_generic_delete_entry(handle_t * return -ENOENT; } diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-pdirop.patch index 7d613aa..3e1b56a 100644 --- a/ldiskfs/kernel_patches/patches/rhel7/ext4-pdirop.patch +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-pdirop.patch @@ -12,13 +12,12 @@ threads to simultaneously lookup, create and unlink in parallel. This patch contains: - pdirops support for ldiskfs - - N-level htree directory - integrate with osd-ldiskfs -Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h +Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h =================================================================== --- /dev/null -+++ linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h ++++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h @@ -0,0 +1,187 @@ +/* + * include/linux/htree_lock.h @@ -207,10 +206,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h + ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) + +#endif -Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c =================================================================== --- /dev/null -+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c @@ -0,0 +1,880 @@ +/* + * fs/ext4/htree_lock.c @@ -468,7 +467,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c + htree_lock_mode_t mode, u32 key, unsigned dep, + int wait, void *event) +{ -+ LIST_HEAD (list); ++ LIST_HEAD(list); + struct htree_lock *tmp; + struct htree_lock *tmp2; + u16 major; @@ -1092,10 +1091,22 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c + kfree(lck); +} +EXPORT_SYMBOL(htree_lock_free); -Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile =================================================================== ---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h -+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/Makefile ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile +@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ htree_lock.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h @@ -27,6 +27,7 @@ #include #include @@ -1104,7 +1115,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h #include #include #include -@@ -810,6 +811,9 @@ struct ext4_inode_info { +@@ -821,6 +822,9 @@ struct ext4_inode_info { __u32 i_dtime; ext4_fsblk_t i_file_acl; @@ -1114,29 +1125,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, -@@ -1536,6 +1540,7 @@ static inline void ext4_clear_state_flag - EXT4_FEATURE_INCOMPAT_META_BG| \ - EXT4_FEATURE_INCOMPAT_EXTENTS| \ - EXT4_FEATURE_INCOMPAT_64BIT| \ -+ EXT4_FEATURE_INCOMPAT_LARGEDIR|\ - EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_EA_INODE| \ - EXT4_FEATURE_INCOMPAT_MMP | \ -@@ -1954,6 +1959,76 @@ struct mmpd_data { - # define NORET_TYPE /**/ - # define ATTRIB_NORET __attribute__((noreturn)) - # define NORET_AND noreturn, -+/* htree levels for ext4 */ -+#define EXT4_HTREE_LEVEL_COMPAT 2 -+#define EXT4_HTREE_LEVEL 3 -+ -+static inline int -+ext4_dir_htree_level(struct super_block *sb) -+{ -+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? -+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; -+} -+ +@@ -1846,6 +1850,71 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + +/* assume name-hash is protected by upper layer */ +#define EXT4_HTREE_LOCK_HASH 0 + @@ -1196,10 +1188,16 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h + struct inode *dir, unsigned flags); +#define ext4_htree_unlock(lck) htree_unlock(lck) + ++extern struct buffer_head *__ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); - struct ext4_xattr_ino_array { - unsigned int xia_count; /* # of used item in the array */ -@@ -2050,9 +2125,17 @@ void ext4_insert_dentry(struct inode *in + /* + * Describe an inode's exact location on disk and in memory +@@ -2088,9 +2157,17 @@ void ext4_insert_dentry(struct inode *in const char *name, int namelen, void *data); static inline void ext4_update_dx_flag(struct inode *inode) { @@ -1217,47 +1215,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h } static unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -@@ -2212,14 +2295,14 @@ extern int ext4_htree_fill_tree(struct f - extern struct inode *ext4_create_inode(handle_t *handle, - struct inode * dir, int mode); - extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode); -+ struct inode *inode, struct htree_lock *lck); - extern int ext4_delete_entry(handle_t *handle, struct inode * dir, - struct ext4_dir_entry_2 * de_del, - struct buffer_head * bh); - extern struct buffer_head * ext4_find_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 ** res_dir, -- int *inlined); -+ int *inlined, struct htree_lock *lck); - extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, - struct inode *inode, const void *, const void *); - extern int search_dir(struct buffer_head *bh, -@@ -2382,13 +2465,15 @@ static inline void ext4_r_blocks_count_s - es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); - } - --static inline loff_t ext4_isize(struct ext4_inode *raw_inode) -+static inline loff_t ext4_isize(struct super_block *sb, -+ struct ext4_inode *raw_inode) - { -- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) -+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || -+ S_ISREG(le16_to_cpu(raw_inode->i_mode))) - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | - le32_to_cpu(raw_inode->i_size_lo); -- else -- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); -+ -+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); - } - - static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) -Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c =================================================================== ---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c -+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c @@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t ext4_lblk_t *block) { @@ -1284,7 +1245,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c + } inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; - BUFFER_TRACE(bh, "get_write_access"); + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); + up(&ei->i_append_sem); if (err) { @@ -1316,16 +1277,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, -@@ -517,7 +525,7 @@ struct dx_root_info * dx_get_dx_info(str - - static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) - { -- return le32_to_cpu(entry->block) & 0x00ffffff; -+ return le32_to_cpu(entry->block) & 0x0fffffff; - } - - static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) -@@ -667,6 +675,223 @@ struct stats dx_show_entries(struct dx_h +@@ -668,6 +676,227 @@ struct stats dx_show_entries(struct dx_h } #endif /* DX_DEBUG */ @@ -1338,6 +1290,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c +}; + +#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ __ext4_find_entry(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ __ext4_add_entry(handle, dentry, inode, NULL) + +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ +#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) @@ -1549,7 +1505,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c /* * Probe for a directory leaf block to search. * -@@ -678,16 +903,17 @@ struct stats dx_show_entries(struct dx_h +@@ -679,10 +908,11 @@ struct stats dx_show_entries(struct dx_h */ static struct dx_frame * dx_probe(const struct qstr *d_name, struct inode *dir, @@ -1563,34 +1519,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c struct dx_root_info * info; struct buffer_head *bh; struct dx_frame *frame = frame_in; - u32 hash; - -- frame->bh = NULL; -+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); - bh = ext4_read_dirblock(dir, 0, INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); -@@ -720,9 +946,16 @@ dx_probe(const struct qstr *d_name, stru - goto fail; - } - -- if ((indirect = info->indirect_levels) > 1) { -- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", -- info->indirect_levels); -+ indirect = info->indirect_levels; -+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { -+ ext4_warning(dir->i_sb, -+ "Directory (ino: %lu) htree depth %#06x exceed " -+ "supported value", dir->i_ino, -+ ext4_dir_htree_level(dir->i_sb)); -+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { -+ ext4_warning(dir->i_sb, "Enable large directory " -+ "feature to access it"); -+ } - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; -@@ -742,8 +975,15 @@ dx_probe(const struct qstr *d_name, stru +@@ -750,8 +980,15 @@ dx_probe(const struct qstr *d_name, stru dxtrace(printk("Look up %x", hash)); while (1) { @@ -1607,7 +1536,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); brelse(bh); -@@ -784,7 +1024,70 @@ dx_probe(const struct qstr *d_name, stru +@@ -792,7 +1029,70 @@ dx_probe(const struct qstr *d_name, stru frame->bh = bh; frame->entries = entries; frame->at = at; @@ -1679,29 +1608,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); if (IS_ERR(bh)) { *err = PTR_ERR(bh); -@@ -818,13 +1121,18 @@ fail: - static void dx_release (struct dx_frame *frames) - { - struct dx_root_info *info; -+ int i; -+ - if (frames[0].bh == NULL) - return; - - info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); -- if (info->indirect_levels) -- brelse(frames[1].bh); -- brelse(frames[0].bh); -+ for (i = 0; i <= info->indirect_levels; i++) { -+ if (frames[i].bh == NULL) -+ break; -+ brelse(frames[i].bh); -+ frames[i].bh = NULL; -+ } - } - - /* -@@ -847,7 +1155,7 @@ static void dx_release (struct dx_frame +@@ -860,7 +1160,7 @@ static void dx_release (struct dx_frame static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, @@ -1710,7 +1617,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c { struct dx_frame *p; struct buffer_head *bh; -@@ -862,12 +1170,22 @@ static int ext4_htree_next_block(struct +@@ -875,12 +1175,22 @@ static int ext4_htree_next_block(struct * this loop, num_frames indicates the number of interior * nodes need to be read. */ @@ -1735,7 +1642,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c p--; } -@@ -890,6 +1208,13 @@ static int ext4_htree_next_block(struct +@@ -903,6 +1213,13 @@ static int ext4_htree_next_block(struct * block so no check is necessary */ while (num_frames--) { @@ -1749,7 +1656,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); if (IS_ERR(bh)) return PTR_ERR(bh); -@@ -898,6 +1223,7 @@ static int ext4_htree_next_block(struct +@@ -911,6 +1228,7 @@ static int ext4_htree_next_block(struct p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } @@ -1757,16 +1664,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c return 1; } -@@ -966,7 +1292,7 @@ int ext4_htree_fill_tree(struct file *di - { - struct dx_hash_info hinfo; - struct ext4_dir_entry_2 *de; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct inode *dir; - ext4_lblk_t block; - int count = 0; -@@ -1000,10 +1326,10 @@ int ext4_htree_fill_tree(struct file *di +@@ -1013,10 +1331,10 @@ int ext4_htree_fill_tree(struct file *di } hinfo.hash = start_hash; hinfo.minor_hash = 0; @@ -1779,7 +1677,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; -@@ -1030,7 +1356,7 @@ int ext4_htree_fill_tree(struct file *di +@@ -1043,7 +1361,7 @@ int ext4_htree_fill_tree(struct file *di count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, @@ -1788,8 +1686,12 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c *next_hash = hashval; if (ret < 0) { err = ret; -@@ -1226,7 +1552,7 @@ static int is_dx_internal_node(struct in - struct buffer_head * ext4_find_entry(struct inode *dir, +@@ -1236,10 +1554,10 @@ static int is_dx_internal_node(struct in + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext4_find_entry (struct inode *dir, ++struct buffer_head *__ext4_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, - int *inlined) @@ -1797,7 +1699,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; -@@ -1270,7 +1596,7 @@ struct buffer_head * ext4_find_entry(str +@@ -1283,7 +1601,7 @@ static struct buffer_head * ext4_find_en goto restart; } if (is_dx(dir)) { @@ -1806,7 +1708,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the -@@ -1280,6 +1606,7 @@ struct buffer_head * ext4_find_entry(str +@@ -1297,6 +1615,7 @@ static struct buffer_head * ext4_find_en return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1814,22 +1716,22 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); start = EXT4_I(dir)->i_dir_start_lookup; -@@ -1369,17 +1696,19 @@ cleanup_and_exit: +@@ -1389,9 +1708,12 @@ cleanup_and_exit: + brelse(bh_use[ra_ptr]); + return ret; } - EXPORT_SYMBOL(ext4_find_entry); ++EXPORT_SYMBOL(__ext4_find_entry); -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) -+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++static struct buffer_head *ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + struct htree_lock *lck, int *err) { struct super_block * sb = dir->i_sb; struct dx_hash_info hinfo; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct buffer_head *bh; +@@ -1400,7 +1722,7 @@ static struct buffer_head * ext4_dx_find ext4_lblk_t block; int retval; @@ -1838,7 +1740,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c return NULL; do { block = dx_get_block(frame->at); -@@ -1403,7 +1732,7 @@ static struct buffer_head * ext4_dx_find +@@ -1424,7 +1746,7 @@ static struct buffer_head * ext4_dx_find /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, hinfo.hash, frame, @@ -1847,25 +1749,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c if (retval < 0) { ext4_warning(sb, "error reading index page in directory #%lu", -@@ -1429,7 +1758,7 @@ static struct dentry *ext4_lookup(struct - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL); - if (IS_ERR(bh)) - return (struct dentry *) bh; - inode = NULL; -@@ -1489,7 +1818,7 @@ struct dentry *ext4_get_parent(struct de - struct ext4_dir_entry_2 * de; - struct buffer_head *bh; - -- bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); -+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL, NULL); - if (IS_ERR(bh)) - return (struct dentry *) bh; - if (!bh) -@@ -1559,8 +1888,9 @@ static struct ext4_dir_entry_2* dx_pack_ +@@ -1583,8 +1905,9 @@ static struct ext4_dir_entry_2* dx_pack_ * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, @@ -1877,7 +1761,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; -@@ -1624,7 +1954,14 @@ static struct ext4_dir_entry_2 *do_split +@@ -1647,7 +1970,14 @@ static struct ext4_dir_entry_2 *do_split hash2, split, count-split)); /* Fancy dance to stay within two buffers */ @@ -1893,7 +1777,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, -@@ -1643,13 +1980,21 @@ static struct ext4_dir_entry_2 *do_split +@@ -1666,13 +1996,21 @@ static struct ext4_dir_entry_2 *do_split dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1921,43 +1805,27 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c err = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (err) goto journal_error; -@@ -1809,7 +2154,7 @@ static int add_dirent_to_buf(handle_t *h - */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); - ext4_update_dx_flag(dir); -- dir->i_version++; -+ inode_inc_iversion(dir); - ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, dir, bh); -@@ -1829,7 +2174,7 @@ static int make_indexed_dir(handle_t *ha - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct dx_entry *entries; - struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; - struct ext4_dir_entry_tail *t; -@@ -1923,7 +2268,7 @@ static int make_indexed_dir(handle_t *ha +@@ -1945,7 +2283,7 @@ static int make_indexed_dir(handle_t *ha ext4_handle_dirty_dx_node(handle, dir, frame->bh); ext4_handle_dirty_dirent_node(handle, dir, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval); ++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, NULL, &retval); if (!de) { /* * Even if the block split failed, we have to properly write -@@ -2030,7 +2375,7 @@ out: +@@ -2051,8 +2389,8 @@ out: + * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ - int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+ struct inode *inode, struct htree_lock *lck) +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) { struct inode *dir = dentry->d_parent->d_inode; struct buffer_head *bh; -@@ -2066,9 +2411,10 @@ int ext4_add_entry(handle_t *handle, str +@@ -2087,9 +2425,10 @@ static int ext4_add_entry(handle_t *hand if (dentry->d_name.len == 2 && memcmp(dentry->d_name.name, "..", 2) == 0) return ext4_update_dotdot(handle, dentry, inode); @@ -1969,169 +1837,49 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); -@@ -2114,18 +2460,21 @@ EXPORT_SYMBOL(ext4_add_entry); +@@ -2129,12 +2468,13 @@ static int ext4_add_entry(handle_t *hand + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(__ext4_add_entry); + + /* * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) + struct inode *inode, struct htree_lock *lck) { -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; - struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; - struct super_block *sb = dir->i_sb; - struct ext4_dir_entry_2 *de; -+ int restart; - int err; +@@ -2148,7 +2488,7 @@ static int ext4_dx_add_entry(handle_t *h + again: + restart = 0; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); -+again: -+ restart = 0; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); if (!frame) return err; entries = frame->entries; -@@ -2137,33 +2486,53 @@ static int ext4_dx_add_entry(handle_t *h - goto cleanup; - } - -- BUFFER_TRACE(bh, "get_write_access"); -- err = ext4_journal_get_write_access(handle, bh); -- if (err) -- goto journal_error; -- - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) - goto cleanup; - -+ err = 0; - /* Block full, should compress but for now just split */ - dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); - /* Need to split index? */ - if (dx_get_count(entries) == dx_get_limit(entries)) { - ext4_lblk_t newblock; -- unsigned icount = dx_get_count(entries); -- int levels = frame - frames; -+ int levels = frame - frames + 1; -+ unsigned icount; -+ int add_level = 1; - struct dx_entry *entries2; +@@ -2178,6 +2518,11 @@ again: struct dx_node *node2; struct buffer_head *bh2; -- if (levels && (dx_get_count(frames->entries) == -- dx_get_limit(frames->entries))) { -- ext4_warning(sb, "Directory index full!"); + if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ + ext4_htree_safe_relock(lck); + restart = 1; + goto cleanup; + } -+ while (frame > frames) { -+ if (dx_get_count((frame - 1)->entries) < -+ dx_get_limit((frame - 1)->entries)) { -+ add_level = 0; -+ break; -+ } -+ frame--; /* split higher index block */ -+ at = frame->at; -+ entries = frame->entries; -+ restart = 1; -+ } -+ if (add_level && levels == ext4_dir_htree_level(sb)) { -+ ext4_warning(sb, "Directory (ino: %lu) index full, " -+ "reach max htree level :%d", -+ dir->i_ino, levels); -+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { -+ ext4_warning(sb, "Large directory feature is" -+ "not enabled on this " -+ "filesystem"); -+ } - err = -ENOSPC; + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2277,16 +2622,43 @@ again: + restart = 1; goto cleanup; } -+ icount = dx_get_count(entries); - bh2 = ext4_append(handle, dir, &newblock); - if (IS_ERR(bh2)) { - err = PTR_ERR(bh2); -@@ -2178,7 +2547,7 @@ static int ext4_dx_add_entry(handle_t *h - err = ext4_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; -- if (levels) { -+ if (!add_level) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", -@@ -2186,7 +2555,7 @@ static int ext4_dx_add_entry(handle_t *h - - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, -- frames[0].bh); -+ (frame - 1)->bh); - if (err) - goto journal_error; - -@@ -2202,18 +2571,24 @@ static int ext4_dx_add_entry(handle_t *h - frame->entries = entries = entries2; - swap(frame->bh, bh2); - } -- dx_insert_block(frames + 0, hash2, newblock); -- dxtrace(dx_show_index("node", frames[1].entries)); -+ dx_insert_block((frame - 1), hash2, newblock); -+ dxtrace(dx_show_index("node", frame->entries)); - dxtrace(dx_show_index("node", - ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_dx_node(handle, dir, bh2); - if (err) - goto journal_error; - brelse (bh2); -+ ext4_handle_dirty_metadata(handle, inode, -+ (frame - 1)->bh); -+ if (restart) { -+ ext4_handle_dirty_metadata(handle, inode, -+ frame->bh); -+ goto cleanup; -+ } - } else { - struct dx_root_info * info; -- dxtrace(printk(KERN_DEBUG -- "Creating second level index...\n")); -+ - memcpy((char *) entries2, (char *) entries, - icount * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); -@@ -2223,35 +2598,63 @@ static int ext4_dx_add_entry(handle_t *h - dx_set_block(entries + 0, newblock); - info = dx_get_dx_info((struct ext4_dir_entry_2*) - frames[0].bh->b_data); -- info->indirect_levels = 1; -+ info->indirect_levels += 1; -+ dxtrace(printk(KERN_DEBUG -+ "Creating %d level index...\n", -+ info->indirect_levels)); -+ ext4_handle_dirty_metadata(handle, inode, frame->bh); -+ ext4_handle_dirty_metadata(handle, inode, bh2); -+ brelse(bh2); -+ restart = 1; -+ goto cleanup; -+ } + } else if (!ext4_htree_dx_locked(lck)) { + struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); - -- /* Add new access path frame */ -- frame = frames + 1; -- frame->at = at = at - entries + entries2; -- frame->entries = entries = entries2; -- frame->bh = bh2; -- err = ext4_journal_get_write_access(handle, -- frame->bh); -- if (err) -- goto journal_error; ++ + /* not well protected, require DX lock */ + ext4_htree_dx_need_lock(lck); + at = frame > frames ? (frame - 1)->at : NULL; @@ -2150,14 +1898,9 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c + (ld->ld_count != dx_get_count(entries))) { + restart = 1; + goto cleanup; - } -- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); ++ } + /* OK, I've got DX lock and nothing changed */ + frame->at = ld->ld_at; - if (err) { - ext4_std_error(inode->i_sb, err); - goto cleanup; - } } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); + de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err); @@ -2174,164 +1917,15 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c + ext4_htree_de_unlock(lck); brelse(bh); dx_release(frames); -+ /* @restart is true means htree-path has been changed, we need to -+ * repeat dx_probe() to find out valid htree-path */ -+ if (restart && err == 0) -+ goto again; - return err; - } - -@@ -2288,7 +2691,7 @@ int ext4_generic_delete_entry(handle_t * - blocksize); - else - de->inode = 0; -- dir->i_version++; -+ inode_inc_iversion(dir); - return 0; - } - i += ext4_rec_len_from_disk(de->rec_len, blocksize); -@@ -2373,7 +2776,7 @@ EXPORT_SYMBOL(ext4_dec_count); - static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) - { -- int err = ext4_add_entry(handle, dentry, inode); -+ int err = ext4_add_entry(handle, dentry, inode, NULL); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); -@@ -2641,7 +3044,7 @@ retry: - goto out_clear_inode; - err = ext4_mark_inode_dirty(handle, inode); - if (!err) -- err = ext4_add_entry(handle, dentry, inode); -+ err = ext4_add_entry(handle, dentry, inode, NULL); - if (err) { - out_clear_inode: - clear_nlink(inode); -@@ -2907,7 +3310,7 @@ static int ext4_rmdir(struct inode *dir, - dquot_initialize(dentry->d_inode); - - retval = -ENOENT; -- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (!bh) -@@ -2974,7 +3377,7 @@ static int ext4_unlink(struct inode *dir - dquot_initialize(dentry->d_inode); - - retval = -ENOENT; -- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (!bh) -@@ -3153,7 +3556,7 @@ retry: - ext4_inc_count(handle, inode); - ihold(inode); - -- err = ext4_add_entry(handle, dentry, inode); -+ err = ext4_add_entry(handle, dentry, inode, NULL); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); -@@ -3183,7 +3556,7 @@ retry: - struct buffer_head *bh; - struct ext4_dir_entry_2 *de; - -- bh = ext4_find_entry(dir, d_name, &de, NULL); -+ bh = ext4_find_entry(dir, d_name, &de, NULL, NULL); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (bh) { -@@ -3230,7 +3633,7 @@ static int ext4_rename(struct inode *old - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); - -- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); -+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL, NULL); - if (IS_ERR(old.bh)) - return PTR_ERR(old.bh); - /* -@@ -3244,7 +3647,7 @@ static int ext4_rename(struct inode *old - - new_inode = new_dentry->d_inode; - new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, -- &new.de, &new.inlined); -+ &new.de, &new.inlined, NULL); - if (IS_ERR(new.bh)) { - if (!new_inode) { - brelse(new_bh); -@@ -3275,7 +3678,7 @@ static int ext4_rename(struct inode *old - goto end_rename; - } - if (!new.bh) { -- retval = ext4_add_entry(handle, new.dentry, old.inode); -+ retval = ext4_add_entry(handle, new.dentry, old.inode, NULL); - if (retval) - goto end_rename; - } else { -@@ -3375,7 +3678,7 @@ static int ext4_rename(struct inode *old - dquot_initialize(new.dir); - - old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, -- &old.de, &old.inlined); -+ &old.de, &old.inlined, NULL); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process -@@ -3475,7 +3678,7 @@ static int ext4_rename(struct inode *old - goto end_rename; - - new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, -- &new.de, &new.inlined); -+ &new.de, &new.inlined, NULL); - - /* RENAME_EXCHANGE case: old *and* new must both exist */ - if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) -Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c -=================================================================== ---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/inode.c -+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c -@@ -4264,7 +4264,7 @@ struct inode *ext4_iget(struct super_blo - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) - ei->i_file_acl |= - ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; -- inode->i_size = ext4_isize(raw_inode); -+ inode->i_size = ext4_isize(sb, raw_inode); - ei->i_disksize = inode->i_size; - #ifdef CONFIG_QUOTA - ei->i_reserved_quota = 0; -@@ -4499,7 +4499,7 @@ static int ext4_do_update_inode(handle_t - raw_inode->i_file_acl_high = - cpu_to_le16(ei->i_file_acl >> 32); - raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); -- if (ei->i_disksize != ext4_isize(raw_inode)) { -+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { - ext4_isize_set(raw_inode, ei->i_disksize); - need_datasync = 1; - } -Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile -=================================================================== ---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/Makefile -+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile -@@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ -- xattr_trusted.o inline.o -+ xattr_trusted.o inline.o htree_lock.o - - ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o - ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c + /* @restart is true means htree-path has been changed, we need to +Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c =================================================================== ---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/super.c -+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c -@@ -872,6 +872,7 @@ static struct inode *ext4_alloc_inode(st +--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/super.c ++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c +@@ -875,6 +875,7 @@ static struct inode *ext4_alloc_inode(st ei->vfs_inode.i_version = 1; - spin_lock_init(&ei->i_raw_lock); + spin_lock_init(&ei->i_raw_lock); + sema_init(&ei->i_append_sem, 1); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch new file mode 100644 index 0000000..3ef66c1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch @@ -0,0 +1,364 @@ +This INCOMPAT_LARGEDIR feature allows larger directories +to be created in ldiskfs, both with directory sizes over +2GB and and a maximum htree depth of 3 instead of the +current limit of 2. These features are needed in order +to exceed the current limit of approximately 10M entries +in a single directory. + +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h +@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags) + #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 + #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 + #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ ++#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 + + #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ +@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP| \ +- EXT4_FEATURE_INCOMPAT_DIRDATA) ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b + */ + #define ERR_BAD_DX_DIR -75000 + ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 ++ ++static inline int ++ext4_dir_htree_level(struct super_block *sb) ++{ ++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; ++} ++ + void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); + +@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || ++ S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c +@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +@@ -5654,7 +5654,7 @@ static int ext4_do_update_inode(handle_t + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); +- if (ei->i_disksize != ext4_isize(raw_inode)) { ++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } +Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c ++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c +@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru + struct dx_frame *frame = frame_in; + u32 hash; + +- frame->bh = NULL; ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + +@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru + goto fail; + } + +- if ((indirect = info->indirect_levels) > 1) { +- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- info->indirect_levels); ++ indirect = info->indirect_levels; ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "Directory (ino: %lu) htree depth %#06x exceed " ++ "supported value", dir->i_ino, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -512,13 +519,18 @@ fail: + static void dx_release (struct dx_frame *frames) + { + struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + + info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); +- if (info->indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0; i <= info->indirect_levels; i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + struct ext4_dir_entry_2 *de, *top; +@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); +- dir->i_version++; ++ inode_inc_iversion(dir); + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); +@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data1, *top; +@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + ++again: ++ restart = 0; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext4_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning(sb, "Directory index full!"); ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "Directory (ino: %lu) index full, " ++ "reach max htree level :%d", ++ dir->i_ino, levels); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is" ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; +@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block((frame - 1), hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ ext4_handle_dirty_metadata(handle, dir, ++ (frame - 1)->bh); ++ if (restart) { ++ ext4_handle_dirty_metadata(handle, dir, ++ frame->bh); ++ goto cleanup; ++ } + } else { + struct dx_root_info * info; +- dxtrace(printk(KERN_DEBUG +- "Creating second level index...\n")); ++ + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); +@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2*) + frames[0].bh->b_data); +- info->indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ ext4_handle_dirty_metadata(handle, dir, frame->bh); ++ ext4_handle_dirty_metadata(handle, dir, bh2); ++ brelse(bh2); ++ restart = 1; ++ goto cleanup; + } +- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; +@@ -1840,6 +1873,10 @@ cleanup: + if (bh) + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle, + blocksize); + else + de->inode = 0; +- dir->i_version++; ++ inode_inc_iversion(dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, bh); + return 0; diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch index a9014f2..0f101ed 100644 --- a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch @@ -5,21 +5,15 @@ --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h -@@ -1895,6 +1895,19 @@ extern int ext4_orphan_add(handle_t *, s +@@ -1895,6 +1895,13 @@ extern int ext4_orphan_add(handle_t *, s extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern struct inode *ext4_create_inode(handle_t *handle, + struct inode * dir, int mode); -+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); +extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 * de_del, + struct buffer_head * bh); -+extern struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir); -+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) +extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode); @@ -35,46 +29,6 @@ #include #include #include -@@ -873,9 +874,9 @@ static inline int search_dirblock(struct - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ --static struct buffer_head * ext4_find_entry (struct inode *dir, -- const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir) -+struct buffer_head * ext4_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 ** res_dir) - { - struct super_block *sb; - struct buffer_head *bh_use[NAMEI_RA_SIZE]; -@@ -981,6 +982,7 @@ cleanup_and_exit: - brelse(bh_use[ra_ptr]); - return ret; - } -+EXPORT_SYMBOL(ext4_find_entry); - - static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) -@@ -1503,8 +1505,8 @@ static int make_indexed_dir(handle_t *ha - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ --static int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+int ext4_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; -@@ -1555,6 +1557,7 @@ static int ext4_add_entry(handle_t *hand - ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); - return retval; - } -+EXPORT_SYMBOL(ext4_add_entry); - - /* - * Returns 0 for success, or a negative error value @@ -1698,10 +1701,10 @@ cleanup: * ext4_delete_entry deletes a directory entry by merging it with the * previous entry diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch deleted file mode 100644 index 4d2acff..0000000 --- a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch +++ /dev/null @@ -1,2273 +0,0 @@ ---- - fs/ext4/Makefile | 2 - fs/ext4/ext4.h | 93 ++++ - fs/ext4/htree_lock.c | 880 +++++++++++++++++++++++++++++++++++++++++++++ - fs/ext4/inode.c | 4 - fs/ext4/namei.c | 585 +++++++++++++++++++++++++---- - include/linux/htree_lock.h | 187 +++++++++ - 6 files changed, 1650 insertions(+), 101 deletions(-) - ---- a/fs/ext4/Makefile -+++ b/fs/ext4/Makefile -@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o - ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ -- mmp.o -+ htree_lock.o mmp.o - - ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ---- a/fs/ext4/ext4.h -+++ b/fs/ext4/ext4.h -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - #include - #include - #ifdef __KERNEL__ -@@ -1402,6 +1403,7 @@ static inline void ext4_clear_state_flag - #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 - #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ - #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 - - #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR - #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ -@@ -1427,7 +1429,8 @@ static inline void ext4_clear_state_flag - EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_EA_INODE| \ - EXT4_FEATURE_INCOMPAT_MMP| \ -- EXT4_FEATURE_INCOMPAT_DIRDATA) -+ EXT4_FEATURE_INCOMPAT_DIRDATA| \ -+ EXT4_FEATURE_INCOMPAT_LARGEDIR) - - #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ -@@ -1690,6 +1693,76 @@ ext4_group_first_block_no(struct super_b - */ - #define ERR_BAD_DX_DIR -75000 - -+/* htree levels for ext4 */ -+#define EXT4_HTREE_LEVEL_COMPAT 2 -+#define EXT4_HTREE_LEVEL 3 -+ -+static inline int -+ext4_dir_htree_level(struct super_block *sb) -+{ -+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? -+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; -+} -+ -+/* assume name-hash is protected by upper layer */ -+#define EXT4_HTREE_LOCK_HASH 0 -+ -+enum ext4_pdo_lk_types { -+#if EXT4_HTREE_LOCK_HASH -+ EXT4_LK_HASH, -+#endif -+ EXT4_LK_DX, /* index block */ -+ EXT4_LK_DE, /* directory entry block */ -+ EXT4_LK_SPIN, /* spinlock */ -+ EXT4_LK_MAX, -+}; -+ -+/* read-only bit */ -+#define EXT4_LB_RO(b) (1 << (b)) -+/* read + write, high bits for writer */ -+#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) -+ -+enum ext4_pdo_lock_bits { -+ /* DX lock bits */ -+ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), -+ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), -+ /* DE lock bits */ -+ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), -+ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), -+ /* DX spinlock bits */ -+ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), -+ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), -+ /* accurate searching */ -+ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), -+}; -+ -+enum ext4_pdo_lock_opc { -+ /* external */ -+ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), -+ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | -+ EXT4_LB_EXACT), -+ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | -+ EXT4_LB_EXACT), -+ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), -+ -+ /* internal */ -+ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | -+ EXT4_LB_EXACT), -+ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), -+ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), -+}; -+ -+extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); -+#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) -+ -+extern struct htree_lock *ext4_htree_lock_alloc(void); -+#define ext4_htree_lock_free(lck) htree_lock_free(lck) -+ -+extern void ext4_htree_lock(struct htree_lock *lck, -+ struct htree_lock_head *lhead, -+ struct inode *dir, unsigned flags); -+#define ext4_htree_unlock(lck) htree_unlock(lck) -+ - void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - -@@ -1964,14 +2037,16 @@ extern int ext4_htree_fill_tree(struct f - extern struct inode *ext4_create_inode(handle_t *handle, - struct inode * dir, int mode); - extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode); -+ struct inode *inode, struct htree_lock *lck); - extern int ext4_delete_entry(handle_t *handle, struct inode * dir, - struct ext4_dir_entry_2 * de_del, - struct buffer_head * bh); - extern struct buffer_head * ext4_find_entry(struct inode *dir, - const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir); --#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) -+ struct ext4_dir_entry_2 **res_dir, -+ struct htree_lock *lck); -+#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \ -+ ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck) - extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, - struct inode *inode, const void *, const void *); - extern struct buffer_head *ext4_append(handle_t *handle, -@@ -2104,13 +2179,15 @@ static inline void ext4_r_blocks_count_s - es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); - } - --static inline loff_t ext4_isize(struct ext4_inode *raw_inode) -+static inline loff_t ext4_isize(struct super_block *sb, -+ struct ext4_inode *raw_inode) - { -- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) -+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || -+ S_ISREG(le16_to_cpu(raw_inode->i_mode))) - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | - le32_to_cpu(raw_inode->i_size_lo); -- else -- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); -+ -+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); - } - - static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) ---- /dev/null -+++ b/fs/ext4/htree_lock.c -@@ -0,0 +1,880 @@ -+/* -+ * fs/ext4/htree_lock.c -+ * -+ * Copyright (c) 2011, 2012, Intel Corporation. -+ * -+ * Author: Liang Zhen -+ */ -+#include -+#include -+#include -+#include -+ -+enum { -+ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX), -+ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW), -+ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR), -+ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW), -+ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR), -+}; -+ -+enum { -+ HTREE_LOCK_COMPAT_EX = 0, -+ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR, -+ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR, -+ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW, -+ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR | -+ HTREE_LOCK_BIT_PW, -+}; -+ -+static int htree_lock_compat[] = { -+ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX, -+ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW, -+ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR, -+ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW, -+ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR, -+}; -+ -+/* max allowed htree-lock depth. -+ * We only need depth=3 for ext4 although user can have higher value. */ -+#define HTREE_LOCK_DEP_MAX 16 -+ -+#ifdef HTREE_LOCK_DEBUG -+ -+static char *hl_name[] = { -+ [HTREE_LOCK_EX] "EX", -+ [HTREE_LOCK_PW] "PW", -+ [HTREE_LOCK_PR] "PR", -+ [HTREE_LOCK_CW] "CW", -+ [HTREE_LOCK_CR] "CR", -+}; -+ -+/* lock stats */ -+struct htree_lock_node_stats { -+ unsigned long long blocked[HTREE_LOCK_MAX]; -+ unsigned long long granted[HTREE_LOCK_MAX]; -+ unsigned long long retried[HTREE_LOCK_MAX]; -+ unsigned long long events; -+}; -+ -+struct htree_lock_stats { -+ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX]; -+ unsigned long long granted[HTREE_LOCK_MAX]; -+ unsigned long long blocked[HTREE_LOCK_MAX]; -+}; -+ -+static struct htree_lock_stats hl_stats; -+ -+void htree_lock_stat_reset(void) -+{ -+ memset(&hl_stats, 0, sizeof(hl_stats)); -+} -+ -+void htree_lock_stat_print(int depth) -+{ -+ int i; -+ int j; -+ -+ printk(KERN_DEBUG "HTREE LOCK STATS:\n"); -+ for (i = 0; i < HTREE_LOCK_MAX; i++) { -+ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n", -+ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]); -+ } -+ for (i = 0; i < depth; i++) { -+ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i); -+ for (j = 0; j < HTREE_LOCK_MAX; j++) { -+ printk(KERN_DEBUG -+ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n", -+ hl_name[j], hl_stats.nodes[i].granted[j], -+ hl_stats.nodes[i].blocked[j], -+ hl_stats.nodes[i].retried[j]); -+ } -+ } -+} -+ -+#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0) -+#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0) -+#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0) -+#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0) -+#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0) -+#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0) -+ -+#else /* !DEBUG */ -+ -+void htree_lock_stat_reset(void) {} -+void htree_lock_stat_print(int depth) {} -+ -+#define lk_grant_inc(m) do {} while (0) -+#define lk_block_inc(m) do {} while (0) -+#define ln_grant_inc(d, m) do {} while (0) -+#define ln_block_inc(d, m) do {} while (0) -+#define ln_retry_inc(d, m) do {} while (0) -+#define ln_event_inc(d) do {} while (0) -+ -+#endif /* DEBUG */ -+ -+EXPORT_SYMBOL(htree_lock_stat_reset); -+EXPORT_SYMBOL(htree_lock_stat_print); -+ -+#define HTREE_DEP_ROOT (-1) -+ -+#define htree_spin_lock(lhead, dep) \ -+ bit_spin_lock((dep) + 1, &(lhead)->lh_lock) -+#define htree_spin_unlock(lhead, dep) \ -+ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock) -+ -+#define htree_key_event_ignore(child, ln) \ -+ (!((child)->lc_events & (1 << (ln)->ln_mode))) -+ -+static int -+htree_key_list_empty(struct htree_lock_node *ln) -+{ -+ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list); -+} -+ -+static void -+htree_key_list_del_init(struct htree_lock_node *ln) -+{ -+ struct htree_lock_node *tmp = NULL; -+ -+ if (!list_empty(&ln->ln_minor_list)) { -+ tmp = list_entry(ln->ln_minor_list.next, -+ struct htree_lock_node, ln_minor_list); -+ list_del_init(&ln->ln_minor_list); -+ } -+ -+ if (list_empty(&ln->ln_major_list)) -+ return; -+ -+ if (tmp == NULL) { /* not on minor key list */ -+ list_del_init(&ln->ln_major_list); -+ } else { -+ BUG_ON(!list_empty(&tmp->ln_major_list)); -+ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list); -+ } -+} -+ -+static void -+htree_key_list_replace_init(struct htree_lock_node *old, -+ struct htree_lock_node *new) -+{ -+ if (!list_empty(&old->ln_major_list)) -+ list_replace_init(&old->ln_major_list, &new->ln_major_list); -+ -+ if (!list_empty(&old->ln_minor_list)) -+ list_replace_init(&old->ln_minor_list, &new->ln_minor_list); -+} -+ -+static void -+htree_key_event_enqueue(struct htree_lock_child *child, -+ struct htree_lock_node *ln, int dep, void *event) -+{ -+ struct htree_lock_node *tmp; -+ -+ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ -+ BUG_ON(ln->ln_mode == HTREE_LOCK_NL); -+ if (event == NULL || htree_key_event_ignore(child, ln)) -+ return; -+ -+ /* shouldn't be a very long list */ -+ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) { -+ if (tmp->ln_mode == HTREE_LOCK_NL) { -+ ln_event_inc(dep); -+ if (child->lc_callback != NULL) -+ child->lc_callback(tmp->ln_ev_target, event); -+ } -+ } -+} -+ -+static int -+htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk, -+ unsigned dep, int wait, void *event) -+{ -+ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep]; -+ struct htree_lock_node *newln = &newlk->lk_nodes[dep]; -+ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; -+ -+ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ -+ /* NB: we only expect PR/PW lock mode at here, only these two modes are -+ * allowed for htree_node_lock(asserted in htree_node_lock_internal), -+ * NL is only used for listener, user can't directly require NL mode */ -+ if ((curln->ln_mode == HTREE_LOCK_NL) || -+ (curln->ln_mode != HTREE_LOCK_PW && -+ newln->ln_mode != HTREE_LOCK_PW)) { -+ /* no conflict, attach it on granted list of @curlk */ -+ if (curln->ln_mode != HTREE_LOCK_NL) { -+ list_add(&newln->ln_granted_list, -+ &curln->ln_granted_list); -+ } else { -+ /* replace key owner */ -+ htree_key_list_replace_init(curln, newln); -+ } -+ -+ list_add(&newln->ln_alive_list, &curln->ln_alive_list); -+ htree_key_event_enqueue(child, newln, dep, event); -+ ln_grant_inc(dep, newln->ln_mode); -+ return 1; /* still hold lh_lock */ -+ } -+ -+ if (!wait) { /* can't grant and don't want to wait */ -+ ln_retry_inc(dep, newln->ln_mode); -+ newln->ln_mode = HTREE_LOCK_INVAL; -+ return -1; /* don't wait and just return -1 */ -+ } -+ -+ newlk->lk_task = current; -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ /* conflict, attach it on blocked list of curlk */ -+ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list); -+ list_add(&newln->ln_alive_list, &curln->ln_alive_list); -+ ln_block_inc(dep, newln->ln_mode); -+ -+ htree_spin_unlock(newlk->lk_head, dep); -+ /* wait to be given the lock */ -+ if (newlk->lk_task != NULL) -+ schedule(); -+ /* granted, no doubt, wake up will set me RUNNING */ -+ if (event == NULL || htree_key_event_ignore(child, newln)) -+ return 0; /* granted without lh_lock */ -+ -+ htree_spin_lock(newlk->lk_head, dep); -+ htree_key_event_enqueue(child, newln, dep, event); -+ return 1; /* still hold lh_lock */ -+} -+ -+/* -+ * get PR/PW access to particular tree-node according to @dep and @key, -+ * it will return -1 if @wait is false and can't immediately grant this lock. -+ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get -+ * @event if it's not NULL. -+ * NB: ALWAYS called holding lhead::lh_lock -+ */ -+static int -+htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck, -+ htree_lock_mode_t mode, u32 key, unsigned dep, -+ int wait, void *event) -+{ -+ LIST_HEAD (list); -+ struct htree_lock *tmp; -+ struct htree_lock *tmp2; -+ u16 major; -+ u16 minor; -+ u8 reverse; -+ u8 ma_bits; -+ u8 mi_bits; -+ -+ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR); -+ BUG_ON(htree_node_is_granted(lck, dep)); -+ -+ key = hash_long(key, lhead->lh_hbits); -+ -+ mi_bits = lhead->lh_hbits >> 1; -+ ma_bits = lhead->lh_hbits - mi_bits; -+ -+ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1); -+ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits; -+ lck->lk_nodes[dep].ln_mode = mode; -+ -+ /* -+ * The major key list is an ordered list, so searches are started -+ * at the end of the list that is numerically closer to major_key, -+ * so at most half of the list will be walked (for well-distributed -+ * keys). The list traversal aborts early if the expected key -+ * location is passed. -+ */ -+ reverse = (major >= (1 << (ma_bits - 1))); -+ -+ if (reverse) { -+ list_for_each_entry_reverse(tmp, -+ &lhead->lh_children[dep].lc_list, -+ lk_nodes[dep].ln_major_list) { -+ if (tmp->lk_nodes[dep].ln_major_key == major) { -+ goto search_minor; -+ -+ } else if (tmp->lk_nodes[dep].ln_major_key < major) { -+ /* attach _after_ @tmp */ -+ list_add(&lck->lk_nodes[dep].ln_major_list, -+ &tmp->lk_nodes[dep].ln_major_list); -+ goto out_grant_major; -+ } -+ } -+ -+ list_add(&lck->lk_nodes[dep].ln_major_list, -+ &lhead->lh_children[dep].lc_list); -+ goto out_grant_major; -+ -+ } else { -+ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list, -+ lk_nodes[dep].ln_major_list) { -+ if (tmp->lk_nodes[dep].ln_major_key == major) { -+ goto search_minor; -+ -+ } else if (tmp->lk_nodes[dep].ln_major_key > major) { -+ /* insert _before_ @tmp */ -+ list_add_tail(&lck->lk_nodes[dep].ln_major_list, -+ &tmp->lk_nodes[dep].ln_major_list); -+ goto out_grant_major; -+ } -+ } -+ -+ list_add_tail(&lck->lk_nodes[dep].ln_major_list, -+ &lhead->lh_children[dep].lc_list); -+ goto out_grant_major; -+ } -+ -+ search_minor: -+ /* -+ * NB: minor_key list doesn't have a "head", @list is just a -+ * temporary stub for helping list searching, make sure it's removed -+ * after searching. -+ * minor_key list is an ordered list too. -+ */ -+ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list); -+ -+ reverse = (minor >= (1 << (mi_bits - 1))); -+ -+ if (reverse) { -+ list_for_each_entry_reverse(tmp2, &list, -+ lk_nodes[dep].ln_minor_list) { -+ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { -+ goto out_enqueue; -+ -+ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) { -+ /* attach _after_ @tmp2 */ -+ list_add(&lck->lk_nodes[dep].ln_minor_list, -+ &tmp2->lk_nodes[dep].ln_minor_list); -+ goto out_grant_minor; -+ } -+ } -+ -+ list_add(&lck->lk_nodes[dep].ln_minor_list, &list); -+ -+ } else { -+ list_for_each_entry(tmp2, &list, -+ lk_nodes[dep].ln_minor_list) { -+ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { -+ goto out_enqueue; -+ -+ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) { -+ /* insert _before_ @tmp2 */ -+ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, -+ &tmp2->lk_nodes[dep].ln_minor_list); -+ goto out_grant_minor; -+ } -+ } -+ -+ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list); -+ } -+ -+ out_grant_minor: -+ if (list.next == &lck->lk_nodes[dep].ln_minor_list) { -+ /* new lock @lck is the first one on minor_key list, which -+ * means it has the smallest minor_key and it should -+ * replace @tmp as minor_key owner */ -+ list_replace_init(&tmp->lk_nodes[dep].ln_major_list, -+ &lck->lk_nodes[dep].ln_major_list); -+ } -+ /* remove the temporary head */ -+ list_del(&list); -+ -+ out_grant_major: -+ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode); -+ return 1; /* granted with holding lh_lock */ -+ -+ out_enqueue: -+ list_del(&list); /* remove temprary head */ -+ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event); -+} -+ -+/* -+ * release the key of @lck at level @dep, and grant any blocked locks. -+ * caller will still listen on @key if @event is not NULL, which means -+ * caller can see a event (by event_cb) while granting any lock with -+ * the same key at level @dep. -+ * NB: ALWAYS called holding lhead::lh_lock -+ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL -+ */ -+static void -+htree_node_unlock_internal(struct htree_lock_head *lhead, -+ struct htree_lock *curlk, unsigned dep, void *event) -+{ -+ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; -+ struct htree_lock *grtlk = NULL; -+ struct htree_lock_node *grtln; -+ struct htree_lock *poslk; -+ struct htree_lock *tmplk; -+ -+ if (!htree_node_is_granted(curlk, dep)) -+ return; -+ -+ if (!list_empty(&curln->ln_granted_list)) { -+ /* there is another granted lock */ -+ grtlk = list_entry(curln->ln_granted_list.next, -+ struct htree_lock, -+ lk_nodes[dep].ln_granted_list); -+ list_del_init(&curln->ln_granted_list); -+ } -+ -+ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) { -+ /* -+ * @curlk is the only granted lock, so we confirmed: -+ * a) curln is key owner (attached on major/minor_list), -+ * so if there is any blocked lock, it should be attached -+ * on curln->ln_blocked_list -+ * b) we always can grant the first blocked lock -+ */ -+ grtlk = list_entry(curln->ln_blocked_list.next, -+ struct htree_lock, -+ lk_nodes[dep].ln_blocked_list); -+ BUG_ON(grtlk->lk_task == NULL); -+ wake_up_process(grtlk->lk_task); -+ } -+ -+ if (event != NULL && -+ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) { -+ curln->ln_ev_target = event; -+ curln->ln_mode = HTREE_LOCK_NL; /* listen! */ -+ } else { -+ curln->ln_mode = HTREE_LOCK_INVAL; -+ } -+ -+ if (grtlk == NULL) { /* I must be the only one locking this key */ -+ struct htree_lock_node *tmpln; -+ -+ BUG_ON(htree_key_list_empty(curln)); -+ -+ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */ -+ return; -+ -+ /* not listening */ -+ if (list_empty(&curln->ln_alive_list)) { /* no more listener */ -+ htree_key_list_del_init(curln); -+ return; -+ } -+ -+ tmpln = list_entry(curln->ln_alive_list.next, -+ struct htree_lock_node, ln_alive_list); -+ -+ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL); -+ -+ htree_key_list_replace_init(curln, tmpln); -+ list_del_init(&curln->ln_alive_list); -+ -+ return; -+ } -+ -+ /* have a granted lock */ -+ grtln = &grtlk->lk_nodes[dep]; -+ if (!list_empty(&curln->ln_blocked_list)) { -+ /* only key owner can be on both lists */ -+ BUG_ON(htree_key_list_empty(curln)); -+ -+ if (list_empty(&grtln->ln_blocked_list)) { -+ list_add(&grtln->ln_blocked_list, -+ &curln->ln_blocked_list); -+ } -+ list_del_init(&curln->ln_blocked_list); -+ } -+ /* -+ * NB: this is the tricky part: -+ * We have only two modes for child-lock (PR and PW), also, -+ * only owner of the key (attached on major/minor_list) can be on -+ * both blocked_list and granted_list, so @grtlk must be one -+ * of these two cases: -+ * -+ * a) @grtlk is taken from granted_list, which means we've granted -+ * more than one lock so @grtlk has to be PR, the first blocked -+ * lock must be PW and we can't grant it at all. -+ * So even @grtlk is not owner of the key (empty blocked_list), -+ * we don't care because we can't grant any lock. -+ * b) we just grant a new lock which is taken from head of blocked -+ * list, and it should be the first granted lock, and it should -+ * be the first one linked on blocked_list. -+ * -+ * Either way, we can get correct result by iterating blocked_list -+ * of @grtlk, and don't have to bother on how to find out -+ * owner of current key. -+ */ -+ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list, -+ lk_nodes[dep].ln_blocked_list) { -+ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW || -+ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW) -+ break; -+ /* grant all readers */ -+ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list); -+ list_add(&poslk->lk_nodes[dep].ln_granted_list, -+ &grtln->ln_granted_list); -+ -+ BUG_ON(poslk->lk_task == NULL); -+ wake_up_process(poslk->lk_task); -+ } -+ -+ /* if @curln is the owner of this key, replace it with @grtln */ -+ if (!htree_key_list_empty(curln)) -+ htree_key_list_replace_init(curln, grtln); -+ -+ if (curln->ln_mode == HTREE_LOCK_INVAL) -+ list_del_init(&curln->ln_alive_list); -+} -+ -+/* -+ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted -+ * and 0 only if @wait is false and can't grant it immediately -+ */ -+int -+htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, -+ u32 key, unsigned dep, int wait, void *event) -+{ -+ struct htree_lock_head *lhead = lck->lk_head; -+ int rc; -+ -+ BUG_ON(dep >= lck->lk_depth); -+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); -+ -+ htree_spin_lock(lhead, dep); -+ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event); -+ if (rc != 0) -+ htree_spin_unlock(lhead, dep); -+ return rc >= 0; -+} -+EXPORT_SYMBOL(htree_node_lock_try); -+ -+/* it's wrapper of htree_node_unlock_internal */ -+void -+htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event) -+{ -+ struct htree_lock_head *lhead = lck->lk_head; -+ -+ BUG_ON(dep >= lck->lk_depth); -+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); -+ -+ htree_spin_lock(lhead, dep); -+ htree_node_unlock_internal(lhead, lck, dep, event); -+ htree_spin_unlock(lhead, dep); -+} -+EXPORT_SYMBOL(htree_node_unlock); -+ -+/* stop listening on child-lock level @dep */ -+void -+htree_node_stop_listen(struct htree_lock *lck, unsigned dep) -+{ -+ struct htree_lock_node *ln = &lck->lk_nodes[dep]; -+ struct htree_lock_node *tmp; -+ -+ BUG_ON(htree_node_is_granted(lck, dep)); -+ BUG_ON(!list_empty(&ln->ln_blocked_list)); -+ BUG_ON(!list_empty(&ln->ln_granted_list)); -+ -+ if (!htree_node_is_listening(lck, dep)) -+ return; -+ -+ htree_spin_lock(lck->lk_head, dep); -+ ln->ln_mode = HTREE_LOCK_INVAL; -+ ln->ln_ev_target = NULL; -+ -+ if (htree_key_list_empty(ln)) { /* not owner */ -+ list_del_init(&ln->ln_alive_list); -+ goto out; -+ } -+ -+ /* I'm the owner... */ -+ if (list_empty(&ln->ln_alive_list)) { /* no more listener */ -+ htree_key_list_del_init(ln); -+ goto out; -+ } -+ -+ tmp = list_entry(ln->ln_alive_list.next, -+ struct htree_lock_node, ln_alive_list); -+ -+ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL); -+ htree_key_list_replace_init(ln, tmp); -+ list_del_init(&ln->ln_alive_list); -+ out: -+ htree_spin_unlock(lck->lk_head, dep); -+} -+EXPORT_SYMBOL(htree_node_stop_listen); -+ -+/* release all child-locks if we have any */ -+static void -+htree_node_release_all(struct htree_lock *lck) -+{ -+ int i; -+ -+ for (i = 0; i < lck->lk_depth; i++) { -+ if (htree_node_is_granted(lck, i)) -+ htree_node_unlock(lck, i, NULL); -+ else if (htree_node_is_listening(lck, i)) -+ htree_node_stop_listen(lck, i); -+ } -+} -+ -+/* -+ * obtain htree lock, it could be blocked inside if there's conflict -+ * with any granted or blocked lock and @wait is true. -+ * NB: ALWAYS called holding lhead::lh_lock -+ */ -+static int -+htree_lock_internal(struct htree_lock *lck, int wait) -+{ -+ struct htree_lock_head *lhead = lck->lk_head; -+ int granted = 0; -+ int blocked = 0; -+ int i; -+ -+ for (i = 0; i < HTREE_LOCK_MAX; i++) { -+ if (lhead->lh_ngranted[i] != 0) -+ granted |= 1 << i; -+ if (lhead->lh_nblocked[i] != 0) -+ blocked |= 1 << i; -+ } -+ if ((htree_lock_compat[lck->lk_mode] & granted) != granted || -+ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) { -+ /* will block current lock even it just conflicts with any -+ * other blocked lock, so lock like EX wouldn't starve */ -+ if (!wait) -+ return -1; -+ lhead->lh_nblocked[lck->lk_mode]++; -+ lk_block_inc(lck->lk_mode); -+ -+ lck->lk_task = current; -+ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list); -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ htree_spin_unlock(lhead, HTREE_DEP_ROOT); -+ /* wait to be given the lock */ -+ if (lck->lk_task != NULL) -+ schedule(); -+ /* granted, no doubt. wake up will set me RUNNING */ -+ return 0; /* without lh_lock */ -+ } -+ lhead->lh_ngranted[lck->lk_mode]++; -+ lk_grant_inc(lck->lk_mode); -+ return 1; -+} -+ -+/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */ -+static void -+htree_unlock_internal(struct htree_lock *lck) -+{ -+ struct htree_lock_head *lhead = lck->lk_head; -+ struct htree_lock *tmp; -+ struct htree_lock *tmp2; -+ int granted = 0; -+ int i; -+ -+ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0); -+ -+ lhead->lh_ngranted[lck->lk_mode]--; -+ lck->lk_mode = HTREE_LOCK_INVAL; -+ -+ for (i = 0; i < HTREE_LOCK_MAX; i++) { -+ if (lhead->lh_ngranted[i] != 0) -+ granted |= 1 << i; -+ } -+ list_for_each_entry_safe(tmp, tmp2, -+ &lhead->lh_blocked_list, lk_blocked_list) { -+ /* conflict with any granted lock? */ -+ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted) -+ break; -+ -+ list_del_init(&tmp->lk_blocked_list); -+ -+ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0); -+ -+ lhead->lh_nblocked[tmp->lk_mode]--; -+ lhead->lh_ngranted[tmp->lk_mode]++; -+ granted |= 1 << tmp->lk_mode; -+ -+ BUG_ON(tmp->lk_task == NULL); -+ wake_up_process(tmp->lk_task); -+ } -+} -+ -+/* it's wrapper of htree_lock_internal and exported interface. -+ * It always return 1 with granted lock if @wait is true, it can return 0 -+ * if @wait is false and locking request can't be granted immediately */ -+int -+htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, -+ htree_lock_mode_t mode, int wait) -+{ -+ int rc; -+ -+ BUG_ON(lck->lk_depth > lhead->lh_depth); -+ BUG_ON(lck->lk_head != NULL); -+ BUG_ON(lck->lk_task != NULL); -+ -+ lck->lk_head = lhead; -+ lck->lk_mode = mode; -+ -+ htree_spin_lock(lhead, HTREE_DEP_ROOT); -+ rc = htree_lock_internal(lck, wait); -+ if (rc != 0) -+ htree_spin_unlock(lhead, HTREE_DEP_ROOT); -+ return rc >= 0; -+} -+EXPORT_SYMBOL(htree_lock_try); -+ -+/* it's wrapper of htree_unlock_internal and exported interface. -+ * It will release all htree_node_locks and htree_lock */ -+void -+htree_unlock(struct htree_lock *lck) -+{ -+ BUG_ON(lck->lk_head == NULL); -+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); -+ -+ htree_node_release_all(lck); -+ -+ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT); -+ htree_unlock_internal(lck); -+ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT); -+ lck->lk_head = NULL; -+ lck->lk_task = NULL; -+} -+EXPORT_SYMBOL(htree_unlock); -+ -+/* change lock mode */ -+void -+htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode) -+{ -+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); -+ lck->lk_mode = mode; -+} -+EXPORT_SYMBOL(htree_change_mode); -+ -+/* release htree lock, and lock it again with new mode. -+ * This function will first release all htree_node_locks and htree_lock, -+ * then try to gain htree_lock with new @mode. -+ * It always return 1 with granted lock if @wait is true, it can return 0 -+ * if @wait is false and locking request can't be granted immediately */ -+int -+htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait) -+{ -+ struct htree_lock_head *lhead = lck->lk_head; -+ int rc; -+ -+ BUG_ON(lhead == NULL); -+ BUG_ON(lck->lk_mode == mode); -+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL); -+ -+ htree_node_release_all(lck); -+ -+ htree_spin_lock(lhead, HTREE_DEP_ROOT); -+ htree_unlock_internal(lck); -+ lck->lk_mode = mode; -+ rc = htree_lock_internal(lck, wait); -+ if (rc != 0) -+ htree_spin_unlock(lhead, HTREE_DEP_ROOT); -+ return rc >= 0; -+} -+EXPORT_SYMBOL(htree_change_lock_try); -+ -+/* create a htree_lock head with @depth levels (number of child-locks), -+ * it is a per resoruce structure */ -+struct htree_lock_head * -+htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv) -+{ -+ struct htree_lock_head *lhead; -+ int i; -+ -+ if (depth > HTREE_LOCK_DEP_MAX) { -+ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", -+ depth, HTREE_LOCK_DEP_MAX); -+ return NULL; -+ } -+ -+ lhead = kzalloc(offsetof(struct htree_lock_head, -+ lh_children[depth]) + priv, GFP_NOFS); -+ if (lhead == NULL) -+ return NULL; -+ -+ if (hbits < HTREE_HBITS_MIN) -+ lhead->lh_hbits = HTREE_HBITS_MIN; -+ else if (hbits > HTREE_HBITS_MAX) -+ lhead->lh_hbits = HTREE_HBITS_MAX; -+ -+ lhead->lh_lock = 0; -+ lhead->lh_depth = depth; -+ INIT_LIST_HEAD(&lhead->lh_blocked_list); -+ if (priv > 0) { -+ lhead->lh_private = (void *)lhead + -+ offsetof(struct htree_lock_head, lh_children[depth]); -+ } -+ -+ for (i = 0; i < depth; i++) { -+ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list); -+ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE; -+ } -+ return lhead; -+} -+EXPORT_SYMBOL(htree_lock_head_alloc); -+ -+/* free the htree_lock head */ -+void -+htree_lock_head_free(struct htree_lock_head *lhead) -+{ -+ int i; -+ -+ BUG_ON(!list_empty(&lhead->lh_blocked_list)); -+ for (i = 0; i < lhead->lh_depth; i++) -+ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list)); -+ kfree(lhead); -+} -+EXPORT_SYMBOL(htree_lock_head_free); -+ -+/* register event callback for @events of child-lock at level @dep */ -+void -+htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep, -+ unsigned events, htree_event_cb_t callback) -+{ -+ BUG_ON(lhead->lh_depth <= dep); -+ lhead->lh_children[dep].lc_events = events; -+ lhead->lh_children[dep].lc_callback = callback; -+} -+EXPORT_SYMBOL(htree_lock_event_attach); -+ -+/* allocate a htree_lock, which is per-thread structure, @pbytes is some -+ * extra-bytes as private data for caller */ -+struct htree_lock * -+htree_lock_alloc(unsigned depth, unsigned pbytes) -+{ -+ struct htree_lock *lck; -+ int i = offsetof(struct htree_lock, lk_nodes[depth]); -+ -+ if (depth > HTREE_LOCK_DEP_MAX) { -+ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", -+ depth, HTREE_LOCK_DEP_MAX); -+ return NULL; -+ } -+ lck = kzalloc(i + pbytes, GFP_NOFS); -+ if (lck == NULL) -+ return NULL; -+ -+ if (pbytes != 0) -+ lck->lk_private = (void *)lck + i; -+ lck->lk_mode = HTREE_LOCK_INVAL; -+ lck->lk_depth = depth; -+ INIT_LIST_HEAD(&lck->lk_blocked_list); -+ -+ for (i = 0; i < depth; i++) { -+ struct htree_lock_node *node = &lck->lk_nodes[i]; -+ -+ node->ln_mode = HTREE_LOCK_INVAL; -+ INIT_LIST_HEAD(&node->ln_major_list); -+ INIT_LIST_HEAD(&node->ln_minor_list); -+ INIT_LIST_HEAD(&node->ln_alive_list); -+ INIT_LIST_HEAD(&node->ln_blocked_list); -+ INIT_LIST_HEAD(&node->ln_granted_list); -+ } -+ -+ return lck; -+} -+EXPORT_SYMBOL(htree_lock_alloc); -+ -+/* free htree_lock node */ -+void -+htree_lock_free(struct htree_lock *lck) -+{ -+ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL); -+ kfree(lck); -+} -+EXPORT_SYMBOL(htree_lock_free); ---- a/fs/ext4/inode.c -+++ b/fs/ext4/inode.c -@@ -4965,7 +4965,7 @@ struct inode *ext4_iget(struct super_blo - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) - ei->i_file_acl |= - ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; -- inode->i_size = ext4_isize(raw_inode); -+ inode->i_size = ext4_isize(sb, raw_inode); - ei->i_disksize = inode->i_size; - #ifdef CONFIG_QUOTA - ei->i_reserved_quota = 0; -@@ -5205,7 +5205,7 @@ static int ext4_do_update_inode(handle_t - raw_inode->i_file_acl_high = - cpu_to_le16(ei->i_file_acl >> 32); - raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); -- if (ei->i_disksize != ext4_isize(raw_inode)) { -+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { - ext4_isize_set(raw_inode, ei->i_disksize); - need_datasync = 1; - } ---- a/fs/ext4/namei.c -+++ b/fs/ext4/namei.c -@@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame, -- int *err); -+ struct htree_lock *lck, int *err); - static void dx_release(struct dx_frame *frames); - static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); -@@ -189,13 +189,13 @@ static void dx_insert_block(struct dx_fr - static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, -- __u32 *start_hash); -+ __u32 *start_hash, struct htree_lock *lck); - static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, -- int *err); -+ struct htree_lock *lck, int *err); - static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode); -+ struct inode *inode, struct htree_lock *lck); - - /* - * p is at least 6 bytes before the end of page -@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str - - static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) - { -- return le32_to_cpu(entry->block) & 0x00ffffff; -+ return le32_to_cpu(entry->block) & 0x0fffffff; - } - - static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) -@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h - } - #endif /* DX_DEBUG */ - -+/* private data for htree_lock */ -+struct ext4_dir_lock_data { -+ unsigned ld_flags; /* bits-map for lock types */ -+ unsigned ld_count; /* # entries of the last DX block */ -+ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ -+ struct dx_entry *ld_at; /* position of leaf dx_entry */ -+}; -+ -+#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) -+ -+/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ -+#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) -+ -+static void ext4_htree_event_cb(void *target, void *event) -+{ -+ u64 *block = (u64 *)target; -+ -+ if (*block == dx_get_block((struct dx_entry *)event)) -+ *block = EXT4_HTREE_NODE_CHANGED; -+} -+ -+struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) -+{ -+ struct htree_lock_head *lhead; -+ -+ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); -+ if (lhead != NULL) { -+ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, -+ ext4_htree_event_cb); -+ } -+ return lhead; -+} -+EXPORT_SYMBOL(ext4_htree_lock_head_alloc); -+ -+struct htree_lock *ext4_htree_lock_alloc(void) -+{ -+ return htree_lock_alloc(EXT4_LK_MAX, -+ sizeof(struct ext4_dir_lock_data)); -+} -+EXPORT_SYMBOL(ext4_htree_lock_alloc); -+ -+static htree_lock_mode_t ext4_htree_mode(unsigned flags) -+{ -+ switch (flags) { -+ default: /* 0 or unknown flags require EX lock */ -+ return HTREE_LOCK_EX; -+ case EXT4_HLOCK_READDIR: -+ return HTREE_LOCK_PR; -+ case EXT4_HLOCK_LOOKUP: -+ return HTREE_LOCK_CR; -+ case EXT4_HLOCK_DEL: -+ case EXT4_HLOCK_ADD: -+ return HTREE_LOCK_CW; -+ } -+} -+ -+/* return PR for read-only operations, otherwise return EX */ -+static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) -+{ -+ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; -+ -+ /* 0 requires EX lock */ -+ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; -+} -+ -+static int ext4_htree_safe_locked(struct htree_lock *lck) -+{ -+ int writer; -+ -+ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) -+ return 1; -+ -+ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == -+ EXT4_LB_DE; -+ if (writer) /* all readers & writers are excluded? */ -+ return lck->lk_mode == HTREE_LOCK_EX; -+ -+ /* all writers are excluded? */ -+ return lck->lk_mode == HTREE_LOCK_PR || -+ lck->lk_mode == HTREE_LOCK_PW || -+ lck->lk_mode == HTREE_LOCK_EX; -+} -+ -+/* relock htree_lock with EX mode if it's change operation, otherwise -+ * relock it with PR mode. It's noop if PDO is disabled. */ -+static void ext4_htree_safe_relock(struct htree_lock *lck) -+{ -+ if (!ext4_htree_safe_locked(lck)) { -+ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; -+ -+ htree_change_lock(lck, ext4_htree_safe_mode(flags)); -+ } -+} -+ -+void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, -+ struct inode *dir, unsigned flags) -+{ -+ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : -+ ext4_htree_safe_mode(flags); -+ -+ ext4_htree_lock_data(lck)->ld_flags = flags; -+ htree_lock(lck, lhead, mode); -+ if (!is_dx(dir)) -+ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ -+} -+EXPORT_SYMBOL(ext4_htree_lock); -+ -+static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, -+ unsigned lmask, int wait, void *ev) -+{ -+ u32 key = (at == NULL) ? 0 : dx_get_block(at); -+ u32 mode; -+ -+ /* NOOP if htree is well protected or caller doesn't require the lock */ -+ if (ext4_htree_safe_locked(lck) || -+ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) -+ return 1; -+ -+ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? -+ HTREE_LOCK_PW : HTREE_LOCK_PR; -+ while (1) { -+ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) -+ return 1; -+ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ -+ return 0; -+ cpu_relax(); /* spin until granted */ -+ } -+} -+ -+static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) -+{ -+ return ext4_htree_safe_locked(lck) || -+ htree_node_is_granted(lck, ffz(~lmask)); -+} -+ -+static void ext4_htree_node_unlock(struct htree_lock *lck, -+ unsigned lmask, void *buf) -+{ -+ /* NB: it's safe to call mutiple times or even it's not locked */ -+ if (!ext4_htree_safe_locked(lck) && -+ htree_node_is_granted(lck, ffz(~lmask))) -+ htree_node_unlock(lck, ffz(~lmask), buf); -+} -+ -+#define ext4_htree_dx_lock(lck, key) \ -+ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) -+#define ext4_htree_dx_lock_try(lck, key) \ -+ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) -+#define ext4_htree_dx_unlock(lck) \ -+ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) -+#define ext4_htree_dx_locked(lck) \ -+ ext4_htree_node_locked(lck, EXT4_LB_DX) -+ -+static void ext4_htree_dx_need_lock(struct htree_lock *lck) -+{ -+ struct ext4_dir_lock_data *ld; -+ -+ if (ext4_htree_safe_locked(lck)) -+ return; -+ -+ ld = ext4_htree_lock_data(lck); -+ switch (ld->ld_flags) { -+ default: -+ return; -+ case EXT4_HLOCK_LOOKUP: -+ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; -+ return; -+ case EXT4_HLOCK_DEL: -+ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; -+ return; -+ case EXT4_HLOCK_ADD: -+ ld->ld_flags = EXT4_HLOCK_SPLIT; -+ return; -+ } -+} -+ -+#define ext4_htree_de_lock(lck, key) \ -+ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) -+#define ext4_htree_de_unlock(lck) \ -+ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) -+ -+#define ext4_htree_spin_lock(lck, key, event) \ -+ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) -+#define ext4_htree_spin_unlock(lck) \ -+ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) -+#define ext4_htree_spin_unlock_listen(lck, p) \ -+ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) -+ -+static void ext4_htree_spin_stop_listen(struct htree_lock *lck) -+{ -+ if (!ext4_htree_safe_locked(lck) && -+ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) -+ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); -+} -+ -+enum { -+ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ -+ DX_HASH_COL_YES, /* there is collision and it does matter */ -+ DX_HASH_COL_NO, /* there is no collision */ -+}; -+ -+static int dx_probe_hash_collision(struct htree_lock *lck, -+ struct dx_entry *entries, -+ struct dx_entry *at, u32 hash) -+{ -+ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { -+ return DX_HASH_COL_IGNORE; /* don't care about collision */ -+ -+ } else if (at == entries + dx_get_count(entries) - 1) { -+ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ -+ -+ } else { /* hash collision? */ -+ return ((dx_get_hash(at + 1) & ~1) == hash) ? -+ DX_HASH_COL_YES : DX_HASH_COL_NO; -+ } -+} -+ - /* - * Probe for a directory leaf block to search. - * -@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h - */ - static struct dx_frame * - dx_probe(const struct qstr *d_name, struct inode *dir, -- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, -+ struct htree_lock *lck, int *err) - { - unsigned count, indirect; -- struct dx_entry *at, *entries, *p, *q, *m; -+ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; - struct dx_root_info * info; - struct buffer_head *bh; - struct dx_frame *frame = frame_in; - u32 hash; - -- frame->bh = NULL; -+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) - goto fail; - -@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru - goto fail; - } - -- if ((indirect = info->indirect_levels) > 1) { -- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", -- info->indirect_levels); -+ indirect = info->indirect_levels; -+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { -+ ext4_warning(dir->i_sb, -+ "Directory (ino: %lu) htree depth %#06x exceed " -+ "supported value", dir->i_ino, -+ ext4_dir_htree_level(dir->i_sb)); -+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { -+ ext4_warning(dir->i_sb, "Enable large directory " -+ "feature to access it"); -+ } - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; -@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru - dxtrace(printk("Look up %x", hash)); - while (1) - { -+ if (indirect == 0) { /* the last index level */ -+ /* NB: ext4_htree_dx_lock() could be noop if -+ * DX-lock flag is not set for current operation */ -+ ext4_htree_dx_lock(lck, dx); -+ ext4_htree_spin_lock(lck, dx, NULL); -+ } - count = dx_get_count(entries); -- if (!count || count > dx_get_limit(entries)) { -+ if (count == 0 || count > dx_get_limit(entries)) { -+ ext4_htree_spin_unlock(lck); /* release spin */ - ext4_warning(dir->i_sb, - "dx entry: no count or count > limit"); - brelse(bh); -@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru - frame->bh = bh; - frame->entries = entries; - frame->at = at; -- if (!indirect--) return frame; -+ -+ if (indirect == 0) { /* the last index level */ -+ struct ext4_dir_lock_data *ld; -+ u64 myblock; -+ -+ /* By default we only lock DE-block, however, we will -+ * also lock the last level DX-block if: -+ * a) there is hash collision -+ * we will set DX-lock flag (a few lines below) -+ * and redo to lock DX-block -+ * see detail in dx_probe_hash_collision() -+ * b) it's a retry from splitting -+ * we need to lock the last level DX-block so nobody -+ * else can split any leaf blocks under the same -+ * DX-block, see detail in ext4_dx_add_entry() -+ */ -+ if (ext4_htree_dx_locked(lck)) { -+ /* DX-block is locked, just lock DE-block -+ * and return */ -+ ext4_htree_spin_unlock(lck); -+ if (!ext4_htree_safe_locked(lck)) -+ ext4_htree_de_lock(lck, frame->at); -+ return frame; -+ } -+ /* it's pdirop and no DX lock */ -+ if (dx_probe_hash_collision(lck, entries, at, hash) == -+ DX_HASH_COL_YES) { -+ /* found hash collision, set DX-lock flag -+ * and retry to abtain DX-lock */ -+ ext4_htree_spin_unlock(lck); -+ ext4_htree_dx_need_lock(lck); -+ continue; -+ } -+ ld = ext4_htree_lock_data(lck); -+ /* because I don't lock DX, so @at can't be trusted -+ * after I release spinlock so I have to save it */ -+ ld->ld_at = at; -+ ld->ld_at_entry = *at; -+ ld->ld_count = dx_get_count(entries); -+ -+ frame->at = &ld->ld_at_entry; -+ myblock = dx_get_block(at); -+ -+ /* NB: ordering locking */ -+ ext4_htree_spin_unlock_listen(lck, &myblock); -+ /* other thread can split this DE-block because: -+ * a) I don't have lock for the DE-block yet -+ * b) I released spinlock on DX-block -+ * if it happened I can detect it by listening -+ * splitting event on this DE-block */ -+ ext4_htree_de_lock(lck, frame->at); -+ ext4_htree_spin_stop_listen(lck); -+ -+ if (myblock == EXT4_HTREE_NODE_CHANGED) { -+ /* someone split this DE-block before -+ * I locked it, I need to retry and lock -+ * valid DE-block */ -+ ext4_htree_de_unlock(lck); -+ continue; -+ } -+ return frame; -+ } -+ dx = at; -+ indirect--; - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) - goto fail2; -+ - at = entries = ((struct dx_node *) bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, -@@ -512,13 +808,18 @@ fail: - static void dx_release (struct dx_frame *frames) - { - struct dx_root_info *info; -+ int i; -+ - if (frames[0].bh == NULL) - return; - - info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); -- if (info->indirect_levels) -- brelse(frames[1].bh); -- brelse(frames[0].bh); -+ for (i = 0; i <= info->indirect_levels; i++) { -+ if (frames[i].bh == NULL) -+ break; -+ brelse(frames[i].bh); -+ frames[i].bh = NULL; -+ } - } - - /* -@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame - static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, -- __u32 *start_hash) -+ __u32 *start_hash, struct htree_lock *lck) - { - struct dx_frame *p; - struct buffer_head *bh; -@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct - * this loop, num_frames indicates the number of interior - * nodes need to be read. - */ -+ ext4_htree_de_unlock(lck); - while (1) { -- if (++(p->at) < p->entries + dx_get_count(p->entries)) -- break; -+ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { -+ /* num_frames > 0 : -+ * DX block -+ * ext4_htree_dx_locked: -+ * frame->at is reliable pointer returned by dx_probe, -+ * otherwise dx_probe already knew no collision */ -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ break; -+ } - if (p == frames) - return 0; - num_frames++; -+ if (num_frames == 1) -+ ext4_htree_dx_unlock(lck); - p--; - } - -@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct - * block so no check is necessary - */ - while (num_frames--) { -+ if (num_frames == 0) { -+ /* it's not always necessary, we just don't want to -+ * detect hash collision again */ -+ ext4_htree_dx_need_lock(lck); -+ ext4_htree_dx_lock(lck, p->at); -+ } -+ - if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) - return err; /* Failure */ -@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct - p->bh = bh; - p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; - } -+ ext4_htree_de_lock(lck, p->at); - return 1; - } - -@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di - { - struct dx_hash_info hinfo; - struct ext4_dir_entry_2 *de; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct inode *dir; - ext4_lblk_t block; - int count = 0; -@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di - } - hinfo.hash = start_hash; - hinfo.minor_hash = 0; -- frame = dx_probe(NULL, dir, &hinfo, frames, &err); -+ /* assume it's PR locked */ -+ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err); - if (!frame) - return err; -- - /* Add '.' and '..' from the htree header */ - if (!start_hash && !start_minor_hash) { - de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; -@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di - count += ret; - hashval = ~0; - ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, -- frame, frames, &hashval); -+ frame, frames, &hashval, NULL); - *next_hash = hashval; - if (ret < 0) { - err = ret; -@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr - - static void ext4_update_dx_flag(struct inode *inode) - { -+ /* Disable it for ldiskfs, because going from a DX directory to -+ * a non-DX directory while it is in use will completely break -+ * the htree-locking. -+ * If we really want to support this operation in the future, -+ * we need to exclusively lock the directory at here which will -+ * increase complexity of code */ -+#if 0 - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -+#endif - } - - /* -@@ -888,8 +1215,9 @@ static inline int search_dirblock(struct - * to brelse() it when appropriate. - */ - struct buffer_head * ext4_find_entry(struct inode *dir, -- const struct qstr *d_name, -- struct ext4_dir_entry_2 ** res_dir) -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 **res_dir, -+ struct htree_lock *lck) - { - struct super_block *sb; - struct buffer_head *bh_use[NAMEI_RA_SIZE]; -@@ -910,7 +1238,7 @@ struct buffer_head * ext4_find_entry(str - if (namelen > EXT4_NAME_LEN) - return NULL; - if (is_dx(dir)) { -- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); -+ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err); - /* - * On success, or if the error was file not found, - * return. Otherwise, fall back to doing a search the -@@ -920,6 +1248,7 @@ struct buffer_head * ext4_find_entry(str - return bh; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); -+ ext4_htree_safe_relock(lck); - } - nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); - start = EXT4_I(dir)->i_dir_start_lookup; -@@ -996,13 +1325,15 @@ cleanup_and_exit: - return ret; - } - --static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, -- struct ext4_dir_entry_2 **res_dir, int *err) -+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, -+ const struct qstr *d_name, -+ struct ext4_dir_entry_2 **res_dir, -+ struct htree_lock *lck, int *err) - { - struct super_block * sb; - struct dx_hash_info hinfo; - u32 hash; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct buffer_head *bh; - ext4_lblk_t block; - int retval; -@@ -1012,13 +1343,16 @@ static struct buffer_head * ext4_dx_find - sb = dir->i_sb; - /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ -- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) -+ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err))) - return NULL; - } else { - frame = frames; - frame->bh = NULL; /* for dx_release() */ - frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ - dx_set_block(frame->at, 0); /* dx_root block is 0 */ -+ /* "." and ".." are stored in root DX lock */ -+ ext4_htree_dx_need_lock(lck); -+ ext4_htree_dx_lock(lck, NULL); - } - hash = hinfo.hash; - do { -@@ -1041,7 +1375,7 @@ static struct buffer_head * ext4_dx_find - - /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hash, frame, -- frames, NULL); -+ frames, NULL, lck); - if (retval < 0) { - ext4_warning(sb, - "error reading index page in directory #%lu", -@@ -1067,7 +1401,7 @@ static struct dentry *ext4_lookup(struct - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -- bh = ext4_find_entry(dir, &dentry->d_name, &de); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - inode = NULL; - if (bh) { - __u32 ino = le32_to_cpu(de->inode); -@@ -1134,7 +1468,7 @@ struct dentry *ext4_get_parent(struct de - struct ext4_dir_entry_2 * de; - struct buffer_head *bh; - -- bh = ext4_find_entry(child->d_inode, &dotdot, &de); -+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); - if (!bh) - return ERR_PTR(-ENOENT); - ino = le32_to_cpu(de->inode); -@@ -1222,8 +1556,9 @@ static struct ext4_dir_entry_2* dx_pack_ - * Returns pointer to de in block into which the new entry will be inserted. - */ - static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, -- struct buffer_head **bh,struct dx_frame *frame, -- struct dx_hash_info *hinfo, int *error) -+ struct buffer_head **bh, struct dx_frame *frames, -+ struct dx_frame *frame, struct dx_hash_info *hinfo, -+ struct htree_lock *lck, int *error) - { - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; -@@ -1280,7 +1615,14 @@ static struct ext4_dir_entry_2 *do_split - hash2, split, count-split)); - - /* Fancy dance to stay within two buffers */ -- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); -+ if (hinfo->hash < hash2) { -+ de2 = dx_move_dirents(data1, data2, map + split, -+ count - split, blocksize); -+ } else { -+ /* make sure we will add entry to the same block which -+ * we have already locked */ -+ de2 = dx_move_dirents(data1, data2, map, split, blocksize); -+ } - de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, - blocksize); -@@ -1289,13 +1631,21 @@ static struct ext4_dir_entry_2 *do_split - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); - -- /* Which block gets the new entry? */ -- if (hinfo->hash >= hash2) -- { -- swap(*bh, bh2); -- de = de2; -+ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, -+ frame->at); /* notify block is being split */ -+ if (hinfo->hash < hash2) { -+ dx_insert_block(frame, hash2 + continued, newblock); -+ -+ } else { -+ /* switch block number */ -+ dx_insert_block(frame, hash2 + continued, -+ dx_get_block(frame->at)); -+ dx_set_block(frame->at, newblock); -+ (frame->at)++; - } -- dx_insert_block(frame, hash2 + continued, newblock); -+ ext4_htree_spin_unlock(lck); -+ ext4_htree_dx_unlock(lck); -+ - err = ext4_handle_dirty_metadata(handle, dir, bh2); - if (err) - goto journal_error; -@@ -1406,7 +1756,7 @@ static int add_dirent_to_buf(handle_t *h - if (!IS_NOCMTIME(dir)) - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); - ext4_update_dx_flag(dir); -- dir->i_version++; -+ inode_inc_iversion(dir); - ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); -@@ -1426,7 +1776,7 @@ static int make_indexed_dir(handle_t *ha - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct dx_entry *entries; - struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; - char *data1, *top; -@@ -1507,7 +1857,7 @@ static int make_indexed_dir(handle_t *ha - ext4_handle_dirty_metadata(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); - -- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval); - if (!de) { - /* - * Even if the block split failed, we have to properly write -@@ -1614,7 +1964,7 @@ out: - * the entry, as someone else might have used it while you slept. - */ - int ext4_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+ struct inode *inode, struct htree_lock *lck) - { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; -@@ -1633,9 +1983,10 @@ int ext4_add_entry(handle_t *handle, str - if (dentry->d_name.len == 2 && - memcmp(dentry->d_name.name, "..", 2) == 0) - return ext4_update_dotdot(handle, dentry, inode); -- retval = ext4_dx_add_entry(handle, dentry, inode); -+ retval = ext4_dx_add_entry(handle, dentry, inode, lck); - if (!retval || (retval != ERR_BAD_DX_DIR)) - return retval; -+ ext4_htree_safe_relock(lck); - ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); - dx_fallback++; - ext4_mark_inode_dirty(handle, dir); -@@ -1673,18 +2024,21 @@ int ext4_add_entry(handle_t *handle, str - * Returns 0 for success, or a negative error value - */ - static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, -- struct inode *inode) -+ struct inode *inode, struct htree_lock *lck) - { -- struct dx_frame frames[2], *frame; -+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; - struct dx_entry *entries, *at; - struct dx_hash_info hinfo; - struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; - struct super_block *sb = dir->i_sb; - struct ext4_dir_entry_2 *de; -+ int restart; - int err; - -- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); -+again: -+ restart = 0; -+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); - if (!frame) - return err; - entries = frame->entries; -@@ -1693,33 +2047,53 @@ static int ext4_dx_add_entry(handle_t *h - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) - goto cleanup; - -- BUFFER_TRACE(bh, "get_write_access"); -- err = ext4_journal_get_write_access(handle, bh); -- if (err) -- goto journal_error; -- - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) - goto cleanup; - -+ err = 0; - /* Block full, should compress but for now just split */ - dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); - /* Need to split index? */ - if (dx_get_count(entries) == dx_get_limit(entries)) { - ext4_lblk_t newblock; -- unsigned icount = dx_get_count(entries); -- int levels = frame - frames; -+ int levels = frame - frames + 1; -+ unsigned icount; -+ int add_level = 1; - struct dx_entry *entries2; - struct dx_node *node2; - struct buffer_head *bh2; - -- if (levels && (dx_get_count(frames->entries) == -- dx_get_limit(frames->entries))) { -- ext4_warning(sb, "Directory index full!"); -+ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ -+ ext4_htree_safe_relock(lck); -+ restart = 1; -+ goto cleanup; -+ } -+ while (frame > frames) { -+ if (dx_get_count((frame - 1)->entries) < -+ dx_get_limit((frame - 1)->entries)) { -+ add_level = 0; -+ break; -+ } -+ frame--; /* split higher index block */ -+ at = frame->at; -+ entries = frame->entries; -+ restart = 1; -+ } -+ if (add_level && levels == ext4_dir_htree_level(sb)) { -+ ext4_warning(sb, "Directory (ino: %lu) index full, " -+ "reach max htree level :%d", -+ dir->i_ino, levels); -+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { -+ ext4_warning(sb, "Large directory feature is" -+ "not enabled on this " -+ "filesystem"); -+ } - err = -ENOSPC; - goto cleanup; - } -+ icount = dx_get_count(entries); - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) - goto cleanup; -@@ -1732,7 +2106,7 @@ static int ext4_dx_add_entry(handle_t *h - err = ext4_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; -- if (levels) { -+ if (!add_level) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", -@@ -1740,7 +2114,7 @@ static int ext4_dx_add_entry(handle_t *h - - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, -- frames[0].bh); -+ (frame - 1)->bh); - if (err) - goto journal_error; - -@@ -1756,14 +2130,21 @@ static int ext4_dx_add_entry(handle_t *h - frame->entries = entries = entries2; - swap(frame->bh, bh2); - } -- dx_insert_block(frames + 0, hash2, newblock); -- dxtrace(dx_show_index("node", frames[1].entries)); -+ dx_insert_block((frame - 1), hash2, newblock); -+ dxtrace(dx_show_index("node", frame->entries)); - dxtrace(dx_show_index("node", - ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, dir, bh2); - if (err) - goto journal_error; - brelse (bh2); -+ ext4_handle_dirty_metadata(handle, inode, -+ (frame - 1)->bh); -+ if (restart) { -+ ext4_handle_dirty_metadata(handle, inode, -+ frame->bh); -+ goto cleanup; -+ } - } else { - struct dx_root_info * info; - dxtrace(printk(KERN_DEBUG -@@ -1777,25 +2158,42 @@ static int ext4_dx_add_entry(handle_t *h - dx_set_block(entries + 0, newblock); - info = dx_get_dx_info((struct ext4_dir_entry_2*) - frames[0].bh->b_data); -- info->indirect_levels = 1; -- -- /* Add new access path frame */ -- frame = frames + 1; -- frame->at = at = at - entries + entries2; -- frame->entries = entries = entries2; -- frame->bh = bh2; -- err = ext4_journal_get_write_access(handle, -- frame->bh); -- if (err) -- goto journal_error; -+ info->indirect_levels += 1; -+ dxtrace(printk(KERN_DEBUG -+ "Creating %d level index...\n", -+ info->indirect_levels)); -+ ext4_handle_dirty_metadata(handle, inode, frame->bh); -+ ext4_handle_dirty_metadata(handle, inode, bh2); -+ brelse(bh2); -+ restart = 1; -+ goto cleanup; - } -- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); -- if (err) { -- ext4_std_error(inode->i_sb, err); -+ } else if (!ext4_htree_dx_locked(lck)) { -+ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); -+ -+ /* not well protected, require DX lock */ -+ ext4_htree_dx_need_lock(lck); -+ at = frame > frames ? (frame - 1)->at : NULL; -+ -+ /* NB: no risk of deadlock because it's just a try. -+ * -+ * NB: we check ld_count for twice, the first time before -+ * having DX lock, the second time after holding DX lock. -+ * -+ * NB: We never free blocks for directory so far, which -+ * means value returned by dx_get_count() should equal to -+ * ld->ld_count if nobody split any DE-block under @at, -+ * and ld->ld_at still points to valid dx_entry. */ -+ if ((ld->ld_count != dx_get_count(entries)) || -+ !ext4_htree_dx_lock_try(lck, at) || -+ (ld->ld_count != dx_get_count(entries))) { -+ restart = 1; - goto cleanup; - } -- } -- de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ /* OK, I've got DX lock and nothing changed */ -+ frame->at = ld->ld_at; -+ } -+ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err); - if (!de) - goto cleanup; - err = add_dirent_to_buf(handle, dentry, inode, de, bh); -@@ -1804,9 +2202,15 @@ static int ext4_dx_add_entry(handle_t *h - journal_error: - ext4_std_error(dir->i_sb, err); - cleanup: -+ ext4_htree_dx_unlock(lck); -+ ext4_htree_de_unlock(lck); - if (bh) - brelse(bh); - dx_release(frames); -+ /* @restart is true means htree-path has been changed, we need to -+ * repeat dx_probe() to find out valid htree-path */ -+ if (restart && err == 0) -+ goto again; - return err; - } - -@@ -1845,7 +2249,7 @@ int ext4_delete_entry(handle_t *handle, - blocksize); - else - de->inode = 0; -- dir->i_version++; -+ inode_inc_iversion(dir); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); - if (unlikely(err)) { -@@ -1892,7 +2296,7 @@ static void ext4_dec_count(handle_t *han - static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) - { -- int err = ext4_add_entry(handle, dentry, inode); -+ int err = ext4_add_entry(handle, dentry, inode, NULL); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); -@@ -2122,7 +2526,7 @@ retry: - err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL); - if (err) - goto out_clear_inode; -- err = ext4_add_entry(handle, dentry, inode); -+ err = ext4_add_entry(handle, dentry, inode, NULL); - if (err) - goto out_clear_inode; - ext4_inc_count(handle, dir); -@@ -2395,7 +2799,7 @@ static int ext4_rmdir(struct inode *dir, - return PTR_ERR(handle); - - retval = -ENOENT; -- bh = ext4_find_entry(dir, &dentry->d_name, &de); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (!bh) - goto end_rmdir; - -@@ -2460,7 +2864,7 @@ static int ext4_unlink(struct inode *dir - ext4_handle_sync(handle); - - retval = -ENOENT; -- bh = ext4_find_entry(dir, &dentry->d_name, &de); -+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (!bh) - goto end_unlink; - -@@ -2628,7 +3032,7 @@ retry: - ext4_inc_count(handle, inode); - ihold(inode); - -- err = ext4_add_entry(handle, dentry, inode); -+ err = ext4_add_entry(handle, dentry, inode, NULL); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); -@@ -2676,7 +3080,7 @@ static int ext4_rename(struct inode *old - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); - -- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); -+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process -@@ -2689,7 +3093,7 @@ static int ext4_rename(struct inode *old - goto end_rename; - - new_inode = new_dentry->d_inode; -- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); -+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL); - if (new_bh) { - if (!new_inode) { - brelse(new_bh); -@@ -2719,7 +3123,7 @@ static int ext4_rename(struct inode *old - goto end_rename; - } - if (!new_bh) { -- retval = ext4_add_entry(handle, new_dentry, old_inode); -+ retval = ext4_add_entry(handle, new_dentry, old_inode, NULL); - if (retval) - goto end_rename; - } else { -@@ -2767,7 +3171,8 @@ static int ext4_rename(struct inode *old - struct buffer_head *old_bh2; - struct ext4_dir_entry_2 *old_de2; - -- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); -+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, -+ &old_de2, NULL); - if (old_bh2) { - retval = ext4_delete_entry(handle, old_dir, - old_de2, old_bh2); ---- /dev/null -+++ b/include/linux/htree_lock.h -@@ -0,0 +1,187 @@ -+/* -+ * include/linux/htree_lock.h -+ * -+ * Copyright (c) 2011, 2012, Intel Corporation. -+ * -+ * Author: Liang Zhen -+ */ -+ -+/* -+ * htree lock -+ * -+ * htree_lock is an advanced lock, it can support five lock modes (concept is -+ * taken from DLM) and it's a sleeping lock. -+ * -+ * most common use case is: -+ * - create a htree_lock_head for data -+ * - each thread (contender) creates it's own htree_lock -+ * - contender needs to call htree_lock(lock_node, mode) to protect data and -+ * call htree_unlock to release lock -+ * -+ * Also, there is advanced use-case which is more complex, user can have -+ * PW/PR lock on particular key, it's mostly used while user holding shared -+ * lock on the htree (CW, CR) -+ * -+ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR -+ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR -+ * ... -+ * htree_node_unlock(lock_node);; unlock the key -+ * -+ * Another tip is, we can have N-levels of this kind of keys, all we need to -+ * do is specifying N-levels while creating htree_lock_head, then we can -+ * lock/unlock a specific level by: -+ * htree_node_lock(lock_node, mode1, key1, level1...); -+ * do something; -+ * htree_node_lock(lock_node, mode1, key2, level2...); -+ * do something; -+ * htree_node_unlock(lock_node, level2); -+ * htree_node_unlock(lock_node, level1); -+ * -+ * NB: for multi-level, should be careful about locking order to avoid deadlock -+ */ -+ -+#ifndef _LINUX_HTREE_LOCK_H -+#define _LINUX_HTREE_LOCK_H -+ -+#include -+#include -+#include -+ -+/* -+ * Lock Modes -+ * more details can be found here: -+ * http://en.wikipedia.org/wiki/Distributed_lock_manager -+ */ -+typedef enum { -+ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */ -+ HTREE_LOCK_PW, /* protected write: allows only CR users */ -+ HTREE_LOCK_PR, /* protected read: allow PR, CR users */ -+ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */ -+ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */ -+ HTREE_LOCK_MAX, /* number of lock modes */ -+} htree_lock_mode_t; -+ -+#define HTREE_LOCK_NL HTREE_LOCK_MAX -+#define HTREE_LOCK_INVAL 0xdead10c -+ -+enum { -+ HTREE_HBITS_MIN = 2, -+ HTREE_HBITS_DEF = 14, -+ HTREE_HBITS_MAX = 32, -+}; -+ -+enum { -+ HTREE_EVENT_DISABLE = (0), -+ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR), -+ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW), -+ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR), -+}; -+ -+struct htree_lock; -+ -+typedef void (*htree_event_cb_t)(void *target, void *event); -+ -+struct htree_lock_child { -+ struct list_head lc_list; /* granted list */ -+ htree_event_cb_t lc_callback; /* event callback */ -+ unsigned lc_events; /* event types */ -+}; -+ -+struct htree_lock_head { -+ unsigned long lh_lock; /* bits lock */ -+ /* blocked lock list (htree_lock) */ -+ struct list_head lh_blocked_list; -+ /* # key levels */ -+ u16 lh_depth; -+ /* hash bits for key and limit number of locks */ -+ u16 lh_hbits; -+ /* counters for blocked locks */ -+ u16 lh_nblocked[HTREE_LOCK_MAX]; -+ /* counters for granted locks */ -+ u16 lh_ngranted[HTREE_LOCK_MAX]; -+ /* private data */ -+ void *lh_private; -+ /* array of children locks */ -+ struct htree_lock_child lh_children[0]; -+}; -+ -+/* htree_lock_node_t is child-lock for a specific key (ln_value) */ -+struct htree_lock_node { -+ htree_lock_mode_t ln_mode; -+ /* major hash key */ -+ u16 ln_major_key; -+ /* minor hash key */ -+ u16 ln_minor_key; -+ struct list_head ln_major_list; -+ struct list_head ln_minor_list; -+ /* alive list, all locks (granted, blocked, listening) are on it */ -+ struct list_head ln_alive_list; -+ /* blocked list */ -+ struct list_head ln_blocked_list; -+ /* granted list */ -+ struct list_head ln_granted_list; -+ void *ln_ev_target; -+}; -+ -+struct htree_lock { -+ struct task_struct *lk_task; -+ struct htree_lock_head *lk_head; -+ void *lk_private; -+ unsigned lk_depth; -+ htree_lock_mode_t lk_mode; -+ struct list_head lk_blocked_list; -+ struct htree_lock_node lk_nodes[0]; -+}; -+ -+/* create a lock head, which stands for a resource */ -+struct htree_lock_head *htree_lock_head_alloc(unsigned depth, -+ unsigned hbits, unsigned priv); -+/* free a lock head */ -+void htree_lock_head_free(struct htree_lock_head *lhead); -+/* register event callback for child lock at level @depth */ -+void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth, -+ unsigned events, htree_event_cb_t callback); -+/* create a lock handle, which stands for a thread */ -+struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes); -+/* free a lock handle */ -+void htree_lock_free(struct htree_lock *lck); -+/* lock htree, when @wait is true, 0 is returned if the lock can't -+ * be granted immediately */ -+int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, -+ htree_lock_mode_t mode, int wait); -+/* unlock htree */ -+void htree_unlock(struct htree_lock *lck); -+/* unlock and relock htree with @new_mode */ -+int htree_change_lock_try(struct htree_lock *lck, -+ htree_lock_mode_t new_mode, int wait); -+void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode); -+/* require child lock (key) of htree at level @dep, @event will be sent to all -+ * listeners on this @key while lock being granted */ -+int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, -+ u32 key, unsigned dep, int wait, void *event); -+/* release child lock at level @dep, this lock will listen on it's key -+ * if @event isn't NULL, event_cb will be called against @lck while granting -+ * any other lock at level @dep with the same key */ -+void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event); -+/* stop listening on child lock at level @dep */ -+void htree_node_stop_listen(struct htree_lock *lck, unsigned dep); -+/* for debug */ -+void htree_lock_stat_print(int depth); -+void htree_lock_stat_reset(void); -+ -+#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1) -+#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1) -+ -+#define htree_lock_mode(lck) ((lck)->lk_mode) -+ -+#define htree_node_lock(lck, mode, key, dep) \ -+ htree_node_lock_try(lck, mode, key, dep, 1, NULL) -+/* this is only safe in thread context of lock owner */ -+#define htree_node_is_granted(lck, dep) \ -+ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \ -+ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL) -+/* this is only safe in thread context of lock owner */ -+#define htree_node_is_listening(lck, dep) \ -+ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) -+ -+#endif diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series index 3f4f9a2..841cd67 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series @@ -11,6 +11,7 @@ rhel6.3/ext4-inode-version.patch rhel6.3/ext4-lookup-dotdot.patch rhel6.3/ext4-print-inum-in-htree-warning.patch rhel6.4/ext4-prealloc.patch +rhel6.3/ext4-use-correct-inode.patch rhel6.3/ext4-mballoc-extra-checks.patch rhel6.4/ext4-misc.patch rhel6.3/ext4-pdir-fix.patch @@ -28,6 +29,7 @@ rhel6.3/ext4-nocmtime-2.6.patch rhel6.3/ext4-journal-callback.patch rhel6.5/ext4-ext-walk-space.patch rhel6.3/ext4-store-tree-generation-at-find.patch +rhel6.3/ext4-large-dir.patch rhel6.3/ext4-pdirop.patch rhel6.4/ext4-extra-isize.patch rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series index a69ef69..9bbf666 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series @@ -30,6 +30,7 @@ rhel6.3/ext4-nocmtime-2.6.patch rhel6.3/ext4-export-64bit-name-hash.patch rhel6.3/ext4-journal-callback.patch rhel6.3/ext4-store-tree-generation-at-find.patch +rhel6.3/ext4-large-dir.patch rhel6.3/ext4-pdirop.patch rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch rhel6.3/ext4-quota-dont-update-cmtime.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series index dabcea3..6b6d9d2 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series @@ -31,7 +31,8 @@ sles11sp2/ext4-disable-mb-cache.patch rhel6.3/ext4-nocmtime-2.6.patch rhel6.3/ext4-export-64bit-name-hash.patch sles11sp2/ext4-store-tree-generation-at-find.patch -sles11sp2/ext4-pdirop.patch +sles11sp2/ext4-large-dir.patch +rhel6.3/ext4-pdirop.patch rhel6.3/ext4-max-dir-size.patch sles11sp2/ext4-max-dir-size-options.patch rhel6.3/ext4-not-discard-preallocation-umount.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series index 025e41a..4d9d6bf 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series @@ -30,7 +30,8 @@ sles11sp2/ext4-large-eas.patch sles11sp2/ext4-disable-mb-cache.patch rhel6.3/ext4-nocmtime-2.6.patch sles11sp2/ext4-store-tree-generation-at-find.patch -sles11sp2/ext4-pdirop.patch +sles11sp2/ext4-large-dir.patch +rhel6.3/ext4-pdirop.patch rhel6.3/ext4-max-dir-size.patch sles11sp2/ext4-max-dir-size-options.patch rhel6.3/ext4-not-discard-preallocation-umount.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series index 8437fdb..2ae3b1e 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series @@ -12,6 +12,7 @@ rhel7/ext4-data-in-dirent.patch rhel7/ext4-large-eas.patch rhel7/ext4-disable-mb-cache.patch rhel7/ext4-nocmtime.patch +rhel7/ext4-large-dir.patch rhel7/ext4-pdirop.patch rhel7/ext4-max-dir-size.patch rhel7/ext4-remove-truncate-warning.patch diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 296aa2f..2cc5ca6 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -168,7 +168,7 @@ struct osd_mdobj_map { }; #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \ - ldiskfs_add_entry(handle, child, cinode, hlock) + __ldiskfs_add_entry(handle, child, cinode, hlock) #define OSD_OTABLE_IT_CACHE_SIZE 64 #define OSD_OTABLE_IT_CACHE_MASK (~(OSD_OTABLE_IT_CACHE_SIZE - 1)) @@ -1095,13 +1095,13 @@ static inline unsigned long osd_remote_parent_ino(struct osd_device *dev) return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino; } -#ifdef JOURNAL_START_HAS_3ARGS +#ifdef LDISKFS_HT_MISC # define osd_journal_start_sb(sb, type, nblock) \ ldiskfs_journal_start_sb(sb, type, nblock) # define osd_ldiskfs_append(handle, inode, nblock, err) \ ldiskfs_append(handle, inode, nblock) # define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ - ldiskfs_find_entry(dir, name, de, inlined, lock) + __ldiskfs_find_entry(dir, name, de, inlined, lock) # define osd_journal_start(inode, type, nblocks) \ ldiskfs_journal_start(inode, type, nblocks) # define osd_transaction_size(dev) \ @@ -1113,7 +1113,7 @@ static inline unsigned long osd_remote_parent_ino(struct osd_device *dev) # define osd_ldiskfs_append(handle, inode, nblock, err) \ ldiskfs_append(handle, inode, nblock, err) # define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ - ldiskfs_find_entry(dir, name, de, lock) + __ldiskfs_find_entry(dir, name, de, lock) # define osd_journal_start(inode, type, nblocks) \ ldiskfs_journal_start(inode, nblocks) # define osd_transaction_size(dev) \ -- 1.8.3.1