--- /dev/null
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
+ #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400
+ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000
++#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000
+
+ #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_EA_INODE| \
+ EXT4_FEATURE_INCOMPAT_MMP| \
+- EXT4_FEATURE_INCOMPAT_DIRDATA)
++ EXT4_FEATURE_INCOMPAT_DIRDATA| \
++ EXT4_FEATURE_INCOMPAT_LARGEDIR)
+
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b
+ */
+ #define ERR_BAD_DX_DIR -75000
+
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL 3
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
++}
++
+ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+
+@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++ struct ext4_inode *raw_inode)
+ {
+- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
++ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+ le32_to_cpu(raw_inode->i_size_lo);
+- else
+- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+ }
+
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ ei->i_file_acl |=
+ ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+- inode->i_size = ext4_isize(raw_inode);
++ inode->i_size = ext4_isize(sb, raw_inode);
+ ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
+
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+- return le32_to_cpu(entry->block) & 0x00ffffff;
++ return le32_to_cpu(entry->block) & 0x0fffffff;
+ }
+
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+
+- frame->bh = NULL;
++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+ if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+
+@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru
+ goto fail;
+ }
+
+- if ((indirect = info->indirect_levels) > 1) {
+- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+- info->indirect_levels);
++ indirect = info->indirect_levels;
++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++ ext4_warning(dir->i_sb,
++ "Directory (ino: %lu) htree depth %#06x exceed "
++ "supported value", dir->i_ino,
++ ext4_dir_htree_level(dir->i_sb));
++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(dir->i_sb, "Enable large directory "
++ "feature to access it");
++ }
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+@@ -512,13 +519,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+ struct dx_root_info *info;
++ int i;
++
+ if (frames[0].bh == NULL)
+ return;
+
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+- if (info->indirect_levels)
+- brelse(frames[1].bh);
+- brelse(frames[0].bh);
++ for (i = 0; i <= info->indirect_levels; i++) {
++ if (frames[i].bh == NULL)
++ break;
++ brelse(frames[i].bh);
++ frames[i].bh = NULL;
++ }
+ }
+
+ /*
+@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+ struct dx_hash_info hinfo;
+ struct ext4_dir_entry_2 *de;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct inode *dir;
+ ext4_lblk_t block;
+ int count = 0;
+@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct ext4_dir_entry_2 *de, *top;
+ struct buffer_head *bh;
+ ext4_lblk_t block;
+@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+- dir->i_version++;
++ inode_inc_iversion(dir);
+ ext4_mark_inode_dirty(handle, dir);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries;
+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+ char *data1, *top;
+@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries, *at;
+ struct dx_hash_info hinfo;
+ struct buffer_head *bh;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct ext4_dir_entry_2 *de;
++ int restart;
+ int err;
+
++again:
++ restart = 0;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h
+ if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ goto cleanup;
+
+- BUFFER_TRACE(bh, "get_write_access");
+- err = ext4_journal_get_write_access(handle, bh);
+- if (err)
+- goto journal_error;
+-
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (err != -ENOSPC)
+ goto cleanup;
+
++ err = 0;
+ /* Block full, should compress but for now just split */
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+ /* Need to split index? */
+ if (dx_get_count(entries) == dx_get_limit(entries)) {
+ ext4_lblk_t newblock;
+- unsigned icount = dx_get_count(entries);
+- int levels = frame - frames;
++ int levels = frame - frames + 1;
++ unsigned icount;
++ int add_level = 1;
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
+- if (levels && (dx_get_count(frames->entries) ==
+- dx_get_limit(frames->entries))) {
+- ext4_warning(sb, "Directory index full!");
++ while (frame > frames) {
++ if (dx_get_count((frame - 1)->entries) <
++ dx_get_limit((frame - 1)->entries)) {
++ add_level = 0;
++ break;
++ }
++ frame--; /* split higher index block */
++ at = frame->at;
++ entries = frame->entries;
++ restart = 1;
++ }
++ if (add_level && levels == ext4_dir_htree_level(sb)) {
++ ext4_warning(sb, "Directory (ino: %lu) index full, "
++ "reach max htree level :%d",
++ dir->i_ino, levels);
++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(sb, "Large directory feature is"
++ "not enabled on this "
++ "filesystem");
++ }
+ err = -ENOSPC;
+ goto cleanup;
+ }
++ icount = dx_get_count(entries);
+ bh2 = ext4_append (handle, dir, &newblock, &err);
+ if (!(bh2))
+ goto cleanup;
+@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+- if (levels) {
++ if (!add_level) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h
+
+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+ err = ext4_journal_get_write_access(handle,
+- frames[0].bh);
++ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+
+@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ }
+- dx_insert_block(frames + 0, hash2, newblock);
+- dxtrace(dx_show_index("node", frames[1].entries));
++ dx_insert_block((frame - 1), hash2, newblock);
++ dxtrace(dx_show_index("node", frame->entries));
+ dxtrace(dx_show_index("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
++ ext4_handle_dirty_metadata(handle, dir,
++ (frame - 1)->bh);
++ if (restart) {
++ ext4_handle_dirty_metadata(handle, dir,
++ frame->bh);
++ goto cleanup;
++ }
+ } else {
+ struct dx_root_info * info;
+- dxtrace(printk(KERN_DEBUG
+- "Creating second level index...\n"));
++
+ memcpy((char *) entries2, (char *) entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h
+ dx_set_block(entries + 0, newblock);
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)
+ frames[0].bh->b_data);
+- info->indirect_levels = 1;
+-
+- /* Add new access path frame */
+- frame = frames + 1;
+- frame->at = at = at - entries + entries2;
+- frame->entries = entries = entries2;
+- frame->bh = bh2;
+- err = ext4_journal_get_write_access(handle,
+- frame->bh);
+- if (err)
+- goto journal_error;
++ info->indirect_levels += 1;
++ dxtrace(printk(KERN_DEBUG
++ "Creating %d level index...\n",
++ info->indirect_levels));
++ ext4_handle_dirty_metadata(handle, dir, frame->bh);
++ ext4_handle_dirty_metadata(handle, dir, bh2);
++ brelse(bh2);
++ restart = 1;
++ goto cleanup;
+ }
+- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ goto cleanup;
+@@ -1840,6 +1873,10 @@ cleanup:
+ if (bh)
+ brelse(bh);
+ dx_release(frames);
++ /* @restart is true means htree-path has been changed, we need to
++ * repeat dx_probe() to find out valid htree-path */
++ if (restart && err == 0)
++ goto again;
+ return err;
+ }
+
+@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle,
+ blocksize);
+ else
+ de->inode = 0;
+- dir->i_version++;
++ inode_inc_iversion(dir);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, dir, bh);
+ return 0;
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
-@@ -1778,6 +1778,19 @@ extern int ext4_orphan_add(handle_t *, s
+@@ -1778,6 +1778,13 @@ extern int ext4_orphan_add(handle_t *, s
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
+extern struct inode *ext4_create_inode(handle_t *handle,
+ struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode);
+extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
+ struct ext4_dir_entry_2 * de_del,
+ struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir);
-+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
+extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
+ struct inode *inode);
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/jbd2.h>
-@@ -873,9 +874,9 @@ static inline int search_dirblock(struct
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
-- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir)
-+struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir)
- {
- struct super_block *sb;
- struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -981,6 +982,7 @@ cleanup_and_exit:
- brelse(bh_use[ra_ptr]);
- return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
-
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1515,8 +1517,8 @@ static int make_indexed_dir(handle_t *ha
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
- {
- struct inode *dir = dentry->d_parent->d_inode;
- struct buffer_head *bh;
-@@ -1565,6 +1567,7 @@ static int ext4_add_entry(handle_t *hand
- brelse(bh);
- return retval;
- }
-+EXPORT_SYMBOL(ext4_add_entry);
-
- /*
- * Returns 0 for success, or a negative error value
@@ -1704,10 +1707,10 @@ cleanup:
* ext4_delete_entry deletes a directory entry by merging it with the
* previous entry
---- /dev/null 2011-12-14 22:16:16.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/include/linux/htree_lock.h 2011-12-02 17:09:34.000000000 +0800
+Index: linux-2.6.32-504.3.3.el6.x86_64/include/linux/htree_lock.h
+===================================================================
+--- /dev/null
++++ linux-2.6.32-504.3.3.el6.x86_64/include/linux/htree_lock.h
@@ -0,0 +1,187 @@
+/*
+ * include/linux/htree_lock.h
+ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
+
+#endif
---- /dev/null 2011-12-14 22:16:16.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/htree_lock.c 2011-12-14 22:56:28.000000000 +0800
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/htree_lock.c
+===================================================================
+--- /dev/null
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/htree_lock.c
@@ -0,0 +1,880 @@
+/*
+ * fs/ext4/htree_lock.c
+ kfree(lck);
+}
+EXPORT_SYMBOL(htree_lock_free);
---- linux-2.6.32-131.6.1/fs/ext4/ext4.h 2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/ext4.h 2011-12-08 18:25:00.000000000 +0800
-@@ -28,6 +28,7 @@
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -27,6 +27,7 @@
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#ifdef __KERNEL__
-@@ -1277,6 +1278,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
- #define EXT4_FEATURE_INCOMPAT_MMP 0x0100
- #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
- #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000
-+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000
-
- #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
- #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
-@@ -1286,7 +1288,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
- EXT4_FEATURE_INCOMPAT_64BIT| \
- EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_MMP| \
-- EXT4_FEATURE_INCOMPAT_DIRDATA)
-+ EXT4_FEATURE_INCOMPAT_DIRDATA| \
-+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
-
- #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
-@@ -1536,6 +1539,76 @@ ext4_group_first_block_no(struct super_b
- */
- #define ERR_BAD_DX_DIR -75000
+@@ -1625,6 +1626,71 @@ ext4_dir_htree_level(struct super_block
+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
-+/* htree levels for ext4 */
-+#define EXT4_HTREE_LEVEL_COMPAT 2
-+#define EXT4_HTREE_LEVEL 3
-+
-+static inline int
-+ext4_dir_htree_level(struct super_block *sb)
-+{
-+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
-+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
-+}
-+
+/* assume name-hash is protected by upper layer */
+#define EXT4_HTREE_LOCK_HASH 0
+
+ struct inode *dir, unsigned flags);
+#define ext4_htree_unlock(lck) htree_unlock(lck)
+
++extern struct buffer_head * __ext4_find_entry(struct inode *dir,
++ const struct qstr *d_name,
++ struct ext4_dir_entry_2 **res_dir,
++ struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct htree_lock *lck);
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-@@ -1769,14 +1842,16 @@ extern int ext4_htree_fill_tree(struct f
- extern struct inode *ext4_create_inode(handle_t *handle,
- struct inode * dir, int mode);
- extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode);
-+ struct inode *inode, struct htree_lock *lck);
- extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
- struct ext4_dir_entry_2 * de_del,
- struct buffer_head * bh);
- extern struct buffer_head * ext4_find_entry(struct inode *dir,
- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir);
--#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
-+ struct ext4_dir_entry_2 **res_dir,
-+ struct htree_lock *lck);
-+#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \
-+ ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck)
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
- struct inode *inode, const void *, const void *);
- extern struct buffer_head *ext4_append(handle_t *handle,
-@@ -1893,13 +1968,15 @@ static inline void ext4_r_blocks_count_s
- es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
- }
-
--static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
-+static inline loff_t ext4_isize(struct super_block *sb,
-+ struct ext4_inode *raw_inode)
- {
-- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
-+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
- return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
- le32_to_cpu(raw_inode->i_size_lo);
-- else
-- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
-+
-+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
- }
-
- static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
---- linux-2.6.32-131.6.1/fs/ext4/namei.c 2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/namei.c 2011-12-14 22:55:28.000000000 +0800
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
@@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s
struct inode *dir,
struct dx_hash_info *hinfo,
/*
* p is at least 6 bytes before the end of page
-@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
-
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
-- return le32_to_cpu(entry->block) & 0x00ffffff;
-+ return le32_to_cpu(entry->block) & 0x0fffffff;
- }
-
- static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
-@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h
+@@ -368,6 +368,225 @@ struct stats dx_show_entries(struct dx_h
}
#endif /* DX_DEBUG */
+ struct dx_entry *ld_at; /* position of leaf dx_entry */
+};
+
-+#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_find_entry(dir, name, dirent) __ext4_find_entry(dir, name, dirent, NULL)
++#define ext4_add_entry(handle, dentry, inode) __ext4_add_entry(handle, dentry, inode, NULL)
+
+/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
+#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32)
/*
* Probe for a directory leaf block to search.
*
-@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h
+@@ -379,10 +598,11 @@ struct stats dx_show_entries(struct dx_h
*/
static struct dx_frame *
dx_probe(const struct qstr *d_name, struct inode *dir,
struct dx_root_info * info;
struct buffer_head *bh;
struct dx_frame *frame = frame_in;
- u32 hash;
-
-- frame->bh = NULL;
-+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
- goto fail;
-
-@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru
- goto fail;
- }
-
-- if ((indirect = info->indirect_levels) > 1) {
-- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
-- info->indirect_levels);
-+ indirect = info->indirect_levels;
-+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
-+ ext4_warning(dir->i_sb,
-+ "Directory (ino: %lu) htree depth %#06x exceed "
-+ "supported value", dir->i_ino,
-+ ext4_dir_htree_level(dir->i_sb));
-+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
-+ ext4_warning(dir->i_sb, "Enable large directory "
-+ "feature to access it");
-+ }
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
-@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru
+@@ -447,8 +667,15 @@ dx_probe(const struct qstr *d_name, stru
dxtrace(printk("Look up %x", hash));
while (1)
{
ext4_warning(dir->i_sb,
"dx entry: no count or count > limit");
brelse(bh);
-@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru
+@@ -489,9 +716,73 @@ dx_probe(const struct qstr *d_name, stru
frame->bh = bh;
frame->entries = entries;
frame->at = at;
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
-@@ -512,13 +808,18 @@ fail:
- static void dx_release (struct dx_frame *frames)
- {
- struct dx_root_info *info;
-+ int i;
-+
- if (frames[0].bh == NULL)
- return;
-
- info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
-- if (info->indirect_levels)
-- brelse(frames[1].bh);
-- brelse(frames[0].bh);
-+ for (i = 0; i <= info->indirect_levels; i++) {
-+ if (frames[i].bh == NULL)
-+ break;
-+ brelse(frames[i].bh);
-+ frames[i].bh = NULL;
-+ }
- }
-
- /*
-@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame
+@@ -553,7 +844,7 @@ static void dx_release (struct dx_frame
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
{
struct dx_frame *p;
struct buffer_head *bh;
-@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct
+@@ -568,12 +859,22 @@ static int ext4_htree_next_block(struct
* this loop, num_frames indicates the number of interior
* nodes need to be read.
*/
p--;
}
-@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct
+@@ -596,6 +897,13 @@ static int ext4_htree_next_block(struct
* block so no check is necessary
*/
while (num_frames--) {
if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
0, &err)))
return err; /* Failure */
-@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct
+@@ -604,6 +912,7 @@ static int ext4_htree_next_block(struct
p->bh = bh;
p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
}
return 1;
}
-@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di
- {
- struct dx_hash_info hinfo;
- struct ext4_dir_entry_2 *de;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct inode *dir;
- ext4_lblk_t block;
- int count = 0;
-@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di
+@@ -696,10 +1005,10 @@ int ext4_htree_fill_tree(struct file *di
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di
+@@ -726,7 +1035,7 @@ int ext4_htree_fill_tree(struct file *di
count += ret;
hashval = ~0;
ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
*next_hash = hashval;
if (ret < 0) {
err = ret;
-@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr
+@@ -826,9 +1135,17 @@ static void dx_insert_block(struct dx_fr
static void ext4_update_dx_flag(struct inode *inode)
{
}
/*
-@@ -889,8 +1216,9 @@ static inline int search_dirblock(struct
+@@ -900,9 +1217,10 @@ static inline int search_dirblock(struct
+ * The returned buffer_head has ->b_count elevated. The caller is expected
* to brelse() it when appropriate.
*/
- struct buffer_head * ext4_find_entry(struct inode *dir,
-- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir)
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 **res_dir,
-+ struct htree_lock *lck)
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
++struct buffer_head * __ext4_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+- struct ext4_dir_entry_2 ** res_dir)
++ struct ext4_dir_entry_2 **res_dir,
++ struct htree_lock *lck)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -911,7 +1239,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -923,7 +1241,7 @@ static struct buffer_head * ext4_find_en
if (namelen > EXT4_NAME_LEN)
return NULL;
if (is_dx(dir)) {
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
-@@ -921,6 +1249,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -933,6 +1251,7 @@ static struct buffer_head * ext4_find_en
return bh;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
start = EXT4_I(dir)->i_dir_start_lookup;
-@@ -998,13 +1327,15 @@ cleanup_and_exit:
+@@ -1008,9 +1327,12 @@ cleanup_and_exit:
+ brelse(bh_use[ra_ptr]);
+ return ret;
}
- EXPORT_SYMBOL(ext4_find_entry);
++EXPORT_SYMBOL(__ext4_find_entry);
-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
{
struct super_block * sb;
struct dx_hash_info hinfo;
- u32 hash;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct ext4_dir_entry_2 *de, *top;
- struct buffer_head *bh;
- ext4_lblk_t block;
-@@ -1015,13 +1346,16 @@ static struct buffer_head * ext4_dx_find
+@@ -1026,13 +1348,16 @@ static struct buffer_head * ext4_dx_find
sb = dir->i_sb;
/* NFS may look up ".." - look at dx_root directory block */
if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
}
hash = hinfo.hash;
do {
-@@ -1050,7 +1384,7 @@ static struct buffer_head * ext4_dx_find
+@@ -1061,7 +1386,7 @@ static struct buffer_head * ext4_dx_find
brelse(bh);
/* Check to see if we should continue to search */
retval = ext4_htree_next_block(dir, hash, frame,
if (retval < 0) {
ext4_warning(sb,
"error reading index page in directory #%lu",
-@@ -1076,7 +1410,7 @@ static struct dentry *ext4_lookup(struct
- if (dentry->d_name.len > EXT4_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
-- bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- inode = NULL;
- if (bh) {
- __u32 ino = le32_to_cpu(de->inode);
-@@ -1144,7 +1478,7 @@ struct dentry *ext4_get_parent(struct de
- struct ext4_dir_entry_2 * de;
- struct buffer_head *bh;
-
-- bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
- inode = NULL;
- if (!bh)
- return ERR_PTR(-ENOENT);
-@@ -1233,8 +1567,9 @@ static struct ext4_dir_entry_2* dx_pack_
+@@ -1244,8 +1569,9 @@ static struct ext4_dir_entry_2* dx_pack_
* Returns pointer to de in block into which the new entry will be inserted.
*/
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
{
unsigned blocksize = dir->i_sb->s_blocksize;
unsigned count, continued;
-@@ -1291,7 +1626,14 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1302,7 +1628,14 @@ static struct ext4_dir_entry_2 *do_split
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
blocksize);
-@@ -1300,13 +1642,21 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1311,13 +1644,21 @@ static struct ext4_dir_entry_2 *do_split
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
err = ext4_handle_dirty_metadata(handle, dir, bh2);
if (err)
goto journal_error;
-@@ -1418,7 +1768,7 @@ static int add_dirent_to_buf(handle_t *h
- if (!IS_NOCMTIME(dir))
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
- ext4_update_dx_flag(dir);
-- dir->i_version++;
-+ inode_inc_iversion(dir);
- ext4_mark_inode_dirty(handle, dir);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
-@@ -1438,7 +1788,7 @@ static int make_indexed_dir(handle_t *ha
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct buffer_head *bh2;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct dx_entry *entries;
- struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
- char *data1, *top;
-@@ -1517,7 +1867,7 @@ static int make_indexed_dir(handle_t *ha
- ext4_handle_dirty_metadata(handle, dir, frame->bh);
- ext4_handle_dirty_metadata(handle, dir, bh);
-
+@@ -1558,7 +1899,7 @@ static int make_indexed_dir(handle_t *ha
+ ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ ext4_handle_dirty_metadata(handle, dir, bh);
+
- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
- if (!de) {
- /*
- * Even if the block split failed, we have to properly write
-@@ -1616,7 +1966,7 @@ out:
+ if (!de) {
+ /*
+ * Even if the block split failed, we have to properly write
+@@ -1664,8 +2005,8 @@ out:
+ * may not sleep between calling this and putting something into
* the entry, as someone else might have used it while you slept.
*/
- int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+ struct inode *inode, struct htree_lock *lck)
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode)
++int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct htree_lock *lck)
{
struct inode *dir = dentry->d_parent->d_inode;
struct buffer_head *bh;
-@@ -1635,9 +1985,10 @@ int ext4_add_entry(handle_t *handle, str
+@@ -1684,9 +2025,10 @@ static int ext4_add_entry(handle_t *hand
if (dentry->d_name.len == 2 &&
memcmp(dentry->d_name.name, "..", 2) == 0)
return ext4_update_dotdot(handle, dentry, inode);
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
-@@ -1674,18 +2025,21 @@ EXPORT_SYMBOL(ext4_add_entry);
+@@ -1717,12 +2059,13 @@ static int ext4_add_entry(handle_t *hand
+ brelse(bh);
+ return retval;
+ }
++EXPORT_SYMBOL(__ext4_add_entry);
+
+ /*
* Returns 0 for success, or a negative error value
*/
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+ struct inode *inode, struct htree_lock *lck)
{
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
- struct dx_hash_info hinfo;
- struct buffer_head *bh;
- struct inode *dir = dentry->d_parent->d_inode;
- struct super_block *sb = dir->i_sb;
- struct ext4_dir_entry_2 *de;
-+ int restart;
- int err;
+@@ -1736,7 +2079,7 @@ static int ext4_dx_add_entry(handle_t *h
+ again:
+ restart = 0;
- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-+again:
-+ restart = 0;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
if (!frame)
return err;
entries = frame->entries;
-@@ -1694,33 +2048,53 @@ static int ext4_dx_add_entry(handle_t *h
- if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
- goto cleanup;
-
-- BUFFER_TRACE(bh, "get_write_access");
-- err = ext4_journal_get_write_access(handle, bh);
-- if (err)
-- goto journal_error;
--
- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC)
- goto cleanup;
-
-+ err = 0;
- /* Block full, should compress but for now just split */
- dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
- dx_get_count(entries), dx_get_limit(entries)));
- /* Need to split index? */
- if (dx_get_count(entries) == dx_get_limit(entries)) {
- ext4_lblk_t newblock;
-- unsigned icount = dx_get_count(entries);
-- int levels = frame - frames;
-+ int levels = frame - frames + 1;
-+ unsigned icount;
-+ int add_level = 1;
- struct dx_entry *entries2;
+@@ -1763,6 +2106,11 @@ again:
struct dx_node *node2;
struct buffer_head *bh2;
-- if (levels && (dx_get_count(frames->entries) ==
-- dx_get_limit(frames->entries))) {
-- ext4_warning(sb, "Directory index full!");
+ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
+ ext4_htree_safe_relock(lck);
+ restart = 1;
+ goto cleanup;
+ }
-+ while (frame > frames) {
-+ if (dx_get_count((frame - 1)->entries) <
-+ dx_get_limit((frame - 1)->entries)) {
-+ add_level = 0;
-+ break;
-+ }
-+ frame--; /* split higher index block */
-+ at = frame->at;
-+ entries = frame->entries;
-+ restart = 1;
-+ }
-+ if (add_level && levels == ext4_dir_htree_level(sb)) {
-+ ext4_warning(sb, "Directory (ino: %lu) index full, "
-+ "reach max htree level :%d",
-+ dir->i_ino, levels);
-+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
-+ ext4_warning(sb, "Large directory feature is"
-+ "not enabled on this "
-+ "filesystem");
-+ }
- err = -ENOSPC;
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+@@ -1860,16 +2208,43 @@ again:
+ restart = 1;
goto cleanup;
}
-+ icount = dx_get_count(entries);
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2))
- goto cleanup;
-@@ -1733,7 +2107,7 @@ static int ext4_dx_add_entry(handle_t *h
- err = ext4_journal_get_write_access(handle, frame->bh);
- if (err)
- goto journal_error;
-- if (levels) {
-+ if (!add_level) {
- unsigned icount1 = icount/2, icount2 = icount - icount1;
- unsigned hash2 = dx_get_hash(entries + icount1);
- dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
-@@ -1741,7 +2115,7 @@ static int ext4_dx_add_entry(handle_t *h
-
- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext4_journal_get_write_access(handle,
-- frames[0].bh);
-+ (frame - 1)->bh);
- if (err)
- goto journal_error;
-
-@@ -1757,18 +2131,24 @@ static int ext4_dx_add_entry(handle_t *h
- frame->entries = entries = entries2;
- swap(frame->bh, bh2);
- }
-- dx_insert_block(frames + 0, hash2, newblock);
-- dxtrace(dx_show_index("node", frames[1].entries));
-+ dx_insert_block((frame - 1), hash2, newblock);
-+ dxtrace(dx_show_index("node", frame->entries));
- dxtrace(dx_show_index("node",
- ((struct dx_node *) bh2->b_data)->entries));
- err = ext4_handle_dirty_metadata(handle, inode, bh2);
- if (err)
- goto journal_error;
- brelse (bh2);
-+ ext4_handle_dirty_metadata(handle, inode,
-+ (frame - 1)->bh);
-+ if (restart) {
-+ ext4_handle_dirty_metadata(handle, inode,
-+ frame->bh);
-+ goto cleanup;
-+ }
- } else {
- struct dx_root_info * info;
-- dxtrace(printk(KERN_DEBUG
-- "Creating second level index...\n"));
-+
- memcpy((char *) entries2, (char *) entries,
- icount * sizeof(struct dx_entry));
- dx_set_limit(entries2, dx_node_limit(dir));
-@@ -1778,32 +2158,60 @@ static int ext4_dx_add_entry(handle_t *h
- dx_set_block(entries + 0, newblock);
- info = dx_get_dx_info((struct ext4_dir_entry_2*)
- frames[0].bh->b_data);
-- info->indirect_levels = 1;
-+ info->indirect_levels += 1;
-+ dxtrace(printk(KERN_DEBUG
-+ "Creating %d level index...\n",
-+ info->indirect_levels));
-+ ext4_handle_dirty_metadata(handle, inode, frame->bh);
-+ ext4_handle_dirty_metadata(handle, inode, bh2);
-+ brelse(bh2);
-+ restart = 1;
-+ goto cleanup;
-+ }
+ } else if (!ext4_htree_dx_locked(lck)) {
+ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
-
-- /* Add new access path frame */
-- frame = frames + 1;
-- frame->at = at = at - entries + entries2;
-- frame->entries = entries = entries2;
-- frame->bh = bh2;
-- err = ext4_journal_get_write_access(handle,
-- frame->bh);
-- if (err)
-- goto journal_error;
++
+ /* not well protected, require DX lock */
+ ext4_htree_dx_need_lock(lck);
+ at = frame > frames ? (frame - 1)->at : NULL;
+ (ld->ld_count != dx_get_count(entries))) {
+ restart = 1;
+ goto cleanup;
- }
-- ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
++ }
+ /* OK, I've got DX lock and nothing changed */
+ frame->at = ld->ld_at;
}
if (bh)
brelse(bh);
dx_release(frames);
-+ /* @restart is true means htree-path has been changed, we need to
-+ * repeat dx_probe() to find out valid htree-path */
-+ if (restart && err == 0)
-+ goto again;
- return err;
- }
-
-@@ -1838,7 +2246,7 @@ int ext4_delete_entry(handle_t *handle,
- blocksize);
- else
- de->inode = 0;
-- dir->i_version++;
-+ inode_inc_iversion(dir);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, dir, bh);
- return 0;
-@@ -1882,7 +2290,7 @@ static void ext4_dec_count(handle_t *han
- static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
- {
-- int err = ext4_add_entry(handle, dentry, inode);
-+ int err = ext4_add_entry(handle, dentry, inode, NULL);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
-@@ -2112,7 +2520,7 @@ retry:
- goto out_stop;
- }
-
-- err = ext4_add_entry(handle, dentry, inode);
-+ err = ext4_add_entry(handle, dentry, inode, NULL);
- if (err)
- goto out_clear_inode;
- ext4_inc_count(handle, dir);
-@@ -2381,7 +2789,7 @@ static int ext4_rmdir(struct inode *dir,
- return PTR_ERR(handle);
-
- retval = -ENOENT;
-- bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (!bh)
- goto end_rmdir;
-
-@@ -2443,7 +2851,7 @@ static int ext4_unlink(struct inode *dir
- ext4_handle_sync(handle);
-
- retval = -ENOENT;
-- bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (!bh)
- goto end_unlink;
-
-@@ -2567,7 +2975,7 @@ retry:
- ext4_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
-
-- err = ext4_add_entry(handle, dentry, inode);
-+ err = ext4_add_entry(handle, dentry, inode, NULL);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
-@@ -2612,7 +3020,7 @@ static int ext4_rename(struct inode *old
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
-
-- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
-+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
- /*
- * Check for inode number is _not_ due to possible IO errors.
- * We might rmdir the source, keep it as pwd of some process
-@@ -2625,7 +3033,7 @@ static int ext4_rename(struct inode *old
- goto end_rename;
-
- new_inode = new_dentry->d_inode;
-- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
-+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL);
- if (new_bh) {
- if (!new_inode) {
- brelse(new_bh);
-@@ -2651,7 +3059,7 @@ static int ext4_rename(struct inode *old
- goto end_rename;
- }
- if (!new_bh) {
-- retval = ext4_add_entry(handle, new_dentry, old_inode);
-+ retval = ext4_add_entry(handle, new_dentry, old_inode, NULL);
- if (retval)
- goto end_rename;
- } else {
-@@ -2693,7 +3101,8 @@ static int ext4_rename(struct inode *old
- struct buffer_head *old_bh2;
- struct ext4_dir_entry_2 *old_de2;
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/Makefile
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/Makefile
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
-- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
-+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
-+ &old_de2, NULL);
- if (old_bh2) {
- retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
---- linux-2.6.32-131.6.1/fs/ext4/inode.c 2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/inode.c 2011-12-01 22:02:11.000000000 +0800
-@@ -5112,7 +5112,7 @@ struct inode *ext4_iget(struct super_blo
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
- ei->i_file_acl |=
- ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-- inode->i_size = ext4_isize(raw_inode);
-+ inode->i_size = ext4_isize(sb, raw_inode);
- ei->i_disksize = inode->i_size;
- #ifdef CONFIG_QUOTA
- ei->i_reserved_quota = 0;
---- linux-2.6.32-131.6.1/fs/ext4/Makefile 2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/Makefile 2011-10-06 12:21:30.000000000 +0800
-@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ htree_lock.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-- mmp.o
-+ htree_lock.o mmp.o
+ mmp.o
- ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
--- /dev/null
+From 5930ea643805feb50a2f8383ae12eb6f10935e49 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Wed, 31 Aug 2011 12:02:51 -0400
+Subject: [PATCH] ext4: call ext4_handle_dirty_metadata with correct inode in
+ ext4_dx_add_entry
+
+ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads
+that point to directory blocks assigned to the directory inode. However, the
+function calls ext4_handle_dirty_metadata with the inode of the file that's
+being added to the directory, not the directory inode itself. Therefore,
+correct the code to dirty the directory buffers with the directory inode, not
+the file inode.
+
+Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Cc: stable@kernel.org
+---
+ fs/ext4/namei.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index f0abe43..a067835 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1585,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ dxtrace(dx_show_index("node", frames[1].entries));
+ dxtrace(dx_show_index("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+- err = ext4_handle_dirty_metadata(handle, inode, bh2);
++ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
+@@ -1611,7 +1611,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ if (err)
+ goto journal_error;
+ }
+- ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
++ err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
++ if (err) {
++ ext4_std_error(inode->i_sb, err);
++ goto cleanup;
++ }
+ }
+ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+ if (!de)
+--
+2.1.0
+
--- /dev/null
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+@@ -1585,7 +1585,8 @@ static inline void ext4_clear_state_flag
+ EXT4_FEATURE_INCOMPAT_EA_INODE| \
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_DIRDATA| \
+- EXT4_FEATURE_INCOMPAT_INLINE_DATA)
++ EXT4_FEATURE_INCOMPAT_INLINE_DATA| \
++ EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -1999,6 +2000,9 @@ struct mmpd_data {
+ # define NORET_TYPE /**/
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND noreturn,
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL 3
+
+ struct ext4_xattr_ino_array {
+ unsigned int xia_count; /* # of used item in the array */
+@@ -2472,13 +2476,16 @@ static inline void ext4_r_blocks_count_s
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++ struct ext4_inode *raw_inode)
+ {
+- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++ if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) ||
++ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) &&
++ S_ISDIR(le16_to_cpu(raw_inode->i_mode))))
+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+ le32_to_cpu(raw_inode->i_size_lo);
+- else
+- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++ return (loff_t)le32_to_cpu(raw_inode->i_size_lo);
+ }
+
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+@@ -513,7 +513,14 @@ struct dx_root_info * dx_get_dx_info(str
+
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+- return le32_to_cpu(entry->block) & 0x00ffffff;
++ return le32_to_cpu(entry->block) & 0x0fffffff;
++}
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
+
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -681,7 +688,7 @@ dx_probe(const struct qstr *d_name, stru
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+
+- frame->bh = NULL;
++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+ bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
+@@ -714,9 +721,16 @@ dx_probe(const struct qstr *d_name, stru
+ goto fail;
+ }
+
+- if ((indirect = info->indirect_levels) > 1) {
+- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+- info->indirect_levels);
++ indirect = info->indirect_levels;
++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++ ext4_warning(dir->i_sb,
++ "inode #%lu: comm %s: htree depth %#06x exceed max depth %u",
++ dir->i_ino, current->comm, indirect,
++ ext4_dir_htree_level(dir->i_sb));
++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(dir->i_sb, "Enable large directory "
++ "feature to access it");
++ }
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+@@ -812,13 +826,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+ struct dx_root_info *info;
++ int i;
++
+ if (frames[0].bh == NULL)
+ return;
+
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+- if (info->indirect_levels)
+- brelse(frames[1].bh);
+- brelse(frames[0].bh);
++ for (i = 0; i <= info->indirect_levels; i++) {
++ if (frames[i].bh == NULL)
++ break;
++ brelse(frames[i].bh);
++ frames[i].bh = NULL;
++ }
+ }
+
+ /*
+@@ -960,7 +979,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+ struct dx_hash_info hinfo;
+ struct ext4_dir_entry_2 *de;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct inode *dir;
+ ext4_lblk_t block;
+ int count = 0;
+@@ -1376,7 +1395,7 @@ static struct buffer_head * ext4_dx_find
+ {
+ struct super_block * sb = dir->i_sb;
+ struct dx_hash_info hinfo;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct buffer_head *bh;
+ ext4_lblk_t block;
+ int retval;
+@@ -1832,7 +1851,7 @@ static int make_indexed_dir(handle_t *ha
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries;
+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+ struct ext4_dir_entry_tail *t;
+@@ -2117,15 +2136,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries, *at;
+ struct dx_hash_info hinfo;
+ struct buffer_head *bh;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct ext4_dir_entry_2 *de;
++ int restart;
+ int err;
+
++again:
++ restart = 0;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+@@ -2138,33 +2160,48 @@ static int ext4_dx_add_entry(handle_t *h
+ goto cleanup;
+ }
+
+- BUFFER_TRACE(bh, "get_write_access");
+- err = ext4_journal_get_write_access(handle, bh);
+- if (err)
+- goto journal_error;
+-
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (err != -ENOSPC)
+ goto cleanup;
+
++ err = 0;
+ /* Block full, should compress but for now just split */
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+ /* Need to split index? */
+ if (dx_get_count(entries) == dx_get_limit(entries)) {
+ ext4_lblk_t newblock;
+- unsigned icount = dx_get_count(entries);
+- int levels = frame - frames;
++ int levels = frame - frames + 1;
++ unsigned icount;
++ int add_level = 1;
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
+- if (levels && (dx_get_count(frames->entries) ==
+- dx_get_limit(frames->entries))) {
+- ext4_warning(sb, "Directory index full!");
++ while (frame > frames) {
++ if (dx_get_count((frame - 1)->entries) <
++ dx_get_limit((frame - 1)->entries)) {
++ add_level = 0;
++ break;
++ }
++ frame--; /* split higher index block */
++ at = frame->at;
++ entries = frame->entries;
++ restart = 1;
++ }
++ if (add_level && levels == ext4_dir_htree_level(sb)) {
++ ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u",
++ dir->i_ino, current->comm, levels,
++ ext4_dir_htree_level(sb));
++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(sb, "Large directory feature is"
++ "not enabled on this "
++ "filesystem");
++ }
+ err = -ENOSPC;
+ goto cleanup;
+ }
++ icount = dx_get_count(entries);
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
+@@ -2179,7 +2216,7 @@ static int ext4_dx_add_entry(handle_t *h
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+- if (levels) {
++ if (!add_level) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *h
+
+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+ err = ext4_journal_get_write_access(handle,
+- frames[0].bh);
++ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+
+@@ -2203,19 +2240,25 @@ static int ext4_dx_add_entry(handle_t *h
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ }
+- dx_insert_block(frames + 0, hash2, newblock);
+- dxtrace(dx_show_index("node", frames[1].entries));
++ dx_insert_block(frame - 1, hash2, newblock);
++ dxtrace(dx_show_index("node", frame->entries));
+ dxtrace(dx_show_index("node",
+- ((struct dx_node *) bh2->b_data)->entries));
++ ((struct dx_node *)bh2->b_data)->entries));
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
++ ext4_handle_dirty_dirent_node(handle, dir,
++ (frame - 1)->bh);
++ if (restart) {
++ ext4_handle_dirty_dirent_node(handle, dir,
++ frame->bh);
++ goto cleanup;
++ }
+ } else {
+ struct dx_root_info * info;
+- dxtrace(printk(KERN_DEBUG
+- "Creating second level index...\n"));
+- memcpy((char *) entries2, (char *) entries,
++
++ memcpy((char *)entries2, (char *)entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+@@ -2224,21 +2267,14 @@ static int ext4_dx_add_entry(handle_t *h
+ dx_set_block(entries + 0, newblock);
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)
+ frames[0].bh->b_data);
+- info->indirect_levels = 1;
+-
+- /* Add new access path frame */
+- frame = frames + 1;
+- frame->at = at = at - entries + entries2;
+- frame->entries = entries = entries2;
+- frame->bh = bh2;
+- err = ext4_journal_get_write_access(handle,
+- frame->bh);
+- if (err)
+- goto journal_error;
+- }
+- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+- if (err) {
+- ext4_std_error(inode->i_sb, err);
++ info->indirect_levels += 1;
++ dxtrace(printk(KERN_DEBUG
++ "Creating %d level index...\n",
++ info->indirect_levels));
++ ext4_handle_dirty_dirent_node(handle, dir, frame->bh);
++ ext4_handle_dirty_dirent_node(handle, dir, bh2);
++ brelse(bh2);
++ restart = 1;
+ goto cleanup;
+ }
+ }
+@@ -2253,6 +2289,10 @@ journal_error:
+ cleanup:
+ brelse(bh);
+ dx_release(frames);
++ /* @restart is true means htree-path has been changed, we need to
++ * repeat dx_probe() to find out valid htree-path */
++ if (restart && err == 0)
++ goto again;
+ return err;
+ }
+
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/inode.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c
+@@ -4056,7 +4056,7 @@ struct inode *ext4_iget(struct super_blo
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ ei->i_file_acl |=
+ ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+- inode->i_size = ext4_isize(raw_inode);
++ inode->i_size = ext4_isize(sb, raw_inode);
+ ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+@@ -4306,7 +4306,7 @@ static int ext4_do_update_inode(handle_t
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+- if (ei->i_disksize != ext4_isize(raw_inode)) {
++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
===================================================================
--- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h
+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
-@@ -2145,6 +2145,19 @@ extern int ext4_orphan_add(handle_t *, s
+@@ -2145,6 +2145,13 @@ extern int ext4_orphan_add(handle_t *, s
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
+extern struct inode *ext4_create_inode(handle_t *handle,
+ struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode);
+extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
+ struct ext4_dir_entry_2 * de_del,
+ struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir,
-+ int *inlined);
+extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
+ struct inode *inode);
extern int search_dir(struct buffer_head *bh,
===================================================================
--- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
-@@ -1211,7 +1211,7 @@ static int is_dx_internal_node(struct in
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
-+struct buffer_head * ext4_find_entry(struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
- int *inlined)
-@@ -1355,6 +1355,7 @@ cleanup_and_exit:
- brelse(bh_use[ra_ptr]);
- return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
-
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1903,8 +1904,8 @@ static int make_indexed_dir(handle_t *ha
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
- {
- struct inode *dir = dentry->d_parent->d_inode;
- struct buffer_head *bh;
-@@ -1979,6 +1980,7 @@ static int ext4_add_entry(handle_t *hand
- ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
- return retval;
- }
-+EXPORT_SYMBOL(ext4_add_entry);
-
- /*
- * Returns 0 for success, or a negative error value
@@ -2165,7 +2167,7 @@ int ext4_generic_delete_entry(handle_t *
return -ENOENT;
}
This patch contains:
- pdirops support for ldiskfs
- - N-level htree directory
- integrate with osd-ldiskfs
-Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
+Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
===================================================================
--- /dev/null
-+++ linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
@@ -0,0 +1,187 @@
+/*
+ * include/linux/htree_lock.h
+ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
+
+#endif
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
===================================================================
--- /dev/null
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
@@ -0,0 +1,880 @@
+/*
+ * fs/ext4/htree_lock.c
+ htree_lock_mode_t mode, u32 key, unsigned dep,
+ int wait, void *event)
+{
-+ LIST_HEAD (list);
++ LIST_HEAD(list);
+ struct htree_lock *tmp;
+ struct htree_lock *tmp2;
+ u16 major;
+ kfree(lck);
+}
+EXPORT_SYMBOL(htree_lock_free);
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile
===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/Makefile
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ htree_lock.o \
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+ xattr_trusted.o inline.o
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
@@ -27,6 +27,7 @@
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
-@@ -810,6 +811,9 @@ struct ext4_inode_info {
+@@ -821,6 +822,9 @@ struct ext4_inode_info {
__u32 i_dtime;
ext4_fsblk_t i_file_acl;
/*
* i_block_group is the number of the block group which contains
* this file's inode. Constant across the lifetime of the inode,
-@@ -1536,6 +1540,7 @@ static inline void ext4_clear_state_flag
- EXT4_FEATURE_INCOMPAT_META_BG| \
- EXT4_FEATURE_INCOMPAT_EXTENTS| \
- EXT4_FEATURE_INCOMPAT_64BIT| \
-+ EXT4_FEATURE_INCOMPAT_LARGEDIR|\
- EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_EA_INODE| \
- EXT4_FEATURE_INCOMPAT_MMP | \
-@@ -1954,6 +1959,76 @@ struct mmpd_data {
- # define NORET_TYPE /**/
- # define ATTRIB_NORET __attribute__((noreturn))
- # define NORET_AND noreturn,
-+/* htree levels for ext4 */
-+#define EXT4_HTREE_LEVEL_COMPAT 2
-+#define EXT4_HTREE_LEVEL 3
-+
-+static inline int
-+ext4_dir_htree_level(struct super_block *sb)
-+{
-+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
-+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
-+}
-+
+@@ -1846,6 +1850,71 @@ struct dx_hash_info
+ */
+ #define HASH_NB_ALWAYS 1
+
+/* assume name-hash is protected by upper layer */
+#define EXT4_HTREE_LOCK_HASH 0
+
+ struct inode *dir, unsigned flags);
+#define ext4_htree_unlock(lck) htree_unlock(lck)
+
++extern struct buffer_head *__ext4_find_entry(struct inode *dir,
++ const struct qstr *d_name,
++ struct ext4_dir_entry_2 **res_dir,
++ int *inlined, struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct htree_lock *lck);
- struct ext4_xattr_ino_array {
- unsigned int xia_count; /* # of used item in the array */
-@@ -2050,9 +2125,17 @@ void ext4_insert_dentry(struct inode *in
+ /*
+ * Describe an inode's exact location on disk and in memory
+@@ -2088,9 +2157,17 @@ void ext4_insert_dentry(struct inode *in
const char *name, int namelen, void *data);
static inline void ext4_update_dx_flag(struct inode *inode)
{
}
static unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-@@ -2212,14 +2295,14 @@ extern int ext4_htree_fill_tree(struct f
- extern struct inode *ext4_create_inode(handle_t *handle,
- struct inode * dir, int mode);
- extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode);
-+ struct inode *inode, struct htree_lock *lck);
- extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
- struct ext4_dir_entry_2 * de_del,
- struct buffer_head * bh);
- extern struct buffer_head * ext4_find_entry(struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 ** res_dir,
-- int *inlined);
-+ int *inlined, struct htree_lock *lck);
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
- struct inode *inode, const void *, const void *);
- extern int search_dir(struct buffer_head *bh,
-@@ -2382,13 +2465,15 @@ static inline void ext4_r_blocks_count_s
- es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
- }
-
--static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
-+static inline loff_t ext4_isize(struct super_block *sb,
-+ struct ext4_inode *raw_inode)
- {
-- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
-+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
- return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
- le32_to_cpu(raw_inode->i_size_lo);
-- else
-- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
-+
-+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
- }
-
- static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
@@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t
ext4_lblk_t *block)
{
+ }
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
- BUFFER_TRACE(bh, "get_write_access");
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
+ up(&ei->i_append_sem);
if (err) {
/* checksumming functions */
void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-@@ -517,7 +525,7 @@ struct dx_root_info * dx_get_dx_info(str
-
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
-- return le32_to_cpu(entry->block) & 0x00ffffff;
-+ return le32_to_cpu(entry->block) & 0x0fffffff;
- }
-
- static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
-@@ -667,6 +675,223 @@ struct stats dx_show_entries(struct dx_h
+@@ -668,6 +676,227 @@ struct stats dx_show_entries(struct dx_h
}
#endif /* DX_DEBUG */
+};
+
+#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_find_entry(dir, name, dirent, inline) \
++ __ext4_find_entry(dir, name, dirent, inline, NULL)
++#define ext4_add_entry(handle, dentry, inode) \
++ __ext4_add_entry(handle, dentry, inode, NULL)
+
+/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
+#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32)
/*
* Probe for a directory leaf block to search.
*
-@@ -678,16 +903,17 @@ struct stats dx_show_entries(struct dx_h
+@@ -679,10 +908,11 @@ struct stats dx_show_entries(struct dx_h
*/
static struct dx_frame *
dx_probe(const struct qstr *d_name, struct inode *dir,
struct dx_root_info * info;
struct buffer_head *bh;
struct dx_frame *frame = frame_in;
- u32 hash;
-
-- frame->bh = NULL;
-+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
- bh = ext4_read_dirblock(dir, 0, INDEX);
- if (IS_ERR(bh)) {
- *err = PTR_ERR(bh);
-@@ -720,9 +946,16 @@ dx_probe(const struct qstr *d_name, stru
- goto fail;
- }
-
-- if ((indirect = info->indirect_levels) > 1) {
-- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
-- info->indirect_levels);
-+ indirect = info->indirect_levels;
-+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
-+ ext4_warning(dir->i_sb,
-+ "Directory (ino: %lu) htree depth %#06x exceed "
-+ "supported value", dir->i_ino,
-+ ext4_dir_htree_level(dir->i_sb));
-+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
-+ ext4_warning(dir->i_sb, "Enable large directory "
-+ "feature to access it");
-+ }
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
-@@ -742,8 +975,15 @@ dx_probe(const struct qstr *d_name, stru
+@@ -750,8 +980,15 @@ dx_probe(const struct qstr *d_name, stru
dxtrace(printk("Look up %x", hash));
while (1)
{
ext4_warning(dir->i_sb,
"dx entry: no count or count > limit");
brelse(bh);
-@@ -784,7 +1024,70 @@ dx_probe(const struct qstr *d_name, stru
+@@ -792,7 +1029,70 @@ dx_probe(const struct qstr *d_name, stru
frame->bh = bh;
frame->entries = entries;
frame->at = at;
bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
if (IS_ERR(bh)) {
*err = PTR_ERR(bh);
-@@ -818,13 +1121,18 @@ fail:
- static void dx_release (struct dx_frame *frames)
- {
- struct dx_root_info *info;
-+ int i;
-+
- if (frames[0].bh == NULL)
- return;
-
- info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
-- if (info->indirect_levels)
-- brelse(frames[1].bh);
-- brelse(frames[0].bh);
-+ for (i = 0; i <= info->indirect_levels; i++) {
-+ if (frames[i].bh == NULL)
-+ break;
-+ brelse(frames[i].bh);
-+ frames[i].bh = NULL;
-+ }
- }
-
- /*
-@@ -847,7 +1155,7 @@ static void dx_release (struct dx_frame
+@@ -860,7 +1160,7 @@ static void dx_release (struct dx_frame
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
{
struct dx_frame *p;
struct buffer_head *bh;
-@@ -862,12 +1170,22 @@ static int ext4_htree_next_block(struct
+@@ -875,12 +1175,22 @@ static int ext4_htree_next_block(struct
* this loop, num_frames indicates the number of interior
* nodes need to be read.
*/
p--;
}
-@@ -890,6 +1208,13 @@ static int ext4_htree_next_block(struct
+@@ -903,6 +1213,13 @@ static int ext4_htree_next_block(struct
* block so no check is necessary
*/
while (num_frames--) {
bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
if (IS_ERR(bh))
return PTR_ERR(bh);
-@@ -898,6 +1223,7 @@ static int ext4_htree_next_block(struct
+@@ -911,6 +1228,7 @@ static int ext4_htree_next_block(struct
p->bh = bh;
p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
}
return 1;
}
-@@ -966,7 +1292,7 @@ int ext4_htree_fill_tree(struct file *di
- {
- struct dx_hash_info hinfo;
- struct ext4_dir_entry_2 *de;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct inode *dir;
- ext4_lblk_t block;
- int count = 0;
-@@ -1000,10 +1326,10 @@ int ext4_htree_fill_tree(struct file *di
+@@ -1013,10 +1331,10 @@ int ext4_htree_fill_tree(struct file *di
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -1030,7 +1356,7 @@ int ext4_htree_fill_tree(struct file *di
+@@ -1043,7 +1361,7 @@ int ext4_htree_fill_tree(struct file *di
count += ret;
hashval = ~0;
ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
*next_hash = hashval;
if (ret < 0) {
err = ret;
-@@ -1226,7 +1552,7 @@ static int is_dx_internal_node(struct in
- struct buffer_head * ext4_find_entry(struct inode *dir,
+@@ -1236,10 +1554,10 @@ static int is_dx_internal_node(struct in
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
++struct buffer_head *__ext4_find_entry(struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
- int *inlined)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -1270,7 +1596,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -1283,7 +1601,7 @@ static struct buffer_head * ext4_find_en
goto restart;
}
if (is_dx(dir)) {
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
-@@ -1280,6 +1606,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -1297,6 +1615,7 @@ static struct buffer_head * ext4_find_en
return bh;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
start = EXT4_I(dir)->i_dir_start_lookup;
-@@ -1369,17 +1696,19 @@ cleanup_and_exit:
+@@ -1389,9 +1708,12 @@ cleanup_and_exit:
+ brelse(bh_use[ra_ptr]);
+ return ret;
}
- EXPORT_SYMBOL(ext4_find_entry);
++EXPORT_SYMBOL(__ext4_find_entry);
-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
-+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
++static struct buffer_head *ext4_dx_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ struct htree_lock *lck, int *err)
{
struct super_block * sb = dir->i_sb;
struct dx_hash_info hinfo;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct buffer_head *bh;
+@@ -1400,7 +1722,7 @@ static struct buffer_head * ext4_dx_find
ext4_lblk_t block;
int retval;
return NULL;
do {
block = dx_get_block(frame->at);
-@@ -1403,7 +1732,7 @@ static struct buffer_head * ext4_dx_find
+@@ -1424,7 +1746,7 @@ static struct buffer_head * ext4_dx_find
/* Check to see if we should continue to search */
retval = ext4_htree_next_block(dir, hinfo.hash, frame,
if (retval < 0) {
ext4_warning(sb,
"error reading index page in directory #%lu",
-@@ -1429,7 +1758,7 @@ static struct dentry *ext4_lookup(struct
- if (dentry->d_name.len > EXT4_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
-- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
- if (IS_ERR(bh))
- return (struct dentry *) bh;
- inode = NULL;
-@@ -1489,7 +1818,7 @@ struct dentry *ext4_get_parent(struct de
- struct ext4_dir_entry_2 * de;
- struct buffer_head *bh;
-
-- bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
-+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL, NULL);
- if (IS_ERR(bh))
- return (struct dentry *) bh;
- if (!bh)
-@@ -1559,8 +1888,9 @@ static struct ext4_dir_entry_2* dx_pack_
+@@ -1583,8 +1905,9 @@ static struct ext4_dir_entry_2* dx_pack_
* Returns pointer to de in block into which the new entry will be inserted.
*/
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
{
unsigned blocksize = dir->i_sb->s_blocksize;
unsigned count, continued;
-@@ -1624,7 +1954,14 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1647,7 +1970,14 @@ static struct ext4_dir_entry_2 *do_split
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
-@@ -1643,13 +1980,21 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1666,13 +1996,21 @@ static struct ext4_dir_entry_2 *do_split
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
if (err)
goto journal_error;
-@@ -1809,7 +2154,7 @@ static int add_dirent_to_buf(handle_t *h
- */
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
- ext4_update_dx_flag(dir);
-- dir->i_version++;
-+ inode_inc_iversion(dir);
- ext4_mark_inode_dirty(handle, dir);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, dir, bh);
-@@ -1829,7 +2174,7 @@ static int make_indexed_dir(handle_t *ha
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct buffer_head *bh2;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct dx_entry *entries;
- struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
- struct ext4_dir_entry_tail *t;
-@@ -1923,7 +2268,7 @@ static int make_indexed_dir(handle_t *ha
+@@ -1945,7 +2283,7 @@ static int make_indexed_dir(handle_t *ha
ext4_handle_dirty_dx_node(handle, dir, frame->bh);
ext4_handle_dirty_dirent_node(handle, dir, bh);
- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-+ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, NULL, &retval);
if (!de) {
/*
* Even if the block split failed, we have to properly write
-@@ -2030,7 +2375,7 @@ out:
+@@ -2051,8 +2389,8 @@ out:
+ * may not sleep between calling this and putting something into
* the entry, as someone else might have used it while you slept.
*/
- int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+ struct inode *inode, struct htree_lock *lck)
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode)
++int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct htree_lock *lck)
{
struct inode *dir = dentry->d_parent->d_inode;
struct buffer_head *bh;
-@@ -2066,9 +2411,10 @@ int ext4_add_entry(handle_t *handle, str
+@@ -2087,9 +2425,10 @@ static int ext4_add_entry(handle_t *hand
if (dentry->d_name.len == 2 &&
memcmp(dentry->d_name.name, "..", 2) == 0)
return ext4_update_dotdot(handle, dentry, inode);
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
-@@ -2114,18 +2460,21 @@ EXPORT_SYMBOL(ext4_add_entry);
+@@ -2129,12 +2468,13 @@ static int ext4_add_entry(handle_t *hand
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+ return retval;
+ }
++EXPORT_SYMBOL(__ext4_add_entry);
+
+ /*
* Returns 0 for success, or a negative error value
*/
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+ struct inode *inode, struct htree_lock *lck)
{
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
- struct dx_hash_info hinfo;
- struct buffer_head *bh;
- struct inode *dir = dentry->d_parent->d_inode;
- struct super_block *sb = dir->i_sb;
- struct ext4_dir_entry_2 *de;
-+ int restart;
- int err;
+@@ -2148,7 +2488,7 @@ static int ext4_dx_add_entry(handle_t *h
+ again:
+ restart = 0;
- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-+again:
-+ restart = 0;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
if (!frame)
return err;
entries = frame->entries;
-@@ -2137,33 +2486,53 @@ static int ext4_dx_add_entry(handle_t *h
- goto cleanup;
- }
-
-- BUFFER_TRACE(bh, "get_write_access");
-- err = ext4_journal_get_write_access(handle, bh);
-- if (err)
-- goto journal_error;
--
- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC)
- goto cleanup;
-
-+ err = 0;
- /* Block full, should compress but for now just split */
- dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
- dx_get_count(entries), dx_get_limit(entries)));
- /* Need to split index? */
- if (dx_get_count(entries) == dx_get_limit(entries)) {
- ext4_lblk_t newblock;
-- unsigned icount = dx_get_count(entries);
-- int levels = frame - frames;
-+ int levels = frame - frames + 1;
-+ unsigned icount;
-+ int add_level = 1;
- struct dx_entry *entries2;
+@@ -2178,6 +2518,11 @@ again:
struct dx_node *node2;
struct buffer_head *bh2;
-- if (levels && (dx_get_count(frames->entries) ==
-- dx_get_limit(frames->entries))) {
-- ext4_warning(sb, "Directory index full!");
+ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
+ ext4_htree_safe_relock(lck);
+ restart = 1;
+ goto cleanup;
+ }
-+ while (frame > frames) {
-+ if (dx_get_count((frame - 1)->entries) <
-+ dx_get_limit((frame - 1)->entries)) {
-+ add_level = 0;
-+ break;
-+ }
-+ frame--; /* split higher index block */
-+ at = frame->at;
-+ entries = frame->entries;
-+ restart = 1;
-+ }
-+ if (add_level && levels == ext4_dir_htree_level(sb)) {
-+ ext4_warning(sb, "Directory (ino: %lu) index full, "
-+ "reach max htree level :%d",
-+ dir->i_ino, levels);
-+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
-+ ext4_warning(sb, "Large directory feature is"
-+ "not enabled on this "
-+ "filesystem");
-+ }
- err = -ENOSPC;
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+@@ -2277,16 +2622,43 @@ again:
+ restart = 1;
goto cleanup;
}
-+ icount = dx_get_count(entries);
- bh2 = ext4_append(handle, dir, &newblock);
- if (IS_ERR(bh2)) {
- err = PTR_ERR(bh2);
-@@ -2178,7 +2547,7 @@ static int ext4_dx_add_entry(handle_t *h
- err = ext4_journal_get_write_access(handle, frame->bh);
- if (err)
- goto journal_error;
-- if (levels) {
-+ if (!add_level) {
- unsigned icount1 = icount/2, icount2 = icount - icount1;
- unsigned hash2 = dx_get_hash(entries + icount1);
- dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
-@@ -2186,7 +2555,7 @@ static int ext4_dx_add_entry(handle_t *h
-
- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext4_journal_get_write_access(handle,
-- frames[0].bh);
-+ (frame - 1)->bh);
- if (err)
- goto journal_error;
-
-@@ -2202,18 +2571,24 @@ static int ext4_dx_add_entry(handle_t *h
- frame->entries = entries = entries2;
- swap(frame->bh, bh2);
- }
-- dx_insert_block(frames + 0, hash2, newblock);
-- dxtrace(dx_show_index("node", frames[1].entries));
-+ dx_insert_block((frame - 1), hash2, newblock);
-+ dxtrace(dx_show_index("node", frame->entries));
- dxtrace(dx_show_index("node",
- ((struct dx_node *) bh2->b_data)->entries));
- err = ext4_handle_dirty_dx_node(handle, dir, bh2);
- if (err)
- goto journal_error;
- brelse (bh2);
-+ ext4_handle_dirty_metadata(handle, inode,
-+ (frame - 1)->bh);
-+ if (restart) {
-+ ext4_handle_dirty_metadata(handle, inode,
-+ frame->bh);
-+ goto cleanup;
-+ }
- } else {
- struct dx_root_info * info;
-- dxtrace(printk(KERN_DEBUG
-- "Creating second level index...\n"));
-+
- memcpy((char *) entries2, (char *) entries,
- icount * sizeof(struct dx_entry));
- dx_set_limit(entries2, dx_node_limit(dir));
-@@ -2223,35 +2598,63 @@ static int ext4_dx_add_entry(handle_t *h
- dx_set_block(entries + 0, newblock);
- info = dx_get_dx_info((struct ext4_dir_entry_2*)
- frames[0].bh->b_data);
-- info->indirect_levels = 1;
-+ info->indirect_levels += 1;
-+ dxtrace(printk(KERN_DEBUG
-+ "Creating %d level index...\n",
-+ info->indirect_levels));
-+ ext4_handle_dirty_metadata(handle, inode, frame->bh);
-+ ext4_handle_dirty_metadata(handle, inode, bh2);
-+ brelse(bh2);
-+ restart = 1;
-+ goto cleanup;
-+ }
+ } else if (!ext4_htree_dx_locked(lck)) {
+ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
-
-- /* Add new access path frame */
-- frame = frames + 1;
-- frame->at = at = at - entries + entries2;
-- frame->entries = entries = entries2;
-- frame->bh = bh2;
-- err = ext4_journal_get_write_access(handle,
-- frame->bh);
-- if (err)
-- goto journal_error;
++
+ /* not well protected, require DX lock */
+ ext4_htree_dx_need_lock(lck);
+ at = frame > frames ? (frame - 1)->at : NULL;
+ (ld->ld_count != dx_get_count(entries))) {
+ restart = 1;
+ goto cleanup;
- }
-- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
++ }
+ /* OK, I've got DX lock and nothing changed */
+ frame->at = ld->ld_at;
- if (err) {
- ext4_std_error(inode->i_sb, err);
- goto cleanup;
- }
}
- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
+ ext4_htree_de_unlock(lck);
brelse(bh);
dx_release(frames);
-+ /* @restart is true means htree-path has been changed, we need to
-+ * repeat dx_probe() to find out valid htree-path */
-+ if (restart && err == 0)
-+ goto again;
- return err;
- }
-
-@@ -2288,7 +2691,7 @@ int ext4_generic_delete_entry(handle_t *
- blocksize);
- else
- de->inode = 0;
-- dir->i_version++;
-+ inode_inc_iversion(dir);
- return 0;
- }
- i += ext4_rec_len_from_disk(de->rec_len, blocksize);
-@@ -2373,7 +2776,7 @@ EXPORT_SYMBOL(ext4_dec_count);
- static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
- {
-- int err = ext4_add_entry(handle, dentry, inode);
-+ int err = ext4_add_entry(handle, dentry, inode, NULL);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- unlock_new_inode(inode);
-@@ -2641,7 +3044,7 @@ retry:
- goto out_clear_inode;
- err = ext4_mark_inode_dirty(handle, inode);
- if (!err)
-- err = ext4_add_entry(handle, dentry, inode);
-+ err = ext4_add_entry(handle, dentry, inode, NULL);
- if (err) {
- out_clear_inode:
- clear_nlink(inode);
-@@ -2907,7 +3310,7 @@ static int ext4_rmdir(struct inode *dir,
- dquot_initialize(dentry->d_inode);
-
- retval = -ENOENT;
-- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
- if (!bh)
-@@ -2974,7 +3377,7 @@ static int ext4_unlink(struct inode *dir
- dquot_initialize(dentry->d_inode);
-
- retval = -ENOENT;
-- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
- if (!bh)
-@@ -3153,7 +3556,7 @@ retry:
- ext4_inc_count(handle, inode);
- ihold(inode);
-
-- err = ext4_add_entry(handle, dentry, inode);
-+ err = ext4_add_entry(handle, dentry, inode, NULL);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
-@@ -3183,7 +3556,7 @@ retry:
- struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
-
-- bh = ext4_find_entry(dir, d_name, &de, NULL);
-+ bh = ext4_find_entry(dir, d_name, &de, NULL, NULL);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
- if (bh) {
-@@ -3230,7 +3633,7 @@ static int ext4_rename(struct inode *old
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
-
-- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
-+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL, NULL);
- if (IS_ERR(old.bh))
- return PTR_ERR(old.bh);
- /*
-@@ -3244,7 +3647,7 @@ static int ext4_rename(struct inode *old
-
- new_inode = new_dentry->d_inode;
- new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
-- &new.de, &new.inlined);
-+ &new.de, &new.inlined, NULL);
- if (IS_ERR(new.bh)) {
- if (!new_inode) {
- brelse(new_bh);
-@@ -3275,7 +3678,7 @@ static int ext4_rename(struct inode *old
- goto end_rename;
- }
- if (!new.bh) {
-- retval = ext4_add_entry(handle, new.dentry, old.inode);
-+ retval = ext4_add_entry(handle, new.dentry, old.inode, NULL);
- if (retval)
- goto end_rename;
- } else {
-@@ -3375,7 +3678,7 @@ static int ext4_rename(struct inode *old
- dquot_initialize(new.dir);
-
- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
-- &old.de, &old.inlined);
-+ &old.de, &old.inlined, NULL);
- /*
- * Check for inode number is _not_ due to possible IO errors.
- * We might rmdir the source, keep it as pwd of some process
-@@ -3475,7 +3678,7 @@ static int ext4_rename(struct inode *old
- goto end_rename;
-
- new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
-- &new.de, &new.inlined);
-+ &new.de, &new.inlined, NULL);
-
- /* RENAME_EXCHANGE case: old *and* new must both exist */
- if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
-===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/inode.c
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
-@@ -4264,7 +4264,7 @@ struct inode *ext4_iget(struct super_blo
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
- ei->i_file_acl |=
- ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-- inode->i_size = ext4_isize(raw_inode);
-+ inode->i_size = ext4_isize(sb, raw_inode);
- ei->i_disksize = inode->i_size;
- #ifdef CONFIG_QUOTA
- ei->i_reserved_quota = 0;
-@@ -4499,7 +4499,7 @@ static int ext4_do_update_inode(handle_t
- raw_inode->i_file_acl_high =
- cpu_to_le16(ei->i_file_acl >> 32);
- raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-- if (ei->i_disksize != ext4_isize(raw_inode)) {
-+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
- ext4_isize_set(raw_inode, ei->i_disksize);
- need_datasync = 1;
- }
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
-===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/Makefile
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
-@@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
- mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-- xattr_trusted.o inline.o
-+ xattr_trusted.o inline.o htree_lock.o
-
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
- ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
+ /* @restart is true means htree-path has been changed, we need to
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c
===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/super.c
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
-@@ -872,6 +872,7 @@ static struct inode *ext4_alloc_inode(st
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/super.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c
+@@ -875,6 +875,7 @@ static struct inode *ext4_alloc_inode(st
ei->vfs_inode.i_version = 1;
- spin_lock_init(&ei->i_raw_lock);
+ spin_lock_init(&ei->i_raw_lock);
+ sema_init(&ei->i_append_sem, 1);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
--- /dev/null
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
+ #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400
+ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
++#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000
+
+ #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_EA_INODE| \
+ EXT4_FEATURE_INCOMPAT_MMP| \
+- EXT4_FEATURE_INCOMPAT_DIRDATA)
++ EXT4_FEATURE_INCOMPAT_DIRDATA| \
++ EXT4_FEATURE_INCOMPAT_LARGEDIR)
+
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b
+ */
+ #define ERR_BAD_DX_DIR -75000
+
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL 3
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
++}
++
+ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+
+@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++ struct ext4_inode *raw_inode)
+ {
+- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
++ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+ le32_to_cpu(raw_inode->i_size_lo);
+- else
+- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+ }
+
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ ei->i_file_acl |=
+ ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+- inode->i_size = ext4_isize(raw_inode);
++ inode->i_size = ext4_isize(sb, raw_inode);
+ ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+@@ -5654,7 +5654,7 @@ static int ext4_do_update_inode(handle_t
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+- if (ei->i_disksize != ext4_isize(raw_inode)) {
++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
+
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+- return le32_to_cpu(entry->block) & 0x00ffffff;
++ return le32_to_cpu(entry->block) & 0x0fffffff;
+ }
+
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+
+- frame->bh = NULL;
++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+ if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+
+@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru
+ goto fail;
+ }
+
+- if ((indirect = info->indirect_levels) > 1) {
+- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+- info->indirect_levels);
++ indirect = info->indirect_levels;
++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++ ext4_warning(dir->i_sb,
++ "Directory (ino: %lu) htree depth %#06x exceed "
++ "supported value", dir->i_ino,
++ ext4_dir_htree_level(dir->i_sb));
++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(dir->i_sb, "Enable large directory "
++ "feature to access it");
++ }
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+@@ -512,13 +519,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+ struct dx_root_info *info;
++ int i;
++
+ if (frames[0].bh == NULL)
+ return;
+
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+- if (info->indirect_levels)
+- brelse(frames[1].bh);
+- brelse(frames[0].bh);
++ for (i = 0; i <= info->indirect_levels; i++) {
++ if (frames[i].bh == NULL)
++ break;
++ brelse(frames[i].bh);
++ frames[i].bh = NULL;
++ }
+ }
+
+ /*
+@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+ struct dx_hash_info hinfo;
+ struct ext4_dir_entry_2 *de;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct inode *dir;
+ ext4_lblk_t block;
+ int count = 0;
+@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct buffer_head *bh;
+ ext4_lblk_t block;
+ struct ext4_dir_entry_2 *de, *top;
+@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+- dir->i_version++;
++ inode_inc_iversion(dir);
+ ext4_mark_inode_dirty(handle, dir);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries;
+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+ char *data1, *top;
+@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+ struct dx_entry *entries, *at;
+ struct dx_hash_info hinfo;
+ struct buffer_head *bh;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct ext4_dir_entry_2 *de;
++ int restart;
+ int err;
+
++again:
++ restart = 0;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h
+ if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ goto cleanup;
+
+- BUFFER_TRACE(bh, "get_write_access");
+- err = ext4_journal_get_write_access(handle, bh);
+- if (err)
+- goto journal_error;
+-
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (err != -ENOSPC)
+ goto cleanup;
+
++ err = 0;
+ /* Block full, should compress but for now just split */
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+ /* Need to split index? */
+ if (dx_get_count(entries) == dx_get_limit(entries)) {
+ ext4_lblk_t newblock;
+- unsigned icount = dx_get_count(entries);
+- int levels = frame - frames;
++ int levels = frame - frames + 1;
++ unsigned icount;
++ int add_level = 1;
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
+- if (levels && (dx_get_count(frames->entries) ==
+- dx_get_limit(frames->entries))) {
+- ext4_warning(sb, "Directory index full!");
++ while (frame > frames) {
++ if (dx_get_count((frame - 1)->entries) <
++ dx_get_limit((frame - 1)->entries)) {
++ add_level = 0;
++ break;
++ }
++ frame--; /* split higher index block */
++ at = frame->at;
++ entries = frame->entries;
++ restart = 1;
++ }
++ if (add_level && levels == ext4_dir_htree_level(sb)) {
++ ext4_warning(sb, "Directory (ino: %lu) index full, "
++ "reach max htree level :%d",
++ dir->i_ino, levels);
++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++ ext4_warning(sb, "Large directory feature is"
++ "not enabled on this "
++ "filesystem");
++ }
+ err = -ENOSPC;
+ goto cleanup;
+ }
++ icount = dx_get_count(entries);
+ bh2 = ext4_append (handle, dir, &newblock, &err);
+ if (!(bh2))
+ goto cleanup;
+@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+- if (levels) {
++ if (!add_level) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h
+
+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+ err = ext4_journal_get_write_access(handle,
+- frames[0].bh);
++ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+
+@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ }
+- dx_insert_block(frames + 0, hash2, newblock);
+- dxtrace(dx_show_index("node", frames[1].entries));
++ dx_insert_block((frame - 1), hash2, newblock);
++ dxtrace(dx_show_index("node", frame->entries));
+ dxtrace(dx_show_index("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
++ ext4_handle_dirty_metadata(handle, dir,
++ (frame - 1)->bh);
++ if (restart) {
++ ext4_handle_dirty_metadata(handle, dir,
++ frame->bh);
++ goto cleanup;
++ }
+ } else {
+ struct dx_root_info * info;
+- dxtrace(printk(KERN_DEBUG
+- "Creating second level index...\n"));
++
+ memcpy((char *) entries2, (char *) entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h
+ dx_set_block(entries + 0, newblock);
+ info = dx_get_dx_info((struct ext4_dir_entry_2*)
+ frames[0].bh->b_data);
+- info->indirect_levels = 1;
+-
+- /* Add new access path frame */
+- frame = frames + 1;
+- frame->at = at = at - entries + entries2;
+- frame->entries = entries = entries2;
+- frame->bh = bh2;
+- err = ext4_journal_get_write_access(handle,
+- frame->bh);
+- if (err)
+- goto journal_error;
++ info->indirect_levels += 1;
++ dxtrace(printk(KERN_DEBUG
++ "Creating %d level index...\n",
++ info->indirect_levels));
++ ext4_handle_dirty_metadata(handle, dir, frame->bh);
++ ext4_handle_dirty_metadata(handle, dir, bh2);
++ brelse(bh2);
++ restart = 1;
++ goto cleanup;
+ }
+- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ goto cleanup;
+@@ -1840,6 +1873,10 @@ cleanup:
+ if (bh)
+ brelse(bh);
+ dx_release(frames);
++ /* @restart is true means htree-path has been changed, we need to
++ * repeat dx_probe() to find out valid htree-path */
++ if (restart && err == 0)
++ goto again;
+ return err;
+ }
+
+@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle,
+ blocksize);
+ else
+ de->inode = 0;
+- dir->i_version++;
++ inode_inc_iversion(dir);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, dir, bh);
+ return 0;
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
-@@ -1895,6 +1895,19 @@ extern int ext4_orphan_add(handle_t *, s
+@@ -1895,6 +1895,13 @@ extern int ext4_orphan_add(handle_t *, s
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
+extern struct inode *ext4_create_inode(handle_t *handle,
+ struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode);
+extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
+ struct ext4_dir_entry_2 * de_del,
+ struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir);
-+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
+extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
+ struct inode *inode);
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/jbd2.h>
-@@ -873,9 +874,9 @@ static inline int search_dirblock(struct
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
-- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir)
-+struct buffer_head * ext4_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 ** res_dir)
- {
- struct super_block *sb;
- struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -981,6 +982,7 @@ cleanup_and_exit:
- brelse(bh_use[ra_ptr]);
- return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
-
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1503,8 +1505,8 @@ static int make_indexed_dir(handle_t *ha
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
- {
- struct inode *dir = dentry->d_parent->d_inode;
- struct buffer_head *bh;
-@@ -1555,6 +1557,7 @@ static int ext4_add_entry(handle_t *hand
- ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
- return retval;
- }
-+EXPORT_SYMBOL(ext4_add_entry);
-
- /*
- * Returns 0 for success, or a negative error value
@@ -1698,10 +1701,10 @@ cleanup:
* ext4_delete_entry deletes a directory entry by merging it with the
* previous entry
+++ /dev/null
----
- fs/ext4/Makefile | 2
- fs/ext4/ext4.h | 93 ++++
- fs/ext4/htree_lock.c | 880 +++++++++++++++++++++++++++++++++++++++++++++
- fs/ext4/inode.c | 4
- fs/ext4/namei.c | 585 +++++++++++++++++++++++++----
- include/linux/htree_lock.h | 187 +++++++++
- 6 files changed, 1650 insertions(+), 101 deletions(-)
-
---- a/fs/ext4/Makefile
-+++ b/fs/ext4/Makefile
-@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
- ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-- mmp.o
-+ htree_lock.o mmp.o
-
- ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
---- a/fs/ext4/ext4.h
-+++ b/fs/ext4/ext4.h
-@@ -28,6 +28,7 @@
- #include <linux/mutex.h>
- #include <linux/timer.h>
- #include <linux/wait.h>
-+#include <linux/htree_lock.h>
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
- #ifdef __KERNEL__
-@@ -1402,6 +1403,7 @@ static inline void ext4_clear_state_flag
- #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
- #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
- #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
-+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000
-
- #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
- #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
-@@ -1427,7 +1429,8 @@ static inline void ext4_clear_state_flag
- EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_EA_INODE| \
- EXT4_FEATURE_INCOMPAT_MMP| \
-- EXT4_FEATURE_INCOMPAT_DIRDATA)
-+ EXT4_FEATURE_INCOMPAT_DIRDATA| \
-+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
-
- #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
-@@ -1690,6 +1693,76 @@ ext4_group_first_block_no(struct super_b
- */
- #define ERR_BAD_DX_DIR -75000
-
-+/* htree levels for ext4 */
-+#define EXT4_HTREE_LEVEL_COMPAT 2
-+#define EXT4_HTREE_LEVEL 3
-+
-+static inline int
-+ext4_dir_htree_level(struct super_block *sb)
-+{
-+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
-+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
-+}
-+
-+/* assume name-hash is protected by upper layer */
-+#define EXT4_HTREE_LOCK_HASH 0
-+
-+enum ext4_pdo_lk_types {
-+#if EXT4_HTREE_LOCK_HASH
-+ EXT4_LK_HASH,
-+#endif
-+ EXT4_LK_DX, /* index block */
-+ EXT4_LK_DE, /* directory entry block */
-+ EXT4_LK_SPIN, /* spinlock */
-+ EXT4_LK_MAX,
-+};
-+
-+/* read-only bit */
-+#define EXT4_LB_RO(b) (1 << (b))
-+/* read + write, high bits for writer */
-+#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
-+
-+enum ext4_pdo_lock_bits {
-+ /* DX lock bits */
-+ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX),
-+ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX),
-+ /* DE lock bits */
-+ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE),
-+ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE),
-+ /* DX spinlock bits */
-+ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN),
-+ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN),
-+ /* accurate searching */
-+ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1),
-+};
-+
-+enum ext4_pdo_lock_opc {
-+ /* external */
-+ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
-+ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
-+ EXT4_LB_EXACT),
-+ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
-+ EXT4_LB_EXACT),
-+ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
-+
-+ /* internal */
-+ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
-+ EXT4_LB_EXACT),
-+ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
-+ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
-+};
-+
-+extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
-+#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead)
-+
-+extern struct htree_lock *ext4_htree_lock_alloc(void);
-+#define ext4_htree_lock_free(lck) htree_lock_free(lck)
-+
-+extern void ext4_htree_lock(struct htree_lock *lck,
-+ struct htree_lock_head *lhead,
-+ struct inode *dir, unsigned flags);
-+#define ext4_htree_unlock(lck) htree_unlock(lck)
-+
- void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
- ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
-@@ -1964,14 +2037,16 @@ extern int ext4_htree_fill_tree(struct f
- extern struct inode *ext4_create_inode(handle_t *handle,
- struct inode * dir, int mode);
- extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode);
-+ struct inode *inode, struct htree_lock *lck);
- extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
- struct ext4_dir_entry_2 * de_del,
- struct buffer_head * bh);
- extern struct buffer_head * ext4_find_entry(struct inode *dir,
- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir);
--#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
-+ struct ext4_dir_entry_2 **res_dir,
-+ struct htree_lock *lck);
-+#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \
-+ ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck)
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
- struct inode *inode, const void *, const void *);
- extern struct buffer_head *ext4_append(handle_t *handle,
-@@ -2104,13 +2179,15 @@ static inline void ext4_r_blocks_count_s
- es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
- }
-
--static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
-+static inline loff_t ext4_isize(struct super_block *sb,
-+ struct ext4_inode *raw_inode)
- {
-- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
-+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
- return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
- le32_to_cpu(raw_inode->i_size_lo);
-- else
-- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
-+
-+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
- }
-
- static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
---- /dev/null
-+++ b/fs/ext4/htree_lock.c
-@@ -0,0 +1,880 @@
-+/*
-+ * fs/ext4/htree_lock.c
-+ *
-+ * Copyright (c) 2011, 2012, Intel Corporation.
-+ *
-+ * Author: Liang Zhen <liang@whamcloud.com>
-+ */
-+#include <linux/jbd2.h>
-+#include <linux/hash.h>
-+#include <linux/module.h>
-+#include <linux/htree_lock.h>
-+
-+enum {
-+ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX),
-+ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW),
-+ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR),
-+ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW),
-+ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR),
-+};
-+
-+enum {
-+ HTREE_LOCK_COMPAT_EX = 0,
-+ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
-+ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
-+ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
-+ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
-+ HTREE_LOCK_BIT_PW,
-+};
-+
-+static int htree_lock_compat[] = {
-+ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX,
-+ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW,
-+ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR,
-+ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW,
-+ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR,
-+};
-+
-+/* max allowed htree-lock depth.
-+ * We only need depth=3 for ext4 although user can have higher value. */
-+#define HTREE_LOCK_DEP_MAX 16
-+
-+#ifdef HTREE_LOCK_DEBUG
-+
-+static char *hl_name[] = {
-+ [HTREE_LOCK_EX] "EX",
-+ [HTREE_LOCK_PW] "PW",
-+ [HTREE_LOCK_PR] "PR",
-+ [HTREE_LOCK_CW] "CW",
-+ [HTREE_LOCK_CR] "CR",
-+};
-+
-+/* lock stats */
-+struct htree_lock_node_stats {
-+ unsigned long long blocked[HTREE_LOCK_MAX];
-+ unsigned long long granted[HTREE_LOCK_MAX];
-+ unsigned long long retried[HTREE_LOCK_MAX];
-+ unsigned long long events;
-+};
-+
-+struct htree_lock_stats {
-+ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX];
-+ unsigned long long granted[HTREE_LOCK_MAX];
-+ unsigned long long blocked[HTREE_LOCK_MAX];
-+};
-+
-+static struct htree_lock_stats hl_stats;
-+
-+void htree_lock_stat_reset(void)
-+{
-+ memset(&hl_stats, 0, sizeof(hl_stats));
-+}
-+
-+void htree_lock_stat_print(int depth)
-+{
-+ int i;
-+ int j;
-+
-+ printk(KERN_DEBUG "HTREE LOCK STATS:\n");
-+ for (i = 0; i < HTREE_LOCK_MAX; i++) {
-+ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
-+ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
-+ }
-+ for (i = 0; i < depth; i++) {
-+ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
-+ for (j = 0; j < HTREE_LOCK_MAX; j++) {
-+ printk(KERN_DEBUG
-+ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
-+ hl_name[j], hl_stats.nodes[i].granted[j],
-+ hl_stats.nodes[i].blocked[j],
-+ hl_stats.nodes[i].retried[j]);
-+ }
-+ }
-+}
-+
-+#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0)
-+#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0)
-+#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0)
-+#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0)
-+#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0)
-+#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0)
-+
-+#else /* !DEBUG */
-+
-+void htree_lock_stat_reset(void) {}
-+void htree_lock_stat_print(int depth) {}
-+
-+#define lk_grant_inc(m) do {} while (0)
-+#define lk_block_inc(m) do {} while (0)
-+#define ln_grant_inc(d, m) do {} while (0)
-+#define ln_block_inc(d, m) do {} while (0)
-+#define ln_retry_inc(d, m) do {} while (0)
-+#define ln_event_inc(d) do {} while (0)
-+
-+#endif /* DEBUG */
-+
-+EXPORT_SYMBOL(htree_lock_stat_reset);
-+EXPORT_SYMBOL(htree_lock_stat_print);
-+
-+#define HTREE_DEP_ROOT (-1)
-+
-+#define htree_spin_lock(lhead, dep) \
-+ bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
-+#define htree_spin_unlock(lhead, dep) \
-+ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
-+
-+#define htree_key_event_ignore(child, ln) \
-+ (!((child)->lc_events & (1 << (ln)->ln_mode)))
-+
-+static int
-+htree_key_list_empty(struct htree_lock_node *ln)
-+{
-+ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
-+}
-+
-+static void
-+htree_key_list_del_init(struct htree_lock_node *ln)
-+{
-+ struct htree_lock_node *tmp = NULL;
-+
-+ if (!list_empty(&ln->ln_minor_list)) {
-+ tmp = list_entry(ln->ln_minor_list.next,
-+ struct htree_lock_node, ln_minor_list);
-+ list_del_init(&ln->ln_minor_list);
-+ }
-+
-+ if (list_empty(&ln->ln_major_list))
-+ return;
-+
-+ if (tmp == NULL) { /* not on minor key list */
-+ list_del_init(&ln->ln_major_list);
-+ } else {
-+ BUG_ON(!list_empty(&tmp->ln_major_list));
-+ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
-+ }
-+}
-+
-+static void
-+htree_key_list_replace_init(struct htree_lock_node *old,
-+ struct htree_lock_node *new)
-+{
-+ if (!list_empty(&old->ln_major_list))
-+ list_replace_init(&old->ln_major_list, &new->ln_major_list);
-+
-+ if (!list_empty(&old->ln_minor_list))
-+ list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
-+}
-+
-+static void
-+htree_key_event_enqueue(struct htree_lock_child *child,
-+ struct htree_lock_node *ln, int dep, void *event)
-+{
-+ struct htree_lock_node *tmp;
-+
-+ /* NB: ALWAYS called holding lhead::lh_lock(dep) */
-+ BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
-+ if (event == NULL || htree_key_event_ignore(child, ln))
-+ return;
-+
-+ /* shouldn't be a very long list */
-+ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
-+ if (tmp->ln_mode == HTREE_LOCK_NL) {
-+ ln_event_inc(dep);
-+ if (child->lc_callback != NULL)
-+ child->lc_callback(tmp->ln_ev_target, event);
-+ }
-+ }
-+}
-+
-+static int
-+htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
-+ unsigned dep, int wait, void *event)
-+{
-+ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
-+ struct htree_lock_node *newln = &newlk->lk_nodes[dep];
-+ struct htree_lock_node *curln = &curlk->lk_nodes[dep];
-+
-+ /* NB: ALWAYS called holding lhead::lh_lock(dep) */
-+ /* NB: we only expect PR/PW lock mode at here, only these two modes are
-+ * allowed for htree_node_lock(asserted in htree_node_lock_internal),
-+ * NL is only used for listener, user can't directly require NL mode */
-+ if ((curln->ln_mode == HTREE_LOCK_NL) ||
-+ (curln->ln_mode != HTREE_LOCK_PW &&
-+ newln->ln_mode != HTREE_LOCK_PW)) {
-+ /* no conflict, attach it on granted list of @curlk */
-+ if (curln->ln_mode != HTREE_LOCK_NL) {
-+ list_add(&newln->ln_granted_list,
-+ &curln->ln_granted_list);
-+ } else {
-+ /* replace key owner */
-+ htree_key_list_replace_init(curln, newln);
-+ }
-+
-+ list_add(&newln->ln_alive_list, &curln->ln_alive_list);
-+ htree_key_event_enqueue(child, newln, dep, event);
-+ ln_grant_inc(dep, newln->ln_mode);
-+ return 1; /* still hold lh_lock */
-+ }
-+
-+ if (!wait) { /* can't grant and don't want to wait */
-+ ln_retry_inc(dep, newln->ln_mode);
-+ newln->ln_mode = HTREE_LOCK_INVAL;
-+ return -1; /* don't wait and just return -1 */
-+ }
-+
-+ newlk->lk_task = current;
-+ set_current_state(TASK_UNINTERRUPTIBLE);
-+ /* conflict, attach it on blocked list of curlk */
-+ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
-+ list_add(&newln->ln_alive_list, &curln->ln_alive_list);
-+ ln_block_inc(dep, newln->ln_mode);
-+
-+ htree_spin_unlock(newlk->lk_head, dep);
-+ /* wait to be given the lock */
-+ if (newlk->lk_task != NULL)
-+ schedule();
-+ /* granted, no doubt, wake up will set me RUNNING */
-+ if (event == NULL || htree_key_event_ignore(child, newln))
-+ return 0; /* granted without lh_lock */
-+
-+ htree_spin_lock(newlk->lk_head, dep);
-+ htree_key_event_enqueue(child, newln, dep, event);
-+ return 1; /* still hold lh_lock */
-+}
-+
-+/*
-+ * get PR/PW access to particular tree-node according to @dep and @key,
-+ * it will return -1 if @wait is false and can't immediately grant this lock.
-+ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
-+ * @event if it's not NULL.
-+ * NB: ALWAYS called holding lhead::lh_lock
-+ */
-+static int
-+htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
-+ htree_lock_mode_t mode, u32 key, unsigned dep,
-+ int wait, void *event)
-+{
-+ LIST_HEAD (list);
-+ struct htree_lock *tmp;
-+ struct htree_lock *tmp2;
-+ u16 major;
-+ u16 minor;
-+ u8 reverse;
-+ u8 ma_bits;
-+ u8 mi_bits;
-+
-+ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
-+ BUG_ON(htree_node_is_granted(lck, dep));
-+
-+ key = hash_long(key, lhead->lh_hbits);
-+
-+ mi_bits = lhead->lh_hbits >> 1;
-+ ma_bits = lhead->lh_hbits - mi_bits;
-+
-+ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
-+ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
-+ lck->lk_nodes[dep].ln_mode = mode;
-+
-+ /*
-+ * The major key list is an ordered list, so searches are started
-+ * at the end of the list that is numerically closer to major_key,
-+ * so at most half of the list will be walked (for well-distributed
-+ * keys). The list traversal aborts early if the expected key
-+ * location is passed.
-+ */
-+ reverse = (major >= (1 << (ma_bits - 1)));
-+
-+ if (reverse) {
-+ list_for_each_entry_reverse(tmp,
-+ &lhead->lh_children[dep].lc_list,
-+ lk_nodes[dep].ln_major_list) {
-+ if (tmp->lk_nodes[dep].ln_major_key == major) {
-+ goto search_minor;
-+
-+ } else if (tmp->lk_nodes[dep].ln_major_key < major) {
-+ /* attach _after_ @tmp */
-+ list_add(&lck->lk_nodes[dep].ln_major_list,
-+ &tmp->lk_nodes[dep].ln_major_list);
-+ goto out_grant_major;
-+ }
-+ }
-+
-+ list_add(&lck->lk_nodes[dep].ln_major_list,
-+ &lhead->lh_children[dep].lc_list);
-+ goto out_grant_major;
-+
-+ } else {
-+ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
-+ lk_nodes[dep].ln_major_list) {
-+ if (tmp->lk_nodes[dep].ln_major_key == major) {
-+ goto search_minor;
-+
-+ } else if (tmp->lk_nodes[dep].ln_major_key > major) {
-+ /* insert _before_ @tmp */
-+ list_add_tail(&lck->lk_nodes[dep].ln_major_list,
-+ &tmp->lk_nodes[dep].ln_major_list);
-+ goto out_grant_major;
-+ }
-+ }
-+
-+ list_add_tail(&lck->lk_nodes[dep].ln_major_list,
-+ &lhead->lh_children[dep].lc_list);
-+ goto out_grant_major;
-+ }
-+
-+ search_minor:
-+ /*
-+ * NB: minor_key list doesn't have a "head", @list is just a
-+ * temporary stub for helping list searching, make sure it's removed
-+ * after searching.
-+ * minor_key list is an ordered list too.
-+ */
-+ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
-+
-+ reverse = (minor >= (1 << (mi_bits - 1)));
-+
-+ if (reverse) {
-+ list_for_each_entry_reverse(tmp2, &list,
-+ lk_nodes[dep].ln_minor_list) {
-+ if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
-+ goto out_enqueue;
-+
-+ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
-+ /* attach _after_ @tmp2 */
-+ list_add(&lck->lk_nodes[dep].ln_minor_list,
-+ &tmp2->lk_nodes[dep].ln_minor_list);
-+ goto out_grant_minor;
-+ }
-+ }
-+
-+ list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
-+
-+ } else {
-+ list_for_each_entry(tmp2, &list,
-+ lk_nodes[dep].ln_minor_list) {
-+ if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
-+ goto out_enqueue;
-+
-+ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
-+ /* insert _before_ @tmp2 */
-+ list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
-+ &tmp2->lk_nodes[dep].ln_minor_list);
-+ goto out_grant_minor;
-+ }
-+ }
-+
-+ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
-+ }
-+
-+ out_grant_minor:
-+ if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
-+ /* new lock @lck is the first one on minor_key list, which
-+ * means it has the smallest minor_key and it should
-+ * replace @tmp as minor_key owner */
-+ list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
-+ &lck->lk_nodes[dep].ln_major_list);
-+ }
-+ /* remove the temporary head */
-+ list_del(&list);
-+
-+ out_grant_major:
-+ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
-+ return 1; /* granted with holding lh_lock */
-+
-+ out_enqueue:
-+ list_del(&list); /* remove temprary head */
-+ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
-+}
-+
-+/*
-+ * release the key of @lck at level @dep, and grant any blocked locks.
-+ * caller will still listen on @key if @event is not NULL, which means
-+ * caller can see a event (by event_cb) while granting any lock with
-+ * the same key at level @dep.
-+ * NB: ALWAYS called holding lhead::lh_lock
-+ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
-+ */
-+static void
-+htree_node_unlock_internal(struct htree_lock_head *lhead,
-+ struct htree_lock *curlk, unsigned dep, void *event)
-+{
-+ struct htree_lock_node *curln = &curlk->lk_nodes[dep];
-+ struct htree_lock *grtlk = NULL;
-+ struct htree_lock_node *grtln;
-+ struct htree_lock *poslk;
-+ struct htree_lock *tmplk;
-+
-+ if (!htree_node_is_granted(curlk, dep))
-+ return;
-+
-+ if (!list_empty(&curln->ln_granted_list)) {
-+ /* there is another granted lock */
-+ grtlk = list_entry(curln->ln_granted_list.next,
-+ struct htree_lock,
-+ lk_nodes[dep].ln_granted_list);
-+ list_del_init(&curln->ln_granted_list);
-+ }
-+
-+ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
-+ /*
-+ * @curlk is the only granted lock, so we confirmed:
-+ * a) curln is key owner (attached on major/minor_list),
-+ * so if there is any blocked lock, it should be attached
-+ * on curln->ln_blocked_list
-+ * b) we always can grant the first blocked lock
-+ */
-+ grtlk = list_entry(curln->ln_blocked_list.next,
-+ struct htree_lock,
-+ lk_nodes[dep].ln_blocked_list);
-+ BUG_ON(grtlk->lk_task == NULL);
-+ wake_up_process(grtlk->lk_task);
-+ }
-+
-+ if (event != NULL &&
-+ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
-+ curln->ln_ev_target = event;
-+ curln->ln_mode = HTREE_LOCK_NL; /* listen! */
-+ } else {
-+ curln->ln_mode = HTREE_LOCK_INVAL;
-+ }
-+
-+ if (grtlk == NULL) { /* I must be the only one locking this key */
-+ struct htree_lock_node *tmpln;
-+
-+ BUG_ON(htree_key_list_empty(curln));
-+
-+ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
-+ return;
-+
-+ /* not listening */
-+ if (list_empty(&curln->ln_alive_list)) { /* no more listener */
-+ htree_key_list_del_init(curln);
-+ return;
-+ }
-+
-+ tmpln = list_entry(curln->ln_alive_list.next,
-+ struct htree_lock_node, ln_alive_list);
-+
-+ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
-+
-+ htree_key_list_replace_init(curln, tmpln);
-+ list_del_init(&curln->ln_alive_list);
-+
-+ return;
-+ }
-+
-+ /* have a granted lock */
-+ grtln = &grtlk->lk_nodes[dep];
-+ if (!list_empty(&curln->ln_blocked_list)) {
-+ /* only key owner can be on both lists */
-+ BUG_ON(htree_key_list_empty(curln));
-+
-+ if (list_empty(&grtln->ln_blocked_list)) {
-+ list_add(&grtln->ln_blocked_list,
-+ &curln->ln_blocked_list);
-+ }
-+ list_del_init(&curln->ln_blocked_list);
-+ }
-+ /*
-+ * NB: this is the tricky part:
-+ * We have only two modes for child-lock (PR and PW), also,
-+ * only owner of the key (attached on major/minor_list) can be on
-+ * both blocked_list and granted_list, so @grtlk must be one
-+ * of these two cases:
-+ *
-+ * a) @grtlk is taken from granted_list, which means we've granted
-+ * more than one lock so @grtlk has to be PR, the first blocked
-+ * lock must be PW and we can't grant it at all.
-+ * So even @grtlk is not owner of the key (empty blocked_list),
-+ * we don't care because we can't grant any lock.
-+ * b) we just grant a new lock which is taken from head of blocked
-+ * list, and it should be the first granted lock, and it should
-+ * be the first one linked on blocked_list.
-+ *
-+ * Either way, we can get correct result by iterating blocked_list
-+ * of @grtlk, and don't have to bother on how to find out
-+ * owner of current key.
-+ */
-+ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
-+ lk_nodes[dep].ln_blocked_list) {
-+ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
-+ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
-+ break;
-+ /* grant all readers */
-+ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
-+ list_add(&poslk->lk_nodes[dep].ln_granted_list,
-+ &grtln->ln_granted_list);
-+
-+ BUG_ON(poslk->lk_task == NULL);
-+ wake_up_process(poslk->lk_task);
-+ }
-+
-+ /* if @curln is the owner of this key, replace it with @grtln */
-+ if (!htree_key_list_empty(curln))
-+ htree_key_list_replace_init(curln, grtln);
-+
-+ if (curln->ln_mode == HTREE_LOCK_INVAL)
-+ list_del_init(&curln->ln_alive_list);
-+}
-+
-+/*
-+ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
-+ * and 0 only if @wait is false and can't grant it immediately
-+ */
-+int
-+htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
-+ u32 key, unsigned dep, int wait, void *event)
-+{
-+ struct htree_lock_head *lhead = lck->lk_head;
-+ int rc;
-+
-+ BUG_ON(dep >= lck->lk_depth);
-+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+
-+ htree_spin_lock(lhead, dep);
-+ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
-+ if (rc != 0)
-+ htree_spin_unlock(lhead, dep);
-+ return rc >= 0;
-+}
-+EXPORT_SYMBOL(htree_node_lock_try);
-+
-+/* it's wrapper of htree_node_unlock_internal */
-+void
-+htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
-+{
-+ struct htree_lock_head *lhead = lck->lk_head;
-+
-+ BUG_ON(dep >= lck->lk_depth);
-+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+
-+ htree_spin_lock(lhead, dep);
-+ htree_node_unlock_internal(lhead, lck, dep, event);
-+ htree_spin_unlock(lhead, dep);
-+}
-+EXPORT_SYMBOL(htree_node_unlock);
-+
-+/* stop listening on child-lock level @dep */
-+void
-+htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
-+{
-+ struct htree_lock_node *ln = &lck->lk_nodes[dep];
-+ struct htree_lock_node *tmp;
-+
-+ BUG_ON(htree_node_is_granted(lck, dep));
-+ BUG_ON(!list_empty(&ln->ln_blocked_list));
-+ BUG_ON(!list_empty(&ln->ln_granted_list));
-+
-+ if (!htree_node_is_listening(lck, dep))
-+ return;
-+
-+ htree_spin_lock(lck->lk_head, dep);
-+ ln->ln_mode = HTREE_LOCK_INVAL;
-+ ln->ln_ev_target = NULL;
-+
-+ if (htree_key_list_empty(ln)) { /* not owner */
-+ list_del_init(&ln->ln_alive_list);
-+ goto out;
-+ }
-+
-+ /* I'm the owner... */
-+ if (list_empty(&ln->ln_alive_list)) { /* no more listener */
-+ htree_key_list_del_init(ln);
-+ goto out;
-+ }
-+
-+ tmp = list_entry(ln->ln_alive_list.next,
-+ struct htree_lock_node, ln_alive_list);
-+
-+ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
-+ htree_key_list_replace_init(ln, tmp);
-+ list_del_init(&ln->ln_alive_list);
-+ out:
-+ htree_spin_unlock(lck->lk_head, dep);
-+}
-+EXPORT_SYMBOL(htree_node_stop_listen);
-+
-+/* release all child-locks if we have any */
-+static void
-+htree_node_release_all(struct htree_lock *lck)
-+{
-+ int i;
-+
-+ for (i = 0; i < lck->lk_depth; i++) {
-+ if (htree_node_is_granted(lck, i))
-+ htree_node_unlock(lck, i, NULL);
-+ else if (htree_node_is_listening(lck, i))
-+ htree_node_stop_listen(lck, i);
-+ }
-+}
-+
-+/*
-+ * obtain htree lock, it could be blocked inside if there's conflict
-+ * with any granted or blocked lock and @wait is true.
-+ * NB: ALWAYS called holding lhead::lh_lock
-+ */
-+static int
-+htree_lock_internal(struct htree_lock *lck, int wait)
-+{
-+ struct htree_lock_head *lhead = lck->lk_head;
-+ int granted = 0;
-+ int blocked = 0;
-+ int i;
-+
-+ for (i = 0; i < HTREE_LOCK_MAX; i++) {
-+ if (lhead->lh_ngranted[i] != 0)
-+ granted |= 1 << i;
-+ if (lhead->lh_nblocked[i] != 0)
-+ blocked |= 1 << i;
-+ }
-+ if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
-+ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
-+ /* will block current lock even it just conflicts with any
-+ * other blocked lock, so lock like EX wouldn't starve */
-+ if (!wait)
-+ return -1;
-+ lhead->lh_nblocked[lck->lk_mode]++;
-+ lk_block_inc(lck->lk_mode);
-+
-+ lck->lk_task = current;
-+ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
-+
-+ set_current_state(TASK_UNINTERRUPTIBLE);
-+ htree_spin_unlock(lhead, HTREE_DEP_ROOT);
-+ /* wait to be given the lock */
-+ if (lck->lk_task != NULL)
-+ schedule();
-+ /* granted, no doubt. wake up will set me RUNNING */
-+ return 0; /* without lh_lock */
-+ }
-+ lhead->lh_ngranted[lck->lk_mode]++;
-+ lk_grant_inc(lck->lk_mode);
-+ return 1;
-+}
-+
-+/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
-+static void
-+htree_unlock_internal(struct htree_lock *lck)
-+{
-+ struct htree_lock_head *lhead = lck->lk_head;
-+ struct htree_lock *tmp;
-+ struct htree_lock *tmp2;
-+ int granted = 0;
-+ int i;
-+
-+ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
-+
-+ lhead->lh_ngranted[lck->lk_mode]--;
-+ lck->lk_mode = HTREE_LOCK_INVAL;
-+
-+ for (i = 0; i < HTREE_LOCK_MAX; i++) {
-+ if (lhead->lh_ngranted[i] != 0)
-+ granted |= 1 << i;
-+ }
-+ list_for_each_entry_safe(tmp, tmp2,
-+ &lhead->lh_blocked_list, lk_blocked_list) {
-+ /* conflict with any granted lock? */
-+ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
-+ break;
-+
-+ list_del_init(&tmp->lk_blocked_list);
-+
-+ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
-+
-+ lhead->lh_nblocked[tmp->lk_mode]--;
-+ lhead->lh_ngranted[tmp->lk_mode]++;
-+ granted |= 1 << tmp->lk_mode;
-+
-+ BUG_ON(tmp->lk_task == NULL);
-+ wake_up_process(tmp->lk_task);
-+ }
-+}
-+
-+/* it's wrapper of htree_lock_internal and exported interface.
-+ * It always return 1 with granted lock if @wait is true, it can return 0
-+ * if @wait is false and locking request can't be granted immediately */
-+int
-+htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
-+ htree_lock_mode_t mode, int wait)
-+{
-+ int rc;
-+
-+ BUG_ON(lck->lk_depth > lhead->lh_depth);
-+ BUG_ON(lck->lk_head != NULL);
-+ BUG_ON(lck->lk_task != NULL);
-+
-+ lck->lk_head = lhead;
-+ lck->lk_mode = mode;
-+
-+ htree_spin_lock(lhead, HTREE_DEP_ROOT);
-+ rc = htree_lock_internal(lck, wait);
-+ if (rc != 0)
-+ htree_spin_unlock(lhead, HTREE_DEP_ROOT);
-+ return rc >= 0;
-+}
-+EXPORT_SYMBOL(htree_lock_try);
-+
-+/* it's wrapper of htree_unlock_internal and exported interface.
-+ * It will release all htree_node_locks and htree_lock */
-+void
-+htree_unlock(struct htree_lock *lck)
-+{
-+ BUG_ON(lck->lk_head == NULL);
-+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+
-+ htree_node_release_all(lck);
-+
-+ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
-+ htree_unlock_internal(lck);
-+ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
-+ lck->lk_head = NULL;
-+ lck->lk_task = NULL;
-+}
-+EXPORT_SYMBOL(htree_unlock);
-+
-+/* change lock mode */
-+void
-+htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
-+{
-+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+ lck->lk_mode = mode;
-+}
-+EXPORT_SYMBOL(htree_change_mode);
-+
-+/* release htree lock, and lock it again with new mode.
-+ * This function will first release all htree_node_locks and htree_lock,
-+ * then try to gain htree_lock with new @mode.
-+ * It always return 1 with granted lock if @wait is true, it can return 0
-+ * if @wait is false and locking request can't be granted immediately */
-+int
-+htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
-+{
-+ struct htree_lock_head *lhead = lck->lk_head;
-+ int rc;
-+
-+ BUG_ON(lhead == NULL);
-+ BUG_ON(lck->lk_mode == mode);
-+ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
-+
-+ htree_node_release_all(lck);
-+
-+ htree_spin_lock(lhead, HTREE_DEP_ROOT);
-+ htree_unlock_internal(lck);
-+ lck->lk_mode = mode;
-+ rc = htree_lock_internal(lck, wait);
-+ if (rc != 0)
-+ htree_spin_unlock(lhead, HTREE_DEP_ROOT);
-+ return rc >= 0;
-+}
-+EXPORT_SYMBOL(htree_change_lock_try);
-+
-+/* create a htree_lock head with @depth levels (number of child-locks),
-+ * it is a per resoruce structure */
-+struct htree_lock_head *
-+htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
-+{
-+ struct htree_lock_head *lhead;
-+ int i;
-+
-+ if (depth > HTREE_LOCK_DEP_MAX) {
-+ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
-+ depth, HTREE_LOCK_DEP_MAX);
-+ return NULL;
-+ }
-+
-+ lhead = kzalloc(offsetof(struct htree_lock_head,
-+ lh_children[depth]) + priv, GFP_NOFS);
-+ if (lhead == NULL)
-+ return NULL;
-+
-+ if (hbits < HTREE_HBITS_MIN)
-+ lhead->lh_hbits = HTREE_HBITS_MIN;
-+ else if (hbits > HTREE_HBITS_MAX)
-+ lhead->lh_hbits = HTREE_HBITS_MAX;
-+
-+ lhead->lh_lock = 0;
-+ lhead->lh_depth = depth;
-+ INIT_LIST_HEAD(&lhead->lh_blocked_list);
-+ if (priv > 0) {
-+ lhead->lh_private = (void *)lhead +
-+ offsetof(struct htree_lock_head, lh_children[depth]);
-+ }
-+
-+ for (i = 0; i < depth; i++) {
-+ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
-+ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
-+ }
-+ return lhead;
-+}
-+EXPORT_SYMBOL(htree_lock_head_alloc);
-+
-+/* free the htree_lock head */
-+void
-+htree_lock_head_free(struct htree_lock_head *lhead)
-+{
-+ int i;
-+
-+ BUG_ON(!list_empty(&lhead->lh_blocked_list));
-+ for (i = 0; i < lhead->lh_depth; i++)
-+ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
-+ kfree(lhead);
-+}
-+EXPORT_SYMBOL(htree_lock_head_free);
-+
-+/* register event callback for @events of child-lock at level @dep */
-+void
-+htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
-+ unsigned events, htree_event_cb_t callback)
-+{
-+ BUG_ON(lhead->lh_depth <= dep);
-+ lhead->lh_children[dep].lc_events = events;
-+ lhead->lh_children[dep].lc_callback = callback;
-+}
-+EXPORT_SYMBOL(htree_lock_event_attach);
-+
-+/* allocate a htree_lock, which is per-thread structure, @pbytes is some
-+ * extra-bytes as private data for caller */
-+struct htree_lock *
-+htree_lock_alloc(unsigned depth, unsigned pbytes)
-+{
-+ struct htree_lock *lck;
-+ int i = offsetof(struct htree_lock, lk_nodes[depth]);
-+
-+ if (depth > HTREE_LOCK_DEP_MAX) {
-+ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
-+ depth, HTREE_LOCK_DEP_MAX);
-+ return NULL;
-+ }
-+ lck = kzalloc(i + pbytes, GFP_NOFS);
-+ if (lck == NULL)
-+ return NULL;
-+
-+ if (pbytes != 0)
-+ lck->lk_private = (void *)lck + i;
-+ lck->lk_mode = HTREE_LOCK_INVAL;
-+ lck->lk_depth = depth;
-+ INIT_LIST_HEAD(&lck->lk_blocked_list);
-+
-+ for (i = 0; i < depth; i++) {
-+ struct htree_lock_node *node = &lck->lk_nodes[i];
-+
-+ node->ln_mode = HTREE_LOCK_INVAL;
-+ INIT_LIST_HEAD(&node->ln_major_list);
-+ INIT_LIST_HEAD(&node->ln_minor_list);
-+ INIT_LIST_HEAD(&node->ln_alive_list);
-+ INIT_LIST_HEAD(&node->ln_blocked_list);
-+ INIT_LIST_HEAD(&node->ln_granted_list);
-+ }
-+
-+ return lck;
-+}
-+EXPORT_SYMBOL(htree_lock_alloc);
-+
-+/* free htree_lock node */
-+void
-+htree_lock_free(struct htree_lock *lck)
-+{
-+ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
-+ kfree(lck);
-+}
-+EXPORT_SYMBOL(htree_lock_free);
---- a/fs/ext4/inode.c
-+++ b/fs/ext4/inode.c
-@@ -4965,7 +4965,7 @@ struct inode *ext4_iget(struct super_blo
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
- ei->i_file_acl |=
- ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-- inode->i_size = ext4_isize(raw_inode);
-+ inode->i_size = ext4_isize(sb, raw_inode);
- ei->i_disksize = inode->i_size;
- #ifdef CONFIG_QUOTA
- ei->i_reserved_quota = 0;
-@@ -5205,7 +5205,7 @@ static int ext4_do_update_inode(handle_t
- raw_inode->i_file_acl_high =
- cpu_to_le16(ei->i_file_acl >> 32);
- raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-- if (ei->i_disksize != ext4_isize(raw_inode)) {
-+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
- ext4_isize_set(raw_inode, ei->i_disksize);
- need_datasync = 1;
- }
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s
- struct inode *dir,
- struct dx_hash_info *hinfo,
- struct dx_frame *frame,
-- int *err);
-+ struct htree_lock *lck, int *err);
- static void dx_release(struct dx_frame *frames);
- static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-@@ -189,13 +189,13 @@ static void dx_insert_block(struct dx_fr
- static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
-- __u32 *start_hash);
-+ __u32 *start_hash, struct htree_lock *lck);
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
-- int *err);
-+ struct htree_lock *lck, int *err);
- static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode);
-+ struct inode *inode, struct htree_lock *lck);
-
- /*
- * p is at least 6 bytes before the end of page
-@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
-
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
-- return le32_to_cpu(entry->block) & 0x00ffffff;
-+ return le32_to_cpu(entry->block) & 0x0fffffff;
- }
-
- static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
-@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h
- }
- #endif /* DX_DEBUG */
-
-+/* private data for htree_lock */
-+struct ext4_dir_lock_data {
-+ unsigned ld_flags; /* bits-map for lock types */
-+ unsigned ld_count; /* # entries of the last DX block */
-+ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */
-+ struct dx_entry *ld_at; /* position of leaf dx_entry */
-+};
-+
-+#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
-+
-+/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
-+#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32)
-+
-+static void ext4_htree_event_cb(void *target, void *event)
-+{
-+ u64 *block = (u64 *)target;
-+
-+ if (*block == dx_get_block((struct dx_entry *)event))
-+ *block = EXT4_HTREE_NODE_CHANGED;
-+}
-+
-+struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
-+{
-+ struct htree_lock_head *lhead;
-+
-+ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
-+ if (lhead != NULL) {
-+ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
-+ ext4_htree_event_cb);
-+ }
-+ return lhead;
-+}
-+EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
-+
-+struct htree_lock *ext4_htree_lock_alloc(void)
-+{
-+ return htree_lock_alloc(EXT4_LK_MAX,
-+ sizeof(struct ext4_dir_lock_data));
-+}
-+EXPORT_SYMBOL(ext4_htree_lock_alloc);
-+
-+static htree_lock_mode_t ext4_htree_mode(unsigned flags)
-+{
-+ switch (flags) {
-+ default: /* 0 or unknown flags require EX lock */
-+ return HTREE_LOCK_EX;
-+ case EXT4_HLOCK_READDIR:
-+ return HTREE_LOCK_PR;
-+ case EXT4_HLOCK_LOOKUP:
-+ return HTREE_LOCK_CR;
-+ case EXT4_HLOCK_DEL:
-+ case EXT4_HLOCK_ADD:
-+ return HTREE_LOCK_CW;
-+ }
-+}
-+
-+/* return PR for read-only operations, otherwise return EX */
-+static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
-+{
-+ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
-+
-+ /* 0 requires EX lock */
-+ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
-+}
-+
-+static int ext4_htree_safe_locked(struct htree_lock *lck)
-+{
-+ int writer;
-+
-+ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
-+ return 1;
-+
-+ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
-+ EXT4_LB_DE;
-+ if (writer) /* all readers & writers are excluded? */
-+ return lck->lk_mode == HTREE_LOCK_EX;
-+
-+ /* all writers are excluded? */
-+ return lck->lk_mode == HTREE_LOCK_PR ||
-+ lck->lk_mode == HTREE_LOCK_PW ||
-+ lck->lk_mode == HTREE_LOCK_EX;
-+}
-+
-+/* relock htree_lock with EX mode if it's change operation, otherwise
-+ * relock it with PR mode. It's noop if PDO is disabled. */
-+static void ext4_htree_safe_relock(struct htree_lock *lck)
-+{
-+ if (!ext4_htree_safe_locked(lck)) {
-+ unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
-+
-+ htree_change_lock(lck, ext4_htree_safe_mode(flags));
-+ }
-+}
-+
-+void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
-+ struct inode *dir, unsigned flags)
-+{
-+ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
-+ ext4_htree_safe_mode(flags);
-+
-+ ext4_htree_lock_data(lck)->ld_flags = flags;
-+ htree_lock(lck, lhead, mode);
-+ if (!is_dx(dir))
-+ ext4_htree_safe_relock(lck); /* make sure it's safe locked */
-+}
-+EXPORT_SYMBOL(ext4_htree_lock);
-+
-+static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
-+ unsigned lmask, int wait, void *ev)
-+{
-+ u32 key = (at == NULL) ? 0 : dx_get_block(at);
-+ u32 mode;
-+
-+ /* NOOP if htree is well protected or caller doesn't require the lock */
-+ if (ext4_htree_safe_locked(lck) ||
-+ !(ext4_htree_lock_data(lck)->ld_flags & lmask))
-+ return 1;
-+
-+ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
-+ HTREE_LOCK_PW : HTREE_LOCK_PR;
-+ while (1) {
-+ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
-+ return 1;
-+ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
-+ return 0;
-+ cpu_relax(); /* spin until granted */
-+ }
-+}
-+
-+static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
-+{
-+ return ext4_htree_safe_locked(lck) ||
-+ htree_node_is_granted(lck, ffz(~lmask));
-+}
-+
-+static void ext4_htree_node_unlock(struct htree_lock *lck,
-+ unsigned lmask, void *buf)
-+{
-+ /* NB: it's safe to call mutiple times or even it's not locked */
-+ if (!ext4_htree_safe_locked(lck) &&
-+ htree_node_is_granted(lck, ffz(~lmask)))
-+ htree_node_unlock(lck, ffz(~lmask), buf);
-+}
-+
-+#define ext4_htree_dx_lock(lck, key) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
-+#define ext4_htree_dx_lock_try(lck, key) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
-+#define ext4_htree_dx_unlock(lck) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
-+#define ext4_htree_dx_locked(lck) \
-+ ext4_htree_node_locked(lck, EXT4_LB_DX)
-+
-+static void ext4_htree_dx_need_lock(struct htree_lock *lck)
-+{
-+ struct ext4_dir_lock_data *ld;
-+
-+ if (ext4_htree_safe_locked(lck))
-+ return;
-+
-+ ld = ext4_htree_lock_data(lck);
-+ switch (ld->ld_flags) {
-+ default:
-+ return;
-+ case EXT4_HLOCK_LOOKUP:
-+ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
-+ return;
-+ case EXT4_HLOCK_DEL:
-+ ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
-+ return;
-+ case EXT4_HLOCK_ADD:
-+ ld->ld_flags = EXT4_HLOCK_SPLIT;
-+ return;
-+ }
-+}
-+
-+#define ext4_htree_de_lock(lck, key) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
-+#define ext4_htree_de_unlock(lck) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
-+
-+#define ext4_htree_spin_lock(lck, key, event) \
-+ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
-+#define ext4_htree_spin_unlock(lck) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
-+#define ext4_htree_spin_unlock_listen(lck, p) \
-+ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
-+
-+static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
-+{
-+ if (!ext4_htree_safe_locked(lck) &&
-+ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
-+ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
-+}
-+
-+enum {
-+ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */
-+ DX_HASH_COL_YES, /* there is collision and it does matter */
-+ DX_HASH_COL_NO, /* there is no collision */
-+};
-+
-+static int dx_probe_hash_collision(struct htree_lock *lck,
-+ struct dx_entry *entries,
-+ struct dx_entry *at, u32 hash)
-+{
-+ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
-+ return DX_HASH_COL_IGNORE; /* don't care about collision */
-+
-+ } else if (at == entries + dx_get_count(entries) - 1) {
-+ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
-+
-+ } else { /* hash collision? */
-+ return ((dx_get_hash(at + 1) & ~1) == hash) ?
-+ DX_HASH_COL_YES : DX_HASH_COL_NO;
-+ }
-+}
-+
- /*
- * Probe for a directory leaf block to search.
- *
-@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h
- */
- static struct dx_frame *
- dx_probe(const struct qstr *d_name, struct inode *dir,
-- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
-+ struct dx_hash_info *hinfo, struct dx_frame *frame_in,
-+ struct htree_lock *lck, int *err)
- {
- unsigned count, indirect;
-- struct dx_entry *at, *entries, *p, *q, *m;
-+ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
- struct dx_root_info * info;
- struct buffer_head *bh;
- struct dx_frame *frame = frame_in;
- u32 hash;
-
-- frame->bh = NULL;
-+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
- goto fail;
-
-@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru
- goto fail;
- }
-
-- if ((indirect = info->indirect_levels) > 1) {
-- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
-- info->indirect_levels);
-+ indirect = info->indirect_levels;
-+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
-+ ext4_warning(dir->i_sb,
-+ "Directory (ino: %lu) htree depth %#06x exceed "
-+ "supported value", dir->i_ino,
-+ ext4_dir_htree_level(dir->i_sb));
-+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
-+ ext4_warning(dir->i_sb, "Enable large directory "
-+ "feature to access it");
-+ }
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
-@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru
- dxtrace(printk("Look up %x", hash));
- while (1)
- {
-+ if (indirect == 0) { /* the last index level */
-+ /* NB: ext4_htree_dx_lock() could be noop if
-+ * DX-lock flag is not set for current operation */
-+ ext4_htree_dx_lock(lck, dx);
-+ ext4_htree_spin_lock(lck, dx, NULL);
-+ }
- count = dx_get_count(entries);
-- if (!count || count > dx_get_limit(entries)) {
-+ if (count == 0 || count > dx_get_limit(entries)) {
-+ ext4_htree_spin_unlock(lck); /* release spin */
- ext4_warning(dir->i_sb,
- "dx entry: no count or count > limit");
- brelse(bh);
-@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru
- frame->bh = bh;
- frame->entries = entries;
- frame->at = at;
-- if (!indirect--) return frame;
-+
-+ if (indirect == 0) { /* the last index level */
-+ struct ext4_dir_lock_data *ld;
-+ u64 myblock;
-+
-+ /* By default we only lock DE-block, however, we will
-+ * also lock the last level DX-block if:
-+ * a) there is hash collision
-+ * we will set DX-lock flag (a few lines below)
-+ * and redo to lock DX-block
-+ * see detail in dx_probe_hash_collision()
-+ * b) it's a retry from splitting
-+ * we need to lock the last level DX-block so nobody
-+ * else can split any leaf blocks under the same
-+ * DX-block, see detail in ext4_dx_add_entry()
-+ */
-+ if (ext4_htree_dx_locked(lck)) {
-+ /* DX-block is locked, just lock DE-block
-+ * and return */
-+ ext4_htree_spin_unlock(lck);
-+ if (!ext4_htree_safe_locked(lck))
-+ ext4_htree_de_lock(lck, frame->at);
-+ return frame;
-+ }
-+ /* it's pdirop and no DX lock */
-+ if (dx_probe_hash_collision(lck, entries, at, hash) ==
-+ DX_HASH_COL_YES) {
-+ /* found hash collision, set DX-lock flag
-+ * and retry to abtain DX-lock */
-+ ext4_htree_spin_unlock(lck);
-+ ext4_htree_dx_need_lock(lck);
-+ continue;
-+ }
-+ ld = ext4_htree_lock_data(lck);
-+ /* because I don't lock DX, so @at can't be trusted
-+ * after I release spinlock so I have to save it */
-+ ld->ld_at = at;
-+ ld->ld_at_entry = *at;
-+ ld->ld_count = dx_get_count(entries);
-+
-+ frame->at = &ld->ld_at_entry;
-+ myblock = dx_get_block(at);
-+
-+ /* NB: ordering locking */
-+ ext4_htree_spin_unlock_listen(lck, &myblock);
-+ /* other thread can split this DE-block because:
-+ * a) I don't have lock for the DE-block yet
-+ * b) I released spinlock on DX-block
-+ * if it happened I can detect it by listening
-+ * splitting event on this DE-block */
-+ ext4_htree_de_lock(lck, frame->at);
-+ ext4_htree_spin_stop_listen(lck);
-+
-+ if (myblock == EXT4_HTREE_NODE_CHANGED) {
-+ /* someone split this DE-block before
-+ * I locked it, I need to retry and lock
-+ * valid DE-block */
-+ ext4_htree_de_unlock(lck);
-+ continue;
-+ }
-+ return frame;
-+ }
-+ dx = at;
-+ indirect--;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
- goto fail2;
-+
- at = entries = ((struct dx_node *) bh->b_data)->entries;
- if (dx_get_limit(entries) != dx_node_limit (dir)) {
- ext4_warning(dir->i_sb,
-@@ -512,13 +808,18 @@ fail:
- static void dx_release (struct dx_frame *frames)
- {
- struct dx_root_info *info;
-+ int i;
-+
- if (frames[0].bh == NULL)
- return;
-
- info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
-- if (info->indirect_levels)
-- brelse(frames[1].bh);
-- brelse(frames[0].bh);
-+ for (i = 0; i <= info->indirect_levels; i++) {
-+ if (frames[i].bh == NULL)
-+ break;
-+ brelse(frames[i].bh);
-+ frames[i].bh = NULL;
-+ }
- }
-
- /*
-@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame
- static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
-- __u32 *start_hash)
-+ __u32 *start_hash, struct htree_lock *lck)
- {
- struct dx_frame *p;
- struct buffer_head *bh;
-@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct
- * this loop, num_frames indicates the number of interior
- * nodes need to be read.
- */
-+ ext4_htree_de_unlock(lck);
- while (1) {
-- if (++(p->at) < p->entries + dx_get_count(p->entries))
-- break;
-+ if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
-+ /* num_frames > 0 :
-+ * DX block
-+ * ext4_htree_dx_locked:
-+ * frame->at is reliable pointer returned by dx_probe,
-+ * otherwise dx_probe already knew no collision */
-+ if (++(p->at) < p->entries + dx_get_count(p->entries))
-+ break;
-+ }
- if (p == frames)
- return 0;
- num_frames++;
-+ if (num_frames == 1)
-+ ext4_htree_dx_unlock(lck);
- p--;
- }
-
-@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct
- * block so no check is necessary
- */
- while (num_frames--) {
-+ if (num_frames == 0) {
-+ /* it's not always necessary, we just don't want to
-+ * detect hash collision again */
-+ ext4_htree_dx_need_lock(lck);
-+ ext4_htree_dx_lock(lck, p->at);
-+ }
-+
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
- return err; /* Failure */
-@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct
- p->bh = bh;
- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
- }
-+ ext4_htree_de_lock(lck, p->at);
- return 1;
- }
-
-@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di
- {
- struct dx_hash_info hinfo;
- struct ext4_dir_entry_2 *de;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct inode *dir;
- ext4_lblk_t block;
- int count = 0;
-@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di
- }
- hinfo.hash = start_hash;
- hinfo.minor_hash = 0;
-- frame = dx_probe(NULL, dir, &hinfo, frames, &err);
-+ /* assume it's PR locked */
-+ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err);
- if (!frame)
- return err;
--
- /* Add '.' and '..' from the htree header */
- if (!start_hash && !start_minor_hash) {
- de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di
- count += ret;
- hashval = ~0;
- ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
-- frame, frames, &hashval);
-+ frame, frames, &hashval, NULL);
- *next_hash = hashval;
- if (ret < 0) {
- err = ret;
-@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr
-
- static void ext4_update_dx_flag(struct inode *inode)
- {
-+ /* Disable it for ldiskfs, because going from a DX directory to
-+ * a non-DX directory while it is in use will completely break
-+ * the htree-locking.
-+ * If we really want to support this operation in the future,
-+ * we need to exclusively lock the directory at here which will
-+ * increase complexity of code */
-+#if 0
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
- ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-+#endif
- }
-
- /*
-@@ -888,8 +1215,9 @@ static inline int search_dirblock(struct
- * to brelse() it when appropriate.
- */
- struct buffer_head * ext4_find_entry(struct inode *dir,
-- const struct qstr *d_name,
-- struct ext4_dir_entry_2 ** res_dir)
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 **res_dir,
-+ struct htree_lock *lck)
- {
- struct super_block *sb;
- struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -910,7 +1238,7 @@ struct buffer_head * ext4_find_entry(str
- if (namelen > EXT4_NAME_LEN)
- return NULL;
- if (is_dx(dir)) {
-- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
-+ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err);
- /*
- * On success, or if the error was file not found,
- * return. Otherwise, fall back to doing a search the
-@@ -920,6 +1248,7 @@ struct buffer_head * ext4_find_entry(str
- return bh;
- dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
- "falling back\n"));
-+ ext4_htree_safe_relock(lck);
- }
- nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
- start = EXT4_I(dir)->i_dir_start_lookup;
-@@ -996,13 +1325,15 @@ cleanup_and_exit:
- return ret;
- }
-
--static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-- struct ext4_dir_entry_2 **res_dir, int *err)
-+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
-+ const struct qstr *d_name,
-+ struct ext4_dir_entry_2 **res_dir,
-+ struct htree_lock *lck, int *err)
- {
- struct super_block * sb;
- struct dx_hash_info hinfo;
- u32 hash;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct buffer_head *bh;
- ext4_lblk_t block;
- int retval;
-@@ -1012,13 +1343,16 @@ static struct buffer_head * ext4_dx_find
- sb = dir->i_sb;
- /* NFS may look up ".." - look at dx_root directory block */
- if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-+ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err)))
- return NULL;
- } else {
- frame = frames;
- frame->bh = NULL; /* for dx_release() */
- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
- dx_set_block(frame->at, 0); /* dx_root block is 0 */
-+ /* "." and ".." are stored in root DX lock */
-+ ext4_htree_dx_need_lock(lck);
-+ ext4_htree_dx_lock(lck, NULL);
- }
- hash = hinfo.hash;
- do {
-@@ -1041,7 +1375,7 @@ static struct buffer_head * ext4_dx_find
-
- /* Check to see if we should continue to search */
- retval = ext4_htree_next_block(dir, hash, frame,
-- frames, NULL);
-+ frames, NULL, lck);
- if (retval < 0) {
- ext4_warning(sb,
- "error reading index page in directory #%lu",
-@@ -1067,7 +1401,7 @@ static struct dentry *ext4_lookup(struct
- if (dentry->d_name.len > EXT4_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
-- bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- inode = NULL;
- if (bh) {
- __u32 ino = le32_to_cpu(de->inode);
-@@ -1134,7 +1468,7 @@ struct dentry *ext4_get_parent(struct de
- struct ext4_dir_entry_2 * de;
- struct buffer_head *bh;
-
-- bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
- if (!bh)
- return ERR_PTR(-ENOENT);
- ino = le32_to_cpu(de->inode);
-@@ -1222,8 +1556,9 @@ static struct ext4_dir_entry_2* dx_pack_
- * Returns pointer to de in block into which the new entry will be inserted.
- */
- static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
-- struct buffer_head **bh,struct dx_frame *frame,
-- struct dx_hash_info *hinfo, int *error)
-+ struct buffer_head **bh, struct dx_frame *frames,
-+ struct dx_frame *frame, struct dx_hash_info *hinfo,
-+ struct htree_lock *lck, int *error)
- {
- unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
-@@ -1280,7 +1615,14 @@ static struct ext4_dir_entry_2 *do_split
- hash2, split, count-split));
-
- /* Fancy dance to stay within two buffers */
-- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
-+ if (hinfo->hash < hash2) {
-+ de2 = dx_move_dirents(data1, data2, map + split,
-+ count - split, blocksize);
-+ } else {
-+ /* make sure we will add entry to the same block which
-+ * we have already locked */
-+ de2 = dx_move_dirents(data1, data2, map, split, blocksize);
-+ }
- de = dx_pack_dirents(data1, blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
- blocksize);
-@@ -1289,13 +1631,21 @@ static struct ext4_dir_entry_2 *do_split
- dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
- dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
-
-- /* Which block gets the new entry? */
-- if (hinfo->hash >= hash2)
-- {
-- swap(*bh, bh2);
-- de = de2;
-+ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
-+ frame->at); /* notify block is being split */
-+ if (hinfo->hash < hash2) {
-+ dx_insert_block(frame, hash2 + continued, newblock);
-+
-+ } else {
-+ /* switch block number */
-+ dx_insert_block(frame, hash2 + continued,
-+ dx_get_block(frame->at));
-+ dx_set_block(frame->at, newblock);
-+ (frame->at)++;
- }
-- dx_insert_block(frame, hash2 + continued, newblock);
-+ ext4_htree_spin_unlock(lck);
-+ ext4_htree_dx_unlock(lck);
-+
- err = ext4_handle_dirty_metadata(handle, dir, bh2);
- if (err)
- goto journal_error;
-@@ -1406,7 +1756,7 @@ static int add_dirent_to_buf(handle_t *h
- if (!IS_NOCMTIME(dir))
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
- ext4_update_dx_flag(dir);
-- dir->i_version++;
-+ inode_inc_iversion(dir);
- ext4_mark_inode_dirty(handle, dir);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
-@@ -1426,7 +1776,7 @@ static int make_indexed_dir(handle_t *ha
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct buffer_head *bh2;
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct dx_entry *entries;
- struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
- char *data1, *top;
-@@ -1507,7 +1857,7 @@ static int make_indexed_dir(handle_t *ha
- ext4_handle_dirty_metadata(handle, dir, frame->bh);
- ext4_handle_dirty_metadata(handle, dir, bh);
-
-- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-+ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
- if (!de) {
- /*
- * Even if the block split failed, we have to properly write
-@@ -1614,7 +1964,7 @@ out:
- * the entry, as someone else might have used it while you slept.
- */
- int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+ struct inode *inode, struct htree_lock *lck)
- {
- struct inode *dir = dentry->d_parent->d_inode;
- struct buffer_head *bh;
-@@ -1633,9 +1983,10 @@ int ext4_add_entry(handle_t *handle, str
- if (dentry->d_name.len == 2 &&
- memcmp(dentry->d_name.name, "..", 2) == 0)
- return ext4_update_dotdot(handle, dentry, inode);
-- retval = ext4_dx_add_entry(handle, dentry, inode);
-+ retval = ext4_dx_add_entry(handle, dentry, inode, lck);
- if (!retval || (retval != ERR_BAD_DX_DIR))
- return retval;
-+ ext4_htree_safe_relock(lck);
- ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
- dx_fallback++;
- ext4_mark_inode_dirty(handle, dir);
-@@ -1673,18 +2024,21 @@ int ext4_add_entry(handle_t *handle, str
- * Returns 0 for success, or a negative error value
- */
- static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
-- struct inode *inode)
-+ struct inode *inode, struct htree_lock *lck)
- {
-- struct dx_frame frames[2], *frame;
-+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
- struct dx_entry *entries, *at;
- struct dx_hash_info hinfo;
- struct buffer_head *bh;
- struct inode *dir = dentry->d_parent->d_inode;
- struct super_block *sb = dir->i_sb;
- struct ext4_dir_entry_2 *de;
-+ int restart;
- int err;
-
-- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-+again:
-+ restart = 0;
-+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
- if (!frame)
- return err;
- entries = frame->entries;
-@@ -1693,33 +2047,53 @@ static int ext4_dx_add_entry(handle_t *h
- if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
- goto cleanup;
-
-- BUFFER_TRACE(bh, "get_write_access");
-- err = ext4_journal_get_write_access(handle, bh);
-- if (err)
-- goto journal_error;
--
- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC)
- goto cleanup;
-
-+ err = 0;
- /* Block full, should compress but for now just split */
- dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
- dx_get_count(entries), dx_get_limit(entries)));
- /* Need to split index? */
- if (dx_get_count(entries) == dx_get_limit(entries)) {
- ext4_lblk_t newblock;
-- unsigned icount = dx_get_count(entries);
-- int levels = frame - frames;
-+ int levels = frame - frames + 1;
-+ unsigned icount;
-+ int add_level = 1;
- struct dx_entry *entries2;
- struct dx_node *node2;
- struct buffer_head *bh2;
-
-- if (levels && (dx_get_count(frames->entries) ==
-- dx_get_limit(frames->entries))) {
-- ext4_warning(sb, "Directory index full!");
-+ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
-+ ext4_htree_safe_relock(lck);
-+ restart = 1;
-+ goto cleanup;
-+ }
-+ while (frame > frames) {
-+ if (dx_get_count((frame - 1)->entries) <
-+ dx_get_limit((frame - 1)->entries)) {
-+ add_level = 0;
-+ break;
-+ }
-+ frame--; /* split higher index block */
-+ at = frame->at;
-+ entries = frame->entries;
-+ restart = 1;
-+ }
-+ if (add_level && levels == ext4_dir_htree_level(sb)) {
-+ ext4_warning(sb, "Directory (ino: %lu) index full, "
-+ "reach max htree level :%d",
-+ dir->i_ino, levels);
-+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
-+ ext4_warning(sb, "Large directory feature is"
-+ "not enabled on this "
-+ "filesystem");
-+ }
- err = -ENOSPC;
- goto cleanup;
- }
-+ icount = dx_get_count(entries);
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2))
- goto cleanup;
-@@ -1732,7 +2106,7 @@ static int ext4_dx_add_entry(handle_t *h
- err = ext4_journal_get_write_access(handle, frame->bh);
- if (err)
- goto journal_error;
-- if (levels) {
-+ if (!add_level) {
- unsigned icount1 = icount/2, icount2 = icount - icount1;
- unsigned hash2 = dx_get_hash(entries + icount1);
- dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
-@@ -1740,7 +2114,7 @@ static int ext4_dx_add_entry(handle_t *h
-
- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext4_journal_get_write_access(handle,
-- frames[0].bh);
-+ (frame - 1)->bh);
- if (err)
- goto journal_error;
-
-@@ -1756,14 +2130,21 @@ static int ext4_dx_add_entry(handle_t *h
- frame->entries = entries = entries2;
- swap(frame->bh, bh2);
- }
-- dx_insert_block(frames + 0, hash2, newblock);
-- dxtrace(dx_show_index("node", frames[1].entries));
-+ dx_insert_block((frame - 1), hash2, newblock);
-+ dxtrace(dx_show_index("node", frame->entries));
- dxtrace(dx_show_index("node",
- ((struct dx_node *) bh2->b_data)->entries));
- err = ext4_handle_dirty_metadata(handle, dir, bh2);
- if (err)
- goto journal_error;
- brelse (bh2);
-+ ext4_handle_dirty_metadata(handle, inode,
-+ (frame - 1)->bh);
-+ if (restart) {
-+ ext4_handle_dirty_metadata(handle, inode,
-+ frame->bh);
-+ goto cleanup;
-+ }
- } else {
- struct dx_root_info * info;
- dxtrace(printk(KERN_DEBUG
-@@ -1777,25 +2158,42 @@ static int ext4_dx_add_entry(handle_t *h
- dx_set_block(entries + 0, newblock);
- info = dx_get_dx_info((struct ext4_dir_entry_2*)
- frames[0].bh->b_data);
-- info->indirect_levels = 1;
--
-- /* Add new access path frame */
-- frame = frames + 1;
-- frame->at = at = at - entries + entries2;
-- frame->entries = entries = entries2;
-- frame->bh = bh2;
-- err = ext4_journal_get_write_access(handle,
-- frame->bh);
-- if (err)
-- goto journal_error;
-+ info->indirect_levels += 1;
-+ dxtrace(printk(KERN_DEBUG
-+ "Creating %d level index...\n",
-+ info->indirect_levels));
-+ ext4_handle_dirty_metadata(handle, inode, frame->bh);
-+ ext4_handle_dirty_metadata(handle, inode, bh2);
-+ brelse(bh2);
-+ restart = 1;
-+ goto cleanup;
- }
-- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
-- if (err) {
-- ext4_std_error(inode->i_sb, err);
-+ } else if (!ext4_htree_dx_locked(lck)) {
-+ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
-+
-+ /* not well protected, require DX lock */
-+ ext4_htree_dx_need_lock(lck);
-+ at = frame > frames ? (frame - 1)->at : NULL;
-+
-+ /* NB: no risk of deadlock because it's just a try.
-+ *
-+ * NB: we check ld_count for twice, the first time before
-+ * having DX lock, the second time after holding DX lock.
-+ *
-+ * NB: We never free blocks for directory so far, which
-+ * means value returned by dx_get_count() should equal to
-+ * ld->ld_count if nobody split any DE-block under @at,
-+ * and ld->ld_at still points to valid dx_entry. */
-+ if ((ld->ld_count != dx_get_count(entries)) ||
-+ !ext4_htree_dx_lock_try(lck, at) ||
-+ (ld->ld_count != dx_get_count(entries))) {
-+ restart = 1;
- goto cleanup;
- }
-- }
-- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
-+ /* OK, I've got DX lock and nothing changed */
-+ frame->at = ld->ld_at;
-+ }
-+ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
- if (!de)
- goto cleanup;
- err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-@@ -1804,9 +2202,15 @@ static int ext4_dx_add_entry(handle_t *h
- journal_error:
- ext4_std_error(dir->i_sb, err);
- cleanup:
-+ ext4_htree_dx_unlock(lck);
-+ ext4_htree_de_unlock(lck);
- if (bh)
- brelse(bh);
- dx_release(frames);
-+ /* @restart is true means htree-path has been changed, we need to
-+ * repeat dx_probe() to find out valid htree-path */
-+ if (restart && err == 0)
-+ goto again;
- return err;
- }
-
-@@ -1845,7 +2249,7 @@ int ext4_delete_entry(handle_t *handle,
- blocksize);
- else
- de->inode = 0;
-- dir->i_version++;
-+ inode_inc_iversion(dir);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
- if (unlikely(err)) {
-@@ -1892,7 +2296,7 @@ static void ext4_dec_count(handle_t *han
- static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
- {
-- int err = ext4_add_entry(handle, dentry, inode);
-+ int err = ext4_add_entry(handle, dentry, inode, NULL);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
-@@ -2122,7 +2526,7 @@ retry:
- err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL);
- if (err)
- goto out_clear_inode;
-- err = ext4_add_entry(handle, dentry, inode);
-+ err = ext4_add_entry(handle, dentry, inode, NULL);
- if (err)
- goto out_clear_inode;
- ext4_inc_count(handle, dir);
-@@ -2395,7 +2799,7 @@ static int ext4_rmdir(struct inode *dir,
- return PTR_ERR(handle);
-
- retval = -ENOENT;
-- bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (!bh)
- goto end_rmdir;
-
-@@ -2460,7 +2864,7 @@ static int ext4_unlink(struct inode *dir
- ext4_handle_sync(handle);
-
- retval = -ENOENT;
-- bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (!bh)
- goto end_unlink;
-
-@@ -2628,7 +3032,7 @@ retry:
- ext4_inc_count(handle, inode);
- ihold(inode);
-
-- err = ext4_add_entry(handle, dentry, inode);
-+ err = ext4_add_entry(handle, dentry, inode, NULL);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
-@@ -2676,7 +3080,7 @@ static int ext4_rename(struct inode *old
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
-
-- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
-+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
- /*
- * Check for inode number is _not_ due to possible IO errors.
- * We might rmdir the source, keep it as pwd of some process
-@@ -2689,7 +3093,7 @@ static int ext4_rename(struct inode *old
- goto end_rename;
-
- new_inode = new_dentry->d_inode;
-- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
-+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL);
- if (new_bh) {
- if (!new_inode) {
- brelse(new_bh);
-@@ -2719,7 +3123,7 @@ static int ext4_rename(struct inode *old
- goto end_rename;
- }
- if (!new_bh) {
-- retval = ext4_add_entry(handle, new_dentry, old_inode);
-+ retval = ext4_add_entry(handle, new_dentry, old_inode, NULL);
- if (retval)
- goto end_rename;
- } else {
-@@ -2767,7 +3171,8 @@ static int ext4_rename(struct inode *old
- struct buffer_head *old_bh2;
- struct ext4_dir_entry_2 *old_de2;
-
-- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
-+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
-+ &old_de2, NULL);
- if (old_bh2) {
- retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
---- /dev/null
-+++ b/include/linux/htree_lock.h
-@@ -0,0 +1,187 @@
-+/*
-+ * include/linux/htree_lock.h
-+ *
-+ * Copyright (c) 2011, 2012, Intel Corporation.
-+ *
-+ * Author: Liang Zhen <liang@whamcloud.com>
-+ */
-+
-+/*
-+ * htree lock
-+ *
-+ * htree_lock is an advanced lock, it can support five lock modes (concept is
-+ * taken from DLM) and it's a sleeping lock.
-+ *
-+ * most common use case is:
-+ * - create a htree_lock_head for data
-+ * - each thread (contender) creates it's own htree_lock
-+ * - contender needs to call htree_lock(lock_node, mode) to protect data and
-+ * call htree_unlock to release lock
-+ *
-+ * Also, there is advanced use-case which is more complex, user can have
-+ * PW/PR lock on particular key, it's mostly used while user holding shared
-+ * lock on the htree (CW, CR)
-+ *
-+ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
-+ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
-+ * ...
-+ * htree_node_unlock(lock_node);; unlock the key
-+ *
-+ * Another tip is, we can have N-levels of this kind of keys, all we need to
-+ * do is specifying N-levels while creating htree_lock_head, then we can
-+ * lock/unlock a specific level by:
-+ * htree_node_lock(lock_node, mode1, key1, level1...);
-+ * do something;
-+ * htree_node_lock(lock_node, mode1, key2, level2...);
-+ * do something;
-+ * htree_node_unlock(lock_node, level2);
-+ * htree_node_unlock(lock_node, level1);
-+ *
-+ * NB: for multi-level, should be careful about locking order to avoid deadlock
-+ */
-+
-+#ifndef _LINUX_HTREE_LOCK_H
-+#define _LINUX_HTREE_LOCK_H
-+
-+#include <linux/list.h>
-+#include <linux/spinlock.h>
-+#include <linux/sched.h>
-+
-+/*
-+ * Lock Modes
-+ * more details can be found here:
-+ * http://en.wikipedia.org/wiki/Distributed_lock_manager
-+ */
-+typedef enum {
-+ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */
-+ HTREE_LOCK_PW, /* protected write: allows only CR users */
-+ HTREE_LOCK_PR, /* protected read: allow PR, CR users */
-+ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */
-+ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */
-+ HTREE_LOCK_MAX, /* number of lock modes */
-+} htree_lock_mode_t;
-+
-+#define HTREE_LOCK_NL HTREE_LOCK_MAX
-+#define HTREE_LOCK_INVAL 0xdead10c
-+
-+enum {
-+ HTREE_HBITS_MIN = 2,
-+ HTREE_HBITS_DEF = 14,
-+ HTREE_HBITS_MAX = 32,
-+};
-+
-+enum {
-+ HTREE_EVENT_DISABLE = (0),
-+ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR),
-+ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW),
-+ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR),
-+};
-+
-+struct htree_lock;
-+
-+typedef void (*htree_event_cb_t)(void *target, void *event);
-+
-+struct htree_lock_child {
-+ struct list_head lc_list; /* granted list */
-+ htree_event_cb_t lc_callback; /* event callback */
-+ unsigned lc_events; /* event types */
-+};
-+
-+struct htree_lock_head {
-+ unsigned long lh_lock; /* bits lock */
-+ /* blocked lock list (htree_lock) */
-+ struct list_head lh_blocked_list;
-+ /* # key levels */
-+ u16 lh_depth;
-+ /* hash bits for key and limit number of locks */
-+ u16 lh_hbits;
-+ /* counters for blocked locks */
-+ u16 lh_nblocked[HTREE_LOCK_MAX];
-+ /* counters for granted locks */
-+ u16 lh_ngranted[HTREE_LOCK_MAX];
-+ /* private data */
-+ void *lh_private;
-+ /* array of children locks */
-+ struct htree_lock_child lh_children[0];
-+};
-+
-+/* htree_lock_node_t is child-lock for a specific key (ln_value) */
-+struct htree_lock_node {
-+ htree_lock_mode_t ln_mode;
-+ /* major hash key */
-+ u16 ln_major_key;
-+ /* minor hash key */
-+ u16 ln_minor_key;
-+ struct list_head ln_major_list;
-+ struct list_head ln_minor_list;
-+ /* alive list, all locks (granted, blocked, listening) are on it */
-+ struct list_head ln_alive_list;
-+ /* blocked list */
-+ struct list_head ln_blocked_list;
-+ /* granted list */
-+ struct list_head ln_granted_list;
-+ void *ln_ev_target;
-+};
-+
-+struct htree_lock {
-+ struct task_struct *lk_task;
-+ struct htree_lock_head *lk_head;
-+ void *lk_private;
-+ unsigned lk_depth;
-+ htree_lock_mode_t lk_mode;
-+ struct list_head lk_blocked_list;
-+ struct htree_lock_node lk_nodes[0];
-+};
-+
-+/* create a lock head, which stands for a resource */
-+struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
-+ unsigned hbits, unsigned priv);
-+/* free a lock head */
-+void htree_lock_head_free(struct htree_lock_head *lhead);
-+/* register event callback for child lock at level @depth */
-+void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
-+ unsigned events, htree_event_cb_t callback);
-+/* create a lock handle, which stands for a thread */
-+struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
-+/* free a lock handle */
-+void htree_lock_free(struct htree_lock *lck);
-+/* lock htree, when @wait is true, 0 is returned if the lock can't
-+ * be granted immediately */
-+int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
-+ htree_lock_mode_t mode, int wait);
-+/* unlock htree */
-+void htree_unlock(struct htree_lock *lck);
-+/* unlock and relock htree with @new_mode */
-+int htree_change_lock_try(struct htree_lock *lck,
-+ htree_lock_mode_t new_mode, int wait);
-+void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
-+/* require child lock (key) of htree at level @dep, @event will be sent to all
-+ * listeners on this @key while lock being granted */
-+int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
-+ u32 key, unsigned dep, int wait, void *event);
-+/* release child lock at level @dep, this lock will listen on it's key
-+ * if @event isn't NULL, event_cb will be called against @lck while granting
-+ * any other lock at level @dep with the same key */
-+void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
-+/* stop listening on child lock at level @dep */
-+void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
-+/* for debug */
-+void htree_lock_stat_print(int depth);
-+void htree_lock_stat_reset(void);
-+
-+#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1)
-+#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1)
-+
-+#define htree_lock_mode(lck) ((lck)->lk_mode)
-+
-+#define htree_node_lock(lck, mode, key, dep) \
-+ htree_node_lock_try(lck, mode, key, dep, 1, NULL)
-+/* this is only safe in thread context of lock owner */
-+#define htree_node_is_granted(lck, dep) \
-+ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
-+ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
-+/* this is only safe in thread context of lock owner */
-+#define htree_node_is_listening(lck, dep) \
-+ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
-+
-+#endif
rhel6.3/ext4-lookup-dotdot.patch
rhel6.3/ext4-print-inum-in-htree-warning.patch
rhel6.4/ext4-prealloc.patch
+rhel6.3/ext4-use-correct-inode.patch
rhel6.3/ext4-mballoc-extra-checks.patch
rhel6.4/ext4-misc.patch
rhel6.3/ext4-pdir-fix.patch
rhel6.3/ext4-journal-callback.patch
rhel6.5/ext4-ext-walk-space.patch
rhel6.3/ext4-store-tree-generation-at-find.patch
+rhel6.3/ext4-large-dir.patch
rhel6.3/ext4-pdirop.patch
rhel6.4/ext4-extra-isize.patch
rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch
rhel6.3/ext4-export-64bit-name-hash.patch
rhel6.3/ext4-journal-callback.patch
rhel6.3/ext4-store-tree-generation-at-find.patch
+rhel6.3/ext4-large-dir.patch
rhel6.3/ext4-pdirop.patch
rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch
rhel6.3/ext4-quota-dont-update-cmtime.patch
rhel6.3/ext4-nocmtime-2.6.patch
rhel6.3/ext4-export-64bit-name-hash.patch
sles11sp2/ext4-store-tree-generation-at-find.patch
-sles11sp2/ext4-pdirop.patch
+sles11sp2/ext4-large-dir.patch
+rhel6.3/ext4-pdirop.patch
rhel6.3/ext4-max-dir-size.patch
sles11sp2/ext4-max-dir-size-options.patch
rhel6.3/ext4-not-discard-preallocation-umount.patch
sles11sp2/ext4-disable-mb-cache.patch
rhel6.3/ext4-nocmtime-2.6.patch
sles11sp2/ext4-store-tree-generation-at-find.patch
-sles11sp2/ext4-pdirop.patch
+sles11sp2/ext4-large-dir.patch
+rhel6.3/ext4-pdirop.patch
rhel6.3/ext4-max-dir-size.patch
sles11sp2/ext4-max-dir-size-options.patch
rhel6.3/ext4-not-discard-preallocation-umount.patch
rhel7/ext4-large-eas.patch
rhel7/ext4-disable-mb-cache.patch
rhel7/ext4-nocmtime.patch
+rhel7/ext4-large-dir.patch
rhel7/ext4-pdirop.patch
rhel7/ext4-max-dir-size.patch
rhel7/ext4-remove-truncate-warning.patch
};
#define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
- ldiskfs_add_entry(handle, child, cinode, hlock)
+ __ldiskfs_add_entry(handle, child, cinode, hlock)
#define OSD_OTABLE_IT_CACHE_SIZE 64
#define OSD_OTABLE_IT_CACHE_MASK (~(OSD_OTABLE_IT_CACHE_SIZE - 1))
return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino;
}
-#ifdef JOURNAL_START_HAS_3ARGS
+#ifdef LDISKFS_HT_MISC
# define osd_journal_start_sb(sb, type, nblock) \
ldiskfs_journal_start_sb(sb, type, nblock)
# define osd_ldiskfs_append(handle, inode, nblock, err) \
ldiskfs_append(handle, inode, nblock)
# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
- ldiskfs_find_entry(dir, name, de, inlined, lock)
+ __ldiskfs_find_entry(dir, name, de, inlined, lock)
# define osd_journal_start(inode, type, nblocks) \
ldiskfs_journal_start(inode, type, nblocks)
# define osd_transaction_size(dev) \
# define osd_ldiskfs_append(handle, inode, nblock, err) \
ldiskfs_append(handle, inode, nblock, err)
# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
- ldiskfs_find_entry(dir, name, de, lock)
+ __ldiskfs_find_entry(dir, name, de, lock)
# define osd_journal_start(inode, type, nblocks) \
ldiskfs_journal_start(inode, nblocks)
# define osd_transaction_size(dev) \