Whamcloud - gitweb
LU-6030 ldiskfs: split pdirop patch 64/14264/11
authorYang Sheng <yang.sheng@intel.com>
Mon, 30 Mar 2015 02:40:35 +0000 (10:40 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 10 Jun 2015 02:54:40 +0000 (02:54 +0000)
Split pdirop patch as two parts. One for Nlevel-htree and Largedir,
other one for pdirop and htree-lock. Also doing some cleanup
work to reduce the patch size.

Signed-off-by: Yang Sheng <yang.sheng@intel.com>
Change-Id: I08b65d9098be95994f44748dbf14afa9f6d5b372
Reviewed-on: http://review.whamcloud.com/14264
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Bob Glossman <bob.glossman@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
16 files changed:
ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel6.3/ext4-osd-iop-common.patch
ldiskfs/kernel_patches/patches/rhel6.3/ext4-pdirop.patch
ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/rhel7/ext4-osd-iop-common.patch
ldiskfs/kernel_patches/patches/rhel7/ext4-pdirop.patch
ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch
ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch [deleted file]
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series
ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series
ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series
lustre/osd-ldiskfs/osd_internal.h

diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-dir.patch
new file mode 100644 (file)
index 0000000..cf5f1f1
--- /dev/null
@@ -0,0 +1,355 @@
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ #define EXT4_FEATURE_INCOMPAT_FLEX_BG         0x0200
+ #define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400
+ #define EXT4_FEATURE_INCOMPAT_DIRDATA         0x1000
++#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000
+ #define EXT4_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT4_FEATURE_INCOMPAT_SUPP    (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
+                                        EXT4_FEATURE_INCOMPAT_MMP| \
+-                                       EXT4_FEATURE_INCOMPAT_DIRDATA)
++                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
++                                       EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b
+  */
+ #define ERR_BAD_DX_DIR        -75000
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL      3
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
++}
++
+ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+                       ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s
+       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++                              struct ext4_inode *raw_inode)
+ {
+-      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
++          S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                       le32_to_cpu(raw_inode->i_size_lo);
+-      else
+-              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++      return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+ }
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+-      inode->i_size = ext4_isize(raw_inode);
++      inode->i_size = ext4_isize(sb, raw_inode);
+       ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return le32_to_cpu(entry->block) & 0x0fffffff;
+ }
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru
+       struct dx_frame *frame = frame_in;
+       u32 hash;
+-      frame->bh = NULL;
++      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+       if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+               goto fail;
+@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru
+               goto fail;
+       }
+-      if ((indirect = info->indirect_levels) > 1) {
+-              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+-                           info->indirect_levels);
++      indirect = info->indirect_levels;
++      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++              ext4_warning(dir->i_sb,
++                           "Directory (ino: %lu) htree depth %#06x exceed "
++                           "supported value", dir->i_ino,
++                           ext4_dir_htree_level(dir->i_sb));
++              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++                      ext4_warning(dir->i_sb, "Enable large directory "
++                                              "feature to access it");
++              }
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+@@ -512,13 +519,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+       struct dx_root_info *info;
++      int i;
++
+       if (frames[0].bh == NULL)
+               return;
+       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+-      if (info->indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      for (i = 0; i <= info->indirect_levels; i++) {
++              if (frames[i].bh == NULL)
++                      break;
++              brelse(frames[i].bh);
++              frames[i].bh = NULL;
++      }
+ }
+ /*
+@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+       struct dx_hash_info hinfo;
+       struct ext4_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct inode *dir;
+       ext4_lblk_t block;
+       int count = 0;
+@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct ext4_dir_entry_2 *de, *top;
+       struct buffer_head *bh;
+       ext4_lblk_t block;
+@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h
+        */
+       dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+       ext4_update_dx_flag(dir);
+-      dir->i_version++;
++      inode_inc_iversion(dir);
+       ext4_mark_inode_dirty(handle, dir);
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+       err = ext4_handle_dirty_metadata(handle, dir, bh);
+@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries;
+       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+       char            *data1, *top;
+@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+       struct dx_hash_info hinfo;
+       struct buffer_head *bh;
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct super_block *sb = dir->i_sb;
+       struct ext4_dir_entry_2 *de;
++      int restart;
+       int err;
++again:
++      restart = 0;
+       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h
+       if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+               goto cleanup;
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext4_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+       if (err != -ENOSPC)
+               goto cleanup;
++      err = 0;
+       /* Block full, should compress but for now just split */
+       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* Need to split index? */
+       if (dx_get_count(entries) == dx_get_limit(entries)) {
+               ext4_lblk_t newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
++              int levels = frame - frames + 1;
++              unsigned icount;
++              int add_level = 1;
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext4_warning(sb, "Directory index full!");
++              while (frame > frames) {
++                      if (dx_get_count((frame - 1)->entries) <
++                          dx_get_limit((frame - 1)->entries)) {
++                              add_level = 0;
++                              break;
++                      }
++                      frame--; /* split higher index block */
++                      at = frame->at;
++                      entries = frame->entries;
++                      restart = 1;
++              }
++              if (add_level && levels == ext4_dir_htree_level(sb)) {
++                      ext4_warning(sb, "Directory (ino: %lu) index full, "
++                                       "reach max htree level :%d",
++                                       dir->i_ino, levels);
++                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++                              ext4_warning(sb, "Large directory feature is"
++                                               "not enabled on this "
++                                               "filesystem");
++                      }
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
++              icount = dx_get_count(entries);
+               bh2 = ext4_append (handle, dir, &newblock, &err);
+               if (!(bh2))
+                       goto cleanup;
+@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h
+               err = ext4_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
++              if (!add_level) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
+                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h
+                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                       err = ext4_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++                                                          (frame - 1)->bh);
+                       if (err)
+                               goto journal_error;
+@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                       }
+-                      dx_insert_block(frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index("node", frames[1].entries));
++                      dx_insert_block((frame - 1), hash2, newblock);
++                      dxtrace(dx_show_index("node", frame->entries));
+                       dxtrace(dx_show_index("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
++                      ext4_handle_dirty_metadata(handle, dir,
++                                                 (frame - 1)->bh);
++                      if (restart) {
++                              ext4_handle_dirty_metadata(handle, dir,
++                                                         frame->bh);
++                              goto cleanup;
++                      }
+               } else {
+                       struct dx_root_info * info;
+-                      dxtrace(printk(KERN_DEBUG
+-                                     "Creating second level index...\n"));
++
+                       memcpy((char *) entries2, (char *) entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h
+                       dx_set_block(entries + 0, newblock);
+                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
+                                       frames[0].bh->b_data);
+-                      info->indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext4_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
++                      info->indirect_levels += 1;
++                      dxtrace(printk(KERN_DEBUG
++                                     "Creating %d level index...\n",
++                                     info->indirect_levels));
++                      ext4_handle_dirty_metadata(handle, dir, frame->bh);
++                      ext4_handle_dirty_metadata(handle, dir, bh2);
++                      brelse(bh2);
++                      restart = 1;
++                      goto cleanup;
+               }
+-              err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+               if (err) {
+                       ext4_std_error(inode->i_sb, err);
+                       goto cleanup;
+@@ -1840,6 +1873,10 @@ cleanup:
+       if (bh)
+               brelse(bh);
+       dx_release(frames);
++      /* @restart is true means htree-path has been changed, we need to
++       * repeat dx_probe() to find out valid htree-path */
++      if (restart && err == 0)
++              goto again;
+       return err;
+ }
+@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle,
+                                       blocksize);
+                       else
+                               de->inode = 0;
+-                      dir->i_version++;
++                      inode_inc_iversion(dir);
+                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                       ext4_handle_dirty_metadata(handle, dir, bh);
+                       return 0;
index 31c68a4..e8718f9 100644 (file)
@@ -1,20 +1,14 @@
 --- a/fs/ext4/ext4.h
 +++ b/fs/ext4/ext4.h
-@@ -1778,6 +1778,19 @@ extern int ext4_orphan_add(handle_t *, s
+@@ -1778,6 +1778,13 @@ extern int ext4_orphan_add(handle_t *, s
  extern int ext4_orphan_del(handle_t *, struct inode *);
  extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
 +extern struct inode *ext4_create_inode(handle_t *handle,
 +                                     struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+                        struct inode *inode);
 +extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
 +                           struct ext4_dir_entry_2 * de_del,
 +                           struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+                                          const struct qstr *d_name,
-+                                          struct ext4_dir_entry_2 ** res_dir);
-+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
 +extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
 +                             struct inode *inode);
  
  #include <linux/fs.h>
  #include <linux/pagemap.h>
  #include <linux/jbd2.h>
-@@ -873,9 +874,9 @@ static inline int search_dirblock(struct
-  * The returned buffer_head has ->b_count elevated.  The caller is expected
-  * to brelse() it when appropriate.
-  */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
--                                      const struct qstr *d_name,
--                                      struct ext4_dir_entry_2 ** res_dir)
-+struct buffer_head * ext4_find_entry(struct inode *dir,
-+                                    const struct qstr *d_name,
-+                                    struct ext4_dir_entry_2 ** res_dir)
- {
-       struct super_block *sb;
-       struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -981,6 +982,7 @@ cleanup_and_exit:
-               brelse(bh_use[ra_ptr]);
-       return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-                      struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1515,8 +1517,8 @@ static int make_indexed_dir(handle_t *ha
-  * may not sleep between calling this and putting something into
-  * the entry, as someone else might have used it while you slept.
-  */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                        struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+                 struct inode *inode)
- {
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct buffer_head *bh;
-@@ -1565,6 +1567,7 @@ static int ext4_add_entry(handle_t *hand
-       brelse(bh);
-       return retval;
- }
-+EXPORT_SYMBOL(ext4_add_entry);
- /*
-  * Returns 0 for success, or a negative error value
 @@ -1704,10 +1707,10 @@ cleanup:
   * ext4_delete_entry deletes a directory entry by merging it with the
   * previous entry
index c6e93c3..32adfc1 100644 (file)
@@ -1,5 +1,7 @@
---- /dev/null  2011-12-14 22:16:16.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/include/linux/htree_lock.h        2011-12-02 17:09:34.000000000 +0800
+Index: linux-2.6.32-504.3.3.el6.x86_64/include/linux/htree_lock.h
+===================================================================
+--- /dev/null
++++ linux-2.6.32-504.3.3.el6.x86_64/include/linux/htree_lock.h
 @@ -0,0 +1,187 @@
 +/*
 + * include/linux/htree_lock.h
 +      ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
 +
 +#endif
---- /dev/null  2011-12-14 22:16:16.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/htree_lock.c      2011-12-14 22:56:28.000000000 +0800
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/htree_lock.c
+===================================================================
+--- /dev/null
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/htree_lock.c
 @@ -0,0 +1,880 @@
 +/*
 + * fs/ext4/htree_lock.c
 +      kfree(lck);
 +}
 +EXPORT_SYMBOL(htree_lock_free);
---- linux-2.6.32-131.6.1/fs/ext4/ext4.h        2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/ext4.h    2011-12-08 18:25:00.000000000 +0800
-@@ -28,6 +28,7 @@
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -27,6 +27,7 @@
  #include <linux/mutex.h>
  #include <linux/timer.h>
  #include <linux/wait.h>
  #include <linux/blockgroup_lock.h>
  #include <linux/percpu_counter.h>
  #ifdef __KERNEL__
-@@ -1277,6 +1278,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
- #define EXT4_FEATURE_INCOMPAT_MMP               0x0100
- #define EXT4_FEATURE_INCOMPAT_FLEX_BG         0x0200
- #define EXT4_FEATURE_INCOMPAT_DIRDATA         0x1000
-+#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000
- #define EXT4_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
- #define EXT4_FEATURE_INCOMPAT_SUPP    (EXT4_FEATURE_INCOMPAT_FILETYPE| \
-@@ -1286,7 +1288,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
-                                        EXT4_FEATURE_INCOMPAT_64BIT| \
-                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-                                        EXT4_FEATURE_INCOMPAT_MMP| \
--                                       EXT4_FEATURE_INCOMPAT_DIRDATA)
-+                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
-+                                       EXT4_FEATURE_INCOMPAT_LARGEDIR)
- #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
-@@ -1536,6 +1539,76 @@ ext4_group_first_block_no(struct super_b
-  */
- #define ERR_BAD_DX_DIR        -75000
+@@ -1625,6 +1626,71 @@ ext4_dir_htree_level(struct super_block
+               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
  
-+/* htree levels for ext4 */
-+#define EXT4_HTREE_LEVEL_COMPAT 2
-+#define EXT4_HTREE_LEVEL      3
-+
-+static inline int
-+ext4_dir_htree_level(struct super_block *sb)
-+{
-+      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
-+              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
-+}
-+
 +/* assume name-hash is protected by upper layer */
 +#define EXT4_HTREE_LOCK_HASH  0
 +
 +                          struct inode *dir, unsigned flags);
 +#define ext4_htree_unlock(lck)                  htree_unlock(lck)
 +
++extern struct buffer_head * __ext4_find_entry(struct inode *dir,
++                                      const struct qstr *d_name,
++                                      struct ext4_dir_entry_2 **res_dir,
++                                      struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck);
  void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
  
-@@ -1769,14 +1842,16 @@ extern int ext4_htree_fill_tree(struct f
- extern struct inode *ext4_create_inode(handle_t *handle,
-                                      struct inode * dir, int mode);
- extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                        struct inode *inode);
-+                        struct inode *inode, struct htree_lock *lck);
- extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
-                            struct ext4_dir_entry_2 * de_del,
-                            struct buffer_head * bh);
- extern struct buffer_head * ext4_find_entry(struct inode *dir,
-                                           const struct qstr *d_name,
--                                          struct ext4_dir_entry_2 ** res_dir);
--#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
-+                                          struct ext4_dir_entry_2 **res_dir,
-+                                          struct htree_lock *lck);
-+#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \
-+      ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck)
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
-                              struct inode *inode, const void *, const void *);
- extern struct buffer_head *ext4_append(handle_t *handle,
-@@ -1893,13 +1968,15 @@ static inline void ext4_r_blocks_count_s
-       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
- }
--static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
-+static inline loff_t ext4_isize(struct super_block *sb,
-+                              struct ext4_inode *raw_inode)
- {
--      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-+      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
-+          S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
-                       le32_to_cpu(raw_inode->i_size_lo);
--      else
--              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
-+
-+      return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
- }
- static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
---- linux-2.6.32-131.6.1/fs/ext4/namei.c       2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/namei.c   2011-12-14 22:55:28.000000000 +0800
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
 @@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s
                                 struct inode *dir,
                                 struct dx_hash_info *hinfo,
  
  /*
   * p is at least 6 bytes before the end of page
-@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
--      return le32_to_cpu(entry->block) & 0x00ffffff;
-+      return le32_to_cpu(entry->block) & 0x0fffffff;
- }
- static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
-@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h
+@@ -368,6 +368,225 @@ struct stats dx_show_entries(struct dx_h
  }
  #endif /* DX_DEBUG */
  
 +      struct dx_entry         *ld_at;    /* position of leaf dx_entry */
 +};
 +
-+#define ext4_htree_lock_data(l)       ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_find_entry(dir, name, dirent) __ext4_find_entry(dir, name, dirent, NULL)
++#define ext4_add_entry(handle, dentry, inode) __ext4_add_entry(handle, dentry, inode, NULL)
 +
 +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
 +#define EXT4_HTREE_NODE_CHANGED       (0xcafeULL << 32)
  /*
   * Probe for a directory leaf block to search.
   *
-@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h
+@@ -379,10 +598,11 @@ struct stats dx_show_entries(struct dx_h
   */
  static struct dx_frame *
  dx_probe(const struct qstr *d_name, struct inode *dir,
        struct dx_root_info * info;
        struct buffer_head *bh;
        struct dx_frame *frame = frame_in;
-       u32 hash;
--      frame->bh = NULL;
-+      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
-       if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
-               goto fail;
-@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru
-               goto fail;
-       }
--      if ((indirect = info->indirect_levels) > 1) {
--              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
--                           info->indirect_levels);
-+      indirect = info->indirect_levels;
-+      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
-+              ext4_warning(dir->i_sb,
-+                           "Directory (ino: %lu) htree depth %#06x exceed "
-+                           "supported value", dir->i_ino,
-+                           ext4_dir_htree_level(dir->i_sb));
-+              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
-+                      ext4_warning(dir->i_sb, "Enable large directory "
-+                                              "feature to access it");
-+              }
-               brelse(bh);
-               *err = ERR_BAD_DX_DIR;
-               goto fail;
-@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru
+@@ -447,8 +667,15 @@ dx_probe(const struct qstr *d_name, stru
        dxtrace(printk("Look up %x", hash));
        while (1)
        {
                        ext4_warning(dir->i_sb,
                                     "dx entry: no count or count > limit");
                        brelse(bh);
-@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru
+@@ -489,9 +716,73 @@ dx_probe(const struct qstr *d_name, stru
                frame->bh = bh;
                frame->entries = entries;
                frame->at = at;
                at = entries = ((struct dx_node *) bh->b_data)->entries;
                if (dx_get_limit(entries) != dx_node_limit (dir)) {
                        ext4_warning(dir->i_sb,
-@@ -512,13 +808,18 @@ fail:
- static void dx_release (struct dx_frame *frames)
- {
-       struct dx_root_info *info;
-+      int i;
-+
-       if (frames[0].bh == NULL)
-               return;
-       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
--      if (info->indirect_levels)
--              brelse(frames[1].bh);
--      brelse(frames[0].bh);
-+      for (i = 0; i <= info->indirect_levels; i++) {
-+              if (frames[i].bh == NULL)
-+                      break;
-+              brelse(frames[i].bh);
-+              frames[i].bh = NULL;
-+      }
- }
- /*
-@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame 
+@@ -553,7 +844,7 @@ static void dx_release (struct dx_frame
  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
  {
        struct dx_frame *p;
        struct buffer_head *bh;
-@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct 
+@@ -568,12 +859,22 @@ static int ext4_htree_next_block(struct
         * this loop, num_frames indicates the number of interior
         * nodes need to be read.
         */
                p--;
        }
  
-@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct 
+@@ -596,6 +897,13 @@ static int ext4_htree_next_block(struct
         * block so no check is necessary
         */
        while (num_frames--) {
                if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
                                      0, &err)))
                        return err; /* Failure */
-@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct 
+@@ -604,6 +912,7 @@ static int ext4_htree_next_block(struct
                p->bh = bh;
                p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
        }
        return 1;
  }
  
-@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di
- {
-       struct dx_hash_info hinfo;
-       struct ext4_dir_entry_2 *de;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct inode *dir;
-       ext4_lblk_t block;
-       int count = 0;
-@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di
+@@ -696,10 +1005,10 @@ int ext4_htree_fill_tree(struct file *di
        }
        hinfo.hash = start_hash;
        hinfo.minor_hash = 0;
        /* Add '.' and '..' from the htree header */
        if (!start_hash && !start_minor_hash) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di
+@@ -726,7 +1035,7 @@ int ext4_htree_fill_tree(struct file *di
                count += ret;
                hashval = ~0;
                ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
                *next_hash = hashval;
                if (ret < 0) {
                        err = ret;
-@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr
+@@ -826,9 +1135,17 @@ static void dx_insert_block(struct dx_fr
  
  static void ext4_update_dx_flag(struct inode *inode)
  {
  }
  
  /*
-@@ -889,8 +1216,9 @@ static inline int search_dirblock(struct
+@@ -900,9 +1217,10 @@ static inline int search_dirblock(struct
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
   * to brelse() it when appropriate.
   */
- struct buffer_head * ext4_find_entry(struct inode *dir,
--                                    const struct qstr *d_name,
--                                    struct ext4_dir_entry_2 ** res_dir)
-+                                   const struct qstr *d_name,
-+                                   struct ext4_dir_entry_2 **res_dir,
-+                                   struct htree_lock *lck)
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
++struct buffer_head * __ext4_find_entry(struct inode *dir,
+                                       const struct qstr *d_name,
+-                                      struct ext4_dir_entry_2 ** res_dir)
++                                      struct ext4_dir_entry_2 **res_dir,
++                                      struct htree_lock *lck)
  {
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -911,7 +1239,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -923,7 +1241,7 @@ static struct buffer_head * ext4_find_en
        if (namelen > EXT4_NAME_LEN)
                return NULL;
        if (is_dx(dir)) {
                /*
                 * On success, or if the error was file not found,
                 * return.  Otherwise, fall back to doing a search the
-@@ -921,6 +1249,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -933,6 +1251,7 @@ static struct buffer_head * ext4_find_en
                        return bh;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
        }
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        start = EXT4_I(dir)->i_dir_start_lookup;
-@@ -998,13 +1327,15 @@ cleanup_and_exit:
+@@ -1008,9 +1327,12 @@ cleanup_and_exit:
+               brelse(bh_use[ra_ptr]);
+       return ret;
  }
- EXPORT_SYMBOL(ext4_find_entry);
++EXPORT_SYMBOL(__ext4_find_entry);
  
 -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 -                     struct ext4_dir_entry_2 **res_dir, int *err)
  {
        struct super_block * sb;
        struct dx_hash_info     hinfo;
-       u32 hash;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct ext4_dir_entry_2 *de, *top;
-       struct buffer_head *bh;
-       ext4_lblk_t block;
-@@ -1015,13 +1346,16 @@ static struct buffer_head * ext4_dx_find
+@@ -1026,13 +1348,16 @@ static struct buffer_head * ext4_dx_find
        sb = dir->i_sb;
        /* NFS may look up ".." - look at dx_root directory block */
        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
        }
        hash = hinfo.hash;
        do {
-@@ -1050,7 +1384,7 @@ static struct buffer_head * ext4_dx_find
+@@ -1061,7 +1386,7 @@ static struct buffer_head * ext4_dx_find
                brelse(bh);
                /* Check to see if we should continue to search */
                retval = ext4_htree_next_block(dir, hash, frame,
                if (retval < 0) {
                        ext4_warning(sb,
                             "error reading index page in directory #%lu",
-@@ -1076,7 +1410,7 @@ static struct dentry *ext4_lookup(struct
-       if (dentry->d_name.len > EXT4_NAME_LEN)
-               return ERR_PTR(-ENAMETOOLONG);
--      bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-       inode = NULL;
-       if (bh) {
-               __u32 ino = le32_to_cpu(de->inode);
-@@ -1144,7 +1478,7 @@ struct dentry *ext4_get_parent(struct de
-       struct ext4_dir_entry_2 * de;
-       struct buffer_head *bh;
--      bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-+      bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
-       inode = NULL;
-       if (!bh)
-               return ERR_PTR(-ENOENT);
-@@ -1233,8 +1567,9 @@ static struct ext4_dir_entry_2* dx_pack_
+@@ -1244,8 +1569,9 @@ static struct ext4_dir_entry_2* dx_pack_
   * Returns pointer to de in block into which the new entry will be inserted.
   */
  static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
  {
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned count, continued;
-@@ -1291,7 +1626,14 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1302,7 +1628,14 @@ static struct ext4_dir_entry_2 *do_split
                                        hash2, split, count-split));
  
        /* Fancy dance to stay within two buffers */
        de = dx_pack_dirents(data1, blocksize);
        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
                                           blocksize);
-@@ -1300,13 +1642,21 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1311,13 +1644,21 @@ static struct ext4_dir_entry_2 *do_split
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
  
        err = ext4_handle_dirty_metadata(handle, dir, bh2);
        if (err)
                goto journal_error;
-@@ -1418,7 +1768,7 @@ static int add_dirent_to_buf(handle_t *h
-       if (!IS_NOCMTIME(dir))
-               dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
-       ext4_update_dx_flag(dir);
--      dir->i_version++;
-+      inode_inc_iversion(dir);
-       ext4_mark_inode_dirty(handle, dir);
-       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, dir, bh);
-@@ -1438,7 +1788,7 @@ static int make_indexed_dir(handle_t *ha
-       const char      *name = dentry->d_name.name;
-       int             namelen = dentry->d_name.len;
-       struct buffer_head *bh2;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct dx_entry *entries;
-       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
-       char            *data1, *top;
-@@ -1517,7 +1867,7 @@ static int make_indexed_dir(handle_t *ha
-       ext4_handle_dirty_metadata(handle, dir, frame->bh);
-       ext4_handle_dirty_metadata(handle, dir, bh);
-
+@@ -1558,7 +1899,7 @@ static int make_indexed_dir(handle_t *ha
+       ext4_handle_dirty_metadata(handle, dir, frame->bh);
+       ext4_handle_dirty_metadata(handle, dir, bh);
 -      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
 +      de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
-       if (!de) {
-               /*
-                * Even if the block split failed, we have to properly write
-@@ -1616,7 +1966,7 @@ out:
+       if (!de) {
+               /*
+                * Even if the block split failed, we have to properly write
+@@ -1664,8 +2005,8 @@ out:
+  * may not sleep between calling this and putting something into
   * the entry, as someone else might have used it while you slept.
   */
- int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                 struct inode *inode)
-+                 struct inode *inode, struct htree_lock *lck)
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+-                        struct inode *inode)
++int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck)
  {
        struct inode *dir = dentry->d_parent->d_inode;
        struct buffer_head *bh;
-@@ -1635,9 +1985,10 @@ int ext4_add_entry(handle_t *handle, str
+@@ -1684,9 +2025,10 @@ static int ext4_add_entry(handle_t *hand
                if (dentry->d_name.len == 2 &&
                    memcmp(dentry->d_name.name, "..", 2) == 0)
                        return ext4_update_dotdot(handle, dentry, inode);
                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
-@@ -1674,18 +2025,21 @@ EXPORT_SYMBOL(ext4_add_entry);
+@@ -1717,12 +2059,13 @@ static int ext4_add_entry(handle_t *hand
+       brelse(bh);
+       return retval;
+ }
++EXPORT_SYMBOL(__ext4_add_entry);
+ /*
   * Returns 0 for success, or a negative error value
   */
  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 -                           struct inode *inode)
 +                           struct inode *inode, struct htree_lock *lck)
  {
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct dx_entry *entries, *at;
-       struct dx_hash_info hinfo;
-       struct buffer_head *bh;
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct super_block *sb = dir->i_sb;
-       struct ext4_dir_entry_2 *de;
-+      int restart;
-       int err;
+@@ -1736,7 +2079,7 @@ static int ext4_dx_add_entry(handle_t *h
  
+ again:
+       restart = 0;
 -      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-+again:
-+      restart = 0;
 +      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
        if (!frame)
                return err;
        entries = frame->entries;
-@@ -1694,33 +2048,53 @@ static int ext4_dx_add_entry(handle_t *h
-       if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
-               goto cleanup;
--      BUFFER_TRACE(bh, "get_write_access");
--      err = ext4_journal_get_write_access(handle, bh);
--      if (err)
--              goto journal_error;
--
-       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-       if (err != -ENOSPC)
-               goto cleanup;
-+      err = 0;
-       /* Block full, should compress but for now just split */
-       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
-                      dx_get_count(entries), dx_get_limit(entries)));
-       /* Need to split index? */
-       if (dx_get_count(entries) == dx_get_limit(entries)) {
-               ext4_lblk_t newblock;
--              unsigned icount = dx_get_count(entries);
--              int levels = frame - frames;
-+              int levels = frame - frames + 1;
-+              unsigned icount;
-+              int add_level = 1;
-               struct dx_entry *entries2;
+@@ -1763,6 +2106,11 @@ again:
                struct dx_node *node2;
                struct buffer_head *bh2;
  
--              if (levels && (dx_get_count(frames->entries) ==
--                             dx_get_limit(frames->entries))) {
--                      ext4_warning(sb, "Directory index full!");
 +              if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
 +                      ext4_htree_safe_relock(lck);
 +                      restart = 1;
 +                      goto cleanup;
 +              }
-+              while (frame > frames) {
-+                      if (dx_get_count((frame - 1)->entries) <
-+                          dx_get_limit((frame - 1)->entries)) {
-+                              add_level = 0;
-+                              break;
-+                      }
-+                      frame--; /* split higher index block */
-+                      at = frame->at;
-+                      entries = frame->entries;
-+                      restart = 1;
-+              }
-+              if (add_level && levels == ext4_dir_htree_level(sb)) {
-+                      ext4_warning(sb, "Directory (ino: %lu) index full, "
-+                                       "reach max htree level :%d",
-+                                       dir->i_ino, levels);
-+                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
-+                              ext4_warning(sb, "Large directory feature is"
-+                                               "not enabled on this "
-+                                               "filesystem");
-+                      }
-                       err = -ENOSPC;
+               while (frame > frames) {
+                       if (dx_get_count((frame - 1)->entries) <
+                           dx_get_limit((frame - 1)->entries)) {
+@@ -1860,16 +2208,43 @@ again:
+                       restart = 1;
                        goto cleanup;
                }
-+              icount = dx_get_count(entries);
-               bh2 = ext4_append (handle, dir, &newblock, &err);
-               if (!(bh2))
-                       goto cleanup;
-@@ -1733,7 +2107,7 @@ static int ext4_dx_add_entry(handle_t *h
-               err = ext4_journal_get_write_access(handle, frame->bh);
-               if (err)
-                       goto journal_error;
--              if (levels) {
-+              if (!add_level) {
-                       unsigned icount1 = icount/2, icount2 = icount - icount1;
-                       unsigned hash2 = dx_get_hash(entries + icount1);
-                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
-@@ -1741,7 +2115,7 @@ static int ext4_dx_add_entry(handle_t *h
-                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
-                       err = ext4_journal_get_write_access(handle,
--                                                           frames[0].bh);
-+                                                          (frame - 1)->bh);
-                       if (err)
-                               goto journal_error;
-@@ -1757,18 +2131,24 @@ static int ext4_dx_add_entry(handle_t *h
-                               frame->entries = entries = entries2;
-                               swap(frame->bh, bh2);
-                       }
--                      dx_insert_block(frames + 0, hash2, newblock);
--                      dxtrace(dx_show_index("node", frames[1].entries));
-+                      dx_insert_block((frame - 1), hash2, newblock);
-+                      dxtrace(dx_show_index("node", frame->entries));
-                       dxtrace(dx_show_index("node",
-                              ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_handle_dirty_metadata(handle, inode, bh2);
-                       if (err)
-                               goto journal_error;
-                       brelse (bh2);
-+                      ext4_handle_dirty_metadata(handle, inode,
-+                                                 (frame - 1)->bh);
-+                      if (restart) {
-+                              ext4_handle_dirty_metadata(handle, inode,
-+                                                         frame->bh);
-+                              goto cleanup;
-+                      }
-               } else {
-                       struct dx_root_info * info;
--                      dxtrace(printk(KERN_DEBUG
--                                     "Creating second level index...\n"));
-+
-                       memcpy((char *) entries2, (char *) entries,
-                              icount * sizeof(struct dx_entry));
-                       dx_set_limit(entries2, dx_node_limit(dir));
-@@ -1778,32 +2158,60 @@ static int ext4_dx_add_entry(handle_t *h
-                       dx_set_block(entries + 0, newblock);
-                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
-                                       frames[0].bh->b_data);
--                      info->indirect_levels = 1;
-+                      info->indirect_levels += 1;
-+                      dxtrace(printk(KERN_DEBUG
-+                                     "Creating %d level index...\n",
-+                                     info->indirect_levels));
-+                      ext4_handle_dirty_metadata(handle, inode, frame->bh);
-+                      ext4_handle_dirty_metadata(handle, inode, bh2);
-+                      brelse(bh2);
-+                      restart = 1;
-+                      goto cleanup;
-+              }
 +      } else if (!ext4_htree_dx_locked(lck)) {
 +              struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
--                      /* Add new access path frame */
--                      frame = frames + 1;
--                      frame->at = at = at - entries + entries2;
--                      frame->entries = entries = entries2;
--                      frame->bh = bh2;
--                      err = ext4_journal_get_write_access(handle,
--                                                           frame->bh);
--                      if (err)
--                              goto journal_error;
++
 +              /* not well protected, require DX lock */
 +              ext4_htree_dx_need_lock(lck);
 +              at = frame > frames ? (frame - 1)->at : NULL;
 +                  (ld->ld_count != dx_get_count(entries))) {
 +                      restart = 1;
 +                      goto cleanup;
-               }
--              ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
++              }
 +              /* OK, I've got DX lock and nothing changed */
 +              frame->at = ld->ld_at;
        }
        if (bh)
                brelse(bh);
        dx_release(frames);
-+      /* @restart is true means htree-path has been changed, we need to
-+       * repeat dx_probe() to find out valid htree-path */
-+      if (restart && err == 0)
-+              goto again;
-       return err;
- }
-@@ -1838,7 +2246,7 @@ int ext4_delete_entry(handle_t *handle,
-                                       blocksize);
-                       else
-                               de->inode = 0;
--                      dir->i_version++;
-+                      inode_inc_iversion(dir);
-                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                       ext4_handle_dirty_metadata(handle, dir, bh);
-                       return 0;
-@@ -1882,7 +2290,7 @@ static void ext4_dec_count(handle_t *han
- static int ext4_add_nondir(handle_t *handle,
-               struct dentry *dentry, struct inode *inode)
- {
--      int err = ext4_add_entry(handle, dentry, inode);
-+      int err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (!err) {
-               ext4_mark_inode_dirty(handle, inode);
-               d_instantiate(dentry, inode);
-@@ -2112,7 +2520,7 @@ retry:
-               goto out_stop;
-       }
--      err = ext4_add_entry(handle, dentry, inode);
-+      err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (err)
-               goto out_clear_inode;
-       ext4_inc_count(handle, dir);
-@@ -2381,7 +2789,7 @@ static int ext4_rmdir(struct inode *dir,
-               return PTR_ERR(handle);
-       retval = -ENOENT;
--      bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-       if (!bh)
-               goto end_rmdir;
-@@ -2443,7 +2851,7 @@ static int ext4_unlink(struct inode *dir
-               ext4_handle_sync(handle);
-       retval = -ENOENT;
--      bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-       if (!bh)
-               goto end_unlink;
-@@ -2567,7 +2975,7 @@ retry:
-       ext4_inc_count(handle, inode);
-       atomic_inc(&inode->i_count);
--      err = ext4_add_entry(handle, dentry, inode);
-+      err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (!err) {
-               ext4_mark_inode_dirty(handle, inode);
-               d_instantiate(dentry, inode);
-@@ -2612,7 +3020,7 @@ static int ext4_rename(struct inode *old
-       if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-               ext4_handle_sync(handle);
--      old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
-+      old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
-       /*
-        *  Check for inode number is _not_ due to possible IO errors.
-        *  We might rmdir the source, keep it as pwd of some process
-@@ -2625,7 +3033,7 @@ static int ext4_rename(struct inode *old
-               goto end_rename;
-       new_inode = new_dentry->d_inode;
--      new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
-+      new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL);
-       if (new_bh) {
-               if (!new_inode) {
-                       brelse(new_bh);
-@@ -2651,7 +3059,7 @@ static int ext4_rename(struct inode *old
-                       goto end_rename;
-       }
-       if (!new_bh) {
--              retval = ext4_add_entry(handle, new_dentry, old_inode);
-+              retval = ext4_add_entry(handle, new_dentry, old_inode, NULL);
-               if (retval)
-                       goto end_rename;
-       } else {
-@@ -2693,7 +3101,8 @@ static int ext4_rename(struct inode *old
-               struct buffer_head *old_bh2;
-               struct ext4_dir_entry_2 *old_de2;
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/Makefile
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/Makefile
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
  
--              old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
-+              old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
-+                                        &old_de2, NULL);
-               if (old_bh2) {
-                       retval = ext4_delete_entry(handle, old_dir,
-                                                  old_de2, old_bh2);
---- linux-2.6.32-131.6.1/fs/ext4/inode.c       2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/inode.c   2011-12-01 22:02:11.000000000 +0800
-@@ -5112,7 +5112,7 @@ struct inode *ext4_iget(struct super_blo
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
-               ei->i_file_acl |=
-                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
--      inode->i_size = ext4_isize(raw_inode);
-+      inode->i_size = ext4_isize(sb, raw_inode);
-       ei->i_disksize = inode->i_size;
- #ifdef CONFIG_QUOTA
-       ei->i_reserved_quota = 0;
---- linux-2.6.32-131.6.1/fs/ext4/Makefile      2011-10-06 20:10:49.000000000 +0800
-+++ linux-2.6.32-131.6.1-pdo/fs/ext4/Makefile  2011-10-06 12:21:30.000000000 +0800
-@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
  ext4-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++              htree_lock.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
--              mmp.o
-+              htree_lock.o mmp.o
+               mmp.o
  
- ext4-$(CONFIG_EXT4_FS_XATTR)          += xattr.o xattr_user.o xattr_trusted.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL)      += acl.o
diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-use-correct-inode.patch
new file mode 100644 (file)
index 0000000..5d86d19
--- /dev/null
@@ -0,0 +1,49 @@
+From 5930ea643805feb50a2f8383ae12eb6f10935e49 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Wed, 31 Aug 2011 12:02:51 -0400
+Subject: [PATCH] ext4: call ext4_handle_dirty_metadata with correct inode in
+ ext4_dx_add_entry
+
+ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads
+that point to directory blocks assigned to the directory inode.  However, the
+function calls ext4_handle_dirty_metadata with the inode of the file that's
+being added to the directory, not the directory inode itself.  Therefore,
+correct the code to dirty the directory buffers with the directory inode, not
+the file inode.
+
+Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Cc: stable@kernel.org
+---
+ fs/ext4/namei.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index f0abe43..a067835 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1585,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                       dxtrace(dx_show_index("node", frames[1].entries));
+                       dxtrace(dx_show_index("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+-                      err = ext4_handle_dirty_metadata(handle, inode, bh2);
++                      err = ext4_handle_dirty_metadata(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
+@@ -1611,7 +1611,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                       if (err)
+                               goto journal_error;
+               }
+-              ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
++              err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
++              if (err) {
++                      ext4_std_error(inode->i_sb, err);
++                      goto cleanup;
++              }
+       }
+       de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+       if (!de)
+-- 
+2.1.0
+
diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-large-dir.patch
new file mode 100644 (file)
index 0000000..24ef03e
--- /dev/null
@@ -0,0 +1,342 @@
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+@@ -1585,7 +1585,8 @@ static inline void ext4_clear_state_flag
+                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
+                                        EXT4_FEATURE_INCOMPAT_MMP |    \
+                                        EXT4_FEATURE_INCOMPAT_DIRDATA| \
+-                                       EXT4_FEATURE_INCOMPAT_INLINE_DATA)
++                                       EXT4_FEATURE_INCOMPAT_INLINE_DATA| \
++                                       EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                        EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -1999,6 +2000,9 @@ struct mmpd_data {
+ # define NORET_TYPE   /**/
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND    noreturn,
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL      3
+ struct ext4_xattr_ino_array {
+       unsigned int xia_count;         /* # of used item in the array */
+@@ -2472,13 +2476,16 @@ static inline void ext4_r_blocks_count_s
+       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++                              struct ext4_inode *raw_inode)
+ {
+-      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) ||
++          (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) &&
++          S_ISDIR(le16_to_cpu(raw_inode->i_mode))))
+               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                       le32_to_cpu(raw_inode->i_size_lo);
+-      else
+-              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++      return (loff_t)le32_to_cpu(raw_inode->i_size_lo);
+ }
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
+@@ -513,7 +513,14 @@ struct dx_root_info * dx_get_dx_info(str
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return le32_to_cpu(entry->block) & 0x0fffffff;
++}
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -681,7 +688,7 @@ dx_probe(const struct qstr *d_name, stru
+       struct dx_frame *frame = frame_in;
+       u32 hash;
+-      frame->bh = NULL;
++      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+       bh = ext4_read_dirblock(dir, 0, INDEX);
+       if (IS_ERR(bh)) {
+               *err = PTR_ERR(bh);
+@@ -714,9 +721,16 @@ dx_probe(const struct qstr *d_name, stru
+               goto fail;
+       }
+-      if ((indirect = info->indirect_levels) > 1) {
+-              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+-                           info->indirect_levels);
++      indirect = info->indirect_levels;
++      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++              ext4_warning(dir->i_sb,
++                           "inode #%lu: comm %s: htree depth %#06x exceed max depth %u",
++                           dir->i_ino, current->comm, indirect,
++                           ext4_dir_htree_level(dir->i_sb));
++              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++                      ext4_warning(dir->i_sb, "Enable large directory "
++                                              "feature to access it");
++              }
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+@@ -812,13 +826,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+       struct dx_root_info *info;
++      int i;
++
+       if (frames[0].bh == NULL)
+               return;
+       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+-      if (info->indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      for (i = 0; i <= info->indirect_levels; i++) {
++              if (frames[i].bh == NULL)
++                      break;
++              brelse(frames[i].bh);
++              frames[i].bh = NULL;
++      }
+ }
+ /*
+@@ -960,7 +979,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+       struct dx_hash_info hinfo;
+       struct ext4_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct inode *dir;
+       ext4_lblk_t block;
+       int count = 0;
+@@ -1376,7 +1395,7 @@ static struct buffer_head * ext4_dx_find
+ {
+       struct super_block * sb = dir->i_sb;
+       struct dx_hash_info     hinfo;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct buffer_head *bh;
+       ext4_lblk_t block;
+       int retval;
+@@ -1832,7 +1851,7 @@ static int make_indexed_dir(handle_t *ha
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries;
+       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+       struct ext4_dir_entry_tail *t;
+@@ -2117,15 +2136,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+       struct dx_hash_info hinfo;
+       struct buffer_head *bh;
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct super_block *sb = dir->i_sb;
+       struct ext4_dir_entry_2 *de;
++      int restart;
+       int err;
++again:
++      restart = 0;
+       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+@@ -2138,33 +2160,48 @@ static int ext4_dx_add_entry(handle_t *h
+               goto cleanup;
+       }
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext4_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+       if (err != -ENOSPC)
+               goto cleanup;
++      err = 0;
+       /* Block full, should compress but for now just split */
+       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* Need to split index? */
+       if (dx_get_count(entries) == dx_get_limit(entries)) {
+               ext4_lblk_t newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
++              int levels = frame - frames + 1;
++              unsigned icount;
++              int add_level = 1;
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext4_warning(sb, "Directory index full!");
++              while (frame > frames) {
++                      if (dx_get_count((frame - 1)->entries) <
++                          dx_get_limit((frame - 1)->entries)) {
++                              add_level = 0;
++                              break;
++                      }
++                      frame--; /* split higher index block */
++                      at = frame->at;
++                      entries = frame->entries;
++                      restart = 1;
++              }
++              if (add_level && levels == ext4_dir_htree_level(sb)) {
++                      ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u",
++                                       dir->i_ino, current->comm, levels,
++                                       ext4_dir_htree_level(sb));
++                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++                              ext4_warning(sb, "Large directory feature is"
++                                               "not enabled on this "
++                                               "filesystem");
++                      }
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
++              icount = dx_get_count(entries);
+               bh2 = ext4_append(handle, dir, &newblock);
+               if (IS_ERR(bh2)) {
+                       err = PTR_ERR(bh2);
+@@ -2179,7 +2216,7 @@ static int ext4_dx_add_entry(handle_t *h
+               err = ext4_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
++              if (!add_level) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
+                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *h
+                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                       err = ext4_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++                                                          (frame - 1)->bh);
+                       if (err)
+                               goto journal_error;
+@@ -2203,19 +2240,25 @@ static int ext4_dx_add_entry(handle_t *h
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                       }
+-                      dx_insert_block(frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index("node", frames[1].entries));
++                      dx_insert_block(frame - 1, hash2, newblock);
++                      dxtrace(dx_show_index("node", frame->entries));
+                       dxtrace(dx_show_index("node",
+-                             ((struct dx_node *) bh2->b_data)->entries));
++                             ((struct dx_node *)bh2->b_data)->entries));
+                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
++                      ext4_handle_dirty_dirent_node(handle, dir,
++                                                    (frame - 1)->bh);
++                      if (restart) {
++                              ext4_handle_dirty_dirent_node(handle, dir,
++                                                            frame->bh);
++                              goto cleanup;
++                      }
+               } else {
+                       struct dx_root_info * info;
+-                      dxtrace(printk(KERN_DEBUG
+-                                     "Creating second level index...\n"));
+-                      memcpy((char *) entries2, (char *) entries,
++
++                      memcpy((char *)entries2, (char *)entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+@@ -2224,21 +2267,14 @@ static int ext4_dx_add_entry(handle_t *h
+                       dx_set_block(entries + 0, newblock);
+                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
+                                       frames[0].bh->b_data);
+-                      info->indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext4_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
+-              }
+-              err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+-              if (err) {
+-                      ext4_std_error(inode->i_sb, err);
++                      info->indirect_levels += 1;
++                      dxtrace(printk(KERN_DEBUG
++                                     "Creating %d level index...\n",
++                                     info->indirect_levels));
++                      ext4_handle_dirty_dirent_node(handle, dir, frame->bh);
++                      ext4_handle_dirty_dirent_node(handle, dir, bh2);
++                      brelse(bh2);
++                      restart = 1;
+                       goto cleanup;
+               }
+       }
+@@ -2253,6 +2289,10 @@ journal_error:
+ cleanup:
+       brelse(bh);
+       dx_release(frames);
++      /* @restart is true means htree-path has been changed, we need to
++       * repeat dx_probe() to find out valid htree-path */
++      if (restart && err == 0)
++              goto again;
+       return err;
+ }
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/inode.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/inode.c
+@@ -4056,7 +4056,7 @@ struct inode *ext4_iget(struct super_blo
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+-      inode->i_size = ext4_isize(raw_inode);
++      inode->i_size = ext4_isize(sb, raw_inode);
+       ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
+@@ -4306,7 +4306,7 @@ static int ext4_do_update_inode(handle_t
+               raw_inode->i_file_acl_high =
+                       cpu_to_le16(ei->i_file_acl >> 32);
+       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+-      if (ei->i_disksize != ext4_isize(raw_inode)) {
++      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+               ext4_isize_set(raw_inode, ei->i_disksize);
+               need_datasync = 1;
+       }
index d00452f..a351b61 100644 (file)
@@ -2,21 +2,15 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
 ===================================================================
 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h
 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
-@@ -2145,6 +2145,19 @@ extern int ext4_orphan_add(handle_t *, s
+@@ -2145,6 +2145,13 @@ extern int ext4_orphan_add(handle_t *, s
  extern int ext4_orphan_del(handle_t *, struct inode *);
  extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
 +extern struct inode *ext4_create_inode(handle_t *handle,
 +                                     struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+                        struct inode *inode);
 +extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
 +                           struct ext4_dir_entry_2 * de_del,
 +                           struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+                                          const struct qstr *d_name,
-+                                          struct ext4_dir_entry_2 ** res_dir,
-+                                          int *inlined);
 +extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
 +                             struct inode *inode);
  extern int search_dir(struct buffer_head *bh,
@@ -26,42 +20,6 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
 ===================================================================
 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
-@@ -1211,7 +1211,7 @@ static int is_dx_internal_node(struct in
-  * The returned buffer_head has ->b_count elevated.  The caller is expected
-  * to brelse() it when appropriate.
-  */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
-+struct buffer_head * ext4_find_entry(struct inode *dir,
-                                       const struct qstr *d_name,
-                                       struct ext4_dir_entry_2 **res_dir,
-                                       int *inlined)
-@@ -1355,6 +1355,7 @@ cleanup_and_exit:
-               brelse(bh_use[ra_ptr]);
-       return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-                      struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1903,8 +1904,8 @@ static int make_indexed_dir(handle_t *ha
-  * may not sleep between calling this and putting something into
-  * the entry, as someone else might have used it while you slept.
-  */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                        struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+                 struct inode *inode)
- {
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct buffer_head *bh;
-@@ -1979,6 +1980,7 @@ static int ext4_add_entry(handle_t *hand
-               ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
-       return retval;
- }
-+EXPORT_SYMBOL(ext4_add_entry);
- /*
-  * Returns 0 for success, or a negative error value
 @@ -2165,7 +2167,7 @@ int ext4_generic_delete_entry(handle_t *
        return -ENOENT;
  }
index 7d613aa..3e1b56a 100644 (file)
@@ -12,13 +12,12 @@ threads to simultaneously lookup, create and unlink in parallel.
     
 This patch contains:
  - pdirops support for ldiskfs
- - N-level htree directory
  - integrate with osd-ldiskfs
 
-Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
+Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
 ===================================================================
 --- /dev/null
-+++ linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
 @@ -0,0 +1,187 @@
 +/*
 + * include/linux/htree_lock.h
@@ -207,10 +206,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
 +      ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
 +
 +#endif
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
 ===================================================================
 --- /dev/null
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
 @@ -0,0 +1,880 @@
 +/*
 + * fs/ext4/htree_lock.c
@@ -468,7 +467,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
 +                       htree_lock_mode_t mode, u32 key, unsigned dep,
 +                       int wait, void *event)
 +{
-+      LIST_HEAD               (list);
++      LIST_HEAD(list);
 +      struct htree_lock       *tmp;
 +      struct htree_lock       *tmp2;
 +      u16                     major;
@@ -1092,10 +1091,22 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
 +      kfree(lck);
 +}
 +EXPORT_SYMBOL(htree_lock_free);
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile
 ===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/Makefile
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+ ext4-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+               ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++              htree_lock.o \
+               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+               mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+               xattr_trusted.o inline.o
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/ext4.h
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/ext4.h
 @@ -27,6 +27,7 @@
  #include <linux/mutex.h>
  #include <linux/timer.h>
@@ -1104,7 +1115,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
  #include <linux/blockgroup_lock.h>
  #include <linux/percpu_counter.h>
  #include <linux/ratelimit.h>
-@@ -810,6 +811,9 @@ struct ext4_inode_info {
+@@ -821,6 +822,9 @@ struct ext4_inode_info {
        __u32   i_dtime;
        ext4_fsblk_t    i_file_acl;
  
@@ -1114,29 +1125,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
-@@ -1536,6 +1540,7 @@ static inline void ext4_clear_state_flag
-                                        EXT4_FEATURE_INCOMPAT_META_BG| \
-                                        EXT4_FEATURE_INCOMPAT_EXTENTS| \
-                                        EXT4_FEATURE_INCOMPAT_64BIT| \
-+                                       EXT4_FEATURE_INCOMPAT_LARGEDIR|\
-                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
-                                        EXT4_FEATURE_INCOMPAT_MMP |    \
-@@ -1954,6 +1959,76 @@ struct mmpd_data {
- # define NORET_TYPE   /**/
- # define ATTRIB_NORET __attribute__((noreturn))
- # define NORET_AND    noreturn,
-+/* htree levels for ext4 */
-+#define EXT4_HTREE_LEVEL_COMPAT 2
-+#define EXT4_HTREE_LEVEL      3
-+
-+static inline int
-+ext4_dir_htree_level(struct super_block *sb)
-+{
-+      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
-+              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
-+}
-+
+@@ -1846,6 +1850,71 @@ struct dx_hash_info
+  */
+ #define HASH_NB_ALWAYS                1
 +/* assume name-hash is protected by upper layer */
 +#define EXT4_HTREE_LOCK_HASH  0
 +
@@ -1196,10 +1188,16 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
 +                          struct inode *dir, unsigned flags);
 +#define ext4_htree_unlock(lck)                  htree_unlock(lck)
 +
++extern struct buffer_head *__ext4_find_entry(struct inode *dir,
++                                      const struct qstr *d_name,
++                                      struct ext4_dir_entry_2 **res_dir,
++                                      int *inlined, struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck);
  
- struct ext4_xattr_ino_array {
-       unsigned int xia_count;         /* # of used item in the array */
-@@ -2050,9 +2125,17 @@ void ext4_insert_dentry(struct inode *in
+ /*
+  * Describe an inode's exact location on disk and in memory
+@@ -2088,9 +2157,17 @@ void ext4_insert_dentry(struct inode *in
                        const char *name, int namelen, void *data);
  static inline void ext4_update_dx_flag(struct inode *inode)
  {
@@ -1217,47 +1215,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
  }
  static unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-@@ -2212,14 +2295,14 @@ extern int ext4_htree_fill_tree(struct f
- extern struct inode *ext4_create_inode(handle_t *handle,
-                                      struct inode * dir, int mode);
- extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                        struct inode *inode);
-+                        struct inode *inode, struct htree_lock *lck);
- extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
-                            struct ext4_dir_entry_2 * de_del,
-                            struct buffer_head * bh);
- extern struct buffer_head * ext4_find_entry(struct inode *dir,
-                                           const struct qstr *d_name,
-                                           struct ext4_dir_entry_2 ** res_dir,
--                                          int *inlined);
-+                                          int *inlined, struct htree_lock *lck);
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
-                              struct inode *inode, const void *, const void *);
- extern int search_dir(struct buffer_head *bh,
-@@ -2382,13 +2465,15 @@ static inline void ext4_r_blocks_count_s
-       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
- }
--static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
-+static inline loff_t ext4_isize(struct super_block *sb,
-+                              struct ext4_inode *raw_inode)
- {
--      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-+      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
-+          S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
-                       le32_to_cpu(raw_inode->i_size_lo);
--      else
--              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
-+
-+      return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
- }
- static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
 ===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/namei.c
 @@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t
                                        ext4_lblk_t *block)
  {
@@ -1284,7 +1245,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
 +      }
        inode->i_size += inode->i_sb->s_blocksize;
        EXT4_I(inode)->i_disksize = inode->i_size;
-       BUFFER_TRACE(bh, "get_write_access");
+       BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
 +      up(&ei->i_append_sem);
        if (err) {
@@ -1316,16 +1277,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
  
  /* checksumming functions */
  void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-@@ -517,7 +525,7 @@ struct dx_root_info * dx_get_dx_info(str
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
--      return le32_to_cpu(entry->block) & 0x00ffffff;
-+      return le32_to_cpu(entry->block) & 0x0fffffff;
- }
- static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
-@@ -667,6 +675,223 @@ struct stats dx_show_entries(struct dx_h
+@@ -668,6 +676,227 @@ struct stats dx_show_entries(struct dx_h
  }
  #endif /* DX_DEBUG */
  
@@ -1338,6 +1290,10 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
 +};
 +
 +#define ext4_htree_lock_data(l)       ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_find_entry(dir, name, dirent, inline) \
++                      __ext4_find_entry(dir, name, dirent, inline, NULL)
++#define ext4_add_entry(handle, dentry, inode) \
++                      __ext4_add_entry(handle, dentry, inode, NULL)
 +
 +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
 +#define EXT4_HTREE_NODE_CHANGED       (0xcafeULL << 32)
@@ -1549,7 +1505,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
  /*
   * Probe for a directory leaf block to search.
   *
-@@ -678,16 +903,17 @@ struct stats dx_show_entries(struct dx_h
+@@ -679,10 +908,11 @@ struct stats dx_show_entries(struct dx_h
   */
  static struct dx_frame *
  dx_probe(const struct qstr *d_name, struct inode *dir,
@@ -1563,34 +1519,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
        struct dx_root_info * info;
        struct buffer_head *bh;
        struct dx_frame *frame = frame_in;
-       u32 hash;
--      frame->bh = NULL;
-+      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
-       bh = ext4_read_dirblock(dir, 0, INDEX);
-       if (IS_ERR(bh)) {
-               *err = PTR_ERR(bh);
-@@ -720,9 +946,16 @@ dx_probe(const struct qstr *d_name, stru
-               goto fail;
-       }
--      if ((indirect = info->indirect_levels) > 1) {
--              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
--                           info->indirect_levels);
-+      indirect = info->indirect_levels;
-+      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
-+              ext4_warning(dir->i_sb,
-+                           "Directory (ino: %lu) htree depth %#06x exceed "
-+                           "supported value", dir->i_ino,
-+                           ext4_dir_htree_level(dir->i_sb));
-+              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
-+                      ext4_warning(dir->i_sb, "Enable large directory "
-+                                              "feature to access it");
-+              }
-               brelse(bh);
-               *err = ERR_BAD_DX_DIR;
-               goto fail;
-@@ -742,8 +975,15 @@ dx_probe(const struct qstr *d_name, stru
+@@ -750,8 +980,15 @@ dx_probe(const struct qstr *d_name, stru
        dxtrace(printk("Look up %x", hash));
        while (1)
        {
@@ -1607,7 +1536,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                        ext4_warning(dir->i_sb,
                                     "dx entry: no count or count > limit");
                        brelse(bh);
-@@ -784,7 +1024,70 @@ dx_probe(const struct qstr *d_name, stru
+@@ -792,7 +1029,70 @@ dx_probe(const struct qstr *d_name, stru
                frame->bh = bh;
                frame->entries = entries;
                frame->at = at;
@@ -1679,29 +1608,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
                if (IS_ERR(bh)) {
                        *err = PTR_ERR(bh);
-@@ -818,13 +1121,18 @@ fail:
- static void dx_release (struct dx_frame *frames)
- {
-       struct dx_root_info *info;
-+      int i;
-+
-       if (frames[0].bh == NULL)
-               return;
-       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
--      if (info->indirect_levels)
--              brelse(frames[1].bh);
--      brelse(frames[0].bh);
-+      for (i = 0; i <= info->indirect_levels; i++) {
-+              if (frames[i].bh == NULL)
-+                      break;
-+              brelse(frames[i].bh);
-+              frames[i].bh = NULL;
-+      }
- }
- /*
-@@ -847,7 +1155,7 @@ static void dx_release (struct dx_frame
+@@ -860,7 +1160,7 @@ static void dx_release (struct dx_frame
  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
@@ -1710,7 +1617,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
  {
        struct dx_frame *p;
        struct buffer_head *bh;
-@@ -862,12 +1170,22 @@ static int ext4_htree_next_block(struct
+@@ -875,12 +1175,22 @@ static int ext4_htree_next_block(struct
         * this loop, num_frames indicates the number of interior
         * nodes need to be read.
         */
@@ -1735,7 +1642,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                p--;
        }
  
-@@ -890,6 +1208,13 @@ static int ext4_htree_next_block(struct
+@@ -903,6 +1213,13 @@ static int ext4_htree_next_block(struct
         * block so no check is necessary
         */
        while (num_frames--) {
@@ -1749,7 +1656,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
-@@ -898,6 +1223,7 @@ static int ext4_htree_next_block(struct
+@@ -911,6 +1228,7 @@ static int ext4_htree_next_block(struct
                p->bh = bh;
                p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
        }
@@ -1757,16 +1664,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
        return 1;
  }
  
-@@ -966,7 +1292,7 @@ int ext4_htree_fill_tree(struct file *di
- {
-       struct dx_hash_info hinfo;
-       struct ext4_dir_entry_2 *de;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct inode *dir;
-       ext4_lblk_t block;
-       int count = 0;
-@@ -1000,10 +1326,10 @@ int ext4_htree_fill_tree(struct file *di
+@@ -1013,10 +1331,10 @@ int ext4_htree_fill_tree(struct file *di
        }
        hinfo.hash = start_hash;
        hinfo.minor_hash = 0;
@@ -1779,7 +1677,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
        /* Add '.' and '..' from the htree header */
        if (!start_hash && !start_minor_hash) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -1030,7 +1356,7 @@ int ext4_htree_fill_tree(struct file *di
+@@ -1043,7 +1361,7 @@ int ext4_htree_fill_tree(struct file *di
                count += ret;
                hashval = ~0;
                ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
@@ -1788,8 +1686,12 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                *next_hash = hashval;
                if (ret < 0) {
                        err = ret;
-@@ -1226,7 +1552,7 @@ static int is_dx_internal_node(struct in
- struct buffer_head * ext4_find_entry(struct inode *dir,
+@@ -1236,10 +1554,10 @@ static int is_dx_internal_node(struct in
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
++struct buffer_head *__ext4_find_entry(struct inode *dir,
                                        const struct qstr *d_name,
                                        struct ext4_dir_entry_2 **res_dir,
 -                                      int *inlined)
@@ -1797,7 +1699,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
  {
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -1270,7 +1596,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -1283,7 +1601,7 @@ static struct buffer_head * ext4_find_en
                goto restart;
        }
        if (is_dx(dir)) {
@@ -1806,7 +1708,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                /*
                 * On success, or if the error was file not found,
                 * return.  Otherwise, fall back to doing a search the
-@@ -1280,6 +1606,7 @@ struct buffer_head * ext4_find_entry(str
+@@ -1297,6 +1615,7 @@ static struct buffer_head * ext4_find_en
                        return bh;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
@@ -1814,22 +1716,22 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
        }
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        start = EXT4_I(dir)->i_dir_start_lookup;
-@@ -1369,17 +1696,19 @@ cleanup_and_exit:
+@@ -1389,9 +1708,12 @@ cleanup_and_exit:
+               brelse(bh_use[ra_ptr]);
+       return ret;
  }
- EXPORT_SYMBOL(ext4_find_entry);
++EXPORT_SYMBOL(__ext4_find_entry);
  
 -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 -                     struct ext4_dir_entry_2 **res_dir, int *err)
-+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
++static struct buffer_head *ext4_dx_find_entry(struct inode *dir,
 +                              const struct qstr *d_name,
 +                              struct ext4_dir_entry_2 **res_dir,
 +                              struct htree_lock *lck, int *err)
  {
        struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct buffer_head *bh;
+@@ -1400,7 +1722,7 @@ static struct buffer_head * ext4_dx_find
        ext4_lblk_t block;
        int retval;
  
@@ -1838,7 +1740,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                return NULL;
        do {
                block = dx_get_block(frame->at);
-@@ -1403,7 +1732,7 @@ static struct buffer_head * ext4_dx_find
+@@ -1424,7 +1746,7 @@ static struct buffer_head * ext4_dx_find
  
                /* Check to see if we should continue to search */
                retval = ext4_htree_next_block(dir, hinfo.hash, frame,
@@ -1847,25 +1749,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                if (retval < 0) {
                        ext4_warning(sb,
                             "error reading index page in directory #%lu",
-@@ -1429,7 +1758,7 @@ static struct dentry *ext4_lookup(struct
-       if (dentry->d_name.len > EXT4_NAME_LEN)
-               return ERR_PTR(-ENAMETOOLONG);
--      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
-       if (IS_ERR(bh))
-               return (struct dentry *) bh;
-       inode = NULL;
-@@ -1489,7 +1818,7 @@ struct dentry *ext4_get_parent(struct de
-       struct ext4_dir_entry_2 * de;
-       struct buffer_head *bh;
--      bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
-+      bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL, NULL);
-       if (IS_ERR(bh))
-               return (struct dentry *) bh;
-       if (!bh)
-@@ -1559,8 +1888,9 @@ static struct ext4_dir_entry_2* dx_pack_
+@@ -1583,8 +1905,9 @@ static struct ext4_dir_entry_2* dx_pack_
   * Returns pointer to de in block into which the new entry will be inserted.
   */
  static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
@@ -1877,7 +1761,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
  {
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned count, continued;
-@@ -1624,7 +1954,14 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1647,7 +1970,14 @@ static struct ext4_dir_entry_2 *do_split
                                        hash2, split, count-split));
  
        /* Fancy dance to stay within two buffers */
@@ -1893,7 +1777,7 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
        de = dx_pack_dirents(data1, blocksize);
        de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
                                           (char *) de,
-@@ -1643,13 +1980,21 @@ static struct ext4_dir_entry_2 *do_split
+@@ -1666,13 +1996,21 @@ static struct ext4_dir_entry_2 *do_split
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
  
@@ -1921,43 +1805,27 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
        err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
        if (err)
                goto journal_error;
-@@ -1809,7 +2154,7 @@ static int add_dirent_to_buf(handle_t *h
-        */
-       dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
-       ext4_update_dx_flag(dir);
--      dir->i_version++;
-+      inode_inc_iversion(dir);
-       ext4_mark_inode_dirty(handle, dir);
-       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_dirent_node(handle, dir, bh);
-@@ -1829,7 +2174,7 @@ static int make_indexed_dir(handle_t *ha
-       const char      *name = dentry->d_name.name;
-       int             namelen = dentry->d_name.len;
-       struct buffer_head *bh2;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct dx_entry *entries;
-       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
-       struct ext4_dir_entry_tail *t;
-@@ -1923,7 +2268,7 @@ static int make_indexed_dir(handle_t *ha
+@@ -1945,7 +2283,7 @@ static int make_indexed_dir(handle_t *ha
        ext4_handle_dirty_dx_node(handle, dir, frame->bh);
        ext4_handle_dirty_dirent_node(handle, dir, bh);
  
 -      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-+      de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
++      de = do_split(handle, dir, &bh, frames, frame, &hinfo, NULL, &retval);
        if (!de) {
                /*
                 * Even if the block split failed, we have to properly write
-@@ -2030,7 +2375,7 @@ out:
+@@ -2051,8 +2389,8 @@ out:
+  * may not sleep between calling this and putting something into
   * the entry, as someone else might have used it while you slept.
   */
- int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                 struct inode *inode)
-+                 struct inode *inode, struct htree_lock *lck)
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+-                        struct inode *inode)
++int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck)
  {
        struct inode *dir = dentry->d_parent->d_inode;
        struct buffer_head *bh;
-@@ -2066,9 +2411,10 @@ int ext4_add_entry(handle_t *handle, str
+@@ -2087,9 +2425,10 @@ static int ext4_add_entry(handle_t *hand
                if (dentry->d_name.len == 2 &&
                    memcmp(dentry->d_name.name, "..", 2) == 0)
                        return ext4_update_dotdot(handle, dentry, inode);
@@ -1969,169 +1837,49 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
-@@ -2114,18 +2460,21 @@ EXPORT_SYMBOL(ext4_add_entry);
+@@ -2129,12 +2468,13 @@ static int ext4_add_entry(handle_t *hand
+               ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+       return retval;
+ }
++EXPORT_SYMBOL(__ext4_add_entry);
+ /*
   * Returns 0 for success, or a negative error value
   */
  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 -                           struct inode *inode)
 +                           struct inode *inode, struct htree_lock *lck)
  {
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct dx_entry *entries, *at;
-       struct dx_hash_info hinfo;
-       struct buffer_head *bh;
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct super_block *sb = dir->i_sb;
-       struct ext4_dir_entry_2 *de;
-+      int restart;
-       int err;
+@@ -2148,7 +2488,7 @@ static int ext4_dx_add_entry(handle_t *h
  
+ again:
+       restart = 0;
 -      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-+again:
-+      restart = 0;
 +      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
        if (!frame)
                return err;
        entries = frame->entries;
-@@ -2137,33 +2486,53 @@ static int ext4_dx_add_entry(handle_t *h
-               goto cleanup;
-       }
--      BUFFER_TRACE(bh, "get_write_access");
--      err = ext4_journal_get_write_access(handle, bh);
--      if (err)
--              goto journal_error;
--
-       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-       if (err != -ENOSPC)
-               goto cleanup;
-+      err = 0;
-       /* Block full, should compress but for now just split */
-       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
-                      dx_get_count(entries), dx_get_limit(entries)));
-       /* Need to split index? */
-       if (dx_get_count(entries) == dx_get_limit(entries)) {
-               ext4_lblk_t newblock;
--              unsigned icount = dx_get_count(entries);
--              int levels = frame - frames;
-+              int levels = frame - frames + 1;
-+              unsigned icount;
-+              int add_level = 1;
-               struct dx_entry *entries2;
+@@ -2178,6 +2518,11 @@ again:
                struct dx_node *node2;
                struct buffer_head *bh2;
  
--              if (levels && (dx_get_count(frames->entries) ==
--                             dx_get_limit(frames->entries))) {
--                      ext4_warning(sb, "Directory index full!");
 +              if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
 +                      ext4_htree_safe_relock(lck);
 +                      restart = 1;
 +                      goto cleanup;
 +              }
-+              while (frame > frames) {
-+                      if (dx_get_count((frame - 1)->entries) <
-+                          dx_get_limit((frame - 1)->entries)) {
-+                              add_level = 0;
-+                              break;
-+                      }
-+                      frame--; /* split higher index block */
-+                      at = frame->at;
-+                      entries = frame->entries;
-+                      restart = 1;
-+              }
-+              if (add_level && levels == ext4_dir_htree_level(sb)) {
-+                      ext4_warning(sb, "Directory (ino: %lu) index full, "
-+                                       "reach max htree level :%d",
-+                                       dir->i_ino, levels);
-+                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
-+                              ext4_warning(sb, "Large directory feature is"
-+                                               "not enabled on this "
-+                                               "filesystem");
-+                      }
-                       err = -ENOSPC;
+               while (frame > frames) {
+                       if (dx_get_count((frame - 1)->entries) <
+                           dx_get_limit((frame - 1)->entries)) {
+@@ -2277,16 +2622,43 @@ again:
+                       restart = 1;
                        goto cleanup;
                }
-+              icount = dx_get_count(entries);
-               bh2 = ext4_append(handle, dir, &newblock);
-               if (IS_ERR(bh2)) {
-                       err = PTR_ERR(bh2);
-@@ -2178,7 +2547,7 @@ static int ext4_dx_add_entry(handle_t *h
-               err = ext4_journal_get_write_access(handle, frame->bh);
-               if (err)
-                       goto journal_error;
--              if (levels) {
-+              if (!add_level) {
-                       unsigned icount1 = icount/2, icount2 = icount - icount1;
-                       unsigned hash2 = dx_get_hash(entries + icount1);
-                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
-@@ -2186,7 +2555,7 @@ static int ext4_dx_add_entry(handle_t *h
-                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
-                       err = ext4_journal_get_write_access(handle,
--                                                           frames[0].bh);
-+                                                          (frame - 1)->bh);
-                       if (err)
-                               goto journal_error;
-@@ -2202,18 +2571,24 @@ static int ext4_dx_add_entry(handle_t *h
-                               frame->entries = entries = entries2;
-                               swap(frame->bh, bh2);
-                       }
--                      dx_insert_block(frames + 0, hash2, newblock);
--                      dxtrace(dx_show_index("node", frames[1].entries));
-+                      dx_insert_block((frame - 1), hash2, newblock);
-+                      dxtrace(dx_show_index("node", frame->entries));
-                       dxtrace(dx_show_index("node",
-                              ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
-                       if (err)
-                               goto journal_error;
-                       brelse (bh2);
-+                      ext4_handle_dirty_metadata(handle, inode,
-+                                                 (frame - 1)->bh);
-+                      if (restart) {
-+                              ext4_handle_dirty_metadata(handle, inode,
-+                                                         frame->bh);
-+                              goto cleanup;
-+                      }
-               } else {
-                       struct dx_root_info * info;
--                      dxtrace(printk(KERN_DEBUG
--                                     "Creating second level index...\n"));
-+
-                       memcpy((char *) entries2, (char *) entries,
-                              icount * sizeof(struct dx_entry));
-                       dx_set_limit(entries2, dx_node_limit(dir));
-@@ -2223,35 +2598,63 @@ static int ext4_dx_add_entry(handle_t *h
-                       dx_set_block(entries + 0, newblock);
-                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
-                                       frames[0].bh->b_data);
--                      info->indirect_levels = 1;
-+                      info->indirect_levels += 1;
-+                      dxtrace(printk(KERN_DEBUG
-+                                     "Creating %d level index...\n",
-+                                     info->indirect_levels));
-+                      ext4_handle_dirty_metadata(handle, inode, frame->bh);
-+                      ext4_handle_dirty_metadata(handle, inode, bh2);
-+                      brelse(bh2);
-+                      restart = 1;
-+                      goto cleanup;
-+              }
 +      } else if (!ext4_htree_dx_locked(lck)) {
 +              struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
--                      /* Add new access path frame */
--                      frame = frames + 1;
--                      frame->at = at = at - entries + entries2;
--                      frame->entries = entries = entries2;
--                      frame->bh = bh2;
--                      err = ext4_journal_get_write_access(handle,
--                                                           frame->bh);
--                      if (err)
--                              goto journal_error;
++
 +              /* not well protected, require DX lock */
 +              ext4_htree_dx_need_lock(lck);
 +              at = frame > frames ? (frame - 1)->at : NULL;
@@ -2150,14 +1898,9 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
 +                  (ld->ld_count != dx_get_count(entries))) {
 +                      restart = 1;
 +                      goto cleanup;
-               }
--              err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
++              }
 +              /* OK, I've got DX lock and nothing changed */
 +              frame->at = ld->ld_at;
-               if (err) {
-                       ext4_std_error(inode->i_sb, err);
-                       goto cleanup;
-               }
        }
 -      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
 +      de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
@@ -2174,164 +1917,15 @@ Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
 +      ext4_htree_de_unlock(lck);
        brelse(bh);
        dx_release(frames);
-+      /* @restart is true means htree-path has been changed, we need to
-+       * repeat dx_probe() to find out valid htree-path */
-+      if (restart && err == 0)
-+              goto again;
-       return err;
- }
-@@ -2288,7 +2691,7 @@ int ext4_generic_delete_entry(handle_t *
-                                       blocksize);
-                       else
-                               de->inode = 0;
--                      dir->i_version++;
-+                      inode_inc_iversion(dir);
-                       return 0;
-               }
-               i += ext4_rec_len_from_disk(de->rec_len, blocksize);
-@@ -2373,7 +2776,7 @@ EXPORT_SYMBOL(ext4_dec_count);
- static int ext4_add_nondir(handle_t *handle,
-               struct dentry *dentry, struct inode *inode)
- {
--      int err = ext4_add_entry(handle, dentry, inode);
-+      int err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (!err) {
-               ext4_mark_inode_dirty(handle, inode);
-               unlock_new_inode(inode);
-@@ -2641,7 +3044,7 @@ retry:
-               goto out_clear_inode;
-       err = ext4_mark_inode_dirty(handle, inode);
-       if (!err)
--              err = ext4_add_entry(handle, dentry, inode);
-+              err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (err) {
- out_clear_inode:
-               clear_nlink(inode);
-@@ -2907,7 +3310,7 @@ static int ext4_rmdir(struct inode *dir,
-       dquot_initialize(dentry->d_inode);
-       retval = -ENOENT;
--      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
-       if (IS_ERR(bh))
-               return PTR_ERR(bh);
-       if (!bh)
-@@ -2974,7 +3377,7 @@ static int ext4_unlink(struct inode *dir
-       dquot_initialize(dentry->d_inode);
-       retval = -ENOENT;
--      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
-       if (IS_ERR(bh))
-               return PTR_ERR(bh);
-       if (!bh)
-@@ -3153,7 +3556,7 @@ retry:
-       ext4_inc_count(handle, inode);
-       ihold(inode);
--      err = ext4_add_entry(handle, dentry, inode);
-+      err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (!err) {
-               ext4_mark_inode_dirty(handle, inode);
-               d_instantiate(dentry, inode);
-@@ -3183,7 +3556,7 @@ retry:
-       struct buffer_head *bh;
-       struct ext4_dir_entry_2 *de;
-
--      bh = ext4_find_entry(dir, d_name, &de, NULL);
-+      bh = ext4_find_entry(dir, d_name, &de, NULL, NULL);
-       if (IS_ERR(bh))
-               return PTR_ERR(bh);
-       if (bh) {
-@@ -3230,7 +3633,7 @@ static int ext4_rename(struct inode *old
-       if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-               ext4_handle_sync(handle);
--      old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
-+      old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL, NULL);
-       if (IS_ERR(old.bh))
-               return PTR_ERR(old.bh);
-       /*
-@@ -3244,7 +3647,7 @@ static int ext4_rename(struct inode *old
-       new_inode = new_dentry->d_inode;
-       new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
--                               &new.de, &new.inlined);
-+                               &new.de, &new.inlined, NULL);
-       if (IS_ERR(new.bh)) {
-               if (!new_inode) {
-                       brelse(new_bh);
-@@ -3275,7 +3678,7 @@ static int ext4_rename(struct inode *old
-                       goto end_rename;
-       }
-       if (!new.bh) {
--              retval = ext4_add_entry(handle, new.dentry, old.inode);
-+              retval = ext4_add_entry(handle, new.dentry, old.inode, NULL);
-               if (retval)
-                       goto end_rename;
-       } else {
-@@ -3375,7 +3678,7 @@ static int ext4_rename(struct inode *old
-       dquot_initialize(new.dir);
-       old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
--                               &old.de, &old.inlined);
-+                               &old.de, &old.inlined, NULL);
-       /*
-        *  Check for inode number is _not_ due to possible IO errors.
-        *  We might rmdir the source, keep it as pwd of some process
-@@ -3475,7 +3678,7 @@ static int ext4_rename(struct inode *old
-               goto end_rename;
-
-       new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
--                               &new.de, &new.inlined);
-+                               &new.de, &new.inlined, NULL);
-
-       /* RENAME_EXCHANGE case: old *and* new must both exist */
-       if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
-===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/inode.c
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
-@@ -4264,7 +4264,7 @@ struct inode *ext4_iget(struct super_blo
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
-               ei->i_file_acl |=
-                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
--      inode->i_size = ext4_isize(raw_inode);
-+      inode->i_size = ext4_isize(sb, raw_inode);
-       ei->i_disksize = inode->i_size;
- #ifdef CONFIG_QUOTA
-       ei->i_reserved_quota = 0;
-@@ -4499,7 +4499,7 @@ static int ext4_do_update_inode(handle_t
-               raw_inode->i_file_acl_high =
-                       cpu_to_le16(ei->i_file_acl >> 32);
-       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
--      if (ei->i_disksize != ext4_isize(raw_inode)) {
-+      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
-               ext4_isize_set(raw_inode, ei->i_disksize);
-               need_datasync = 1;
-       }
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
-===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/Makefile
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
-@@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o
-               ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-               mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
--              xattr_trusted.o inline.o
-+              xattr_trusted.o inline.o htree_lock.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL)      += acl.o
- ext4-$(CONFIG_EXT4_FS_SECURITY)               += xattr_security.o
-Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
+       /* @restart is true means htree-path has been changed, we need to
+Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c
 ===================================================================
---- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/super.c
-+++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
-@@ -872,6 +872,7 @@ static struct inode *ext4_alloc_inode(st
+--- linux-3.10.0-229.1.2.fc21.x86_64.orig/fs/ext4/super.c
++++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/super.c
+@@ -875,6 +875,7 @@ static struct inode *ext4_alloc_inode(st
  
        ei->vfs_inode.i_version = 1;
-       spin_lock_init(&ei->i_raw_lock);
+       spin_lock_init(&ei->i_raw_lock);
 +      sema_init(&ei->i_append_sem, 1);
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch
new file mode 100644 (file)
index 0000000..3ef66c1
--- /dev/null
@@ -0,0 +1,364 @@
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ #define EXT4_FEATURE_INCOMPAT_FLEX_BG         0x0200
+ #define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400
+ #define EXT4_FEATURE_INCOMPAT_DIRDATA         0x1000 /* data in dirent */
++#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000
+ #define EXT4_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT4_FEATURE_INCOMPAT_SUPP    (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
+                                        EXT4_FEATURE_INCOMPAT_MMP| \
+-                                       EXT4_FEATURE_INCOMPAT_DIRDATA)
++                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
++                                       EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b
+  */
+ #define ERR_BAD_DX_DIR        -75000
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL      3
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
++}
++
+ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+                       ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s
+       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++                              struct ext4_inode *raw_inode)
+ {
+-      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
++          S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                       le32_to_cpu(raw_inode->i_size_lo);
+-      else
+-              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++      return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+ }
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+-      inode->i_size = ext4_isize(raw_inode);
++      inode->i_size = ext4_isize(sb, raw_inode);
+       ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
+@@ -5654,7 +5654,7 @@ static int ext4_do_update_inode(handle_t
+               raw_inode->i_file_acl_high =
+                       cpu_to_le16(ei->i_file_acl >> 32);
+       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+-      if (ei->i_disksize != ext4_isize(raw_inode)) {
++      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+               ext4_isize_set(raw_inode, ei->i_disksize);
+               need_datasync = 1;
+       }
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return le32_to_cpu(entry->block) & 0x0fffffff;
+ }
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru
+       struct dx_frame *frame = frame_in;
+       u32 hash;
+-      frame->bh = NULL;
++      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+       if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+               goto fail;
+@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru
+               goto fail;
+       }
+-      if ((indirect = info->indirect_levels) > 1) {
+-              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+-                           info->indirect_levels);
++      indirect = info->indirect_levels;
++      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++              ext4_warning(dir->i_sb,
++                           "Directory (ino: %lu) htree depth %#06x exceed "
++                           "supported value", dir->i_ino,
++                           ext4_dir_htree_level(dir->i_sb));
++              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++                      ext4_warning(dir->i_sb, "Enable large directory "
++                                              "feature to access it");
++              }
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+@@ -512,13 +519,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+       struct dx_root_info *info;
++      int i;
++
+       if (frames[0].bh == NULL)
+               return;
+       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+-      if (info->indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      for (i = 0; i <= info->indirect_levels; i++) {
++              if (frames[i].bh == NULL)
++                      break;
++              brelse(frames[i].bh);
++              frames[i].bh = NULL;
++      }
+ }
+ /*
+@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+       struct dx_hash_info hinfo;
+       struct ext4_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct inode *dir;
+       ext4_lblk_t block;
+       int count = 0;
+@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct buffer_head *bh;
+       ext4_lblk_t block;
+       struct ext4_dir_entry_2 *de, *top;
+@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h
+        */
+       dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+       ext4_update_dx_flag(dir);
+-      dir->i_version++;
++      inode_inc_iversion(dir);
+       ext4_mark_inode_dirty(handle, dir);
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+       err = ext4_handle_dirty_metadata(handle, dir, bh);
+@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries;
+       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+       char            *data1, *top;
+@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+       struct dx_hash_info hinfo;
+       struct buffer_head *bh;
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct super_block *sb = dir->i_sb;
+       struct ext4_dir_entry_2 *de;
++      int restart;
+       int err;
++again:
++      restart = 0;
+       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h
+       if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+               goto cleanup;
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext4_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+       if (err != -ENOSPC)
+               goto cleanup;
++      err = 0;
+       /* Block full, should compress but for now just split */
+       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* Need to split index? */
+       if (dx_get_count(entries) == dx_get_limit(entries)) {
+               ext4_lblk_t newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
++              int levels = frame - frames + 1;
++              unsigned icount;
++              int add_level = 1;
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext4_warning(sb, "Directory index full!");
++              while (frame > frames) {
++                      if (dx_get_count((frame - 1)->entries) <
++                          dx_get_limit((frame - 1)->entries)) {
++                              add_level = 0;
++                              break;
++                      }
++                      frame--; /* split higher index block */
++                      at = frame->at;
++                      entries = frame->entries;
++                      restart = 1;
++              }
++              if (add_level && levels == ext4_dir_htree_level(sb)) {
++                      ext4_warning(sb, "Directory (ino: %lu) index full, "
++                                       "reach max htree level :%d",
++                                       dir->i_ino, levels);
++                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++                              ext4_warning(sb, "Large directory feature is"
++                                               "not enabled on this "
++                                               "filesystem");
++                      }
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
++              icount = dx_get_count(entries);
+               bh2 = ext4_append (handle, dir, &newblock, &err);
+               if (!(bh2))
+                       goto cleanup;
+@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h
+               err = ext4_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
++              if (!add_level) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
+                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h
+                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                       err = ext4_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++                                                          (frame - 1)->bh);
+                       if (err)
+                               goto journal_error;
+@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                       }
+-                      dx_insert_block(frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index("node", frames[1].entries));
++                      dx_insert_block((frame - 1), hash2, newblock);
++                      dxtrace(dx_show_index("node", frame->entries));
+                       dxtrace(dx_show_index("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
++                      ext4_handle_dirty_metadata(handle, dir,
++                                                 (frame - 1)->bh);
++                      if (restart) {
++                              ext4_handle_dirty_metadata(handle, dir,
++                                                         frame->bh);
++                              goto cleanup;
++                      }
+               } else {
+                       struct dx_root_info * info;
+-                      dxtrace(printk(KERN_DEBUG
+-                                     "Creating second level index...\n"));
++
+                       memcpy((char *) entries2, (char *) entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h
+                       dx_set_block(entries + 0, newblock);
+                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
+                                       frames[0].bh->b_data);
+-                      info->indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext4_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
++                      info->indirect_levels += 1;
++                      dxtrace(printk(KERN_DEBUG
++                                     "Creating %d level index...\n",
++                                     info->indirect_levels));
++                      ext4_handle_dirty_metadata(handle, dir, frame->bh);
++                      ext4_handle_dirty_metadata(handle, dir, bh2);
++                      brelse(bh2);
++                      restart = 1;
++                      goto cleanup;
+               }
+-              err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+               if (err) {
+                       ext4_std_error(inode->i_sb, err);
+                       goto cleanup;
+@@ -1840,6 +1873,10 @@ cleanup:
+       if (bh)
+               brelse(bh);
+       dx_release(frames);
++      /* @restart is true means htree-path has been changed, we need to
++       * repeat dx_probe() to find out valid htree-path */
++      if (restart && err == 0)
++              goto again;
+       return err;
+ }
+@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle,
+                                       blocksize);
+                       else
+                               de->inode = 0;
+-                      dir->i_version++;
++                      inode_inc_iversion(dir);
+                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                       ext4_handle_dirty_metadata(handle, dir, bh);
+                       return 0;
index a9014f2..0f101ed 100644 (file)
@@ -5,21 +5,15 @@
 
 --- a/fs/ext4/ext4.h
 +++ b/fs/ext4/ext4.h
-@@ -1895,6 +1895,19 @@ extern int ext4_orphan_add(handle_t *, s
+@@ -1895,6 +1895,13 @@ extern int ext4_orphan_add(handle_t *, s
  extern int ext4_orphan_del(handle_t *, struct inode *);
  extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
 +extern struct inode *ext4_create_inode(handle_t *handle,
 +                                     struct inode * dir, int mode);
-+extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+                        struct inode *inode);
 +extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
 +                           struct ext4_dir_entry_2 * de_del,
 +                           struct buffer_head * bh);
-+extern struct buffer_head * ext4_find_entry(struct inode *dir,
-+                                          const struct qstr *d_name,
-+                                          struct ext4_dir_entry_2 ** res_dir);
-+#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
 +extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
 +                             struct inode *inode);
 
  #include <linux/fs.h>
  #include <linux/pagemap.h>
  #include <linux/jbd2.h>
-@@ -873,9 +874,9 @@ static inline int search_dirblock(struct
-  * The returned buffer_head has ->b_count elevated.  The caller is expected
-  * to brelse() it when appropriate.
-  */
--static struct buffer_head * ext4_find_entry (struct inode *dir,
--                                      const struct qstr *d_name,
--                                      struct ext4_dir_entry_2 ** res_dir)
-+struct buffer_head * ext4_find_entry(struct inode *dir,
-+                                    const struct qstr *d_name,
-+                                    struct ext4_dir_entry_2 ** res_dir)
- {
-       struct super_block *sb;
-       struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -981,6 +982,7 @@ cleanup_and_exit:
-               brelse(bh_use[ra_ptr]);
-       return ret;
- }
-+EXPORT_SYMBOL(ext4_find_entry);
-
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-                      struct ext4_dir_entry_2 **res_dir, int *err)
-@@ -1503,8 +1505,8 @@ static int make_indexed_dir(handle_t *ha
-  * may not sleep between calling this and putting something into
-  * the entry, as someone else might have used it while you slept.
-  */
--static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                        struct inode *inode)
-+int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-+                 struct inode *inode)
- {
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct buffer_head *bh;
-@@ -1555,6 +1557,7 @@ static int ext4_add_entry(handle_t *hand
-               ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
-       return retval;
- }
-+EXPORT_SYMBOL(ext4_add_entry);
-
- /*
-  * Returns 0 for success, or a negative error value
 @@ -1698,10 +1701,10 @@ cleanup:
   * ext4_delete_entry deletes a directory entry by merging it with the
   * previous entry
diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-pdirop.patch
deleted file mode 100644 (file)
index 4d2acff..0000000
+++ /dev/null
@@ -1,2273 +0,0 @@
----
- fs/ext4/Makefile           |    2
- fs/ext4/ext4.h             |   93 ++++
- fs/ext4/htree_lock.c       |  880 +++++++++++++++++++++++++++++++++++++++++++++
- fs/ext4/inode.c            |    4
- fs/ext4/namei.c            |  585 +++++++++++++++++++++++++----
- include/linux/htree_lock.h |  187 +++++++++
- 6 files changed, 1650 insertions(+), 101 deletions(-)
-
---- a/fs/ext4/Makefile
-+++ b/fs/ext4/Makefile
-@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
- ext4-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
-               ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
--              mmp.o
-+              htree_lock.o mmp.o
-
- ext4-$(CONFIG_EXT4_FS_XATTR)          += xattr.o xattr_user.o xattr_trusted.o
- ext4-$(CONFIG_EXT4_FS_POSIX_ACL)      += acl.o
---- a/fs/ext4/ext4.h
-+++ b/fs/ext4/ext4.h
-@@ -28,6 +28,7 @@
- #include <linux/mutex.h>
- #include <linux/timer.h>
- #include <linux/wait.h>
-+#include <linux/htree_lock.h>
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
- #ifdef __KERNEL__
-@@ -1402,6 +1403,7 @@ static inline void ext4_clear_state_flag
- #define EXT4_FEATURE_INCOMPAT_FLEX_BG         0x0200
- #define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
- #define EXT4_FEATURE_INCOMPAT_DIRDATA         0x1000 /* data in dirent */
-+#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000
-
- #define EXT2_FEATURE_COMPAT_SUPP      EXT4_FEATURE_COMPAT_EXT_ATTR
- #define EXT2_FEATURE_INCOMPAT_SUPP    (EXT4_FEATURE_INCOMPAT_FILETYPE| \
-@@ -1427,7 +1429,8 @@ static inline void ext4_clear_state_flag
-                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
-                                        EXT4_FEATURE_INCOMPAT_MMP| \
--                                       EXT4_FEATURE_INCOMPAT_DIRDATA)
-+                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
-+                                       EXT4_FEATURE_INCOMPAT_LARGEDIR)
-
- #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
-@@ -1690,6 +1693,76 @@ ext4_group_first_block_no(struct super_b
-  */
- #define ERR_BAD_DX_DIR        -75000
-
-+/* htree levels for ext4 */
-+#define EXT4_HTREE_LEVEL_COMPAT 2
-+#define EXT4_HTREE_LEVEL      3
-+
-+static inline int
-+ext4_dir_htree_level(struct super_block *sb)
-+{
-+      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
-+              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
-+}
-+
-+/* assume name-hash is protected by upper layer */
-+#define EXT4_HTREE_LOCK_HASH  0
-+
-+enum ext4_pdo_lk_types {
-+#if EXT4_HTREE_LOCK_HASH
-+      EXT4_LK_HASH,
-+#endif
-+      EXT4_LK_DX,             /* index block */
-+      EXT4_LK_DE,             /* directory entry block */
-+      EXT4_LK_SPIN,           /* spinlock */
-+      EXT4_LK_MAX,
-+};
-+
-+/* read-only bit */
-+#define EXT4_LB_RO(b)         (1 << (b))
-+/* read + write, high bits for writer */
-+#define EXT4_LB_RW(b)         ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
-+
-+enum ext4_pdo_lock_bits {
-+      /* DX lock bits */
-+      EXT4_LB_DX_RO           = EXT4_LB_RO(EXT4_LK_DX),
-+      EXT4_LB_DX              = EXT4_LB_RW(EXT4_LK_DX),
-+      /* DE lock bits */
-+      EXT4_LB_DE_RO           = EXT4_LB_RO(EXT4_LK_DE),
-+      EXT4_LB_DE              = EXT4_LB_RW(EXT4_LK_DE),
-+      /* DX spinlock bits */
-+      EXT4_LB_SPIN_RO         = EXT4_LB_RO(EXT4_LK_SPIN),
-+      EXT4_LB_SPIN            = EXT4_LB_RW(EXT4_LK_SPIN),
-+      /* accurate searching */
-+      EXT4_LB_EXACT           = EXT4_LB_RO(EXT4_LK_MAX << 1),
-+};
-+
-+enum ext4_pdo_lock_opc {
-+      /* external */
-+      EXT4_HLOCK_READDIR      = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
-+      EXT4_HLOCK_LOOKUP       = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
-+                                 EXT4_LB_EXACT),
-+      EXT4_HLOCK_DEL          = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
-+                                 EXT4_LB_EXACT),
-+      EXT4_HLOCK_ADD          = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
-+
-+      /* internal */
-+      EXT4_HLOCK_LOOKUP_SAFE  = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
-+                                 EXT4_LB_EXACT),
-+      EXT4_HLOCK_DEL_SAFE     = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
-+      EXT4_HLOCK_SPLIT        = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
-+};
-+
-+extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
-+#define ext4_htree_lock_head_free(lhead)      htree_lock_head_free(lhead)
-+
-+extern struct htree_lock *ext4_htree_lock_alloc(void);
-+#define ext4_htree_lock_free(lck)             htree_lock_free(lck)
-+
-+extern void ext4_htree_lock(struct htree_lock *lck,
-+                          struct htree_lock_head *lhead,
-+                          struct inode *dir, unsigned flags);
-+#define ext4_htree_unlock(lck)                  htree_unlock(lck)
-+
- void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                       ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
-@@ -1964,14 +2037,16 @@ extern int ext4_htree_fill_tree(struct f
- extern struct inode *ext4_create_inode(handle_t *handle,
-                                      struct inode * dir, int mode);
- extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                        struct inode *inode);
-+                        struct inode *inode, struct htree_lock *lck);
- extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
-                            struct ext4_dir_entry_2 * de_del,
-                            struct buffer_head * bh);
- extern struct buffer_head * ext4_find_entry(struct inode *dir,
-                                           const struct qstr *d_name,
--                                          struct ext4_dir_entry_2 ** res_dir);
--#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
-+                                          struct ext4_dir_entry_2 **res_dir,
-+                                          struct htree_lock *lck);
-+#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \
-+      ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck)
- extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
-                              struct inode *inode, const void *, const void *);
- extern struct buffer_head *ext4_append(handle_t *handle,
-@@ -2104,13 +2179,15 @@ static inline void ext4_r_blocks_count_s
-       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
- }
-
--static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
-+static inline loff_t ext4_isize(struct super_block *sb,
-+                              struct ext4_inode *raw_inode)
- {
--      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-+      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
-+          S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
-                       le32_to_cpu(raw_inode->i_size_lo);
--      else
--              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
-+
-+      return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
- }
-
- static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
---- /dev/null
-+++ b/fs/ext4/htree_lock.c
-@@ -0,0 +1,880 @@
-+/*
-+ * fs/ext4/htree_lock.c
-+ *
-+ * Copyright (c) 2011, 2012, Intel Corporation.
-+ *
-+ * Author: Liang Zhen <liang@whamcloud.com>
-+ */
-+#include <linux/jbd2.h>
-+#include <linux/hash.h>
-+#include <linux/module.h>
-+#include <linux/htree_lock.h>
-+
-+enum {
-+      HTREE_LOCK_BIT_EX       = (1 << HTREE_LOCK_EX),
-+      HTREE_LOCK_BIT_PW       = (1 << HTREE_LOCK_PW),
-+      HTREE_LOCK_BIT_PR       = (1 << HTREE_LOCK_PR),
-+      HTREE_LOCK_BIT_CW       = (1 << HTREE_LOCK_CW),
-+      HTREE_LOCK_BIT_CR       = (1 << HTREE_LOCK_CR),
-+};
-+
-+enum {
-+      HTREE_LOCK_COMPAT_EX    = 0,
-+      HTREE_LOCK_COMPAT_PW    = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
-+      HTREE_LOCK_COMPAT_PR    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
-+      HTREE_LOCK_COMPAT_CW    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
-+      HTREE_LOCK_COMPAT_CR    = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
-+                                HTREE_LOCK_BIT_PW,
-+};
-+
-+static int htree_lock_compat[] = {
-+      [HTREE_LOCK_EX]         HTREE_LOCK_COMPAT_EX,
-+      [HTREE_LOCK_PW]         HTREE_LOCK_COMPAT_PW,
-+      [HTREE_LOCK_PR]         HTREE_LOCK_COMPAT_PR,
-+      [HTREE_LOCK_CW]         HTREE_LOCK_COMPAT_CW,
-+      [HTREE_LOCK_CR]         HTREE_LOCK_COMPAT_CR,
-+};
-+
-+/* max allowed htree-lock depth.
-+ * We only need depth=3 for ext4 although user can have higher value. */
-+#define HTREE_LOCK_DEP_MAX    16
-+
-+#ifdef HTREE_LOCK_DEBUG
-+
-+static char *hl_name[] = {
-+      [HTREE_LOCK_EX]         "EX",
-+      [HTREE_LOCK_PW]         "PW",
-+      [HTREE_LOCK_PR]         "PR",
-+      [HTREE_LOCK_CW]         "CW",
-+      [HTREE_LOCK_CR]         "CR",
-+};
-+
-+/* lock stats */
-+struct htree_lock_node_stats {
-+      unsigned long long      blocked[HTREE_LOCK_MAX];
-+      unsigned long long      granted[HTREE_LOCK_MAX];
-+      unsigned long long      retried[HTREE_LOCK_MAX];
-+      unsigned long long      events;
-+};
-+
-+struct htree_lock_stats {
-+      struct htree_lock_node_stats    nodes[HTREE_LOCK_DEP_MAX];
-+      unsigned long long      granted[HTREE_LOCK_MAX];
-+      unsigned long long      blocked[HTREE_LOCK_MAX];
-+};
-+
-+static struct htree_lock_stats hl_stats;
-+
-+void htree_lock_stat_reset(void)
-+{
-+      memset(&hl_stats, 0, sizeof(hl_stats));
-+}
-+
-+void htree_lock_stat_print(int depth)
-+{
-+      int     i;
-+      int     j;
-+
-+      printk(KERN_DEBUG "HTREE LOCK STATS:\n");
-+      for (i = 0; i < HTREE_LOCK_MAX; i++) {
-+              printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
-+                     hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
-+      }
-+      for (i = 0; i < depth; i++) {
-+              printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
-+              for (j = 0; j < HTREE_LOCK_MAX; j++) {
-+                      printk(KERN_DEBUG
-+                              "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
-+                              hl_name[j], hl_stats.nodes[i].granted[j],
-+                              hl_stats.nodes[i].blocked[j],
-+                              hl_stats.nodes[i].retried[j]);
-+              }
-+      }
-+}
-+
-+#define lk_grant_inc(m)       do { hl_stats.granted[m]++; } while (0)
-+#define lk_block_inc(m)       do { hl_stats.blocked[m]++; } while (0)
-+#define ln_grant_inc(d, m)    do { hl_stats.nodes[d].granted[m]++; } while (0)
-+#define ln_block_inc(d, m)    do { hl_stats.nodes[d].blocked[m]++; } while (0)
-+#define ln_retry_inc(d, m)    do { hl_stats.nodes[d].retried[m]++; } while (0)
-+#define ln_event_inc(d)       do { hl_stats.nodes[d].events++; } while (0)
-+
-+#else /* !DEBUG */
-+
-+void htree_lock_stat_reset(void) {}
-+void htree_lock_stat_print(int depth) {}
-+
-+#define lk_grant_inc(m)             do {} while (0)
-+#define lk_block_inc(m)             do {} while (0)
-+#define ln_grant_inc(d, m)    do {} while (0)
-+#define ln_block_inc(d, m)    do {} while (0)
-+#define ln_retry_inc(d, m)    do {} while (0)
-+#define ln_event_inc(d)             do {} while (0)
-+
-+#endif /* DEBUG */
-+
-+EXPORT_SYMBOL(htree_lock_stat_reset);
-+EXPORT_SYMBOL(htree_lock_stat_print);
-+
-+#define HTREE_DEP_ROOT                  (-1)
-+
-+#define htree_spin_lock(lhead, dep)                           \
-+      bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
-+#define htree_spin_unlock(lhead, dep)                         \
-+      bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
-+
-+#define htree_key_event_ignore(child, ln)                     \
-+      (!((child)->lc_events & (1 << (ln)->ln_mode)))
-+
-+static int
-+htree_key_list_empty(struct htree_lock_node *ln)
-+{
-+      return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
-+}
-+
-+static void
-+htree_key_list_del_init(struct htree_lock_node *ln)
-+{
-+      struct htree_lock_node *tmp = NULL;
-+
-+      if (!list_empty(&ln->ln_minor_list)) {
-+              tmp = list_entry(ln->ln_minor_list.next,
-+                               struct htree_lock_node, ln_minor_list);
-+              list_del_init(&ln->ln_minor_list);
-+      }
-+
-+      if (list_empty(&ln->ln_major_list))
-+              return;
-+
-+      if (tmp == NULL) { /* not on minor key list */
-+              list_del_init(&ln->ln_major_list);
-+      } else {
-+              BUG_ON(!list_empty(&tmp->ln_major_list));
-+              list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
-+      }
-+}
-+
-+static void
-+htree_key_list_replace_init(struct htree_lock_node *old,
-+                          struct htree_lock_node *new)
-+{
-+      if (!list_empty(&old->ln_major_list))
-+              list_replace_init(&old->ln_major_list, &new->ln_major_list);
-+
-+      if (!list_empty(&old->ln_minor_list))
-+              list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
-+}
-+
-+static void
-+htree_key_event_enqueue(struct htree_lock_child *child,
-+                      struct htree_lock_node *ln, int dep, void *event)
-+{
-+      struct htree_lock_node *tmp;
-+
-+      /* NB: ALWAYS called holding lhead::lh_lock(dep) */
-+      BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
-+      if (event == NULL || htree_key_event_ignore(child, ln))
-+              return;
-+
-+      /* shouldn't be a very long list */
-+      list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
-+              if (tmp->ln_mode == HTREE_LOCK_NL) {
-+                      ln_event_inc(dep);
-+                      if (child->lc_callback != NULL)
-+                              child->lc_callback(tmp->ln_ev_target, event);
-+              }
-+      }
-+}
-+
-+static int
-+htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
-+                      unsigned dep, int wait, void *event)
-+{
-+      struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
-+      struct htree_lock_node *newln = &newlk->lk_nodes[dep];
-+      struct htree_lock_node *curln = &curlk->lk_nodes[dep];
-+
-+      /* NB: ALWAYS called holding lhead::lh_lock(dep) */
-+      /* NB: we only expect PR/PW lock mode at here, only these two modes are
-+       * allowed for htree_node_lock(asserted in htree_node_lock_internal),
-+       * NL is only used for listener, user can't directly require NL mode */
-+      if ((curln->ln_mode == HTREE_LOCK_NL) ||
-+          (curln->ln_mode != HTREE_LOCK_PW &&
-+           newln->ln_mode != HTREE_LOCK_PW)) {
-+              /* no conflict, attach it on granted list of @curlk */
-+              if (curln->ln_mode != HTREE_LOCK_NL) {
-+                      list_add(&newln->ln_granted_list,
-+                               &curln->ln_granted_list);
-+              } else {
-+                      /* replace key owner */
-+                      htree_key_list_replace_init(curln, newln);
-+              }
-+
-+              list_add(&newln->ln_alive_list, &curln->ln_alive_list);
-+              htree_key_event_enqueue(child, newln, dep, event);
-+              ln_grant_inc(dep, newln->ln_mode);
-+              return 1; /* still hold lh_lock */
-+      }
-+
-+      if (!wait) { /* can't grant and don't want to wait */
-+              ln_retry_inc(dep, newln->ln_mode);
-+              newln->ln_mode = HTREE_LOCK_INVAL;
-+              return -1; /* don't wait and just return -1 */
-+      }
-+
-+      newlk->lk_task = current;
-+      set_current_state(TASK_UNINTERRUPTIBLE);
-+      /* conflict, attach it on blocked list of curlk */
-+      list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
-+      list_add(&newln->ln_alive_list, &curln->ln_alive_list);
-+      ln_block_inc(dep, newln->ln_mode);
-+
-+      htree_spin_unlock(newlk->lk_head, dep);
-+      /* wait to be given the lock */
-+      if (newlk->lk_task != NULL)
-+              schedule();
-+      /* granted, no doubt, wake up will set me RUNNING */
-+      if (event == NULL || htree_key_event_ignore(child, newln))
-+              return 0; /* granted without lh_lock */
-+
-+      htree_spin_lock(newlk->lk_head, dep);
-+      htree_key_event_enqueue(child, newln, dep, event);
-+      return 1; /* still hold lh_lock */
-+}
-+
-+/*
-+ * get PR/PW access to particular tree-node according to @dep and @key,
-+ * it will return -1 if @wait is false and can't immediately grant this lock.
-+ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
-+ * @event if it's not NULL.
-+ * NB: ALWAYS called holding lhead::lh_lock
-+ */
-+static int
-+htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
-+                       htree_lock_mode_t mode, u32 key, unsigned dep,
-+                       int wait, void *event)
-+{
-+      LIST_HEAD               (list);
-+      struct htree_lock       *tmp;
-+      struct htree_lock       *tmp2;
-+      u16                     major;
-+      u16                     minor;
-+      u8                      reverse;
-+      u8                      ma_bits;
-+      u8                      mi_bits;
-+
-+      BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
-+      BUG_ON(htree_node_is_granted(lck, dep));
-+
-+      key = hash_long(key, lhead->lh_hbits);
-+
-+      mi_bits = lhead->lh_hbits >> 1;
-+      ma_bits = lhead->lh_hbits - mi_bits;
-+
-+      lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
-+      lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
-+      lck->lk_nodes[dep].ln_mode = mode;
-+
-+      /*
-+       * The major key list is an ordered list, so searches are started
-+       * at the end of the list that is numerically closer to major_key,
-+       * so at most half of the list will be walked (for well-distributed
-+       * keys). The list traversal aborts early if the expected key
-+       * location is passed.
-+       */
-+      reverse = (major >= (1 << (ma_bits - 1)));
-+
-+      if (reverse) {
-+              list_for_each_entry_reverse(tmp,
-+                                      &lhead->lh_children[dep].lc_list,
-+                                      lk_nodes[dep].ln_major_list) {
-+                      if (tmp->lk_nodes[dep].ln_major_key == major) {
-+                              goto search_minor;
-+
-+                      } else if (tmp->lk_nodes[dep].ln_major_key < major) {
-+                              /* attach _after_ @tmp */
-+                              list_add(&lck->lk_nodes[dep].ln_major_list,
-+                                       &tmp->lk_nodes[dep].ln_major_list);
-+                              goto out_grant_major;
-+                      }
-+              }
-+
-+              list_add(&lck->lk_nodes[dep].ln_major_list,
-+                       &lhead->lh_children[dep].lc_list);
-+              goto out_grant_major;
-+
-+      } else {
-+              list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
-+                                  lk_nodes[dep].ln_major_list) {
-+                      if (tmp->lk_nodes[dep].ln_major_key == major) {
-+                              goto search_minor;
-+
-+                      } else if (tmp->lk_nodes[dep].ln_major_key > major) {
-+                              /* insert _before_ @tmp */
-+                              list_add_tail(&lck->lk_nodes[dep].ln_major_list,
-+                                      &tmp->lk_nodes[dep].ln_major_list);
-+                              goto out_grant_major;
-+                      }
-+              }
-+
-+              list_add_tail(&lck->lk_nodes[dep].ln_major_list,
-+                            &lhead->lh_children[dep].lc_list);
-+              goto out_grant_major;
-+      }
-+
-+ search_minor:
-+      /*
-+       * NB: minor_key list doesn't have a "head", @list is just a
-+       * temporary stub for helping list searching, make sure it's removed
-+       * after searching.
-+       * minor_key list is an ordered list too.
-+       */
-+      list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
-+
-+      reverse = (minor >= (1 << (mi_bits - 1)));
-+
-+      if (reverse) {
-+              list_for_each_entry_reverse(tmp2, &list,
-+                                          lk_nodes[dep].ln_minor_list) {
-+                      if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
-+                              goto out_enqueue;
-+
-+                      } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
-+                              /* attach _after_ @tmp2 */
-+                              list_add(&lck->lk_nodes[dep].ln_minor_list,
-+                                       &tmp2->lk_nodes[dep].ln_minor_list);
-+                              goto out_grant_minor;
-+                      }
-+              }
-+
-+              list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
-+
-+      } else {
-+              list_for_each_entry(tmp2, &list,
-+                                  lk_nodes[dep].ln_minor_list) {
-+                      if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
-+                              goto out_enqueue;
-+
-+                      } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
-+                              /* insert _before_ @tmp2 */
-+                              list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
-+                                      &tmp2->lk_nodes[dep].ln_minor_list);
-+                              goto out_grant_minor;
-+                      }
-+              }
-+
-+              list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
-+      }
-+
-+ out_grant_minor:
-+      if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
-+              /* new lock @lck is the first one on minor_key list, which
-+               * means it has the smallest minor_key and it should
-+               * replace @tmp as minor_key owner */
-+              list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
-+                                &lck->lk_nodes[dep].ln_major_list);
-+      }
-+      /* remove the temporary head */
-+      list_del(&list);
-+
-+ out_grant_major:
-+      ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
-+      return 1; /* granted with holding lh_lock */
-+
-+ out_enqueue:
-+      list_del(&list); /* remove temprary head */
-+      return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
-+}
-+
-+/*
-+ * release the key of @lck at level @dep, and grant any blocked locks.
-+ * caller will still listen on @key if @event is not NULL, which means
-+ * caller can see a event (by event_cb) while granting any lock with
-+ * the same key at level @dep.
-+ * NB: ALWAYS called holding lhead::lh_lock
-+ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
-+ */
-+static void
-+htree_node_unlock_internal(struct htree_lock_head *lhead,
-+                         struct htree_lock *curlk, unsigned dep, void *event)
-+{
-+      struct htree_lock_node  *curln = &curlk->lk_nodes[dep];
-+      struct htree_lock       *grtlk = NULL;
-+      struct htree_lock_node  *grtln;
-+      struct htree_lock       *poslk;
-+      struct htree_lock       *tmplk;
-+
-+      if (!htree_node_is_granted(curlk, dep))
-+              return;
-+
-+      if (!list_empty(&curln->ln_granted_list)) {
-+              /* there is another granted lock */
-+              grtlk = list_entry(curln->ln_granted_list.next,
-+                                 struct htree_lock,
-+                                 lk_nodes[dep].ln_granted_list);
-+              list_del_init(&curln->ln_granted_list);
-+      }
-+
-+      if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
-+              /*
-+               * @curlk is the only granted lock, so we confirmed:
-+               * a) curln is key owner (attached on major/minor_list),
-+               *    so if there is any blocked lock, it should be attached
-+               *    on curln->ln_blocked_list
-+               * b) we always can grant the first blocked lock
-+               */
-+              grtlk = list_entry(curln->ln_blocked_list.next,
-+                                 struct htree_lock,
-+                                 lk_nodes[dep].ln_blocked_list);
-+              BUG_ON(grtlk->lk_task == NULL);
-+              wake_up_process(grtlk->lk_task);
-+      }
-+
-+      if (event != NULL &&
-+          lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
-+              curln->ln_ev_target = event;
-+              curln->ln_mode = HTREE_LOCK_NL; /* listen! */
-+      } else {
-+              curln->ln_mode = HTREE_LOCK_INVAL;
-+      }
-+
-+      if (grtlk == NULL) { /* I must be the only one locking this key */
-+              struct htree_lock_node *tmpln;
-+
-+              BUG_ON(htree_key_list_empty(curln));
-+
-+              if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
-+                      return;
-+
-+              /* not listening */
-+              if (list_empty(&curln->ln_alive_list)) { /* no more listener */
-+                      htree_key_list_del_init(curln);
-+                      return;
-+              }
-+
-+              tmpln = list_entry(curln->ln_alive_list.next,
-+                                 struct htree_lock_node, ln_alive_list);
-+
-+              BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
-+
-+              htree_key_list_replace_init(curln, tmpln);
-+              list_del_init(&curln->ln_alive_list);
-+
-+              return;
-+      }
-+
-+      /* have a granted lock */
-+      grtln = &grtlk->lk_nodes[dep];
-+      if (!list_empty(&curln->ln_blocked_list)) {
-+              /* only key owner can be on both lists */
-+              BUG_ON(htree_key_list_empty(curln));
-+
-+              if (list_empty(&grtln->ln_blocked_list)) {
-+                      list_add(&grtln->ln_blocked_list,
-+                               &curln->ln_blocked_list);
-+              }
-+              list_del_init(&curln->ln_blocked_list);
-+      }
-+      /*
-+       * NB: this is the tricky part:
-+       * We have only two modes for child-lock (PR and PW), also,
-+       * only owner of the key (attached on major/minor_list) can be on
-+       * both blocked_list and granted_list, so @grtlk must be one
-+       * of these two cases:
-+       *
-+       * a) @grtlk is taken from granted_list, which means we've granted
-+       *    more than one lock so @grtlk has to be PR, the first blocked
-+       *    lock must be PW and we can't grant it at all.
-+       *    So even @grtlk is not owner of the key (empty blocked_list),
-+       *    we don't care because we can't grant any lock.
-+       * b) we just grant a new lock which is taken from head of blocked
-+       *    list, and it should be the first granted lock, and it should
-+       *    be the first one linked on blocked_list.
-+       *
-+       * Either way, we can get correct result by iterating blocked_list
-+       * of @grtlk, and don't have to bother on how to find out
-+       * owner of current key.
-+       */
-+      list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
-+                               lk_nodes[dep].ln_blocked_list) {
-+              if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
-+                  poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
-+                      break;
-+              /* grant all readers */
-+              list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
-+              list_add(&poslk->lk_nodes[dep].ln_granted_list,
-+                       &grtln->ln_granted_list);
-+
-+              BUG_ON(poslk->lk_task == NULL);
-+              wake_up_process(poslk->lk_task);
-+      }
-+
-+      /* if @curln is the owner of this key, replace it with @grtln */
-+      if (!htree_key_list_empty(curln))
-+              htree_key_list_replace_init(curln, grtln);
-+
-+      if (curln->ln_mode == HTREE_LOCK_INVAL)
-+              list_del_init(&curln->ln_alive_list);
-+}
-+
-+/*
-+ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
-+ * and 0 only if @wait is false and can't grant it immediately
-+ */
-+int
-+htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
-+                  u32 key, unsigned dep, int wait, void *event)
-+{
-+      struct htree_lock_head *lhead = lck->lk_head;
-+      int rc;
-+
-+      BUG_ON(dep >= lck->lk_depth);
-+      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+
-+      htree_spin_lock(lhead, dep);
-+      rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
-+      if (rc != 0)
-+              htree_spin_unlock(lhead, dep);
-+      return rc >= 0;
-+}
-+EXPORT_SYMBOL(htree_node_lock_try);
-+
-+/* it's wrapper of htree_node_unlock_internal */
-+void
-+htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
-+{
-+      struct htree_lock_head *lhead = lck->lk_head;
-+
-+      BUG_ON(dep >= lck->lk_depth);
-+      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+
-+      htree_spin_lock(lhead, dep);
-+      htree_node_unlock_internal(lhead, lck, dep, event);
-+      htree_spin_unlock(lhead, dep);
-+}
-+EXPORT_SYMBOL(htree_node_unlock);
-+
-+/* stop listening on child-lock level @dep */
-+void
-+htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
-+{
-+      struct htree_lock_node *ln = &lck->lk_nodes[dep];
-+      struct htree_lock_node *tmp;
-+
-+      BUG_ON(htree_node_is_granted(lck, dep));
-+      BUG_ON(!list_empty(&ln->ln_blocked_list));
-+      BUG_ON(!list_empty(&ln->ln_granted_list));
-+
-+      if (!htree_node_is_listening(lck, dep))
-+              return;
-+
-+      htree_spin_lock(lck->lk_head, dep);
-+      ln->ln_mode = HTREE_LOCK_INVAL;
-+      ln->ln_ev_target = NULL;
-+
-+      if (htree_key_list_empty(ln)) { /* not owner */
-+              list_del_init(&ln->ln_alive_list);
-+              goto out;
-+      }
-+
-+      /* I'm the owner... */
-+      if (list_empty(&ln->ln_alive_list)) { /* no more listener */
-+              htree_key_list_del_init(ln);
-+              goto out;
-+      }
-+
-+      tmp = list_entry(ln->ln_alive_list.next,
-+                       struct htree_lock_node, ln_alive_list);
-+
-+      BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
-+      htree_key_list_replace_init(ln, tmp);
-+      list_del_init(&ln->ln_alive_list);
-+ out:
-+      htree_spin_unlock(lck->lk_head, dep);
-+}
-+EXPORT_SYMBOL(htree_node_stop_listen);
-+
-+/* release all child-locks if we have any */
-+static void
-+htree_node_release_all(struct htree_lock *lck)
-+{
-+      int     i;
-+
-+      for (i = 0; i < lck->lk_depth; i++) {
-+              if (htree_node_is_granted(lck, i))
-+                      htree_node_unlock(lck, i, NULL);
-+              else if (htree_node_is_listening(lck, i))
-+                      htree_node_stop_listen(lck, i);
-+      }
-+}
-+
-+/*
-+ * obtain htree lock, it could be blocked inside if there's conflict
-+ * with any granted or blocked lock and @wait is true.
-+ * NB: ALWAYS called holding lhead::lh_lock
-+ */
-+static int
-+htree_lock_internal(struct htree_lock *lck, int wait)
-+{
-+      struct htree_lock_head *lhead = lck->lk_head;
-+      int     granted = 0;
-+      int     blocked = 0;
-+      int     i;
-+
-+      for (i = 0; i < HTREE_LOCK_MAX; i++) {
-+              if (lhead->lh_ngranted[i] != 0)
-+                      granted |= 1 << i;
-+              if (lhead->lh_nblocked[i] != 0)
-+                      blocked |= 1 << i;
-+      }
-+      if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
-+          (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
-+              /* will block current lock even it just conflicts with any
-+               * other blocked lock, so lock like EX wouldn't starve */
-+              if (!wait)
-+                      return -1;
-+              lhead->lh_nblocked[lck->lk_mode]++;
-+              lk_block_inc(lck->lk_mode);
-+
-+              lck->lk_task = current;
-+              list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
-+
-+              set_current_state(TASK_UNINTERRUPTIBLE);
-+              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
-+              /* wait to be given the lock */
-+              if (lck->lk_task != NULL)
-+                      schedule();
-+              /* granted, no doubt. wake up will set me RUNNING */
-+              return 0; /* without lh_lock */
-+      }
-+      lhead->lh_ngranted[lck->lk_mode]++;
-+      lk_grant_inc(lck->lk_mode);
-+      return 1;
-+}
-+
-+/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
-+static void
-+htree_unlock_internal(struct htree_lock *lck)
-+{
-+      struct htree_lock_head *lhead = lck->lk_head;
-+      struct htree_lock *tmp;
-+      struct htree_lock *tmp2;
-+      int granted = 0;
-+      int i;
-+
-+      BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
-+
-+      lhead->lh_ngranted[lck->lk_mode]--;
-+      lck->lk_mode = HTREE_LOCK_INVAL;
-+
-+      for (i = 0; i < HTREE_LOCK_MAX; i++) {
-+              if (lhead->lh_ngranted[i] != 0)
-+                      granted |= 1 << i;
-+      }
-+      list_for_each_entry_safe(tmp, tmp2,
-+                               &lhead->lh_blocked_list, lk_blocked_list) {
-+              /* conflict with any granted lock? */
-+              if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
-+                      break;
-+
-+              list_del_init(&tmp->lk_blocked_list);
-+
-+              BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
-+
-+              lhead->lh_nblocked[tmp->lk_mode]--;
-+              lhead->lh_ngranted[tmp->lk_mode]++;
-+              granted |= 1 << tmp->lk_mode;
-+
-+              BUG_ON(tmp->lk_task == NULL);
-+              wake_up_process(tmp->lk_task);
-+      }
-+}
-+
-+/* it's wrapper of htree_lock_internal and exported interface.
-+ * It always return 1 with granted lock if @wait is true, it can return 0
-+ * if @wait is false and locking request can't be granted immediately */
-+int
-+htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
-+             htree_lock_mode_t mode, int wait)
-+{
-+      int     rc;
-+
-+      BUG_ON(lck->lk_depth > lhead->lh_depth);
-+      BUG_ON(lck->lk_head != NULL);
-+      BUG_ON(lck->lk_task != NULL);
-+
-+      lck->lk_head = lhead;
-+      lck->lk_mode = mode;
-+
-+      htree_spin_lock(lhead, HTREE_DEP_ROOT);
-+      rc = htree_lock_internal(lck, wait);
-+      if (rc != 0)
-+              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
-+      return rc >= 0;
-+}
-+EXPORT_SYMBOL(htree_lock_try);
-+
-+/* it's wrapper of htree_unlock_internal and exported interface.
-+ * It will release all htree_node_locks and htree_lock */
-+void
-+htree_unlock(struct htree_lock *lck)
-+{
-+      BUG_ON(lck->lk_head == NULL);
-+      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+
-+      htree_node_release_all(lck);
-+
-+      htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
-+      htree_unlock_internal(lck);
-+      htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
-+      lck->lk_head = NULL;
-+      lck->lk_task = NULL;
-+}
-+EXPORT_SYMBOL(htree_unlock);
-+
-+/* change lock mode */
-+void
-+htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
-+{
-+      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
-+      lck->lk_mode = mode;
-+}
-+EXPORT_SYMBOL(htree_change_mode);
-+
-+/* release htree lock, and lock it again with new mode.
-+ * This function will first release all htree_node_locks and htree_lock,
-+ * then try to gain htree_lock with new @mode.
-+ * It always return 1 with granted lock if @wait is true, it can return 0
-+ * if @wait is false and locking request can't be granted immediately */
-+int
-+htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
-+{
-+      struct htree_lock_head *lhead = lck->lk_head;
-+      int rc;
-+
-+      BUG_ON(lhead == NULL);
-+      BUG_ON(lck->lk_mode == mode);
-+      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
-+
-+      htree_node_release_all(lck);
-+
-+      htree_spin_lock(lhead, HTREE_DEP_ROOT);
-+      htree_unlock_internal(lck);
-+      lck->lk_mode = mode;
-+      rc = htree_lock_internal(lck, wait);
-+      if (rc != 0)
-+              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
-+      return rc >= 0;
-+}
-+EXPORT_SYMBOL(htree_change_lock_try);
-+
-+/* create a htree_lock head with @depth levels (number of child-locks),
-+ * it is a per resoruce structure */
-+struct htree_lock_head *
-+htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
-+{
-+      struct htree_lock_head *lhead;
-+      int  i;
-+
-+      if (depth > HTREE_LOCK_DEP_MAX) {
-+              printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
-+                      depth, HTREE_LOCK_DEP_MAX);
-+              return NULL;
-+      }
-+
-+      lhead = kzalloc(offsetof(struct htree_lock_head,
-+                               lh_children[depth]) + priv, GFP_NOFS);
-+      if (lhead == NULL)
-+              return NULL;
-+
-+      if (hbits < HTREE_HBITS_MIN)
-+              lhead->lh_hbits = HTREE_HBITS_MIN;
-+      else if (hbits > HTREE_HBITS_MAX)
-+              lhead->lh_hbits = HTREE_HBITS_MAX;
-+
-+      lhead->lh_lock = 0;
-+      lhead->lh_depth = depth;
-+      INIT_LIST_HEAD(&lhead->lh_blocked_list);
-+      if (priv > 0) {
-+              lhead->lh_private = (void *)lhead +
-+                      offsetof(struct htree_lock_head, lh_children[depth]);
-+      }
-+
-+      for (i = 0; i < depth; i++) {
-+              INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
-+              lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
-+      }
-+      return lhead;
-+}
-+EXPORT_SYMBOL(htree_lock_head_alloc);
-+
-+/* free the htree_lock head */
-+void
-+htree_lock_head_free(struct htree_lock_head *lhead)
-+{
-+      int     i;
-+
-+      BUG_ON(!list_empty(&lhead->lh_blocked_list));
-+      for (i = 0; i < lhead->lh_depth; i++)
-+              BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
-+      kfree(lhead);
-+}
-+EXPORT_SYMBOL(htree_lock_head_free);
-+
-+/* register event callback for @events of child-lock at level @dep */
-+void
-+htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
-+                      unsigned events, htree_event_cb_t callback)
-+{
-+      BUG_ON(lhead->lh_depth <= dep);
-+      lhead->lh_children[dep].lc_events = events;
-+      lhead->lh_children[dep].lc_callback = callback;
-+}
-+EXPORT_SYMBOL(htree_lock_event_attach);
-+
-+/* allocate a htree_lock, which is per-thread structure, @pbytes is some
-+ * extra-bytes as private data for caller */
-+struct htree_lock *
-+htree_lock_alloc(unsigned depth, unsigned pbytes)
-+{
-+      struct htree_lock *lck;
-+      int i = offsetof(struct htree_lock, lk_nodes[depth]);
-+
-+      if (depth > HTREE_LOCK_DEP_MAX) {
-+              printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
-+                      depth, HTREE_LOCK_DEP_MAX);
-+              return NULL;
-+      }
-+      lck = kzalloc(i + pbytes, GFP_NOFS);
-+      if (lck == NULL)
-+              return NULL;
-+
-+      if (pbytes != 0)
-+              lck->lk_private = (void *)lck + i;
-+      lck->lk_mode = HTREE_LOCK_INVAL;
-+      lck->lk_depth = depth;
-+      INIT_LIST_HEAD(&lck->lk_blocked_list);
-+
-+      for (i = 0; i < depth; i++) {
-+              struct htree_lock_node *node = &lck->lk_nodes[i];
-+
-+              node->ln_mode = HTREE_LOCK_INVAL;
-+              INIT_LIST_HEAD(&node->ln_major_list);
-+              INIT_LIST_HEAD(&node->ln_minor_list);
-+              INIT_LIST_HEAD(&node->ln_alive_list);
-+              INIT_LIST_HEAD(&node->ln_blocked_list);
-+              INIT_LIST_HEAD(&node->ln_granted_list);
-+      }
-+
-+      return lck;
-+}
-+EXPORT_SYMBOL(htree_lock_alloc);
-+
-+/* free htree_lock node */
-+void
-+htree_lock_free(struct htree_lock *lck)
-+{
-+      BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
-+      kfree(lck);
-+}
-+EXPORT_SYMBOL(htree_lock_free);
---- a/fs/ext4/inode.c
-+++ b/fs/ext4/inode.c
-@@ -4965,7 +4965,7 @@ struct inode *ext4_iget(struct super_blo
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
-               ei->i_file_acl |=
-                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
--      inode->i_size = ext4_isize(raw_inode);
-+      inode->i_size = ext4_isize(sb, raw_inode);
-       ei->i_disksize = inode->i_size;
- #ifdef CONFIG_QUOTA
-       ei->i_reserved_quota = 0;
-@@ -5205,7 +5205,7 @@ static int ext4_do_update_inode(handle_t
-               raw_inode->i_file_acl_high =
-                       cpu_to_le16(ei->i_file_acl >> 32);
-       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
--      if (ei->i_disksize != ext4_isize(raw_inode)) {
-+      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
-               ext4_isize_set(raw_inode, ei->i_disksize);
-               need_datasync = 1;
-       }
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s
-                                struct inode *dir,
-                                struct dx_hash_info *hinfo,
-                                struct dx_frame *frame,
--                               int *err);
-+                               struct htree_lock *lck, int *err);
- static void dx_release(struct dx_frame *frames);
- static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
-                      struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-@@ -189,13 +189,13 @@ static void dx_insert_block(struct dx_fr
- static int ext4_htree_next_block(struct inode *dir, __u32 hash,
-                                struct dx_frame *frame,
-                                struct dx_frame *frames,
--                               __u32 *start_hash);
-+                               __u32 *start_hash, struct htree_lock *lck);
- static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
-               const struct qstr *d_name,
-               struct ext4_dir_entry_2 **res_dir,
--              int *err);
-+              struct htree_lock *lck, int *err);
- static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
--                           struct inode *inode);
-+                           struct inode *inode, struct htree_lock *lck);
-
- /*
-  * p is at least 6 bytes before the end of page
-@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
-
- static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
- {
--      return le32_to_cpu(entry->block) & 0x00ffffff;
-+      return le32_to_cpu(entry->block) & 0x0fffffff;
- }
-
- static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
-@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h
- }
- #endif /* DX_DEBUG */
-
-+/* private data for htree_lock */
-+struct ext4_dir_lock_data {
-+      unsigned                ld_flags;  /* bits-map for lock types */
-+      unsigned                ld_count;  /* # entries of the last DX block */
-+      struct dx_entry         ld_at_entry; /* copy of leaf dx_entry */
-+      struct dx_entry         *ld_at;    /* position of leaf dx_entry */
-+};
-+
-+#define ext4_htree_lock_data(l)       ((struct ext4_dir_lock_data *)(l)->lk_private)
-+
-+/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
-+#define EXT4_HTREE_NODE_CHANGED       (0xcafeULL << 32)
-+
-+static void ext4_htree_event_cb(void *target, void *event)
-+{
-+      u64 *block = (u64 *)target;
-+
-+      if (*block == dx_get_block((struct dx_entry *)event))
-+              *block = EXT4_HTREE_NODE_CHANGED;
-+}
-+
-+struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
-+{
-+      struct htree_lock_head *lhead;
-+
-+      lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
-+      if (lhead != NULL) {
-+              htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
-+                                      ext4_htree_event_cb);
-+      }
-+      return lhead;
-+}
-+EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
-+
-+struct htree_lock *ext4_htree_lock_alloc(void)
-+{
-+      return htree_lock_alloc(EXT4_LK_MAX,
-+                              sizeof(struct ext4_dir_lock_data));
-+}
-+EXPORT_SYMBOL(ext4_htree_lock_alloc);
-+
-+static htree_lock_mode_t ext4_htree_mode(unsigned flags)
-+{
-+      switch (flags) {
-+      default: /* 0 or unknown flags require EX lock */
-+              return HTREE_LOCK_EX;
-+      case EXT4_HLOCK_READDIR:
-+              return HTREE_LOCK_PR;
-+      case EXT4_HLOCK_LOOKUP:
-+              return HTREE_LOCK_CR;
-+      case EXT4_HLOCK_DEL:
-+      case EXT4_HLOCK_ADD:
-+              return HTREE_LOCK_CW;
-+      }
-+}
-+
-+/* return PR for read-only operations, otherwise return EX */
-+static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
-+{
-+      int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
-+
-+      /* 0 requires EX lock */
-+      return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
-+}
-+
-+static int ext4_htree_safe_locked(struct htree_lock *lck)
-+{
-+      int writer;
-+
-+      if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
-+              return 1;
-+
-+      writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
-+               EXT4_LB_DE;
-+      if (writer) /* all readers & writers are excluded? */
-+              return lck->lk_mode == HTREE_LOCK_EX;
-+
-+      /* all writers are excluded? */
-+      return lck->lk_mode == HTREE_LOCK_PR ||
-+             lck->lk_mode == HTREE_LOCK_PW ||
-+             lck->lk_mode == HTREE_LOCK_EX;
-+}
-+
-+/* relock htree_lock with EX mode if it's change operation, otherwise
-+ * relock it with PR mode. It's noop if PDO is disabled. */
-+static void ext4_htree_safe_relock(struct htree_lock *lck)
-+{
-+      if (!ext4_htree_safe_locked(lck)) {
-+              unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
-+
-+              htree_change_lock(lck, ext4_htree_safe_mode(flags));
-+      }
-+}
-+
-+void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
-+                   struct inode *dir, unsigned flags)
-+{
-+      htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
-+                                            ext4_htree_safe_mode(flags);
-+
-+      ext4_htree_lock_data(lck)->ld_flags = flags;
-+      htree_lock(lck, lhead, mode);
-+      if (!is_dx(dir))
-+              ext4_htree_safe_relock(lck); /* make sure it's safe locked */
-+}
-+EXPORT_SYMBOL(ext4_htree_lock);
-+
-+static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
-+                              unsigned lmask, int wait, void *ev)
-+{
-+      u32     key = (at == NULL) ? 0 : dx_get_block(at);
-+      u32     mode;
-+
-+      /* NOOP if htree is well protected or caller doesn't require the lock */
-+      if (ext4_htree_safe_locked(lck) ||
-+         !(ext4_htree_lock_data(lck)->ld_flags & lmask))
-+              return 1;
-+
-+      mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
-+              HTREE_LOCK_PW : HTREE_LOCK_PR;
-+      while (1) {
-+              if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
-+                      return 1;
-+              if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
-+                      return 0;
-+              cpu_relax(); /* spin until granted */
-+      }
-+}
-+
-+static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
-+{
-+      return ext4_htree_safe_locked(lck) ||
-+             htree_node_is_granted(lck, ffz(~lmask));
-+}
-+
-+static void ext4_htree_node_unlock(struct htree_lock *lck,
-+                                 unsigned lmask, void *buf)
-+{
-+      /* NB: it's safe to call mutiple times or even it's not locked */
-+      if (!ext4_htree_safe_locked(lck) &&
-+           htree_node_is_granted(lck, ffz(~lmask)))
-+              htree_node_unlock(lck, ffz(~lmask), buf);
-+}
-+
-+#define ext4_htree_dx_lock(lck, key)          \
-+      ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
-+#define ext4_htree_dx_lock_try(lck, key)      \
-+      ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
-+#define ext4_htree_dx_unlock(lck)             \
-+      ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
-+#define ext4_htree_dx_locked(lck)             \
-+      ext4_htree_node_locked(lck, EXT4_LB_DX)
-+
-+static void ext4_htree_dx_need_lock(struct htree_lock *lck)
-+{
-+      struct ext4_dir_lock_data *ld;
-+
-+      if (ext4_htree_safe_locked(lck))
-+              return;
-+
-+      ld = ext4_htree_lock_data(lck);
-+      switch (ld->ld_flags) {
-+      default:
-+              return;
-+      case EXT4_HLOCK_LOOKUP:
-+              ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
-+              return;
-+      case EXT4_HLOCK_DEL:
-+              ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
-+              return;
-+      case EXT4_HLOCK_ADD:
-+              ld->ld_flags = EXT4_HLOCK_SPLIT;
-+              return;
-+      }
-+}
-+
-+#define ext4_htree_de_lock(lck, key)          \
-+      ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
-+#define ext4_htree_de_unlock(lck)             \
-+      ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
-+
-+#define ext4_htree_spin_lock(lck, key, event) \
-+      ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
-+#define ext4_htree_spin_unlock(lck)           \
-+      ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
-+#define ext4_htree_spin_unlock_listen(lck, p) \
-+      ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
-+
-+static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
-+{
-+      if (!ext4_htree_safe_locked(lck) &&
-+          htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
-+              htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
-+}
-+
-+enum {
-+      DX_HASH_COL_IGNORE,     /* ignore collision while probing frames */
-+      DX_HASH_COL_YES,        /* there is collision and it does matter */
-+      DX_HASH_COL_NO,         /* there is no collision */
-+};
-+
-+static int dx_probe_hash_collision(struct htree_lock *lck,
-+                                 struct dx_entry *entries,
-+                                 struct dx_entry *at, u32 hash)
-+{
-+      if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
-+              return DX_HASH_COL_IGNORE; /* don't care about collision */
-+
-+      } else if (at == entries + dx_get_count(entries) - 1) {
-+              return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
-+
-+      } else { /* hash collision? */
-+              return ((dx_get_hash(at + 1) & ~1) == hash) ?
-+                      DX_HASH_COL_YES : DX_HASH_COL_NO;
-+      }
-+}
-+
- /*
-  * Probe for a directory leaf block to search.
-  *
-@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h
-  */
- static struct dx_frame *
- dx_probe(const struct qstr *d_name, struct inode *dir,
--       struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
-+       struct dx_hash_info *hinfo, struct dx_frame *frame_in,
-+       struct htree_lock *lck, int *err)
- {
-       unsigned count, indirect;
--      struct dx_entry *at, *entries, *p, *q, *m;
-+      struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
-       struct dx_root_info * info;
-       struct buffer_head *bh;
-       struct dx_frame *frame = frame_in;
-       u32 hash;
-
--      frame->bh = NULL;
-+      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
-       if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
-               goto fail;
-
-@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru
-               goto fail;
-       }
-
--      if ((indirect = info->indirect_levels) > 1) {
--              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
--                           info->indirect_levels);
-+      indirect = info->indirect_levels;
-+      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
-+              ext4_warning(dir->i_sb,
-+                           "Directory (ino: %lu) htree depth %#06x exceed "
-+                           "supported value", dir->i_ino,
-+                           ext4_dir_htree_level(dir->i_sb));
-+              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
-+                      ext4_warning(dir->i_sb, "Enable large directory "
-+                                              "feature to access it");
-+              }
-               brelse(bh);
-               *err = ERR_BAD_DX_DIR;
-               goto fail;
-@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru
-       dxtrace(printk("Look up %x", hash));
-       while (1)
-       {
-+              if (indirect == 0) { /* the last index level */
-+                      /* NB: ext4_htree_dx_lock() could be noop if
-+                       * DX-lock flag is not set for current operation */
-+                      ext4_htree_dx_lock(lck, dx);
-+                      ext4_htree_spin_lock(lck, dx, NULL);
-+              }
-               count = dx_get_count(entries);
--              if (!count || count > dx_get_limit(entries)) {
-+              if (count == 0 || count > dx_get_limit(entries)) {
-+                      ext4_htree_spin_unlock(lck); /* release spin */
-                       ext4_warning(dir->i_sb,
-                                    "dx entry: no count or count > limit");
-                       brelse(bh);
-@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru
-               frame->bh = bh;
-               frame->entries = entries;
-               frame->at = at;
--              if (!indirect--) return frame;
-+
-+              if (indirect == 0) { /* the last index level */
-+                      struct ext4_dir_lock_data *ld;
-+                      u64 myblock;
-+
-+                      /* By default we only lock DE-block, however, we will
-+                       * also lock the last level DX-block if:
-+                       * a) there is hash collision
-+                       *    we will set DX-lock flag (a few lines below)
-+                       *    and redo to lock DX-block
-+                       *    see detail in dx_probe_hash_collision()
-+                       * b) it's a retry from splitting
-+                       *    we need to lock the last level DX-block so nobody
-+                       *    else can split any leaf blocks under the same
-+                       *    DX-block, see detail in ext4_dx_add_entry()
-+                       */
-+                      if (ext4_htree_dx_locked(lck)) {
-+                              /* DX-block is locked, just lock DE-block
-+                               * and return */
-+                              ext4_htree_spin_unlock(lck);
-+                              if (!ext4_htree_safe_locked(lck))
-+                                      ext4_htree_de_lock(lck, frame->at);
-+                              return frame;
-+                      }
-+                      /* it's pdirop and no DX lock */
-+                      if (dx_probe_hash_collision(lck, entries, at, hash) ==
-+                          DX_HASH_COL_YES) {
-+                              /* found hash collision, set DX-lock flag
-+                               * and retry to abtain DX-lock */
-+                              ext4_htree_spin_unlock(lck);
-+                              ext4_htree_dx_need_lock(lck);
-+                              continue;
-+                      }
-+                      ld = ext4_htree_lock_data(lck);
-+                      /* because I don't lock DX, so @at can't be trusted
-+                       * after I release spinlock so I have to save it */
-+                      ld->ld_at = at;
-+                      ld->ld_at_entry = *at;
-+                      ld->ld_count = dx_get_count(entries);
-+
-+                      frame->at = &ld->ld_at_entry;
-+                      myblock = dx_get_block(at);
-+
-+                      /* NB: ordering locking */
-+                      ext4_htree_spin_unlock_listen(lck, &myblock);
-+                      /* other thread can split this DE-block because:
-+                       * a) I don't have lock for the DE-block yet
-+                       * b) I released spinlock on DX-block
-+                       * if it happened I can detect it by listening
-+                       * splitting event on this DE-block */
-+                      ext4_htree_de_lock(lck, frame->at);
-+                      ext4_htree_spin_stop_listen(lck);
-+
-+                      if (myblock == EXT4_HTREE_NODE_CHANGED) {
-+                              /* someone split this DE-block before
-+                               * I locked it, I need to retry and lock
-+                               * valid DE-block */
-+                              ext4_htree_de_unlock(lck);
-+                              continue;
-+                      }
-+                      return frame;
-+              }
-+              dx = at;
-+              indirect--;
-               if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
-                       goto fail2;
-+
-               at = entries = ((struct dx_node *) bh->b_data)->entries;
-               if (dx_get_limit(entries) != dx_node_limit (dir)) {
-                       ext4_warning(dir->i_sb,
-@@ -512,13 +808,18 @@ fail:
- static void dx_release (struct dx_frame *frames)
- {
-       struct dx_root_info *info;
-+      int i;
-+
-       if (frames[0].bh == NULL)
-               return;
-
-       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
--      if (info->indirect_levels)
--              brelse(frames[1].bh);
--      brelse(frames[0].bh);
-+      for (i = 0; i <= info->indirect_levels; i++) {
-+              if (frames[i].bh == NULL)
-+                      break;
-+              brelse(frames[i].bh);
-+              frames[i].bh = NULL;
-+      }
- }
-
- /*
-@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame
- static int ext4_htree_next_block(struct inode *dir, __u32 hash,
-                                struct dx_frame *frame,
-                                struct dx_frame *frames,
--                               __u32 *start_hash)
-+                               __u32 *start_hash, struct htree_lock *lck)
- {
-       struct dx_frame *p;
-       struct buffer_head *bh;
-@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct
-        * this loop, num_frames indicates the number of interior
-        * nodes need to be read.
-        */
-+      ext4_htree_de_unlock(lck);
-       while (1) {
--              if (++(p->at) < p->entries + dx_get_count(p->entries))
--                      break;
-+              if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
-+                      /* num_frames > 0 :
-+                       *   DX block
-+                       * ext4_htree_dx_locked:
-+                       *   frame->at is reliable pointer returned by dx_probe,
-+                       *   otherwise dx_probe already knew no collision */
-+                      if (++(p->at) < p->entries + dx_get_count(p->entries))
-+                              break;
-+              }
-               if (p == frames)
-                       return 0;
-               num_frames++;
-+              if (num_frames == 1)
-+                      ext4_htree_dx_unlock(lck);
-               p--;
-       }
-
-@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct
-        * block so no check is necessary
-        */
-       while (num_frames--) {
-+              if (num_frames == 0) {
-+                      /* it's not always necessary, we just don't want to
-+                       * detect hash collision again */
-+                      ext4_htree_dx_need_lock(lck);
-+                      ext4_htree_dx_lock(lck, p->at);
-+              }
-+
-               if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
-                                     0, &err)))
-                       return err; /* Failure */
-@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct
-               p->bh = bh;
-               p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
-       }
-+      ext4_htree_de_lock(lck, p->at);
-       return 1;
- }
-
-@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di
- {
-       struct dx_hash_info hinfo;
-       struct ext4_dir_entry_2 *de;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct inode *dir;
-       ext4_lblk_t block;
-       int count = 0;
-@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di
-       }
-       hinfo.hash = start_hash;
-       hinfo.minor_hash = 0;
--      frame = dx_probe(NULL, dir, &hinfo, frames, &err);
-+      /* assume it's PR locked */
-+      frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err);
-       if (!frame)
-               return err;
--
-       /* Add '.' and '..' from the htree header */
-       if (!start_hash && !start_minor_hash) {
-               de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di
-               count += ret;
-               hashval = ~0;
-               ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
--                                          frame, frames, &hashval);
-+                                          frame, frames, &hashval, NULL);
-               *next_hash = hashval;
-               if (ret < 0) {
-                       err = ret;
-@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr
-
- static void ext4_update_dx_flag(struct inode *inode)
- {
-+      /* Disable it for ldiskfs, because going from a DX directory to
-+       * a non-DX directory while it is in use will completely break
-+       * the htree-locking.
-+       * If we really want to support this operation in the future,
-+       * we need to exclusively lock the directory at here which will
-+       * increase complexity of code */
-+#if 0
-       if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-                                    EXT4_FEATURE_COMPAT_DIR_INDEX))
-               ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-+#endif
- }
-
- /*
-@@ -888,8 +1215,9 @@ static inline int search_dirblock(struct
-  * to brelse() it when appropriate.
-  */
- struct buffer_head * ext4_find_entry(struct inode *dir,
--                                    const struct qstr *d_name,
--                                    struct ext4_dir_entry_2 ** res_dir)
-+                                   const struct qstr *d_name,
-+                                   struct ext4_dir_entry_2 **res_dir,
-+                                   struct htree_lock *lck)
- {
-       struct super_block *sb;
-       struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -910,7 +1238,7 @@ struct buffer_head * ext4_find_entry(str
-       if (namelen > EXT4_NAME_LEN)
-               return NULL;
-       if (is_dx(dir)) {
--              bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
-+              bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err);
-               /*
-                * On success, or if the error was file not found,
-                * return.  Otherwise, fall back to doing a search the
-@@ -920,6 +1248,7 @@ struct buffer_head * ext4_find_entry(str
-                       return bh;
-               dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
-                              "falling back\n"));
-+              ext4_htree_safe_relock(lck);
-       }
-       nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
-       start = EXT4_I(dir)->i_dir_start_lookup;
-@@ -996,13 +1325,15 @@ cleanup_and_exit:
-       return ret;
- }
-
--static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
--                     struct ext4_dir_entry_2 **res_dir, int *err)
-+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
-+                                             const struct qstr *d_name,
-+                                             struct ext4_dir_entry_2 **res_dir,
-+                                             struct htree_lock *lck, int *err)
- {
-       struct super_block * sb;
-       struct dx_hash_info     hinfo;
-       u32 hash;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct buffer_head *bh;
-       ext4_lblk_t block;
-       int retval;
-@@ -1012,13 +1343,16 @@ static struct buffer_head * ext4_dx_find
-       sb = dir->i_sb;
-       /* NFS may look up ".." - look at dx_root directory block */
-       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
--              if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-+              if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err)))
-                       return NULL;
-       } else {
-               frame = frames;
-               frame->bh = NULL;                       /* for dx_release() */
-               frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-               dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-+              /* "." and ".." are stored in root DX lock */
-+              ext4_htree_dx_need_lock(lck);
-+              ext4_htree_dx_lock(lck, NULL);
-       }
-       hash = hinfo.hash;
-       do {
-@@ -1041,7 +1375,7 @@ static struct buffer_head * ext4_dx_find
-
-               /* Check to see if we should continue to search */
-               retval = ext4_htree_next_block(dir, hash, frame,
--                                             frames, NULL);
-+                                             frames, NULL, lck);
-               if (retval < 0) {
-                       ext4_warning(sb,
-                            "error reading index page in directory #%lu",
-@@ -1067,7 +1401,7 @@ static struct dentry *ext4_lookup(struct
-       if (dentry->d_name.len > EXT4_NAME_LEN)
-               return ERR_PTR(-ENAMETOOLONG);
-
--      bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-       inode = NULL;
-       if (bh) {
-               __u32 ino = le32_to_cpu(de->inode);
-@@ -1134,7 +1468,7 @@ struct dentry *ext4_get_parent(struct de
-       struct ext4_dir_entry_2 * de;
-       struct buffer_head *bh;
-
--      bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-+      bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
-       if (!bh)
-               return ERR_PTR(-ENOENT);
-       ino = le32_to_cpu(de->inode);
-@@ -1222,8 +1556,9 @@ static struct ext4_dir_entry_2* dx_pack_
-  * Returns pointer to de in block into which the new entry will be inserted.
-  */
- static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
--                      struct buffer_head **bh,struct dx_frame *frame,
--                      struct dx_hash_info *hinfo, int *error)
-+                      struct buffer_head **bh, struct dx_frame *frames,
-+                      struct dx_frame *frame, struct dx_hash_info *hinfo,
-+                      struct htree_lock *lck, int *error)
- {
-       unsigned blocksize = dir->i_sb->s_blocksize;
-       unsigned count, continued;
-@@ -1280,7 +1615,14 @@ static struct ext4_dir_entry_2 *do_split
-                                       hash2, split, count-split));
-
-       /* Fancy dance to stay within two buffers */
--      de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
-+      if (hinfo->hash < hash2) {
-+              de2 = dx_move_dirents(data1, data2, map + split,
-+                                    count - split, blocksize);
-+      } else {
-+              /* make sure we will add entry to the same block which
-+               * we have already locked */
-+              de2 = dx_move_dirents(data1, data2, map, split, blocksize);
-+      }
-       de = dx_pack_dirents(data1, blocksize);
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
-                                          blocksize);
-@@ -1289,13 +1631,21 @@ static struct ext4_dir_entry_2 *do_split
-       dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
-       dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
-
--      /* Which block gets the new entry? */
--      if (hinfo->hash >= hash2)
--      {
--              swap(*bh, bh2);
--              de = de2;
-+      ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
-+                           frame->at); /* notify block is being split */
-+      if (hinfo->hash < hash2) {
-+              dx_insert_block(frame, hash2 + continued, newblock);
-+
-+      } else {
-+              /* switch block number */
-+              dx_insert_block(frame, hash2 + continued,
-+                              dx_get_block(frame->at));
-+              dx_set_block(frame->at, newblock);
-+              (frame->at)++;
-       }
--      dx_insert_block(frame, hash2 + continued, newblock);
-+      ext4_htree_spin_unlock(lck);
-+      ext4_htree_dx_unlock(lck);
-+
-       err = ext4_handle_dirty_metadata(handle, dir, bh2);
-       if (err)
-               goto journal_error;
-@@ -1406,7 +1756,7 @@ static int add_dirent_to_buf(handle_t *h
-       if (!IS_NOCMTIME(dir))
-               dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
-       ext4_update_dx_flag(dir);
--      dir->i_version++;
-+      inode_inc_iversion(dir);
-       ext4_mark_inode_dirty(handle, dir);
-       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, dir, bh);
-@@ -1426,7 +1776,7 @@ static int make_indexed_dir(handle_t *ha
-       const char      *name = dentry->d_name.name;
-       int             namelen = dentry->d_name.len;
-       struct buffer_head *bh2;
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct dx_entry *entries;
-       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
-       char            *data1, *top;
-@@ -1507,7 +1857,7 @@ static int make_indexed_dir(handle_t *ha
-       ext4_handle_dirty_metadata(handle, dir, frame->bh);
-       ext4_handle_dirty_metadata(handle, dir, bh);
-
--      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-+      de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
-       if (!de) {
-               /*
-                * Even if the block split failed, we have to properly write
-@@ -1614,7 +1964,7 @@ out:
-  * the entry, as someone else might have used it while you slept.
-  */
- int ext4_add_entry(handle_t *handle, struct dentry *dentry,
--                 struct inode *inode)
-+                 struct inode *inode, struct htree_lock *lck)
- {
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct buffer_head *bh;
-@@ -1633,9 +1983,10 @@ int ext4_add_entry(handle_t *handle, str
-               if (dentry->d_name.len == 2 &&
-                   memcmp(dentry->d_name.name, "..", 2) == 0)
-                       return ext4_update_dotdot(handle, dentry, inode);
--              retval = ext4_dx_add_entry(handle, dentry, inode);
-+              retval = ext4_dx_add_entry(handle, dentry, inode, lck);
-               if (!retval || (retval != ERR_BAD_DX_DIR))
-                       return retval;
-+              ext4_htree_safe_relock(lck);
-               ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
-               dx_fallback++;
-               ext4_mark_inode_dirty(handle, dir);
-@@ -1673,18 +2024,21 @@ int ext4_add_entry(handle_t *handle, str
-  * Returns 0 for success, or a negative error value
-  */
- static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
--                           struct inode *inode)
-+                           struct inode *inode, struct htree_lock *lck)
- {
--      struct dx_frame frames[2], *frame;
-+      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-       struct dx_entry *entries, *at;
-       struct dx_hash_info hinfo;
-       struct buffer_head *bh;
-       struct inode *dir = dentry->d_parent->d_inode;
-       struct super_block *sb = dir->i_sb;
-       struct ext4_dir_entry_2 *de;
-+      int restart;
-       int err;
-
--      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-+again:
-+      restart = 0;
-+      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
-       if (!frame)
-               return err;
-       entries = frame->entries;
-@@ -1693,33 +2047,53 @@ static int ext4_dx_add_entry(handle_t *h
-       if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
-               goto cleanup;
-
--      BUFFER_TRACE(bh, "get_write_access");
--      err = ext4_journal_get_write_access(handle, bh);
--      if (err)
--              goto journal_error;
--
-       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-       if (err != -ENOSPC)
-               goto cleanup;
-
-+      err = 0;
-       /* Block full, should compress but for now just split */
-       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
-                      dx_get_count(entries), dx_get_limit(entries)));
-       /* Need to split index? */
-       if (dx_get_count(entries) == dx_get_limit(entries)) {
-               ext4_lblk_t newblock;
--              unsigned icount = dx_get_count(entries);
--              int levels = frame - frames;
-+              int levels = frame - frames + 1;
-+              unsigned icount;
-+              int add_level = 1;
-               struct dx_entry *entries2;
-               struct dx_node *node2;
-               struct buffer_head *bh2;
-
--              if (levels && (dx_get_count(frames->entries) ==
--                             dx_get_limit(frames->entries))) {
--                      ext4_warning(sb, "Directory index full!");
-+              if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
-+                      ext4_htree_safe_relock(lck);
-+                      restart = 1;
-+                      goto cleanup;
-+              }
-+              while (frame > frames) {
-+                      if (dx_get_count((frame - 1)->entries) <
-+                          dx_get_limit((frame - 1)->entries)) {
-+                              add_level = 0;
-+                              break;
-+                      }
-+                      frame--; /* split higher index block */
-+                      at = frame->at;
-+                      entries = frame->entries;
-+                      restart = 1;
-+              }
-+              if (add_level && levels == ext4_dir_htree_level(sb)) {
-+                      ext4_warning(sb, "Directory (ino: %lu) index full, "
-+                                       "reach max htree level :%d",
-+                                       dir->i_ino, levels);
-+                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
-+                              ext4_warning(sb, "Large directory feature is"
-+                                               "not enabled on this "
-+                                               "filesystem");
-+                      }
-                       err = -ENOSPC;
-                       goto cleanup;
-               }
-+              icount = dx_get_count(entries);
-               bh2 = ext4_append (handle, dir, &newblock, &err);
-               if (!(bh2))
-                       goto cleanup;
-@@ -1732,7 +2106,7 @@ static int ext4_dx_add_entry(handle_t *h
-               err = ext4_journal_get_write_access(handle, frame->bh);
-               if (err)
-                       goto journal_error;
--              if (levels) {
-+              if (!add_level) {
-                       unsigned icount1 = icount/2, icount2 = icount - icount1;
-                       unsigned hash2 = dx_get_hash(entries + icount1);
-                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
-@@ -1740,7 +2114,7 @@ static int ext4_dx_add_entry(handle_t *h
-
-                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
-                       err = ext4_journal_get_write_access(handle,
--                                                           frames[0].bh);
-+                                                          (frame - 1)->bh);
-                       if (err)
-                               goto journal_error;
-
-@@ -1756,14 +2130,21 @@ static int ext4_dx_add_entry(handle_t *h
-                               frame->entries = entries = entries2;
-                               swap(frame->bh, bh2);
-                       }
--                      dx_insert_block(frames + 0, hash2, newblock);
--                      dxtrace(dx_show_index("node", frames[1].entries));
-+                      dx_insert_block((frame - 1), hash2, newblock);
-+                      dxtrace(dx_show_index("node", frame->entries));
-                       dxtrace(dx_show_index("node",
-                              ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
-                       if (err)
-                               goto journal_error;
-                       brelse (bh2);
-+                      ext4_handle_dirty_metadata(handle, inode,
-+                                                 (frame - 1)->bh);
-+                      if (restart) {
-+                              ext4_handle_dirty_metadata(handle, inode,
-+                                                         frame->bh);
-+                              goto cleanup;
-+                      }
-               } else {
-                       struct dx_root_info * info;
-                       dxtrace(printk(KERN_DEBUG
-@@ -1777,25 +2158,42 @@ static int ext4_dx_add_entry(handle_t *h
-                       dx_set_block(entries + 0, newblock);
-                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
-                                       frames[0].bh->b_data);
--                      info->indirect_levels = 1;
--
--                      /* Add new access path frame */
--                      frame = frames + 1;
--                      frame->at = at = at - entries + entries2;
--                      frame->entries = entries = entries2;
--                      frame->bh = bh2;
--                      err = ext4_journal_get_write_access(handle,
--                                                           frame->bh);
--                      if (err)
--                              goto journal_error;
-+                      info->indirect_levels += 1;
-+                      dxtrace(printk(KERN_DEBUG
-+                              "Creating %d level index...\n",
-+                              info->indirect_levels));
-+                      ext4_handle_dirty_metadata(handle, inode, frame->bh);
-+                      ext4_handle_dirty_metadata(handle, inode, bh2);
-+                      brelse(bh2);
-+                      restart = 1;
-+                      goto cleanup;
-               }
--              err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
--              if (err) {
--                      ext4_std_error(inode->i_sb, err);
-+      } else if (!ext4_htree_dx_locked(lck)) {
-+              struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
-+
-+              /* not well protected, require DX lock */
-+              ext4_htree_dx_need_lock(lck);
-+              at = frame > frames ? (frame - 1)->at : NULL;
-+
-+              /* NB: no risk of deadlock because it's just a try.
-+               *
-+               * NB: we check ld_count for twice, the first time before
-+               * having DX lock, the second time after holding DX lock.
-+               *
-+               * NB: We never free blocks for directory so far, which
-+               * means value returned by dx_get_count() should equal to
-+               * ld->ld_count if nobody split any DE-block under @at,
-+               * and ld->ld_at still points to valid dx_entry. */
-+              if ((ld->ld_count != dx_get_count(entries)) ||
-+                  !ext4_htree_dx_lock_try(lck, at) ||
-+                  (ld->ld_count != dx_get_count(entries))) {
-+                      restart = 1;
-                       goto cleanup;
-               }
--      }
--      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
-+              /* OK, I've got DX lock and nothing changed */
-+              frame->at = ld->ld_at;
-+        }
-+      de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
-       if (!de)
-               goto cleanup;
-       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-@@ -1804,9 +2202,15 @@ static int ext4_dx_add_entry(handle_t *h
- journal_error:
-       ext4_std_error(dir->i_sb, err);
- cleanup:
-+      ext4_htree_dx_unlock(lck);
-+      ext4_htree_de_unlock(lck);
-       if (bh)
-               brelse(bh);
-       dx_release(frames);
-+      /* @restart is true means htree-path has been changed, we need to
-+       * repeat dx_probe() to find out valid htree-path */
-+      if (restart && err == 0)
-+              goto again;
-       return err;
- }
-
-@@ -1845,7 +2249,7 @@ int ext4_delete_entry(handle_t *handle,
-                                       blocksize);
-                       else
-                               de->inode = 0;
--                      dir->i_version++;
-+                      inode_inc_iversion(dir);
-                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                       err = ext4_handle_dirty_metadata(handle, dir, bh);
-                       if (unlikely(err)) {
-@@ -1892,7 +2296,7 @@ static void ext4_dec_count(handle_t *han
- static int ext4_add_nondir(handle_t *handle,
-               struct dentry *dentry, struct inode *inode)
- {
--      int err = ext4_add_entry(handle, dentry, inode);
-+      int err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (!err) {
-               ext4_mark_inode_dirty(handle, inode);
-               d_instantiate(dentry, inode);
-@@ -2122,7 +2526,7 @@ retry:
-       err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL);
-       if (err)
-               goto out_clear_inode;
--      err = ext4_add_entry(handle, dentry, inode);
-+      err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (err)
-               goto out_clear_inode;
-       ext4_inc_count(handle, dir);
-@@ -2395,7 +2799,7 @@ static int ext4_rmdir(struct inode *dir,
-               return PTR_ERR(handle);
-
-       retval = -ENOENT;
--      bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-       if (!bh)
-               goto end_rmdir;
-
-@@ -2460,7 +2864,7 @@ static int ext4_unlink(struct inode *dir
-               ext4_handle_sync(handle);
-
-       retval = -ENOENT;
--      bh = ext4_find_entry(dir, &dentry->d_name, &de);
-+      bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
-       if (!bh)
-               goto end_unlink;
-
-@@ -2628,7 +3032,7 @@ retry:
-       ext4_inc_count(handle, inode);
-       ihold(inode);
-
--      err = ext4_add_entry(handle, dentry, inode);
-+      err = ext4_add_entry(handle, dentry, inode, NULL);
-       if (!err) {
-               ext4_mark_inode_dirty(handle, inode);
-               d_instantiate(dentry, inode);
-@@ -2676,7 +3080,7 @@ static int ext4_rename(struct inode *old
-       if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-               ext4_handle_sync(handle);
-
--      old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
-+      old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
-       /*
-        *  Check for inode number is _not_ due to possible IO errors.
-        *  We might rmdir the source, keep it as pwd of some process
-@@ -2689,7 +3093,7 @@ static int ext4_rename(struct inode *old
-               goto end_rename;
-
-       new_inode = new_dentry->d_inode;
--      new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
-+      new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL);
-       if (new_bh) {
-               if (!new_inode) {
-                       brelse(new_bh);
-@@ -2719,7 +3123,7 @@ static int ext4_rename(struct inode *old
-                       goto end_rename;
-       }
-       if (!new_bh) {
--              retval = ext4_add_entry(handle, new_dentry, old_inode);
-+              retval = ext4_add_entry(handle, new_dentry, old_inode, NULL);
-               if (retval)
-                       goto end_rename;
-       } else {
-@@ -2767,7 +3171,8 @@ static int ext4_rename(struct inode *old
-               struct buffer_head *old_bh2;
-               struct ext4_dir_entry_2 *old_de2;
-
--              old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
-+              old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
-+                                        &old_de2, NULL);
-               if (old_bh2) {
-                       retval = ext4_delete_entry(handle, old_dir,
-                                                  old_de2, old_bh2);
---- /dev/null
-+++ b/include/linux/htree_lock.h
-@@ -0,0 +1,187 @@
-+/*
-+ * include/linux/htree_lock.h
-+ *
-+ * Copyright (c) 2011, 2012, Intel Corporation.
-+ *
-+ * Author: Liang Zhen <liang@whamcloud.com>
-+ */
-+
-+/*
-+ * htree lock
-+ *
-+ * htree_lock is an advanced lock, it can support five lock modes (concept is
-+ * taken from DLM) and it's a sleeping lock.
-+ *
-+ * most common use case is:
-+ * - create a htree_lock_head for data
-+ * - each thread (contender) creates it's own htree_lock
-+ * - contender needs to call htree_lock(lock_node, mode) to protect data and
-+ *   call htree_unlock to release lock
-+ *
-+ * Also, there is advanced use-case which is more complex, user can have
-+ * PW/PR lock on particular key, it's mostly used while user holding shared
-+ * lock on the htree (CW, CR)
-+ *
-+ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
-+ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
-+ * ...
-+ * htree_node_unlock(lock_node);; unlock the key
-+ *
-+ * Another tip is, we can have N-levels of this kind of keys, all we need to
-+ * do is specifying N-levels while creating htree_lock_head, then we can
-+ * lock/unlock a specific level by:
-+ * htree_node_lock(lock_node, mode1, key1, level1...);
-+ * do something;
-+ * htree_node_lock(lock_node, mode1, key2, level2...);
-+ * do something;
-+ * htree_node_unlock(lock_node, level2);
-+ * htree_node_unlock(lock_node, level1);
-+ *
-+ * NB: for multi-level, should be careful about locking order to avoid deadlock
-+ */
-+
-+#ifndef _LINUX_HTREE_LOCK_H
-+#define _LINUX_HTREE_LOCK_H
-+
-+#include <linux/list.h>
-+#include <linux/spinlock.h>
-+#include <linux/sched.h>
-+
-+/*
-+ * Lock Modes
-+ * more details can be found here:
-+ * http://en.wikipedia.org/wiki/Distributed_lock_manager
-+ */
-+typedef enum {
-+      HTREE_LOCK_EX   = 0, /* exclusive lock: incompatible with all others */
-+      HTREE_LOCK_PW,       /* protected write: allows only CR users */
-+      HTREE_LOCK_PR,       /* protected read: allow PR, CR users */
-+      HTREE_LOCK_CW,       /* concurrent write: allow CR, CW users */
-+      HTREE_LOCK_CR,       /* concurrent read: allow all but EX users */
-+      HTREE_LOCK_MAX,      /* number of lock modes */
-+} htree_lock_mode_t;
-+
-+#define HTREE_LOCK_NL         HTREE_LOCK_MAX
-+#define HTREE_LOCK_INVAL      0xdead10c
-+
-+enum {
-+      HTREE_HBITS_MIN         = 2,
-+      HTREE_HBITS_DEF         = 14,
-+      HTREE_HBITS_MAX         = 32,
-+};
-+
-+enum {
-+      HTREE_EVENT_DISABLE     = (0),
-+      HTREE_EVENT_RD          = (1 << HTREE_LOCK_PR),
-+      HTREE_EVENT_WR          = (1 << HTREE_LOCK_PW),
-+      HTREE_EVENT_RDWR        = (HTREE_EVENT_RD | HTREE_EVENT_WR),
-+};
-+
-+struct htree_lock;
-+
-+typedef void (*htree_event_cb_t)(void *target, void *event);
-+
-+struct htree_lock_child {
-+      struct list_head        lc_list;        /* granted list */
-+      htree_event_cb_t        lc_callback;    /* event callback */
-+      unsigned                lc_events;      /* event types */
-+};
-+
-+struct htree_lock_head {
-+      unsigned long           lh_lock;        /* bits lock */
-+      /* blocked lock list (htree_lock) */
-+      struct list_head        lh_blocked_list;
-+      /* # key levels */
-+      u16                     lh_depth;
-+      /* hash bits for key and limit number of locks */
-+      u16                     lh_hbits;
-+      /* counters for blocked locks */
-+      u16                     lh_nblocked[HTREE_LOCK_MAX];
-+      /* counters for granted locks */
-+      u16                     lh_ngranted[HTREE_LOCK_MAX];
-+      /* private data */
-+      void                    *lh_private;
-+      /* array of children locks */
-+      struct htree_lock_child lh_children[0];
-+};
-+
-+/* htree_lock_node_t is child-lock for a specific key (ln_value) */
-+struct htree_lock_node {
-+      htree_lock_mode_t       ln_mode;
-+      /* major hash key */
-+      u16                     ln_major_key;
-+      /* minor hash key */
-+      u16                     ln_minor_key;
-+      struct list_head        ln_major_list;
-+      struct list_head        ln_minor_list;
-+      /* alive list, all locks (granted, blocked, listening) are on it */
-+      struct list_head        ln_alive_list;
-+      /* blocked list */
-+      struct list_head        ln_blocked_list;
-+      /* granted list */
-+      struct list_head        ln_granted_list;
-+      void                    *ln_ev_target;
-+};
-+
-+struct htree_lock {
-+      struct task_struct      *lk_task;
-+      struct htree_lock_head  *lk_head;
-+      void                    *lk_private;
-+      unsigned                lk_depth;
-+      htree_lock_mode_t       lk_mode;
-+      struct list_head        lk_blocked_list;
-+      struct htree_lock_node  lk_nodes[0];
-+};
-+
-+/* create a lock head, which stands for a resource */
-+struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
-+                                            unsigned hbits, unsigned priv);
-+/* free a lock head */
-+void htree_lock_head_free(struct htree_lock_head *lhead);
-+/* register event callback for child lock at level @depth */
-+void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
-+                           unsigned events, htree_event_cb_t callback);
-+/* create a lock handle, which stands for a thread */
-+struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
-+/* free a lock handle */
-+void htree_lock_free(struct htree_lock *lck);
-+/* lock htree, when @wait is true, 0 is returned if the lock can't
-+ * be granted immediately */
-+int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
-+                 htree_lock_mode_t mode, int wait);
-+/* unlock htree */
-+void htree_unlock(struct htree_lock *lck);
-+/* unlock and relock htree with @new_mode */
-+int htree_change_lock_try(struct htree_lock *lck,
-+                        htree_lock_mode_t new_mode, int wait);
-+void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
-+/* require child lock (key) of htree at level @dep, @event will be sent to all
-+ * listeners on this @key while lock being granted */
-+int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
-+                      u32 key, unsigned dep, int wait, void *event);
-+/* release child lock at level @dep, this lock will listen on it's key
-+ * if @event isn't NULL, event_cb will be called against @lck while granting
-+ * any other lock at level @dep with the same key */
-+void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
-+/* stop listening on child lock at level @dep */
-+void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
-+/* for debug */
-+void htree_lock_stat_print(int depth);
-+void htree_lock_stat_reset(void);
-+
-+#define htree_lock(lck, lh, mode)     htree_lock_try(lck, lh, mode, 1)
-+#define htree_change_lock(lck, mode)  htree_change_lock_try(lck, mode, 1)
-+
-+#define htree_lock_mode(lck)          ((lck)->lk_mode)
-+
-+#define htree_node_lock(lck, mode, key, dep)  \
-+      htree_node_lock_try(lck, mode, key, dep, 1, NULL)
-+/* this is only safe in thread context of lock owner */
-+#define htree_node_is_granted(lck, dep)               \
-+      ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
-+       (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
-+/* this is only safe in thread context of lock owner */
-+#define htree_node_is_listening(lck, dep)     \
-+      ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
-+
-+#endif
index 3f4f9a2..841cd67 100644 (file)
@@ -11,6 +11,7 @@ rhel6.3/ext4-inode-version.patch
 rhel6.3/ext4-lookup-dotdot.patch
 rhel6.3/ext4-print-inum-in-htree-warning.patch
 rhel6.4/ext4-prealloc.patch
+rhel6.3/ext4-use-correct-inode.patch
 rhel6.3/ext4-mballoc-extra-checks.patch
 rhel6.4/ext4-misc.patch
 rhel6.3/ext4-pdir-fix.patch
@@ -28,6 +29,7 @@ rhel6.3/ext4-nocmtime-2.6.patch
 rhel6.3/ext4-journal-callback.patch
 rhel6.5/ext4-ext-walk-space.patch
 rhel6.3/ext4-store-tree-generation-at-find.patch
+rhel6.3/ext4-large-dir.patch
 rhel6.3/ext4-pdirop.patch
 rhel6.4/ext4-extra-isize.patch
 rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch
index a69ef69..9bbf666 100644 (file)
@@ -30,6 +30,7 @@ rhel6.3/ext4-nocmtime-2.6.patch
 rhel6.3/ext4-export-64bit-name-hash.patch
 rhel6.3/ext4-journal-callback.patch
 rhel6.3/ext4-store-tree-generation-at-find.patch
+rhel6.3/ext4-large-dir.patch
 rhel6.3/ext4-pdirop.patch
 rhel6.3/ext4-quota-force-block-alloc-quotaoff.patch
 rhel6.3/ext4-quota-dont-update-cmtime.patch
index dabcea3..6b6d9d2 100644 (file)
@@ -31,7 +31,8 @@ sles11sp2/ext4-disable-mb-cache.patch
 rhel6.3/ext4-nocmtime-2.6.patch
 rhel6.3/ext4-export-64bit-name-hash.patch
 sles11sp2/ext4-store-tree-generation-at-find.patch
-sles11sp2/ext4-pdirop.patch
+sles11sp2/ext4-large-dir.patch
+rhel6.3/ext4-pdirop.patch
 rhel6.3/ext4-max-dir-size.patch
 sles11sp2/ext4-max-dir-size-options.patch
 rhel6.3/ext4-not-discard-preallocation-umount.patch
index 025e41a..4d9d6bf 100644 (file)
@@ -30,7 +30,8 @@ sles11sp2/ext4-large-eas.patch
 sles11sp2/ext4-disable-mb-cache.patch
 rhel6.3/ext4-nocmtime-2.6.patch
 sles11sp2/ext4-store-tree-generation-at-find.patch
-sles11sp2/ext4-pdirop.patch
+sles11sp2/ext4-large-dir.patch
+rhel6.3/ext4-pdirop.patch
 rhel6.3/ext4-max-dir-size.patch
 sles11sp2/ext4-max-dir-size-options.patch
 rhel6.3/ext4-not-discard-preallocation-umount.patch
index 8437fdb..2ae3b1e 100644 (file)
@@ -12,6 +12,7 @@ rhel7/ext4-data-in-dirent.patch
 rhel7/ext4-large-eas.patch
 rhel7/ext4-disable-mb-cache.patch
 rhel7/ext4-nocmtime.patch
+rhel7/ext4-large-dir.patch
 rhel7/ext4-pdirop.patch
 rhel7/ext4-max-dir-size.patch
 rhel7/ext4-remove-truncate-warning.patch
index 296aa2f..2cc5ca6 100644 (file)
@@ -168,7 +168,7 @@ struct osd_mdobj_map {
 };
 
 #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
-        ldiskfs_add_entry(handle, child, cinode, hlock)
+       __ldiskfs_add_entry(handle, child, cinode, hlock)
 
 #define OSD_OTABLE_IT_CACHE_SIZE       64
 #define OSD_OTABLE_IT_CACHE_MASK       (~(OSD_OTABLE_IT_CACHE_SIZE - 1))
@@ -1095,13 +1095,13 @@ static inline unsigned long osd_remote_parent_ino(struct osd_device *dev)
        return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino;
 }
 
-#ifdef JOURNAL_START_HAS_3ARGS
+#ifdef LDISKFS_HT_MISC
 # define osd_journal_start_sb(sb, type, nblock) \
                ldiskfs_journal_start_sb(sb, type, nblock)
 # define osd_ldiskfs_append(handle, inode, nblock, err) \
                ldiskfs_append(handle, inode, nblock)
 # define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
-               ldiskfs_find_entry(dir, name, de, inlined, lock)
+               __ldiskfs_find_entry(dir, name, de, inlined, lock)
 # define osd_journal_start(inode, type, nblocks) \
                ldiskfs_journal_start(inode, type, nblocks)
 # define osd_transaction_size(dev) \
@@ -1113,7 +1113,7 @@ static inline unsigned long osd_remote_parent_ino(struct osd_device *dev)
 # define osd_ldiskfs_append(handle, inode, nblock, err) \
                ldiskfs_append(handle, inode, nblock, err)
 # define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
-               ldiskfs_find_entry(dir, name, de, lock)
+               __ldiskfs_find_entry(dir, name, de, lock)
 # define osd_journal_start(inode, type, nblocks) \
                ldiskfs_journal_start(inode, nblocks)
 # define osd_transaction_size(dev) \