Whamcloud - gitweb
Branch HEAD
authoryangsheng <yangsheng>
Mon, 4 Feb 2008 07:33:48 +0000 (07:33 +0000)
committeryangsheng <yangsheng>
Mon, 4 Feb 2008 07:33:48 +0000 (07:33 +0000)
b=14482
i=alex
i=adilger

Move iam patches to RHEL5 kernel.

ldiskfs/kernel_patches/patches/ext3-iam-2.6.18-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series

diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-2.6.18-rhel5.patch b/ldiskfs/kernel_patches/patches/ext3-iam-2.6.18-rhel5.patch
new file mode 100644 (file)
index 0000000..27f8fe1
--- /dev/null
@@ -0,0 +1,2272 @@
+Index: linux-stage/include/linux/ext3_fs.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs.h   2007-11-26 23:09:05.000000000 +0300
++++ linux-stage/include/linux/ext3_fs.h        2007-11-26 23:09:06.000000000 +0300
+@@ -812,6 +812,9 @@
+ #define DX_HASH_LEGACY                0
+ #define DX_HASH_HALF_MD4      1
+ #define DX_HASH_TEA           2
++#define DX_HASH_R5            6
++#define DX_HASH_SAME          7
++#define DX_HASH_MAX           7
+ #ifdef __KERNEL__
+@@ -942,9 +945,6 @@
+ extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+ /* dir.c */
+-extern int ext3_check_dir_entry(const char *, struct inode *,
+-                              struct ext3_dir_entry_2 *,
+-                              struct buffer_head *, unsigned long);
+ extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+                                   __u32 minor_hash,
+                                   struct ext3_dir_entry_2 *dirent);
+Index: linux-stage/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-stage.orig/include/linux/ext3_fs_i.h 2007-11-26 23:09:04.000000000 +0300
++++ linux-stage/include/linux/ext3_fs_i.h      2007-11-26 23:16:00.000000000 +0300
+@@ -20,6 +20,7 @@
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
+ #include <linux/mutex.h>
++#include <linux/dynlocks.h>
+ #define HAVE_DISK_INODE_VERSION
+@@ -157,6 +157,11 @@
+       struct mutex truncate_mutex;
+       struct inode vfs_inode;
++      /* following fields for parallel directory operations -bzzz */
++      struct dynlock   i_htree_lock;
++      struct semaphore i_append_sem;
++      struct semaphore i_rename_sem;
++
+       struct ext3_ext_cache i_cached_extent;
+       /* mballoc */
+Index: linux-stage/fs/ext3/super.c
+===================================================================
+--- linux-stage.orig/fs/ext3/super.c   2007-11-26 23:09:05.000000000 +0300
++++ linux-stage/fs/ext3/super.c        2007-11-26 23:09:06.000000000 +0300
+@@ -464,6 +464,10 @@
+       ei->i_block_alloc_info = NULL;
+       ei->vfs_inode.i_version = 1;
++      dynlock_init(&ei->i_htree_lock);
++      sema_init(&ei->i_rename_sem, 1);
++      sema_init(&ei->i_append_sem, 1);
++
+       memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
+       INIT_LIST_HEAD(&ei->i_prealloc_list);
+       spin_lock_init(&ei->i_prealloc_lock);
+@@ -695,7 +699,7 @@
+       Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+       Opt_grpquota,
+       Opt_extents, Opt_noextents, Opt_extdebug,
+-      Opt_mballoc, Opt_nomballoc, Opt_stripe,
++      Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_hashfunc,
+ };
+ static match_table_t tokens = {
+@@ -756,6 +760,7 @@
+       {Opt_stripe, "stripe=%u"},
+       {Opt_err, NULL},
+       {Opt_resize, "resize"},
++      {Opt_hashfunc,"hash=%s"},
+ };
+ static ext3_fsblk_t get_sb_block(void **data)
+@@ -779,6 +784,7 @@
+       return sb_block;
+ }
++int user_selected_hash_function = -1;
+ static int parse_options (char *options, struct super_block *sb,
+                         unsigned int *inum, unsigned long *journal_devnum,
+                         ext3_fsblk_t *n_blocks_count, int is_remount)
+@@ -1120,6 +1126,22 @@
+                               return 0;
+                       sbi->s_stripe = option;
+                       break;
++              case Opt_hashfunc:
++                      if (strncmp (args[0].from,"legacy",6) == 0){
++                                user_selected_hash_function = 0;
++                        } else if (strncmp (args[0].from,"half_md4",8) == 0){
++                                user_selected_hash_function = 1;
++                        } else if (strncmp (args[0].from,"tea",3) == 0){
++                                user_selected_hash_function = 2;
++                        } else if (strncmp (args[0].from,"r5",2) == 0){
++                                user_selected_hash_function = 3;
++                        } else if (strncmp (args[0].from,"same",4) == 0){
++                                user_selected_hash_function = 4;
++                        } else {
++                                printk ("Hashfunc name wrong\n");
++                                return 0;
++                        }
++                      break;
+               default:
+                       printk (KERN_ERR
+                               "EXT3-fs: Unrecognized mount option \"%s\" "
+Index: linux-stage/fs/ext3/namei.c
+===================================================================
+--- linux-stage.orig/fs/ext3/namei.c   2007-11-26 23:09:04.000000000 +0300
++++ linux-stage/fs/ext3/namei.c        2007-11-26 23:09:06.000000000 +0300
+@@ -24,6 +24,7 @@
+  *    Theodore Ts'o, 2002
+  */
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -36,6 +37,7 @@
+ #include <linux/quotaops.h>
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
++#include <linux/lustre_iam.h>
+ #include "namei.h"
+ #include "xattr.h"
+@@ -50,25 +52,29 @@
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+-static struct buffer_head *ext3_append(handle_t *handle,
++
++struct buffer_head *ext3_append(handle_t *handle,
+                                       struct inode *inode,
+                                       u32 *block, int *err)
+ {
+       struct buffer_head *bh;
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      /* with parallel dir operations all appends
++       * have to be serialized -bzzz */
++      down(&ei->i_append_sem);
+       *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+-      if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++      bh = ext3_bread(handle, inode, *block, 1, err);
++      if (bh != NULL) {
+               inode->i_size += inode->i_sb->s_blocksize;
+-              EXT3_I(inode)->i_disksize = inode->i_size;
+-              ext3_journal_get_write_access(handle,bh);
++              ei->i_disksize = inode->i_size;
+       }
++      up(&ei->i_append_sem);
++      
+       return bh;
+ }
+-#ifndef assert
+-#define assert(test) J_ASSERT(test)
+-#endif
+ #ifndef swap
+ #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+@@ -77,167 +83,84 @@
+ #define dxtrace(command) 
+ #endif
+-struct fake_dirent
+-{
+-      __le32 inode;
+-      __le16 rec_len;
+-      u8 name_len;
+-      u8 file_type;
+-};
+-
+-struct dx_countlimit
+-{
+-      __le16 limit;
+-      __le16 count;
+-};
+-
+-struct dx_entry
+-{
+-      __le32 hash;
+-      __le32 block;
+-};
+-
+-/*
+- * dx_root_info is laid out so that if it should somehow get overlaid by a
+- * dirent the two low bits of the hash version will be zero.  Therefore, the
+- * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+- */
+-
+-struct dx_root
+-{
+-      struct fake_dirent dot;
+-      char dot_name[4];
+-      struct fake_dirent dotdot;
+-      char dotdot_name[4];
+-      struct dx_root_info
+-      {
+-              __le32 reserved_zero;
+-              u8 hash_version;
+-              u8 info_length; /* 8 */
+-              u8 indirect_levels;
+-              u8 unused_flags;
+-      }
+-      info;
+-      struct dx_entry entries[0];
+-};
+-
+-struct dx_node
+-{
+-      struct fake_dirent fake;
+-      struct dx_entry entries[0];
+-};
+-
+-
+-struct dx_frame
+-{
+-      struct buffer_head *bh;
+-      struct dx_entry *entries;
+-      struct dx_entry *at;
+-};
+-
+-struct dx_map_entry
+-{
+-      u32 hash;
+-      u16 offs;
+-      u16 size;
+-};
+-
+ #ifdef CONFIG_EXT3_INDEX
+-static inline unsigned dx_get_block (struct dx_entry *entry);
+-static void dx_set_block (struct dx_entry *entry, unsigned value);
+-static inline unsigned dx_get_hash (struct dx_entry *entry);
+-static void dx_set_hash (struct dx_entry *entry, unsigned value);
+-static unsigned dx_get_count (struct dx_entry *entries);
+-static unsigned dx_get_limit (struct dx_entry *entries);
+-static void dx_set_count (struct dx_entry *entries, unsigned value);
+-static void dx_set_limit (struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+-static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
+-                               struct inode *dir,
+-                               struct dx_hash_info *hinfo,
+-                               struct dx_frame *frame,
+-                               int *err);
+-static void dx_release (struct dx_frame *frames);
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
++static void dx_set_block(struct iam_path *p,
++                       struct iam_entry *entry, unsigned value);
++static unsigned dx_get_limit(struct iam_entry *entries);
++static void dx_set_count(struct iam_entry *entries, unsigned value);
++static void dx_set_limit(struct iam_entry *entries, unsigned value);
++static unsigned dx_root_limit(struct iam_path *p);
++static unsigned dx_node_limit(struct iam_path *p);
++static int dx_probe(struct qstr *name,
++                  struct inode *dir,
++                  struct dx_hash_info *hinfo,
++                  struct iam_path *path);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct dx_frame *frame,
+-                               struct dx_frame *frames, 
+-                               __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
+-
+-/*
+- * Future: use high four bits of block for coalesce-on-delete flags
+- * Mask them off for now.
+- */
+-
+-static inline unsigned dx_get_block (struct dx_entry *entry)
++static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
+-}
+-
+-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+-{
+-      entry->block = cpu_to_le32(value);
+-}
+-
+-static inline unsigned dx_get_hash (struct dx_entry *entry)
+-{
+-      return le32_to_cpu(entry->hash);
+-}
+-
+-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+-{
+-      entry->hash = cpu_to_le32(value);
+-}
+-
+-static inline unsigned dx_get_count (struct dx_entry *entries)
+-{
+-      return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+-}
+-
+-static inline unsigned dx_get_limit (struct dx_entry *entries)
+-{
+-      return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
++      ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++int dx_index_is_compat(struct iam_path *path)
+ {
+-      ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
++      return iam_path_descr(path) == &iam_htree_compat_param;
+ }
+-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+-{
+-      ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+-}
+-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++int dx_node_check(struct iam_path *p, struct iam_frame *f)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
+-              EXT3_DIR_REC_LEN(2) - infosize;
+-      return 0? 20: entry_space / sizeof(struct dx_entry);
+-}
++      struct iam_entry     *e;
++      struct iam_container *c;
++      unsigned count;
++      unsigned  i;
++      iam_ptr_t  blk;
++      iam_ptr_t  root;
++      struct inode *inode;
+-static inline unsigned dx_node_limit (struct inode *dir)
+-{
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
+-      return 0? 22: entry_space / sizeof(struct dx_entry);
++      c = p->ip_container;
++      e = dx_node_get_entries(p, f);
++      count = dx_get_count(e);
++      e = iam_entry_shift(p, e, 1);
++      root = iam_path_descr(p)->id_ops->id_root_ptr(c);
++
++      inode = iam_path_obj(p);
++      for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
++              iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1));
++              iam_get_ikey(p, e, iam_path_ikey(p, 1));
++              if (i > 0 &&
++                  iam_ikeycmp(c, iam_path_ikey(p, 0),
++                              iam_path_ikey(p, 1)) > 0)
++                      return 0;
++              blk = dx_get_block(p, e);
++              /*
++               * Disable this check as it is racy.
++               */
++              if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize)
++      return 0;
++              /*
++               * By definition of a tree, no node points to the root.
++               */
++              if (blk == root)
++                      return 0;
++              }
++      return 1;
+ }
+ /*
+  * Debug
+  */
+ #ifdef DX_DEBUG
+-static void dx_show_index (char * label, struct dx_entry *entries)
++static void dx_show_index (char * label, struct iam_entry *entries)
+ {
+         int i, n = dx_get_count (entries);
+         printk("%s index ", label);
+@@ -288,7 +212,7 @@
+ }
+ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+-                           struct dx_entry *entries, int levels)
++                           struct iam_entry *entries, int levels)
+ {
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+@@ -319,134 +243,368 @@
+ #endif /* DX_DEBUG */
+ /*
+- * Probe for a directory leaf block to search.
++ * Per-node tree locking.
+- *
+- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+- * error in the directory index, and the caller should fall back to
+- * searching the directory normally.  The callers of dx_probe **MUST**
+- * check for this error code, and make sure it never gets reflected
+- * back to userspace.
+  */
+-static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
+-       struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+-{
+-      unsigned count, indirect;
+-      struct dx_entry *at, *entries, *p, *q, *m;
+-      struct dx_root *root;
+-      struct buffer_head *bh;
+-      struct dx_frame *frame = frame_in;
+-      u32 hash;
+-      frame->bh = NULL;
+-      if (dentry)
+-              dir = dentry->d_parent->d_inode;
+-      if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+-              goto fail;
+-      root = (struct dx_root *) bh->b_data;
+-      if (root->info.hash_version != DX_HASH_TEA &&
+-          root->info.hash_version != DX_HASH_HALF_MD4 &&
+-          root->info.hash_version != DX_HASH_LEGACY) {
+-              ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unrecognised inode hash code %d",
+-                           root->info.hash_version);
+-              brelse(bh);
+-              *err = ERR_BAD_DX_DIR;
+-              goto fail;
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock     25
++
++#define DX_DEBUG (1)
++
++#if DX_DEBUG
++static struct dx_lock_stats {
++      unsigned dls_bh_lock;
++      unsigned dls_bh_busy;
++      unsigned dls_bh_again;
++      unsigned dls_bh_full_again;
++} dx_lock_stats = { 0, };
++#define DX_DEVAL(x) x
++#else
++#define DX_DEVAL(x)
++#endif
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++      DX_DEVAL(dx_lock_stats.dls_bh_lock++);
++#ifdef CONFIG_SMP
++        while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++              DX_DEVAL(dx_lock_stats.dls_bh_busy++);
++                while (test_bit(BH_DXLock, &bh->b_state))
++                        cpu_relax();
++        }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++        smp_mb__before_clear_bit();
++        clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++/*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
++                                   enum dynlock_type lt)
++{
++      return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS);
++}
++
++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh)
++{
++      if (lh != NULL)
++              dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh);
++}
++
++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh)
++{
++      int i;
++
++      for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
++              if (*lh != NULL) {
++                      dx_unlock_htree(dir, *lh);
++                      *lh = NULL;
++              }
+       }
+-      hinfo->hash_version = root->info.hash_version;
+-      hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+-      if (dentry)
+-              ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+-      hash = hinfo->hash;
+-
+-      if (root->info.unused_flags & 1) {
+-              ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unimplemented inode hash flags: %#06x",
+-                           root->info.unused_flags);
+-              brelse(bh);
+-              *err = ERR_BAD_DX_DIR;
+-              goto fail;
++}
++
++/*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct iam_entry *dx_find_position(struct iam_path *path,
++                                 struct iam_frame *frame)
++{
++      int count;
++      struct iam_entry *p;
++      struct iam_entry *q;
++      struct iam_entry *m;
++
++      count = dx_get_count(frame->entries);
++      assert_corr(count && count <= dx_get_limit(frame->entries));
++      p = iam_entry_shift(path, frame->entries,
++                          dx_index_is_compat(path) ? 1 : 2);
++      q = iam_entry_shift(path, frame->entries, count - 1);
++      while (p <= q) {
++              m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
++              if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
++                              path->ip_ikey_target) > 0)
++                      q = iam_entry_shift(path, m, -1);
++              else
++                      p = iam_entry_shift(path, m, +1);
+       }
++      return iam_entry_shift(path, p, -1);
++}
+-      if ((indirect = root->info.indirect_levels) > 1) {
+-              ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unimplemented inode hash depth: %#06x",
+-                           root->info.indirect_levels);
+-              brelse(bh);
+-              *err = ERR_BAD_DX_DIR;
+-              goto fail;
++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame)
++{
++      return dx_get_block(path, dx_find_position(path, frame));
++}
++
++/*
++ * Fast check for frame consistency.
++ */
++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame)
++{
++      struct iam_container *bag;
++      struct iam_entry *next;
++      struct iam_entry *last;
++      struct iam_entry *entries;
++      struct iam_entry *at;
++
++      bag     = path->ip_container;
++      at      = frame->at;
++      entries = frame->entries;
++      last    = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
++
++      if (unlikely(at > last))
++              return -EAGAIN;
++
++      if (unlikely(dx_get_block(path, at) != frame->leaf))
++              return -EAGAIN;
++
++      if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
++                               path->ip_ikey_target) > 0))
++              return -EAGAIN;
++
++      next = iam_entry_shift(path, at, +1);
++      if (next <= last) {
++              if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
++                                       path->ip_ikey_target) <= 0))
++                      return -EAGAIN;
+       }
++      return 0;
++}
+-      entries = (struct dx_entry *) (((char *)&root->info) +
+-                                     root->info.info_length);
+-      assert(dx_get_limit(entries) == dx_root_limit(dir,
+-                                                    root->info.info_length));
+-      dxtrace (printk("Look up %x", hash));
+-      while (1)
+-      {
+-              count = dx_get_count(entries);
+-              assert (count && count <= dx_get_limit(entries));
+-              p = entries + 1;
+-              q = entries + count - 1;
+-              while (p <= q)
+-              {
+-                      m = p + (q - p)/2;
+-                      dxtrace(printk("."));
+-                      if (dx_get_hash(m) > hash)
+-                              q = m - 1;
+-                      else
+-                              p = m + 1;
+-              }
++/*
++ * returns 0 if path was unchanged, -EAGAIN otherwise.
++ */
++static int dx_check_path(struct iam_path *path, struct iam_frame *frame)
++{
++      int equal;
+-              if (0) // linear search cross check
+-              {
+-                      unsigned n = count - 1;
+-                      at = entries;
+-                      while (n--)
+-                      {
+-                              dxtrace(printk(","));
+-                              if (dx_get_hash(++at) > hash)
+-                              {
+-                                      at--;
+-                                      break;
+-                              }
++      dx_lock_bh(frame->bh);
++      equal = dx_check_fast(path, frame) == 0 ||
++              frame->leaf == dx_find_ptr(path, frame);
++      DX_DEVAL(dx_lock_stats.dls_bh_again += !equal);
++      dx_unlock_bh(frame->bh);
++      
++      return equal ? 0 : -EAGAIN;
++}
++
++/*
++ * returns 0 if path was unchanged, -EAGAIN otherwise.
++ */
++static int dx_check_full_path(struct iam_path *path, int search)
++{
++      struct iam_frame *bottom;
++      struct iam_frame *scan;
++      int i;
++      int result;
++
++      do_corr(schedule());
++
++      for (bottom = path->ip_frames, i = 0;
++           i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
++              ; /* find last filled in frame */
++      }
++
++      /*
++       * Lock frames, bottom to top.
++       */
++      for (scan = bottom - 1; scan >= path->ip_frames; --scan)
++              dx_lock_bh(scan->bh);
++      /*
++       * Check them top to bottom.
++       */
++      result = 0;
++      for (scan = path->ip_frames; scan < bottom; ++scan) {
++              struct iam_entry *pos;
++
++              if (search) {
++                      if (dx_check_fast(path, scan) == 0)
++                              continue;
++
++                      pos = dx_find_position(path, scan);
++                      if (scan->leaf != dx_get_block(path, pos)) {
++                              result = -EAGAIN;
++                              break;
++                      }
++                      scan->at = pos;
++              } else {
++                      pos = iam_entry_shift(path, scan->entries,
++                                            dx_get_count(scan->entries) - 1);
++                      if (scan->at > pos ||
++                          scan->leaf != dx_get_block(path, scan->at)) {
++                              result = -EAGAIN;
++                              break;
+                       }
+-                      assert (at == p - 1);
+               }
+-
+-              at = p - 1;
+-              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+-              frame->bh = bh;
+-              frame->entries = entries;
+-              frame->at = at;
+-              if (!indirect--) return frame;
+-              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+-                      goto fail2;
+-              at = entries = ((struct dx_node *) bh->b_data)->entries;
+-              assert (dx_get_limit(entries) == dx_node_limit (dir));
+-              frame++;
+-      }
+-fail2:
+-      while (frame >= frame_in) {
+-              brelse(frame->bh);
+-              frame--;
+       }
+-fail:
+-      return NULL;
++
++      /*
++       * Unlock top to bottom.
++       */
++      for (scan = path->ip_frames; scan < bottom; ++scan)
++              dx_unlock_bh(scan->bh);
++      DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result);
++      do_corr(schedule());
++
++      return result;
+ }
+-static void dx_release (struct dx_frame *frames)
++static int dx_lookup_try(struct iam_path *path)
++{
++      u32 ptr;
++      int err = 0;
++      int i;
++
++      struct iam_descr *param;
++      struct iam_frame *frame;
++      struct iam_container *c;
++
++      param = iam_path_descr(path);
++      c = path->ip_container;
++      
++                   ptr = param->id_ops->id_root_ptr(c);
++      for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
++           ++frame, ++i) {
++              err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
++                                                &frame->bh);
++              do_corr(schedule());
++
++              dx_lock_bh(frame->bh);
++              /*
++               * node must be initialized under bh lock because concurrent
++               * creation procedure may change it and dx_lookup_try() will
++               * see obsolete tree height. -bzzz
++               */
++              if (err != 0)
++                      break;
++
++              if (EXT3_INVARIANT_ON) {
++                      err = param->id_ops->id_node_check(path, frame);
++                      if (err != 0)
++                              break;
++              }
++
++              err = param->id_ops->id_node_load(path, frame);
++              if (err != 0)
++                      break;
++
++              assert_inv(dx_node_check(path, frame));
++              /*
++               * splitting may change root index block and move hash we're
++               * looking for into another index block so, we have to check
++               * this situation and repeat from begining if path got changed
++               * -bzzz
++               */
++              if (i > 0) {
++                      err = dx_check_path(path, frame - 1);
++                      if (err != 0)
++                              break;
++              }
++
++              frame->at = dx_find_position(path, frame);
++              frame->curidx = ptr;
++              frame->leaf = ptr = dx_get_block(path, frame->at);
++
++              dx_unlock_bh(frame->bh);
++              do_corr(schedule());
++      }
++      if (err != 0)
++              dx_unlock_bh(frame->bh);
++      path->ip_frame = --frame;
++      return err;
++}
++
++static int dx_lookup(struct iam_path *path)
++{
++      int err;
++      int i;
++
++      for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
++              assert(path->ip_frames[i].bh == NULL);
++
++      do {
++              err = dx_lookup_try(path);
++              do_corr(schedule());
++              if (err != 0)
++                      iam_path_fini(path);
++      } while (err == -EAGAIN);
++
++      return err;
++}
++
++/*
++ * Performs path lookup and returns with found leaf (if any) locked by htree
++ * lock.
++ */
++int dx_lookup_lock(struct iam_path *path,
++                 struct dynlock_handle **dl, enum dynlock_type lt)
+ {
+-      if (frames[0].bh == NULL)
+-              return;
++      int result;
++      struct inode *dir;
+-      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      dir = iam_path_obj(path);
++      while ((result = dx_lookup(path)) == 0) {
++              do_corr(schedule());
++              *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt);
++              if (*dl == NULL) {
++                      iam_path_fini(path);
++                      result = -ENOMEM;
++                      break;
++              }
++              do_corr(schedule());
++              /*
++               * while locking leaf we just found may get split so we need
++               * to check this -bzzz
++               */
++              if (dx_check_full_path(path, 1) == 0)
++                      break;
++              dx_unlock_htree(dir, *dl);
++              *dl = NULL;
++              iam_path_fini(path);
++      }
++      return result;
+ }
+ /*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally.  The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static int dx_probe(struct qstr *name, struct inode *dir,
++                  struct dx_hash_info *hinfo, struct iam_path *path)
++{
++      int err;
++      struct iam_path_compat *ipc;
++      
++      assert_corr(path->ip_data != NULL);
++      ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
++      ipc->ipc_qstr  = name;
++      ipc->ipc_hinfo = hinfo;
++
++      assert_corr(dx_index_is_compat(path));
++      err = dx_lookup(path);
++      assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
++      return err;
++}
++
++
++/*
+  * This function increments the frame pointer to search the next leaf
+  * block, and reads in the necessary intervening nodes if the search
+  * should be necessary.  Whether or not the search is necessary is
+@@ -463,17 +632,16 @@
+  * If start_hash is non-null, it will be filled in with the starting
+  * hash of the next page.
+  */
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct dx_frame *frame,
+-                               struct dx_frame *frames, 
+-                               __u32 *start_hash)
++static int ext3_htree_advance(struct inode *dir, __u32 hash,
++                            struct iam_path *path, __u32 *start_hash,
++                            int compat)
+ {
+-      struct dx_frame *p;
++      struct iam_frame *p;
+       struct buffer_head *bh;
+       int err, num_frames = 0;
+       __u32 bhash;
+-      p = frame;
++      p = path->ip_frame;
+       /*
+        * Find the next leaf page by incrementing the frame pointer.
+        * If we run out of entries in the interior node, loop around and
+@@ -482,14 +650,26 @@
+        * nodes need to be read.
+        */
+       while (1) {
+-              if (++(p->at) < p->entries + dx_get_count(p->entries))
++              do_corr(schedule());
++              dx_lock_bh(p->bh);
++              p->at = iam_entry_shift(path, p->at, +1);
++              if (p->at < iam_entry_shift(path, p->entries,
++                                          dx_get_count(p->entries))) {
++                      p->leaf = dx_get_block(path, p->at);
++                      dx_unlock_bh(p->bh);
+                       break;
+-              if (p == frames)
++              }
++              dx_unlock_bh(p->bh);
++              if (p == path->ip_frames)
+                       return 0;
+               num_frames++;
+-              p--;
++              --p;
+       }
++      if (compat) {
++              /*
++               * Htree hash magic.
++               */
+       /*
+        * If the hash is 1, then continue only if the next page has a
+        * continuation hash of any value.  This is used for readdir
+@@ -497,30 +677,146 @@
+        * desired contiuation hash.  If it doesn't, return since
+        * there's no point to read in the successive index pages.
+        */
+-      bhash = dx_get_hash(p->at);
++              iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
+       if (start_hash)
+               *start_hash = bhash;
+       if ((hash & 1) == 0) {
+               if ((bhash & ~1) != hash)
+                       return 0;
+       }
++      }
+       /*
+        * If the hash is HASH_NB_ALWAYS, we always go to the next
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+-                                    0, &err)))
++              iam_ptr_t idx;
++
++              do_corr(schedule());
++              dx_lock_bh(p->bh);
++              idx = p->leaf = dx_get_block(path, p->at);
++              dx_unlock_bh(p->bh);
++              err = iam_path_descr(path)->id_ops->
++                      id_node_read(path->ip_container, idx, NULL, &bh);
++              if (err != 0)
+                       return err; /* Failure */
+-              p++;
+-              brelse (p->bh);
++              ++p;
++              brelse(p->bh);
++              assert_corr(p->bh != bh);
+               p->bh = bh;
+-              p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++              p->entries = dx_node_get_entries(path, p);
++              p->at = iam_entry_shift(path, p->entries, !compat);
++              assert_corr(p->curidx != idx);
++              p->curidx = idx;
++              dx_lock_bh(p->bh);
++              assert_corr(p->leaf != dx_get_block(path, p->at));
++              p->leaf = dx_get_block(path, p->at);
++              dx_unlock_bh(p->bh);
++              assert_inv(dx_node_check(path, p));
+       }
+       return 1;
+ }
+-
++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
++{
++      struct iam_frame *f;
++
++      for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
++              do_corr(schedule());
++              *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ);
++              if (*lh == NULL)
++                      return -ENOMEM;
++      }
++      return 0;
++}
++
++static int iam_index_advance(struct iam_path *path)
++{
++      return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
++}
++
++/*
++ * Advance index part of @path to point to the next leaf. Returns 1 on
++ * success, 0, when end of container was reached. Leaf node is locked.
++ */
++int iam_index_next(struct iam_container *c, struct iam_path *path)
++{
++      iam_ptr_t cursor;
++      struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, };
++      int result;
++      struct inode *object;
++
++      /*
++       * Locking for iam_index_next()... is to be described.
++       */
++
++      object = c->ic_object;
++      cursor = path->ip_frame->leaf;
++
++      while (1) {
++              result = iam_index_lock(path, lh);
++              do_corr(schedule());
++              if (result < 0)
++                      break;
++              
++              result = dx_check_full_path(path, 0);
++              if (result == 0 && cursor == path->ip_frame->leaf) {
++                      result = iam_index_advance(path);
++
++                      assert_corr(result == 0 ||
++                                  cursor != path->ip_frame->leaf);
++                      break;
++              }
++              do {
++                      dx_unlock_array(object, lh);
++
++                      iam_path_release(path);
++                      do_corr(schedule());
++
++                      result = dx_lookup(path);
++                      if (result < 0)
++                              break;
++
++                      while (path->ip_frame->leaf != cursor) {
++                              do_corr(schedule());
++
++                              result = iam_index_lock(path, lh);
++                              do_corr(schedule());
++                              if (result < 0)
++                                      break;
++
++                              result = dx_check_full_path(path, 0);
++                              if (result != 0)
++                                      break;
++
++                              result = iam_index_advance(path);
++                              if (result == 0) {
++                                      ext3_error(object->i_sb, __FUNCTION__,
++                                                 "cannot find cursor: %u\n",
++                                                 cursor);
++                                      result = -EIO;
++                              }
++                              if (result < 0)
++                                      break;
++                              result = dx_check_full_path(path, 0);
++                              if (result != 0)
++                                      break;
++                              dx_unlock_array(object, lh);
++                      }
++              } while (result == -EAGAIN);
++              if (result < 0)
++                      break;
++      }
++      dx_unlock_array(object, lh);
++      return result;
++}
++
++int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                        struct iam_path *path, __u32 *start_hash)
++{
++      return ext3_htree_advance(dir, hash, path, start_hash, 1);
++}
++
+ /*
+  * p is at least 6 bytes before the end of page
+  */
+@@ -593,7 +889,8 @@
+ {
+       struct dx_hash_info hinfo;
+       struct ext3_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
+       struct inode *dir;
+       int block, err;
+       int count = 0;
+@@ -603,6 +900,7 @@
+       dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+                      start_minor_hash));
+       dir = dir_file->f_dentry->d_inode;
++      iam_path_compat_init(&cpath, dir);
+       if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+               hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+               hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -613,19 +911,19 @@
+       }
+       hinfo.hash = start_hash;
+       hinfo.minor_hash = 0;
+-      frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+-      if (!frame)
++      err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, path);
++      if (err != 0)
+               return err;
+       /* Add '.' and '..' from the htree header */
+       if (!start_hash && !start_minor_hash) {
+-              de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++              de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data;
+               if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+                       goto errout;
+               count++;
+       }
+       if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+-              de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++              de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data;
+               de = ext3_next_entry(de);
+               if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+                       goto errout;
+@@ -633,7 +931,7 @@
+       }
+       while (1) {
+-              block = dx_get_block(frame->at);
++              block = dx_get_block(path, path->ip_frame->at);
+               ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+                                            start_hash, start_minor_hash);
+               if (ret < 0) {
+@@ -642,8 +940,8 @@
+               }
+               count += ret;
+               hashval = ~0;
+-              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, 
+-                                          frame, frames, &hashval);
++              ret = ext3_htree_next_block(dir,
++                                          HASH_NB_ALWAYS, path, &hashval);
+               *next_hash = hashval;
+               if (ret < 0) {
+                       err = ret;
+@@ -658,12 +956,12 @@
+                   (count && ((hashval & 1) == 0)))
+                       break;
+       }
+-      dx_release(frames);
+-      dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 
++      iam_path_fini(path);
++      dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+                      count, *next_hash));
+       return count;
+ errout:
+-      dx_release(frames);
++      iam_path_fini(path);
+       return (err);
+ }
+@@ -695,7 +1011,6 @@
+                       map_tail--;
+                       map_tail->hash = h.hash;
+                       map_tail->offs = (u16) ((char *) de - base);
+-                      map_tail->size = le16_to_cpu(de->rec_len);
+                       count++;
+                       cond_resched();
+               }
+@@ -723,19 +1021,45 @@
+       } while(more);
+ }
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++                  const struct iam_ikey *key, iam_ptr_t ptr)
+ {
+-      struct dx_entry *entries = frame->entries;
+-      struct dx_entry *old = frame->at, *new = old + 1;
++      struct iam_entry *entries = frame->entries;
++      struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+       int count = dx_get_count(entries);
+-      assert(count < dx_get_limit(entries));
+-      assert(old < entries + count);
+-      memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+-      dx_set_hash(new, hash);
+-      dx_set_block(new, block);
++      /*
++       * Unfortunately we cannot assert this, as this function is sometimes
++       * called by VFS under i_sem and without pdirops lock.
++       */
++      assert_corr(1 || iam_frame_is_locked(path, frame));
++      assert_corr(count < dx_get_limit(entries));
++      assert_corr(frame->at < iam_entry_shift(path, entries, count));
++      assert_inv(dx_node_check(path, frame));
++
++      memmove(iam_entry_shift(path, new, 1), new,
++              (char *)iam_entry_shift(path, entries, count) - (char *)new);
++      dx_set_ikey(path, new, key);
++      dx_set_block(path, new, ptr);
+       dx_set_count(entries, count + 1);
++      assert_inv(dx_node_check(path, frame));
++}
++
++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
++                       const struct iam_ikey *key, iam_ptr_t ptr)
++{
++      dx_lock_bh(frame->bh);
++      iam_insert_key(path, frame, key, ptr);
++      dx_unlock_bh(frame->bh);
++}
++
++void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
++                   u32 hash, u32 block)
++{
++      assert_corr(dx_index_is_compat(path));
++      iam_insert_key(path, frame, (struct iam_ikey *)&hash, block);
+ }
++
+ #endif
+@@ -934,7 +1258,11 @@
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_frame frames[2], *frame;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_entry_compat dummy_dot = {
++              .block = 0
++      };
+       struct ext3_dir_entry_2 *de, *top;
+       struct buffer_head *bh;
+       unsigned long block;
+@@ -943,21 +1271,25 @@
+       const u8 *name = dentry->d_name.name;
+       struct inode *dir = dentry->d_parent->d_inode;
++      iam_path_compat_init(&cpath, dir);
++
+       sb = dir->i_sb;
+       /* NFS may look up ".." - look at dx_root directory block */
+       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+-              if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
++              *err = dx_probe(&dentry->d_name, NULL, &hinfo, path);
++              if (*err != 0)
+                       return NULL;
+       } else {
+-              frame = frames;
+-              frame->bh = NULL;                       /* for dx_release() */
+-              frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
+-              dx_set_block(frame->at, 0);             /* dx_root block is 0 */
++              path->ip_frame->bh = NULL;              /* for iam_path_fini() */
++              path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
+       }
+       hash = hinfo.hash;
+       do {
+-              block = dx_get_block(frame->at);
+-              if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++              block = dx_get_block(path, path->ip_frame->at);
++              *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container,
++                                                        (iam_ptr_t)block,
++                                                   NULL, &bh);
++              if (*err != 0)
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+               top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+@@ -972,13 +1304,12 @@
+                               goto errout;
+                       }
+                       *res_dir = de;
+-                      dx_release (frames);
++                      iam_path_fini(path);
+                       return bh;
+               }
+               brelse (bh);
+               /* Check to see if we should continue to search */
+-              retval = ext3_htree_next_block(dir, hash, frame,
+-                                             frames, NULL);
++              retval = ext3_htree_next_block(dir, hash, path, NULL);
+               if (retval < 0) {
+                       ext3_warning(sb, __FUNCTION__,
+                            "error reading index page in directory #%lu",
+@@ -991,7 +1322,7 @@
+       *err = -ENOENT;
+ errout:
+       dxtrace(printk("%s not found\n", name));
+-      dx_release (frames);
++      iam_path_fini(path);
+       return NULL;
+ }
+ #endif
+@@ -1124,19 +1455,69 @@
+  * Allocate a new block, and move entries so that they are approx. equally full.
+  * Returns pointer to de in block into which the new entry will be inserted.
+  */
+-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+-                      struct buffer_head **bh,struct dx_frame *frame,
+-                      struct dx_hash_info *hinfo, int *error)
++struct ext3_dir_entry_2 *move_entries(struct inode *dir,
++                                    struct dx_hash_info *hinfo,
++                                    struct buffer_head **bh1,
++                                    struct buffer_head **bh2,
++                                    __u32 *delim_hash)
+ {
++      char *data1;
++      char *data2;
+       unsigned blocksize = dir->i_sb->s_blocksize;
+-      unsigned count, continued;
++      unsigned count;
++      unsigned continued;
++      unsigned split;
++      u32 hash2;
++
++      struct dx_map_entry     *map;
++      struct ext3_dir_entry_2 *de1;
++      struct ext3_dir_entry_2 *de2;
++
++      data1 = (*bh1)->b_data;
++      data2 = (*bh2)->b_data;
++
++      /* create map in the end of data2 block */
++      map = (struct dx_map_entry *) (data2 + blocksize);
++      count = dx_make_map((struct ext3_dir_entry_2 *) data1,
++                          blocksize, hinfo, map);
++      map -= count;
++      split = count/2; // need to adjust to actual middle
++      dx_sort_map(map, count);
++      hash2 = map[split].hash;
++      continued = hash2 == map[split - 1].hash;
++      dxtrace(printk("Split block %i at %x, %i/%i\n",
++              frame->leaf, hash2, split, count - split));
++
++      /* Fancy dance to stay within two buffers */
++      de2 = dx_move_dirents(data1, data2, map + split, count - split);
++      de1 = dx_pack_dirents(data1, blocksize);
++      de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1);
++      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++      dxtrace(dx_show_leaf(hinfo,
++                           (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++      dxtrace(dx_show_leaf(hinfo,
++                           (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++      /* Which block gets the new entry? */
++      if (hinfo->hash >= hash2) {
++              swap(*bh1, *bh2);
++              de1 = de2;
++      }
++      *delim_hash = hash2 + continued;
++      return de1;
++}
++
++/* Allocate new node, and split leaf node @bh into it, inserting new pointer
++ * into parent node identified by @frame */
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path,
++                      struct buffer_head **bh,struct iam_frame *frame,
++                      struct dx_hash_info *hinfo, int *error)
++{
++      struct inode *dir = iam_path_obj(path);
+       struct buffer_head *bh2;
+       u32 newblock;
+       u32 hash2;
+-      struct dx_map_entry *map;
+-      char *data1 = (*bh)->b_data, *data2;
+-      unsigned split, move, size, i;
+-      struct ext3_dir_entry_2 *de = NULL, *de2;
++      struct ext3_dir_entry_2 *de = NULL;
+       int     err;
+       bh2 = ext3_append (handle, dir, &newblock, error);
+@@ -1161,46 +1542,9 @@
+       if (err)
+               goto journal_error;
+-      data2 = bh2->b_data;
+-
+-      /* create map in the end of data2 block */
+-      map = (struct dx_map_entry *) (data2 + blocksize);
+-      count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
+-                           blocksize, hinfo, map);
+-      map -= count;
+-      dx_sort_map (map, count);
+-      /* Split the existing block in the middle, size-wise */
+-      size = 0;
+-      move = 0;
+-      for (i = count-1; i >= 0; i--) {
+-              /* is more than half of this entry in 2nd half of the block? */
+-              if (size + map[i].size/2 > blocksize/2)
+-                      break;
+-              size += map[i].size;
+-              move++;
+-      }
+-      /* map index at which we will split */
+-      split = count - move;
+-      hash2 = map[split].hash;
+-      continued = hash2 == map[split - 1].hash;
+-      dxtrace(printk("Split block %i at %x, %i/%i\n",
+-              dx_get_block(frame->at), hash2, split, count-split));
+-
+-      /* Fancy dance to stay within two buffers */
+-      de2 = dx_move_dirents(data1, data2, map + split, count - split);
+-      de = dx_pack_dirents(data1,blocksize);
+-      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+-      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++      de = move_entries(dir, hinfo, bh, &bh2, &hash2);
+-      /* Which block gets the new entry? */
+-      if (hinfo->hash >= hash2)
+-      {
+-              swap(*bh, bh2);
+-              de = de2;
+-      }
+-      dx_insert_block (frame, hash2 + continued, newblock);
++      dx_insert_block(path, frame, hash2, newblock);
+       err = ext3_journal_dirty_metadata (handle, bh2);
+       if (err)
+               goto journal_error;
+@@ -1203,6 +1558,63 @@
+ }
+ #endif
++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir,
++                                            struct buffer_head *bh,
++                                            const char *name, int namelen)
++{
++      struct ext3_dir_entry_2 *de;
++      char *top;
++      unsigned long offset;
++      int nlen;
++      int rlen;
++      int reclen;
++
++      reclen = EXT3_DIR_REC_LEN(namelen);
++      de = (struct ext3_dir_entry_2 *)bh->b_data;
++      top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++      offset = 0;
++      while ((char *) de <= top) {
++              if (!ext3_check_dir_entry("ext3_add_entry",
++                                        dir, de, bh, offset))
++                      return ERR_PTR(-EIO);
++              if (ext3_match(namelen, name, de))
++                      return ERR_PTR(-EEXIST);
++              nlen = EXT3_DIR_REC_LEN(de->name_len);
++              rlen = le16_to_cpu(de->rec_len);
++              if ((de->inode? rlen - nlen: rlen) >= reclen)
++                      return de;
++              de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++              offset += rlen;
++      }
++      return ERR_PTR(-ENOSPC);
++}
++
++struct ext3_dir_entry_2 *split_entry(struct inode *dir,
++                                   struct ext3_dir_entry_2 *de,
++                                   unsigned long ino, mode_t mode,
++                                   const char *name, int namelen)
++{
++      int nlen;
++      int rlen;
++
++      nlen = EXT3_DIR_REC_LEN(de->name_len);
++      rlen = le16_to_cpu(de->rec_len);
++      if (de->inode) {
++              struct ext3_dir_entry_2 *de1;
++
++              de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++              de1->rec_len = cpu_to_le16(rlen - nlen);
++              de->rec_len = cpu_to_le16(nlen);
++              de = de1;
++      }
++      de->file_type = EXT3_FT_UNKNOWN;
++      de->inode = cpu_to_le32(ino);
++      if (ino != 0)
++              ext3_set_de_type(dir->i_sb, de, mode);
++      de->name_len = namelen;
++      memcpy(de->name, name, namelen);
++      return de;
++}
+ /*
+  * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+@@ -1222,34 +1634,16 @@
+       struct inode    *dir = dentry->d_parent->d_inode;
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+-      unsigned long   offset = 0;
+-      unsigned short  reclen;
+-      int             nlen, rlen, err;
+-      char            *top;
++      int             err;
+-      reclen = EXT3_DIR_REC_LEN(namelen);
+       if (!de) {
+-              de = (struct ext3_dir_entry_2 *)bh->b_data;
+-              top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+-              while ((char *) de <= top) {
+-                      if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
+-                                                bh, offset)) {
+-                              brelse (bh);
+-                              return -EIO;
+-                      }
+-                      if (ext3_match (namelen, name, de)) {
+-                              brelse (bh);
+-                              return -EEXIST;
+-                      }
+-                      nlen = EXT3_DIR_REC_LEN(de->name_len);
+-                      rlen = le16_to_cpu(de->rec_len);
+-                      if ((de->inode? rlen - nlen: rlen) >= reclen)
+-                              break;
+-                      de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
+-                      offset += rlen;
++              de = find_insertion_point(dir, bh, name, namelen);
++              if (IS_ERR(de)) {
++                      err = PTR_ERR(de);
++                      if (err != -ENOSPC)
++                              brelse(bh);
++                      return err;
+               }
+-              if ((char *) de > top)
+-                      return -ENOSPC;
+       }
+       BUFFER_TRACE(bh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, bh);
+@@ -1260,22 +1654,9 @@
+       }
+       /* By now the buffer is marked for journaling */
+-      nlen = EXT3_DIR_REC_LEN(de->name_len);
+-      rlen = le16_to_cpu(de->rec_len);
+-      if (de->inode) {
+-              struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
+-              de1->rec_len = cpu_to_le16(rlen - nlen);
+-              de->rec_len = cpu_to_le16(nlen);
+-              de = de1;
+-      }
+-      de->file_type = EXT3_FT_UNKNOWN;
+-      if (inode) {
+-              de->inode = cpu_to_le32(inode->i_ino);
+-              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+-      } else
+-              de->inode = 0;
+-      de->name_len = namelen;
+-      memcpy (de->name, name, namelen);
++
++      split_entry(dir, de, inode ? inode->i_ino : 0,
++                  inode ? inode->i_mode : 0, name, namelen);
+       /*
+        * XXX shouldn't update any times until successful
+        * completion of syscall, but too many callers depend
+@@ -1304,6 +1685,7 @@
+  * This converts a one block unindexed directory to a 3 block indexed
+  * directory, and adds the dentry to the indexed directory.
+  */
++extern int user_selected_hash_function;
+ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+                           struct inode *inode, struct buffer_head *bh)
+ {
+@@ -1312,8 +1694,9 @@
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+       struct dx_root  *root;
+-      struct dx_frame frames[2], *frame;
+-      struct dx_entry *entries;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_entry *entries;
+       struct ext3_dir_entry_2 *de, *de2;
+       char            *data1, *top;
+       unsigned        len;
+@@ -1323,6 +1706,7 @@
+       u32             block;
+       struct fake_dirent *fde;
++      iam_path_compat_init(&cpath, dir);
+       blocksize =  dir->i_sb->s_blocksize;
+       dxtrace(printk("Creating index\n"));
+       retval = ext3_journal_get_write_access(handle, bh);
+@@ -1357,23 +1741,25 @@
+       memset (&root->info, 0, sizeof(root->info));
+       root->info.info_length = sizeof(root->info);
+       root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+-      entries = root->entries;
+-      dx_set_block (entries, 1);
++      if (user_selected_hash_function >= 0 &&
++          user_selected_hash_function <= DX_HASH_MAX)
++              root->info.hash_version = user_selected_hash_function;
++      entries = (void *)root->entries;
++      dx_set_block (path, entries, 1);
+       dx_set_count (entries, 1);
+-      dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++      dx_set_limit (entries, dx_root_limit(path));
+       /* Initialize as for dx_probe */
+       hinfo.hash_version = root->info.hash_version;
+       hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+       ext3fs_dirhash(name, namelen, &hinfo);
+-      frame = frames;
+-      frame->entries = entries;
+-      frame->at = entries;
+-      frame->bh = bh;
++      path->ip_frame->entries = entries;
++      path->ip_frame->at = entries;
++      path->ip_frame->bh = bh;
+       bh = bh2;
+-      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+-      dx_release (frames);
+-      if (!(de))
++      de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &retval);
++      iam_path_fini(path);
++      if (!de)
+               return retval;
+       return add_dirent_to_buf(handle, dentry, inode, de, bh);
+@@ -1444,139 +1830,384 @@
+       return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ }
++static int shift_entries(struct iam_path *path,
++                       struct iam_frame *frame, unsigned count,
++                       struct iam_entry *entries, struct iam_entry *entries2,
++                       u32 newblock)
++{
++      unsigned count1;
++      unsigned count2;
++      int delta;
++
++      struct iam_frame *parent = frame - 1;
++      struct iam_ikey *pivot = iam_path_ikey(path, 3);
++
++      delta = dx_index_is_compat(path) ? 0 : +1;
++
++      count1 = count/2 + delta;
++      count2 = count - count1;
++      iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
++
++      dxtrace(printk("Split index %i/%i\n", count1, count2));
++
++      memcpy((char *) iam_entry_shift(path, entries2, delta),
++             (char *) iam_entry_shift(path, entries, count1),
++             count2 * iam_entry_size(path));
++
++      dx_set_count(entries2, count2 + delta);
++      dx_set_limit(entries2, dx_node_limit(path));
++
++      /*
++       * NOTE: very subtle piece of code competing dx_probe() may find 2nd
++       * level index in root index, then we insert new index here and set
++       * new count in that 2nd level index. so, dx_probe() may see 2nd level
++       * index w/o hash it looks for. the solution is to check root index
++       * after we locked just founded 2nd level index -bzzz
++       */
++      iam_insert_key_lock(path, parent, pivot, newblock);
++
++      /*
++       * now old and new 2nd level index blocks contain all pointers, so
++       * dx_probe() may find it in the both.  it's OK -bzzz
++       */
++      dx_lock_bh(frame->bh);
++      dx_set_count(entries, count1);
++      dx_unlock_bh(frame->bh);
++
++      /*
++       * now old 2nd level index block points to first half of leafs. it's
++       * importand that dx_probe() must check root index block for changes
++       * under dx_lock_bh(frame->bh) -bzzz
++       */
++
++      return count1;
++}
++
+ #ifdef CONFIG_EXT3_INDEX
+-/*
+- * Returns 0 for success, or a negative error value
+- */
+-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+-                           struct inode *inode)
++int split_index_node(handle_t *handle, struct iam_path *path,
++                   struct dynlock_handle **lh)
+ {
+-      struct dx_frame frames[2], *frame;
+-      struct dx_entry *entries, *at;
+-      struct dx_hash_info hinfo;
+-      struct buffer_head * bh;
+-      struct inode *dir = dentry->d_parent->d_inode;
+-      struct super_block * sb = dir->i_sb;
+-      struct ext3_dir_entry_2 *de;
+-      int err;
+-      frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+-      if (!frame)
+-              return err;
+-      entries = frame->entries;
+-      at = frame->at;
++      struct iam_entry *entries;   /* old block contents */
++      struct iam_entry *entries2;  /* new block contents */
++      struct iam_frame *frame, *safe;
++      struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
++      u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
++      struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
++      struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
++      struct inode *dir = iam_path_obj(path);
++      struct iam_descr *descr;
++      int nr_splet;
++      int i, err;
+-      if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+-              goto cleanup;
++      descr = iam_path_descr(path);
++      /*
++       * Algorithm below depends on this.
++       */
++      assert_corr(dx_root_limit(path) < dx_node_limit(path));
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
++      frame = path->ip_frame;
++      entries = frame->entries;
+-      err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+-      if (err != -ENOSPC) {
+-              bh = NULL;
+-              goto cleanup;
+-      }
++      /*
++       * Tall-tree handling: we might have to split multiple index blocks
++       * all the way up to tree root. Tricky point here is error handling:
++       * to avoid complicated undo/rollback we
++       *
++       *   - first allocate all necessary blocks
++       *
++       *   - insert pointers into them atomically.
++       */
++
++      /*
++       * Locking: leaf is already locked. htree-locks are acquired on all
++       * index nodes that require split bottom-to-top, on the "safe" node,
++       * and on all new nodes
++       */
+-      /* Block full, should compress but for now just split */
+       dxtrace(printk("using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+-      /* Need to split index? */
+-      if (dx_get_count(entries) == dx_get_limit(entries)) {
+-              u32 newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
+-              struct dx_entry *entries2;
+-              struct dx_node *node2;
+-              struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext3_warning(sb, __FUNCTION__,
+-                                   "Directory index full!");
++      /* What levels need split? */
++      for (nr_splet = 0; frame >= path->ip_frames &&
++           dx_get_count(frame->entries) == dx_get_limit(frame->entries);
++           --frame, ++nr_splet) {
++              do_corr(schedule());
++              if (nr_splet == DX_MAX_TREE_HEIGHT) {
++                      ext3_warning(dir->i_sb, __FUNCTION__,
++                                   "Directory index full!\n");
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
+-              bh2 = ext3_append (handle, dir, &newblock, &err);
+-              if (!(bh2))
++      }
++
++      safe = frame;
++
++      /*
++       * Lock all nodes, bottom to top.
++       */
++      for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
++              do_corr(schedule());
++              lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE);
++              if (lock[i] == NULL) {
++                      err = -ENOMEM;
++                      goto cleanup;
++              }
++      }
++
++      /*
++       * Check for concurrent index modification.
++       */
++      err = dx_check_full_path(path, 1);
++      if (err)
++              goto cleanup;
++      /*
++       * And check that the same number of nodes is to be split.
++       */
++      for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
++           dx_get_count(frame->entries) == dx_get_limit(frame->entries);
++           --frame, ++i) {
++              ;
++      }
++      if (i != nr_splet) {
++              err = -EAGAIN;
++              goto cleanup;
++      }
++
++      /* Go back down, allocating blocks, locking them, and adding into
++       * transaction... */
++      for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
++              bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
++              do_corr(schedule());
++              if (!bh_new[i] ||
++                  descr->id_ops->id_node_init(path->ip_container,
++                                              bh_new[i], 0) != 0)
++                      goto cleanup;
++              new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE);
++              if (new_lock[i] == NULL) {
++                      err = -ENOMEM;
+                       goto cleanup;
+-              node2 = (struct dx_node *)(bh2->b_data);
+-              entries2 = node2->entries;
+-              node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+-              node2->fake.inode = 0;
++              }
++              do_corr(schedule());
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
+-                      unsigned icount1 = icount/2, icount2 = icount - icount1;
+-                      unsigned hash2 = dx_get_hash(entries + icount1);
+-                      dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+-
+-                      BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+-                      err = ext3_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++      }
++      /* Add "safe" node to transaction too */
++      if (safe + 1 != path->ip_frames) {
++              do_corr(schedule());
++              err = ext3_journal_get_write_access(handle, safe->bh);
++              if (err)
++                      goto journal_error;
++      }
++
++      /* Go through nodes once more, inserting pointers */
++      for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
++              unsigned count;
++              int idx;
++              struct buffer_head *bh2;
++              struct buffer_head *bh;
++
++              entries = frame->entries;
++              count = dx_get_count(entries);
++              idx = iam_entry_diff(path, frame->at, entries);
++
++              bh2 = bh_new[i];
++              entries2 = dx_get_entries(path, bh2->b_data, 0);
++
++              bh = frame->bh;
++              if (frame == path->ip_frames) {
++                      /* splitting root node. Tricky point:
++                       *
++                       * In the "normal" B-tree we'd split root *and* add
++                       * new root to the tree with pointers to the old root
++                       * and its sibling (thus introducing two new nodes).
++                       *
++                       * In htree it's enough to add one node, because
++                       * capacity of the root node is smaller than that of
++                       * non-root one.
++                       */
++                      struct iam_frame *frames;
++                      struct iam_entry *next;
++
++                      assert_corr(i == 0);
++
++                      do_corr(schedule());
++
++                      frames = path->ip_frames;
++                      memcpy((char *) entries2, (char *) entries,
++                             count * iam_entry_size(path));
++                      dx_set_limit(entries2, dx_node_limit(path));
++
++                      /* Set up root */
++                      dx_lock_bh(frame->bh);
++                      next = descr->id_ops->id_root_inc(path->ip_container,
++                                                        path, frame);
++                      dx_set_block(path, next, newblock[0]);
++                      dx_unlock_bh(frame->bh);
++
++                      do_corr(schedule());
++                      /* Shift frames in the path */
++                      memmove(frames + 2, frames + 1,
++                              (sizeof path->ip_frames) - 2 * sizeof frames[0]);
++                      /* Add new access path frame */
++                      frames[1].at = iam_entry_shift(path, entries2, idx);
++                      frames[1].entries = entries = entries2;
++                      frames[1].bh = bh2;
++                      assert_inv(dx_node_check(path, frame));
++                      ++ path->ip_frame;
++                      ++ frame;
++                      assert_inv(dx_node_check(path, frame));
++                      bh_new[0] = NULL; /* buffer head is "consumed" */
++                      err = ext3_journal_get_write_access(handle, bh2);
+                       if (err)
+                               goto journal_error;
++                      do_corr(schedule());
++              } else {
++                      /* splitting non-root index node. */
++                      struct iam_frame *parent = frame - 1;
+-                      memcpy ((char *) entries2, (char *) (entries + icount1),
+-                              icount2 * sizeof(struct dx_entry));
+-                      dx_set_count (entries, icount1);
+-                      dx_set_count (entries2, icount2);
+-                      dx_set_limit (entries2, dx_node_limit(dir));
+-
++                      do_corr(schedule());
++                      count = shift_entries(path, frame, count,
++                                            entries, entries2, newblock[i]);
+                       /* Which index block gets the new entry? */
+-                      if (at - entries >= icount1) {
+-                              frame->at = at = at - entries - icount1 + entries2;
++                      if (idx >= count) {
++                              int d = dx_index_is_compat(path) ? 0 : +1;
++
++                              frame->at = iam_entry_shift(path, entries2,
++                                                          idx - count + d);
+                               frame->entries = entries = entries2;
++                              frame->curidx = newblock[i];
+                               swap(frame->bh, bh2);
++                              assert_corr(lock[i + 1] != NULL);
++                              assert_corr(new_lock[i] != NULL);
++                              swap(lock[i + 1], new_lock[i]);
++                              bh_new[i] = bh2;
++                              parent->at = iam_entry_shift(path,
++                                                           parent->at, +1);
+                       }
+-                      dx_insert_block (frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index ("node", frames[1].entries));
++                      assert_inv(dx_node_check(path, frame));
++                      assert_inv(dx_node_check(path, parent));
++                      dxtrace(dx_show_index ("node", frame->entries));
+                       dxtrace(dx_show_index ("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+                       err = ext3_journal_dirty_metadata(handle, bh2);
+                       if (err)
+                               goto journal_error;
+-                      brelse (bh2);
+-              } else {
+-                      dxtrace(printk("Creating second level index...\n"));
+-                      memcpy((char *) entries2, (char *) entries,
+-                             icount * sizeof(struct dx_entry));
+-                      dx_set_limit(entries2, dx_node_limit(dir));
+-
+-                      /* Set up root */
+-                      dx_set_count(entries, 1);
+-                      dx_set_block(entries + 0, newblock);
+-                      ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext3_journal_get_write_access(handle,
+-                                                           frame->bh);
++                      do_corr(schedule());
++                      err = ext3_journal_dirty_metadata(handle, parent->bh);
+                       if (err)
+                               goto journal_error;
+               }
+-              ext3_journal_dirty_metadata(handle, frames[0].bh);
++              do_corr(schedule());
++              err = ext3_journal_dirty_metadata(handle, bh);
++              if (err)
++                      goto journal_error;
++      }
++              /*
++               * This function was called to make insertion of new leaf
++               * possible. Check that it fulfilled its obligations.
++               */
++              assert_corr(dx_get_count(path->ip_frame->entries) <
++                          dx_get_limit(path->ip_frame->entries));
++      assert_corr(lock[nr_splet] != NULL);
++      *lh = lock[nr_splet];
++      lock[nr_splet] = NULL;
++      if (nr_splet > 0) {
++              /*
++               * Log ->i_size modification.
++               */
++              err = ext3_mark_inode_dirty(handle, dir);
++              if (err)
++                      goto journal_error;
++      }
++      goto cleanup;
++journal_error:
++      ext3_std_error(dir->i_sb, err);
++
++cleanup:
++      dx_unlock_array(dir, lock);
++      dx_unlock_array(dir, new_lock);
++
++      assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
++
++      do_corr(schedule());
++      for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
++              if (bh_new[i] != NULL)
++                      brelse(bh_new[i]);
++      }
++      return err;
++}
++
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode)
++{
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_descr *param;
++      struct iam_frame *frame;
++      struct dx_hash_info hinfo;
++      struct buffer_head * bh = NULL;
++      struct inode *dir = dentry->d_parent->d_inode;
++      struct ext3_dir_entry_2 *de;
++      struct dynlock_handle *dummy = NULL;
++      int err;
++      size_t isize;
++
++      iam_path_compat_init(&cpath, dir);
++      param = iam_path_descr(path);
++
++      err = dx_probe(&dentry->d_name, NULL, &hinfo, path);
++      if (err != 0)
++              return err;
++      frame = path->ip_frame;
++
++      isize = dir->i_size;
++
++      err = param->id_ops->id_node_read(path->ip_container,
++                      (iam_ptr_t)dx_get_block(path, frame->at),
++                                handle, &bh);
++      if (err != 0)
++              goto cleanup;
++
++      BUFFER_TRACE(bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, bh);
++      if (err)
++              goto journal_error;
++
++      err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
++      if (err != -ENOSPC) {
++              bh = NULL;
++              goto cleanup;
+       }
+-      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++      
++      err = split_index_node(handle, path, &dummy);
++      if (err)
++              goto cleanup;   
++
++      /*copy split inode too*/
++      de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err);
+       if (!de)
+               goto cleanup;
++
++      assert_inv(dx_node_check(path, frame));
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+-      bh = NULL;
+-      goto cleanup;
++      goto cleanup2;
+ journal_error:
+       ext3_std_error(dir->i_sb, err);
+ cleanup:
+       if (bh)
+               brelse(bh);
+-      dx_release(frames);
++cleanup2:
++      dx_unlock_htree(dir, dummy);
++      if (err)
++              inode->i_size = isize;
++      iam_path_fini(path);
+       return err;
+ }
+ #endif
+@@ -1678,6 +2309,26 @@
+       return ext3_new_inode(handle, dir, mode, inum);
+ }
++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode)
++{
++      struct inode *inode;
++
++      inode = ext3_new_inode(handle, dir, mode, 0);
++      if (!IS_ERR(inode)) {
++              if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) {
++#ifdef CONFIG_LDISKFS_FS_XATTR
++                      inode->i_op = &ext3_special_inode_operations;
++#endif
++              } else {
++                      inode->i_op = &ext3_file_inode_operations;
++                      inode->i_fop = &ext3_file_operations;
++                      ext3_set_aops(inode);
++              }
++      }
++      return inode;
++}
++EXPORT_SYMBOL(ext3_create_inode);
++
+ /*
+  * By the time this is called, we already have created
+  * the directory cache entry for the new file, but it
+Index: linux-stage/fs/ext3/ioctl.c
+===================================================================
+--- linux-stage.orig/fs/ext3/ioctl.c   2007-11-26 23:09:03.000000000 +0300
++++ linux-stage/fs/ext3/ioctl.c        2007-11-26 23:09:06.000000000 +0300
+@@ -16,6 +16,7 @@
+ #include <asm/uaccess.h>
+ #include <linux/namei.h>
++#include <linux/lustre_iam.h>
+ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+               unsigned long arg)
+@@ -275,6 +276,6 @@
+       default:
+-              return -ENOTTY;
++              return iam_uapi_ioctl(inode, filp, cmd, arg);
+       }
+ }
+Index: linux-stage/fs/ext3/file.c
+===================================================================
+--- linux-stage.orig/fs/ext3/file.c    2007-11-26 23:08:59.000000000 +0300
++++ linux-stage/fs/ext3/file.c 2007-11-26 23:09:06.000000000 +0300
+@@ -23,6 +23,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/lustre_iam.h>
+ #include "xattr.h"
+ #include "acl.h"
+@@ -41,8 +42,12 @@
+               ext3_discard_reservation(inode);
+               mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+       }
+-      if (is_dx(inode) && filp->private_data)
+-              ext3_htree_free_dir_info(filp->private_data);
++      if (is_dx(inode) && filp->private_data) {
++              if (S_ISDIR(inode->i_mode))
++                      ext3_htree_free_dir_info(filp->private_data);
++              else
++                      ext3_iam_release(filp, inode);
++      }
+       return 0;
+ }
+Index: linux-stage/fs/ext3/hash.c
+===================================================================
+--- linux-stage.orig/fs/ext3/hash.c    2007-11-26 23:08:59.000000000 +0300
++++ linux-stage/fs/ext3/hash.c 2007-11-26 23:09:06.000000000 +0300
+@@ -49,6 +49,23 @@
+       return (hash0 << 1);
+ }
++static __u32 dx_r5_hash(const signed char *msg, int len)
++{
++      __u32 a = 0;
++      while (len--) {
++              a += *msg << 4;
++              a += *msg >> 4;
++              a *= 11;
++              msg++;
++      }
++      return a;
++}
++
++static __u32 dx_same_hash(const signed char *msg, int len)
++{
++      return 0xcafebabeUL;
++}
++
+ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+ {
+       __u32   pad, val;
+@@ -139,6 +156,12 @@
+               hash = buf[0];
+               minor_hash = buf[1];
+               break;
++      case DX_HASH_R5:
++              hash = dx_r5_hash(name, len);
++              break;
++      case DX_HASH_SAME:
++              hash = dx_same_hash(name, len);
++              break;
+       default:
+               hinfo->hash = 0;
+               return -1;
+Index: linux-stage/fs/ext3/Makefile
+===================================================================
+--- linux-stage.orig/fs/ext3/Makefile  2007-11-26 23:09:05.000000000 +0300
++++ linux-stage/fs/ext3/Makefile       2007-11-26 23:09:06.000000000 +0300
+@@ -6,7 +6,7 @@
+ ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+          ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+-         mballoc.o dynlocks.o
++         mballoc.o dynlocks.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-stage/fs/ext3/dir.c
+===================================================================
+--- linux-stage.orig/fs/ext3/dir.c     2007-11-26 23:09:04.000000000 +0300
++++ linux-stage/fs/ext3/dir.c  2007-11-26 23:09:06.000000000 +0300
+@@ -28,6 +28,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/slab.h>
+ #include <linux/rbtree.h>
++#include <linux/lustre_iam.h>
+ static unsigned char ext3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+@@ -61,6 +62,7 @@
+ }
+                              
++#if EXT3_INVARIANT_ON
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+                         struct ext3_dir_entry_2 * de,
+                         struct buffer_head * bh,
+@@ -90,6 +92,7 @@
+                       rlen, de->name_len);
+       return error_msg == NULL ? 1 : 0;
+ }
++#endif
+ static int ext3_readdir(struct file * filp,
+                        void * dirent, filldir_t filldir)
+@@ -304,12 +307,14 @@
+       root->rb_node = NULL;
+ }
++extern struct iam_private_info *ext3_iam_alloc_info(int flags);
++extern void ext3_iam_release_info(struct iam_private_info *info);
+ static struct dir_private_info *create_dir_info(loff_t pos)
+ {
+       struct dir_private_info *p;
+-      p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++      p = (void *)ext3_iam_alloc_info(GFP_KERNEL);
+       if (!p)
+               return NULL;
+       p->root.rb_node = NULL;
+@@ -325,6 +330,7 @@
+ void ext3_htree_free_dir_info(struct dir_private_info *p)
+ {
+       free_rb_tree_fname(&p->root);
++      ext3_iam_release_info((void *)p);
+       kfree(p);
+ }
index 1d9f2f5..f686f4c 100644 (file)
@@ -16,3 +16,8 @@ ext3-inode-version-2.6.18-vanilla.patch
 ext3-mmp-2.6.18-vanilla.patch
 ext3-unlink-race.patch
 ext3-statfs-2.6-rhel5.patch
+ext3-dynlocks-common.patch
+ext3-dynlocks-2.6.18-vanilla.patch
+ext3-iam-common.patch
+ext3-iam-2.6.18-rhel5.patch
+ext3-orphans-delay.patch