Whamcloud - gitweb
- add cmd3 patches for ldiskfs
authortappro <tappro>
Thu, 14 Jun 2007 10:07:23 +0000 (10:07 +0000)
committertappro <tappro>
Thu, 14 Jun 2007 10:07:23 +0000 (10:07 +0000)
13 files changed:
ldiskfs/kernel_patches/patches/ext3-hash-selection.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-htree-comments.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-htree-path.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-iam-ops.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-iam-separate.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-tall-htree.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series

diff --git a/ldiskfs/kernel_patches/patches/ext3-hash-selection.patch b/ldiskfs/kernel_patches/patches/ext3-hash-selection.patch
new file mode 100644 (file)
index 0000000..40eb9fe
--- /dev/null
@@ -0,0 +1,125 @@
+Index: linux-2.6.9/fs/ext3/hash.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/hash.c    2006-04-23 22:39:01.000000000 +0800
++++ linux-2.6.9/fs/ext3/hash.c 2006-04-23 22:39:16.000000000 +0800
+@@ -127,6 +127,11 @@
+       return a;
+ }
++static __u32 dx_same_hash(const signed char *msg, int len)
++{
++      return 0xcafebabeUL;
++}
++
+ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+ {
+       __u32   pad, val;
+@@ -220,6 +225,9 @@
+       case DX_HASH_R5:
+               hash = dx_r5_hash(name, len);
+               break;
++      case DX_HASH_SAME:
++              hash = dx_same_hash(name, len);
++              break;
+       default:
+               hinfo->hash = 0;
+               return -1;
+Index: linux-2.6.9/fs/ext3/super.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/super.c   2006-08-17 09:26:01.000000000 +0300
++++ linux-2.6.9/fs/ext3/super.c        2006-08-17 09:31:22.000000000 +0300
+@@ -599,6 +599,7 @@ enum {
+       Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+       Opt_extents, Opt_noextents, Opt_extdebug,
+       Opt_mballoc, Opt_nomballoc, Opt_stripe,
++        Opt_hashfunc,
+ };
+ static match_table_t tokens = {
+@@ -655,6 +656,7 @@ static match_table_t tokens = {
+       {Opt_stripe, "stripe=%u"},
+       {Opt_err, NULL},
+       {Opt_resize, "resize"},
++      {Opt_hashfunc,"hash=%s"},
+ };
+ static unsigned long get_sb_block(void **data)
+@@ -679,6 +681,7 @@ static unsigned long get_sb_block(void *
+       return sb_block;
+ }
++int user_selected_hash_function = -1;
+ static int parse_options (char * options, struct super_block *sb,
+                         unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
+ {
+@@ -980,6 +983,22 @@ clear_qf_name:
+                               return 0;
+                       sbi->s_stripe = option;
+                       break;
++              case Opt_hashfunc:
++                      if (strncmp (args[0].from,"legacy",6) == 0){
++                                user_selected_hash_function = 0;
++                        } else if (strncmp (args[0].from,"half_md4",8) == 0){
++                                user_selected_hash_function = 1;
++                        } else if (strncmp (args[0].from,"tea",3) == 0){
++                                user_selected_hash_function = 2;
++                        } else if (strncmp (args[0].from,"r5",2) == 0){
++                                user_selected_hash_function = 3;
++                        } else if (strncmp (args[0].from,"same",4) == 0){
++                                user_selected_hash_function = 4;
++                        } else {
++                                printk ("Hashfunc name wrong\n");
++                                return 0;
++                        }
++                      break;
+               default:
+                       printk (KERN_ERR
+                               "EXT3-fs: Unrecognized mount option \"%s\" "
+Index: linux-2.6.9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/namei.c   2006-04-23 22:39:02.000000000 +0800
++++ linux-2.6.9/fs/ext3/namei.c        2006-04-23 22:39:16.000000000 +0800
+@@ -365,10 +365,7 @@
+               struct htree_cookie *hc = cookie;
+               root = data;
+-              if (root->info.hash_version != DX_HASH_TEA &&
+-                  root->info.hash_version != DX_HASH_HALF_MD4 &&
+-                  root->info.hash_version != DX_HASH_R5 &&
+-                  root->info.hash_version != DX_HASH_LEGACY) {
++              if (root->info.hash_version > DX_HASH_MAX) {
+                       ext3_warning(sb, __FUNCTION__,
+                                    "Unrecognised inode hash code %d",
+                                    root->info.hash_version);
+@@ -1467,6 +1464,7 @@
+  * This converts a one block unindexed directory to a 3 block indexed
+  * directory, and adds the dentry to the indexed directory.
+  */
++extern int user_selected_hash_function;
+ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+                           struct inode *inode, struct buffer_head *bh)
+ {
+@@ -1522,7 +1520,9 @@
+       memset (&root->info, 0, sizeof(root->info));
+       root->info.info_length = sizeof(root->info);
+       root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+-      root->info.hash_version = DX_HASH_R5;
++      if (user_selected_hash_function >= 0 &&
++          user_selected_hash_function <= DX_HASH_MAX)
++              root->info.hash_version = user_selected_hash_function;
+       entries = (void *)root->entries;
+       dx_set_block (&path, entries, 1);
+       dx_set_count (entries, 1);
+Index: linux-2.6.9/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.9.orig/include/linux/ext3_fs.h   2006-04-23 22:39:01.000000000 +0800
++++ linux-2.6.9/include/linux/ext3_fs.h        2006-04-23 22:39:16.000000000 +0800
+@@ -665,6 +665,8 @@
+ #define DX_HASH_HALF_MD4      1
+ #define DX_HASH_TEA           2
+ #define DX_HASH_R5            3
++#define DX_HASH_SAME          4
++#define DX_HASH_MAX           4
+ /* hash info structure used by the directory hash */
+ struct dx_hash_info
diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-comments.patch b/ldiskfs/kernel_patches/patches/ext3-htree-comments.patch
new file mode 100644 (file)
index 0000000..159add6
--- /dev/null
@@ -0,0 +1,1643 @@
+Index: linux-2.6.9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/namei.c   2006-04-17 18:32:12.000000000 +0800
++++ linux-2.6.9/fs/ext3/namei.c        2006-04-23 21:40:41.000000000 +0800
+@@ -24,6 +24,78 @@
+  *    Theodore Ts'o, 2002
+  */
++/*
++ * iam: big theory statement.
++ *
++ * iam (Index Access Module) is a module providing abstraction of persistent
++ * transactional container on top of generalized ext3 htree.
++ *
++ * iam supports:
++ *
++ *     - key, pointer, and record size specifiable per container.
++ *
++ *     - trees taller than 2 index levels.
++ *
++ *     - read/write to existing ext3 htree directories as iam containers.
++ *
++ * iam container is a tree, consisting of leaf nodes containing keys and
++ * records stored in this container, and index nodes, containing keys and
++ * pointers to leaf or index nodes.
++ *
++ * iam does not work with keys directly, instead it calls user-supplied key
++ * comparison function (->dpo_keycmp()).
++ *
++ * Pointers are (currently) interpreted as logical offsets (measured in
++ * blocksful) within underlying flat file on top of which iam tree lives.
++ *
++ * On-disk format:
++ *
++ * iam mostly tries to reuse existing htree formats.
++ *
++ * Format of index node:
++ *
++ * +-----+-------+-------+-------+------+-------+------------+
++ * |     | count |       |       |      |       |            |
++ * | gap |   /   | entry | entry | .... | entry | free space |
++ * |     | limit |       |       |      |       |            |
++ * +-----+-------+-------+-------+------+-------+------------+
++ *
++ *       gap           this part of node is never accessed by iam code. It
++ *                     exists for binary compatibility with ext3 htree (that,
++ *                     in turn, stores fake struct ext2_dirent for ext2
++ *                     compatibility), and to keep some unspecified per-node
++ *                     data. Gap can be different for root and non-root index
++ *                     nodes. Gap size can be specified for each container
++ *                     (gap of 0 is allowed).
++ *
++ *       count/limit   current number of entries in this node, and the maximal
++ *                     number of entries that can fit into node. count/limit
++ *                     has the same size as entry, and is itself counted in
++ *                     count.
++ *
++ *       entry         index entry: consists of a key immediately followed by
++ *                     a pointer to a child node. Size of a key and size of a
++ *                     pointer depends on container. Entry has neither
++ *                     alignment nor padding.
++ *
++ *       free space    portion of node new entries are added to
++ *
++ * Entries in index node are sorted by their key value.
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ */
++
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -98,14 +170,6 @@
+       __le16 count;
+ };
+-struct dx_entry; /* incomplete type */
+-struct dx_key;   /* incomplete type */
+-
+-struct dx_entry_compat {
+-      __le32 hash;
+-      __le32 block;
+-};
+-
+ /*
+  * dx_root_info is laid out so that if it should somehow get overlaid by a
+  * dirent the two low bits of the hash version will be zero.  Therefore, the
+@@ -135,111 +199,513 @@
+       struct {} entries[0];
+ };
+-
+-struct dx_frame
+-{
+-      struct buffer_head *bh;
+-      struct dx_entry *entries;
+-      struct dx_entry *at;
+-};
+-
+ struct dx_map_entry
+ {
+       u32 hash;
+       u32 offs;
+ };
+-struct dx_path;
+-struct dx_param {
+-      size_t       dpo_key_size;
+-      size_t       dpo_ptr_size;
+-      size_t       dpo_node_gap;
+-      size_t       dpo_root_gap;
+-
+-      u32 (*dpo_root_ptr)(struct dx_path *path);
+-      int (*dpo_node_check)(struct dx_path *path,
+-                            struct dx_frame *frame, void *cookie);
+-      int (*dpo_node_init)(struct dx_path *path,
+-                           struct buffer_head *bh, int root);
+-      int (*dpo_keycmp)(struct dx_path *path,
+-                        struct dx_key *k1, struct dx_key *k2);
++/*
++ * Entry within index tree node. Consists of a key immediately followed
++ * (without padding) by a pointer to the child node.
++ *
++ * Both key and pointer are of variable size, hence incomplete type.
++ */
++struct iam_entry;
++
++struct iam_entry_compat {
++      __le32 hash;
++      __le32 block;
++};
++
++/*
++ * Incomplete type used to refer to keys in iam container.
++ *
++ * As key size can be different from container to container, iam has to use
++ * incomplete type. Clients cast pointer to iam_key to real key type and back.
++ */
++struct iam_key;
++
++/* Incomplete type use to refer to the records stored in iam containers. */
++struct iam_rec;
++
++typedef __u64 iam_ptr_t;
++
++/*
++ * Index node traversed during tree lookup.
++ */
++struct iam_frame {
++      struct buffer_head *bh;    /* buffer holding node data */
++      struct iam_entry *entries; /* array of entries */
++      struct iam_entry *at;      /* target entry, found by binary search */
++};
++
++/* leaf node reached by tree lookup */
++struct iam_leaf {
++      struct buffer_head *bh;
++      struct iam_leaf_entry *entries;
++      struct iam_leaf_entry *at;
++};
++
++struct iam_path;
++struct iam_container;
++
++/*
++ * Parameters, describing a flavor of iam container.
++ */
++struct iam_descr {
++      /*
++       * Size of a key in this container, in bytes.
++       */
++      size_t       id_key_size;
++      /*
++       * Size of a pointer to the next level (stored in index nodes), in
++       * bytes.
++       */
++      size_t       id_ptr_size;
++      /*
++       * Size of a record (stored in leaf nodes), in bytes.
++       */
++      size_t       id_rec_size;
++      /*
++       * Size of unused (by iam) space at the beginning of every non-root
++       * node, in bytes. Used for compatibility with ext3.
++       */
++      size_t       id_node_gap;
++      /*
++       * Size of unused (by iam) space at the beginning of root node, in
++       * bytes. Used for compatibility with ext3.
++       */
++      size_t       id_root_gap;
++
++      /*
++       * Returns pointer (in the same sense as pointer in index entry) to
++       * the root node.
++       */
++      __u32 (*id_root_ptr)(struct iam_container *c);
++
++      /*
++       * Check validity and consistency of index node. This is called when
++       * iam just loaded new node into frame.
++       */
++      int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
++      /*
++       * Initialize new node (stored in @bh) that is going to be added into
++       * tree.
++       */
++      int (*id_node_init)(struct iam_container *c,
++                          struct buffer_head *bh, int root);
++      int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
++                          handle_t *h, struct buffer_head **bh);
++      /*
++       * Key comparison function. Returns -1, 0, +1.
++       */
++      int (*id_keycmp)(struct iam_container *c,
++                       struct iam_key *k1, struct iam_key *k2);
++      /*
++       * Create new container.
++       *
++       * Newly created container has a root node and a single leaf. Leaf
++       * contains single record with the smallest possible key.
++       */
++      int (*id_create)(struct iam_container *c);
++      struct {
++              /*
++               * leaf operations.
++               */
++              /*
++               * returns true iff leaf is positioned at the last entry.
++               */
++              int (*at_end)(struct iam_container *c, struct iam_leaf *l);
++              /* position leaf at the first entry */
++              void (*start)(struct iam_container *c, struct iam_leaf *l);
++              /* more leaf to the next entry. */
++              void (*next)(struct iam_container *c, struct iam_leaf *l);
++              /* return key of current leaf record in @k */
++              void (*key)(struct iam_container *c, struct iam_leaf *l,
++                          struct iam_key *k);
++              /* return pointer to entry body */
++              struct iam_rec *(*rec)(struct iam_container *c,
++                                     struct iam_leaf *l);
++      } id_leaf;
++};
++
++struct iam_container {
++      /*
++       * Underlying flat file. IO against this object is issued to
++       * read/write nodes.
++       */
++      struct inode     *ic_object;
++      /*
++       * container flavor.
++       */
++      struct iam_descr *ic_descr;
++      /*
++       * pointer to flavor-specific per-container data.
++       */
++      void             *ic_descr_data;
+ };
+ /*
+  * Structure to keep track of a path drilled through htree.
+  */
+-struct dx_path {
+-      struct inode         *dp_object;
+-      struct dx_param      *dp_param;
+-      int                   dp_indirect;
+-      struct dx_frame       dp_frames[DX_MAX_TREE_HEIGHT];
+-      struct dx_frame      *dp_frame;
+-      struct dx_key        *dp_key_target;
+-      struct dx_key        *dp_key_scratch[DX_SCRATCH_KEYS];
+-};
+-
+-struct dx_path_compat {
+-      struct dx_path dpc_path;
+-      __u32          dpc_scrach[DX_SCRATCH_KEYS];
+-};
+-
+-static u32 htree_root_ptr(struct dx_path *p);
+-static int htree_node_check(struct dx_path *path,
+-                          struct dx_frame *frame, void *cookie);
+-static int htree_node_init(struct dx_path *path,
++struct iam_path {
++      /*
++       * Parent container.
++       */
++      struct iam_container  *ip_container;
++      /*
++       * Number of index levels minus one.
++       */
++      int                    ip_indirect;
++      /*
++       * Nodes that top-to-bottom traversal passed through.
++       */
++      struct iam_frame       ip_frames[DX_MAX_TREE_HEIGHT];
++      /*
++       * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
++       * immediately above leaf).
++       */
++      struct iam_frame      *ip_frame;
++      /*
++       * Leaf node: a child of ->ip_frame.
++       */
++      struct iam_leaf       *ip_leaf;
++      /*
++       * Key searched for.
++       */
++      struct iam_key        *ip_key_target;
++      /*
++       * Scratch-pad area for temporary keys.
++       */
++      struct iam_key        *ip_key_scratch[DX_SCRATCH_KEYS];
++      /*
++       * pointer to flavor-specific per-container data.
++       */
++      void                  *ip_descr_data;
++};
++
++/*
++ * Helper structure for legacy htrees.
++ */
++struct iam_path_compat {
++      struct iam_path      ipc_path;
++      struct iam_container ipc_container;
++      __u32                ipc_scrach[DX_SCRATCH_KEYS];
++};
++
++static u32 htree_root_ptr(struct iam_container *c);
++static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
++static int htree_node_init(struct iam_container *c,
+                          struct buffer_head *bh, int root);
+-static int htree_keycmp(struct dx_path *path,
+-                      struct dx_key *k1, struct dx_key *k2);
++static int htree_keycmp(struct iam_container *c,
++                      struct iam_key *k1, struct iam_key *k2);
++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
++                         handle_t *h, struct buffer_head **bh);
++
++/*
++ * Parameters describing iam compatibility mode in which existing ext3 htrees
++ * can be manipulated.
++ */
++static struct iam_descr htree_compat_param = {
++      .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++      .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
++      .id_node_gap = offsetof(struct dx_node, entries),
++      .id_root_gap = offsetof(struct dx_root, entries),
++
++      .id_root_ptr   = htree_root_ptr,
++      .id_node_check = htree_node_check,
++      .id_node_init  = htree_node_init,
++      .id_node_read  = htree_node_read,
++      .id_keycmp     = htree_keycmp
++};
++
++
++struct iam_key;
++struct iam_rec;
++struct iam_descr;
++struct iam_container;
++struct iam_path;
+-static struct dx_param htree_compat_param = {
+-      .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
+-      .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
+-      .dpo_node_gap = offsetof(struct dx_node, entries),
+-      .dpo_root_gap = offsetof(struct dx_root, entries),
+-
+-      .dpo_root_ptr   = htree_root_ptr,
+-      .dpo_node_check = htree_node_check,
+-      .dpo_node_init  = htree_node_init,
+-      .dpo_keycmp     = htree_keycmp
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++                     struct iam_descr *descr, struct inode *inode);
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c);
++
++/*
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
++ *
++ *
++ *
++ * Return values: +ve: found, 0: not-found, -ve: error
++ */
++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++/*
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h.
++ *
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
++ *
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ *                                  iam_lookup(c, k, r2) > 0 &&
++ *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_insert(handle_t *h, struct iam_container *c,
++             struct iam_key *k, struct iam_rec *r);
++/*
++ * Replace existing record with key @k, or insert new one. New record data are
++ * in @r.
++ *
++ * Return values: 0: success, -ve: error.
++ *
++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
++ *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_update(handle_t *h, struct iam_container *c,
++             struct iam_key *k, struct iam_rec *r);
++/*
++ * Delete existing record with key @k.
++ *
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ *                                 !iam_lookup(c, k, *));
++ */
++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
++
++/*
++ * iam cursor (iterator) api.
++ */
++
++/*
++ * Flags controlling iterator functionality.
++ */
++enum iam_it_flags {
++      /*
++       * this iterator will move (iam_it_{prev,next}() will be called on it)
++       */
++      IAM_IT_MOVE  = (1 << 0),
++      /*
++       * tree can be updated through this iterator.
++       */
++      IAM_IT_WRITE = (1 << 1)
+ };
++/*
++ * States of iterator state machine.
++ */
++enum iam_it_state {
++      /* initial state */
++      IAM_IT_DETACHED,
++      /* iterator is above particular record in the container */
++      IAM_IT_ATTACHED
++};
++
++/*
++ * Iterator.
++ *
++ * Immediately after call to iam_it_init() iterator is in "detached"
++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but
++ * doesn't point to any particular record in this container.
++ *
++ * After successful call to iam_it_get() and until corresponding call to
++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
++ *
++ * Attached iterator can move through records in a container (provided
++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
++ * passes over them, and can modify container (provided IAM_IT_WRITE
++ * permission).
++ *
++ * Concurrency: iterators are supposed to be local to thread. Interfaces below
++ * do no internal serialization.
++ *
++ */
++struct iam_iterator {
++      /*
++       * iterator flags, taken from enum iam_it_flags.
++       */
++      __u32                 ii_flags;
++      enum iam_it_state     ii_state;
++      /*
++       * path to the record. Valid in IAM_IT_ATTACHED state.
++       */
++      struct iam_path       ii_path;
++};
++
++static inline struct iam_key *keycpy(struct iam_container *c,
++                                   struct iam_key *k1, struct iam_key *k2)
++{
++      return memcpy(k1, k2, c->ic_descr->id_key_size);
++}
++
++static inline int keycmp(struct iam_container *c,
++                       struct iam_key *k1, struct iam_key *k2)
++{
++      return c->ic_descr->id_keycmp(c, k1, k2);
++}
++
++static struct iam_container *iam_it_container(struct iam_iterator *it)
++{
++      return it->ii_path.ip_container;
++}
++
++static inline int it_keycmp(struct iam_iterator *it,
++                          struct iam_key *k1, struct iam_key *k2)
++{
++      return keycmp(iam_it_container(it), k1, k2);
++}
++
++/*
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags);
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it);
++
++/*
++ * Attach iterator. After successful completion, @it points to record with the
++ * largest key not larger than @k. Semantics of ->id_create() method guarantee
++ * that such record will always be found.
++ *
++ * Return value: 0: positioned on existing record,
++ *             -ve: error.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0,
++ *                     (it_state(it) == IAM_IT_ATTACHED &&
++ *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
++ */
++int iam_it_get(struct iam_iterator *it, struct iam_key *k);
++
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ *                iam_it_container(dst) == iam_it_container(src) &&
++ *                dst->ii_flags = src->ii_flags &&
++ *                ergo(it_state(it) == IAM_IT_ATTACHED,
++ *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
++ */
++void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
++
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it);
++
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ *              +1: end of container reached
++ *             -ve: error
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_next(struct iam_iterator *it);
++
++/*
++ * Return pointer to the record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++const struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
++
++/*
++ * Replace contents of record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
++
++/*
++ * Place key under iterator in @k, return @k
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++const struct iam_key *iam_it_key_get(struct iam_iterator *it,
++                                   struct iam_key *k);
++
++/*
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                it->ii_flags&IAM_IT_WRITE &&
++ *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0,
++ *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
++ *                     !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++                    struct iam_key *k, struct iam_rec *r);
++/*
++ * Delete record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+ #ifdef CONFIG_EXT3_INDEX
+-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry);
+-static void dx_set_block(struct dx_path *p,
+-                       struct dx_entry *entry, unsigned value);
+-static inline struct dx_key *dx_get_key(struct dx_path *p,
+-                                      struct dx_entry *entry,
+-                                      struct dx_key *key);
+-static void dx_set_key(struct dx_path *p, struct dx_entry *entry,
+-                     struct dx_key *key);
+-static unsigned dx_get_count(struct dx_entry *entries);
+-static unsigned dx_get_limit(struct dx_entry *entries);
+-static void dx_set_count(struct dx_entry *entries, unsigned value);
+-static void dx_set_limit(struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit(struct dx_path *p);
+-static unsigned dx_node_limit(struct dx_path *p);
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
++static void dx_set_block(struct iam_path *p,
++                       struct iam_entry *entry, unsigned value);
++static inline struct iam_key *dx_get_key(struct iam_path *p,
++                                      struct iam_entry *entry,
++                                      struct iam_key *key);
++static void dx_set_key(struct iam_path *p, struct iam_entry *entry,
++                     struct iam_key *key);
++static unsigned dx_get_count(struct iam_entry *entries);
++static unsigned dx_get_limit(struct iam_entry *entries);
++static void dx_set_count(struct iam_entry *entries, unsigned value);
++static void dx_set_limit(struct iam_entry *entries, unsigned value);
++static unsigned dx_root_limit(struct iam_path *p);
++static unsigned dx_node_limit(struct iam_path *p);
+ static int dx_probe(struct dentry *dentry,
+                   struct inode *dir,
+                   struct dx_hash_info *hinfo,
+-                  struct dx_path *path);
++                  struct iam_path *path);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_path *path,
+-                           struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct iam_path *path,
++                           struct iam_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct dx_path *path, __u32 *start_hash);
++                               struct iam_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
+-static inline void dx_path_init(struct dx_path *path, struct inode *inode);
+-static inline void dx_path_fini(struct dx_path *path);
++static inline void iam_path_init(struct iam_path *path,
++                               struct iam_container *c);
++static inline void iam_path_fini(struct iam_path *path);
+ /*
+@@ -247,153 +713,154 @@
+  * Mask them off for now.
+  */
+-static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off)
++static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off)
+ {
+       return (void *)((char *)entry + off);
+ }
+-static inline size_t dx_entry_size(struct dx_path *p)
++static inline struct iam_descr *path_descr(struct iam_path *p)
+ {
+-      return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size;
++      return p->ip_container->ic_descr;
+ }
+-static inline struct dx_entry *dx_entry_shift(struct dx_path *p,
+-                                            struct dx_entry *entry, int shift)
++static inline struct inode *path_obj(struct iam_path *p)
++{
++      return p->ip_container->ic_object;
++}
++
++static inline size_t iam_entry_size(struct iam_path *p)
++{
++      return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size;
++}
++
++static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
++                                            struct iam_entry *entry, int shift)
+ {
+       void *e = entry;
+-      return e + shift * dx_entry_size(p);
++      return e + shift * iam_entry_size(p);
+ }
+-static inline ptrdiff_t dx_entry_diff(struct dx_path *p,
+-                                    struct dx_entry *e1, struct dx_entry *e2)
++static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
++                                    struct iam_entry *e1, struct iam_entry *e2)
+ {
+       ptrdiff_t diff;
+       diff = (void *)e1 - (void *)e2;
+-      assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff);
+-      return diff / dx_entry_size(p);
++      assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
++      return diff / iam_entry_size(p);
+ }
+-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry)
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+ {
+-      return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size))
++      return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size))
+               & 0x00ffffff;
+ }
+-static inline void dx_set_block(struct dx_path *p,
+-                              struct dx_entry *entry, unsigned value)
++static inline void dx_set_block(struct iam_path *p,
++                              struct iam_entry *entry, unsigned value)
+ {
+-      *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value);
++      *(u32*)entry_off(entry,
++                       path_descr(p)->id_key_size) = cpu_to_le32(value);
+ }
+-static inline struct dx_key *dx_get_key(struct dx_path *p,
+-                                      struct dx_entry *entry,
+-                                      struct dx_key *key)
++static inline struct iam_key *dx_get_key(struct iam_path *p,
++                                      struct iam_entry *entry,
++                                      struct iam_key *key)
+ {
+-      memcpy(key, entry, p->dp_param->dpo_key_size);
++      memcpy(key, entry, path_descr(p)->id_key_size);
+       return key;
+ }
+-static inline struct dx_key *dx_key_at(struct dx_path *p,
+-                                     struct dx_entry *entry)
++static inline struct iam_key *iam_key_at(struct iam_path *p,
++                                     struct iam_entry *entry)
+ {
+-      return (struct dx_key *)entry;
++      return (struct iam_key *)entry;
+ }
+-static inline void dx_set_key(struct dx_path *p,
+-                            struct dx_entry *entry, struct dx_key *key)
++static inline void dx_set_key(struct iam_path *p,
++                            struct iam_entry *entry, struct iam_key *key)
+ {
+-      memcpy(entry, key, p->dp_param->dpo_key_size);
++      memcpy(entry, key, path_descr(p)->id_key_size);
+ }
+-static inline unsigned dx_get_count (struct dx_entry *entries)
++static inline unsigned dx_get_count (struct iam_entry *entries)
+ {
+       return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+ }
+-static inline unsigned dx_get_limit (struct dx_entry *entries)
++static inline unsigned dx_get_limit (struct iam_entry *entries)
+ {
+       return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+ }
+-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++static inline void dx_set_count (struct iam_entry *entries, unsigned value)
+ {
+       ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+ }
+-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++static inline void dx_set_limit (struct iam_entry *entries, unsigned value)
+ {
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+-static inline unsigned dx_root_limit(struct dx_path *p)
++static inline unsigned dx_root_limit(struct iam_path *p)
+ {
+-      struct dx_param *param = p->dp_param;
+-      unsigned entry_space   = p->dp_object->i_sb->s_blocksize -
+-              param->dpo_root_gap;
+-      return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++      struct iam_descr *param = path_descr(p);
++      unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
++              param->id_root_gap;
++      return entry_space / (param->id_key_size + param->id_ptr_size);
+ }
+-static inline unsigned dx_node_limit(struct dx_path *p)
++static inline unsigned dx_node_limit(struct iam_path *p)
+ {
+-      struct dx_param *param = p->dp_param;
+-      unsigned entry_space   = p->dp_object->i_sb->s_blocksize -
+-              param->dpo_node_gap;
+-      return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++      struct iam_descr *param = path_descr(p);
++      unsigned entry_space   = path_obj(p)->i_sb->s_blocksize -
++              param->id_node_gap;
++      return entry_space / (param->id_key_size + param->id_ptr_size);
+ }
+-static inline int dx_index_is_compat(struct dx_path *path)
++static inline int dx_index_is_compat(struct iam_path *path)
+ {
+-      return path->dp_param == &htree_compat_param;
++      return path_descr(path) == &htree_compat_param;
+ }
+-static struct dx_entry *dx_get_entries(struct dx_path *path, void *data,
++static struct iam_entry *dx_get_entries(struct iam_path *path, void *data,
+                                      int root)
+ {
+       return data +
+               (root ?
+-               path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap);
++               path_descr(path)->id_root_gap : path_descr(path)->id_node_gap);
+ }
+-static struct dx_entry *dx_node_get_entries(struct dx_path *path,
+-                                          struct dx_frame *frame)
++static struct iam_entry *dx_node_get_entries(struct iam_path *path,
++                                          struct iam_frame *frame)
+ {
+       return dx_get_entries(path,
+-                            frame->bh->b_data, frame == path->dp_frames);
+-}
+-
+-static inline struct dx_key *keycpy(struct dx_path *p,
+-                                  struct dx_key *k1, struct dx_key *k2)
+-{
+-      return memcpy(k1, k2, p->dp_param->dpo_key_size);
+-}
+-
+-static inline int keycmp(struct dx_path *p,
+-                       struct dx_key *k1, struct dx_key *k2)
+-{
+-      return p->dp_param->dpo_keycmp(p, k1, k2);
++                            frame->bh->b_data, frame == path->ip_frames);
+ }
+-static int dx_node_check(struct dx_path *p, struct dx_frame *f)
++static int dx_node_check(struct iam_path *p, struct iam_frame *f)
+ {
+-      struct dx_entry *e;
++      struct iam_entry     *e;
++      struct iam_container *c;
+       unsigned count;
+       unsigned  i;
++      c = p->ip_container;
+       e = dx_node_get_entries(p, f);
+       count = dx_get_count(e);
+-      e = dx_entry_shift(p, e, 1);
+-      for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) {
+-              keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]);
+-              dx_get_key(p, e, p->dp_key_scratch[1]);
++      e = iam_entry_shift(p, e, 1);
++      for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
++              keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]);
++              dx_get_key(p, e, p->ip_key_scratch[1]);
+               if (i > 0 &&
+-                  keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0)
++                  keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0)
+                       return 0;
+       }
+       return 1;
+ }
+-static u32 htree_root_ptr(struct dx_path *path)
++static u32 htree_root_ptr(struct iam_container *c)
+ {
+       return 0;
+ }
+@@ -403,20 +870,19 @@
+       struct dentry       *dentry;
+ };
+-static int htree_node_check(struct dx_path *path, struct dx_frame *frame,
+-                          void *cookie)
++static int htree_node_check(struct iam_path *path, struct iam_frame *frame)
+ {
+       void *data;
+-      struct dx_entry *entries;
++      struct iam_entry *entries;
+       struct super_block *sb;
+       data = frame->bh->b_data;
+       entries = dx_node_get_entries(path, frame);
+-      sb = path->dp_object->i_sb;
+-      if (frame == path->dp_frames) {
++      sb = path_obj(path)->i_sb;
++      if (frame == path->ip_frames) {
+               /* root node */
+               struct dx_root *root;
+-              struct htree_cookie *hc = cookie;
++              struct htree_cookie *hc = path->ip_descr_data;
+               root = data;
+               if (root->info.hash_version > DX_HASH_MAX) {
+@@ -433,8 +899,8 @@
+                       return ERR_BAD_DX_DIR;
+               }
+-              path->dp_indirect = root->info.indirect_levels;
+-              if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) {
++              path->ip_indirect = root->info.indirect_levels;
++              if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) {
+                       ext3_warning(sb, __FUNCTION__,
+                                    "Unimplemented inode hash depth: %#06x",
+                                    root->info.indirect_levels);
+@@ -450,17 +916,17 @@
+               if (hc->dentry)
+                       ext3fs_dirhash(hc->dentry->d_name.name,
+                                      hc->dentry->d_name.len, hc->hinfo);
+-              path->dp_key_target = (struct dx_key *)&hc->hinfo->hash;
++              path->ip_key_target = (struct iam_key *)&hc->hinfo->hash;
+       } else {
+               /* non-root index */
+-              assert(entries == data + path->dp_param->dpo_node_gap);
++              assert(entries == data + path_descr(path)->id_node_gap);
+               assert(dx_get_limit(entries) == dx_node_limit(path));
+       }
+       frame->entries = frame->at = entries;
+       return 0;
+ }
+-static int htree_node_init(struct dx_path *path,
++static int htree_node_init(struct iam_container *c,
+                          struct buffer_head *bh, int root)
+ {
+       struct dx_node *node;
+@@ -468,13 +934,24 @@
+       assert(!root);
+       node = (void *)bh->b_data;
+-      node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize);
++      node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize);
+       node->fake.inode = 0;
+       return 0;
+ }
+-static int htree_keycmp(struct dx_path *path,
+-                      struct dx_key *k1, struct dx_key *k2)
++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
++                         handle_t *handle, struct buffer_head **bh)
++{
++      int result = 0;
++
++      *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result);
++      if (*bh == NULL)
++              result = -EIO;
++      return result;
++}
++
++static int htree_keycmp(struct iam_container *c,
++                      struct iam_key *k1, struct iam_key *k2)
+ {
+       __u32 p1 = le32_to_cpu(*(__u32 *)k1);
+       __u32 p2 = le32_to_cpu(*(__u32 *)k2);
+@@ -486,7 +963,7 @@
+  * Debug
+  */
+ #ifdef DX_DEBUG
+-static void dx_show_index (char * label, struct dx_entry *entries)
++static void dx_show_index (char * label, struct iam_entry *entries)
+ {
+         int i, n = dx_get_count (entries);
+         printk("%s index ", label);
+@@ -535,7 +1012,7 @@
+ }
+ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+-                           struct dx_entry *entries, int levels)
++                           struct iam_entry *entries, int levels)
+ {
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+@@ -565,32 +1042,33 @@
+ }
+ #endif /* DX_DEBUG */
+-static int dx_lookup(struct dx_path *path, void *cookie)
++static int dx_lookup(struct iam_path *path)
+ {
+       u32 ptr;
+-      int err;
++      int err = 0;
+       int i;
+-      struct dx_param *param;
+-      struct dx_frame *frame;
+-
+-      param = path->dp_param;
++      struct iam_descr *param;
++      struct iam_frame *frame;
++      struct iam_container *c;
+-      for (frame = path->dp_frames, i = 0,
+-           ptr = param->dpo_root_ptr(path); i <= path->dp_indirect;
++      param = path_descr(path);
++      c = path->ip_container;
++      
++      for (frame = path->ip_frames, i = 0,
++                   ptr = param->id_root_ptr(path->ip_container);
++           i <= path->ip_indirect;
+            ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+-              struct dx_entry *entries;
+-              struct dx_entry *p;
+-              struct dx_entry *q;
+-              struct dx_entry *m;
++              struct iam_entry *entries;
++              struct iam_entry *p;
++              struct iam_entry *q;
++              struct iam_entry *m;
+               unsigned count;
+-              frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err);
+-              if (frame->bh == NULL) {
+-                      err = -EIO;
++              err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh);
++              if (err != 0)
+                       break;
+-              }
+-              err = param->dpo_node_check(path, frame, cookie);
++              err = param->id_node_check(path, frame);
+               if (err != 0)
+                       break;
+@@ -599,37 +1077,37 @@
+               entries = frame->entries;
+               count = dx_get_count(entries);
+               assert(count && count <= dx_get_limit(entries));
+-              p = dx_entry_shift(path, entries, 1);
+-              q = dx_entry_shift(path, entries, count - 1);
++              p = iam_entry_shift(path, entries, 1);
++              q = iam_entry_shift(path, entries, count - 1);
+               while (p <= q) {
+-                      m = dx_entry_shift(path,
+-                                         p, dx_entry_diff(path, q, p) / 2);
++                      m = iam_entry_shift(path,
++                                         p, iam_entry_diff(path, q, p) / 2);
+                       dxtrace(printk("."));
+-                      if (keycmp(path, dx_key_at(path, m),
+-                                 path->dp_key_target) > 0)
+-                              q = dx_entry_shift(path, m, -1);
++                      if (keycmp(c, iam_key_at(path, m),
++                                 path->ip_key_target) > 0)
++                              q = iam_entry_shift(path, m, -1);
+                       else
+-                              p = dx_entry_shift(path, m, +1);
++                              p = iam_entry_shift(path, m, +1);
+               }
+-              frame->at = dx_entry_shift(path, p, -1);
++              frame->at = iam_entry_shift(path, p, -1);
+               if (1) { // linear search cross check
+                       unsigned n = count - 1;
+-                      struct dx_entry *at;
++                      struct iam_entry *at;
+                       at = entries;
+                       while (n--) {
+                               dxtrace(printk(","));
+-                              at = dx_entry_shift(path, at, +1);
+-                              if (keycmp(path, dx_key_at(path, at),
+-                                         path->dp_key_target) > 0) {
+-                                      if (at != dx_entry_shift(path, frame->at, 1)) {
++                              at = iam_entry_shift(path, at, +1);
++                              if (keycmp(c, iam_key_at(path, at),
++                                         path->ip_key_target) > 0) {
++                                      if (at != iam_entry_shift(path, frame->at, 1)) {
+                                               BREAKPOINT;
+                                               printk(KERN_EMERG "%i\n",
+-                                                     keycmp(path, dx_key_at(path, at),
+-                                                            path->dp_key_target));
++                                                     keycmp(c, iam_key_at(path, at),
++                                                            path->ip_key_target));
+                                       }
+-                                      at = dx_entry_shift(path, at, -1);
++                                      at = iam_entry_shift(path, at, -1);
+                                       break;
+                               }
+                       }
+@@ -637,8 +1115,8 @@
+               }
+       }
+       if (err != 0)
+-              dx_path_fini(path);
+-      path->dp_frame = --frame;
++              iam_path_fini(path);
++      path->ip_frame = --frame;
+       return err;
+ }
+@@ -652,7 +1130,7 @@
+  * back to userspace.
+  */
+ static int dx_probe(struct dentry *dentry, struct inode *dir,
+-                  struct dx_hash_info *hinfo, struct dx_path *path)
++                  struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+       int err;
+       struct htree_cookie hc = {
+@@ -661,39 +1139,78 @@
+       };
+       assert(dx_index_is_compat(path));
+-      err = dx_lookup(path, &hc);
+-      assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL);
++      path->ip_descr_data = &hc;
++      err = dx_lookup(path);
++      assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+       return err;
+ }
+-static inline void dx_path_init(struct dx_path *path, struct inode *inode)
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++                     struct iam_descr *descr, struct inode *inode)
++{
++      memset(c, 0, sizeof *c);
++      c->ic_descr  = descr;
++      c->ic_object = igrab(inode);
++      if (c->ic_object != NULL)
++              return 0;
++      else
++              return -ENOENT;
++}
++
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c)
++{
++      if (c->ic_object != NULL) {
++              iput(c->ic_object);
++              c->ic_object = NULL;
++      }
++}
++
++static inline void iam_path_init(struct iam_path *path, struct iam_container *c)
+ {
+       memset(path, 0, sizeof *path);
+-      path->dp_object = inode;
+-      path->dp_frame = path->dp_frames;
++      path->ip_container = c;
++      path->ip_frame = path->ip_frames;
+ }
+-static inline void dx_path_fini(struct dx_path *path)
++static inline void iam_path_fini(struct iam_path *path)
+ {
+       int i;
+-      for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) {
+-              if (path->dp_frames[i].bh != NULL) {
+-                      brelse(path->dp_frames[i].bh);
+-                      path->dp_frames[i].bh = NULL;
++      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
++              if (path->ip_frames[i].bh != NULL) {
++                      brelse(path->ip_frames[i].bh);
++                      path->ip_frames[i].bh = NULL;
+               }
+       }
+ }
+-static void dx_path_compat_init(struct dx_path_compat *path,
+-                              struct inode *inode)
++static void iam_path_compat_init(struct iam_path_compat *path,
++                               struct inode *inode)
+ {
+       int i;
+-      dx_path_init(&path->dpc_path, inode);
+-      path->dpc_path.dp_param = &htree_compat_param;
+-      for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i)
+-              path->dpc_path.dp_key_scratch[i] =
+-                      (struct dx_key *)&path->dpc_scrach[i];
++
++      iam_container_init(&path->ipc_container, &htree_compat_param, inode);
++      /*
++       * XXX hack allowing finalization of iam_path_compat with
++       * iam_path_fini().
++       */
++      iput(inode);
++      iam_path_init(&path->ipc_path, &path->ipc_container);
++      for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
++              path->ipc_path.ip_key_scratch[i] =
++                      (struct iam_key *)&path->ipc_scrach[i];
++}
++
++static void iam_path_compat_fini(struct iam_path_compat *path)
++{
++      iam_path_fini(&path->ipc_path);
++      iam_container_fini(&path->ipc_container);
+ }
+ /*
+@@ -714,16 +1231,16 @@
+  * hash of the next page.
+  */
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct dx_path *path, __u32 *start_hash)
++                               struct iam_path *path, __u32 *start_hash)
+ {
+-      struct dx_frame *p;
++      struct iam_frame *p;
+       struct buffer_head *bh;
+       int err, num_frames = 0;
+       __u32 bhash;
+       assert(dx_index_is_compat(path));
+-      p = path->dp_frame;
++      p = path->ip_frame;
+       /*
+        * Find the next leaf page by incrementing the frame pointer.
+        * If we run out of entries in the interior node, loop around and
+@@ -732,11 +1249,11 @@
+        * nodes need to be read.
+        */
+       while (1) {
+-              p->at = dx_entry_shift(path, p->at, +1);
+-              if (p->at < dx_entry_shift(path, p->entries,
++              p->at = iam_entry_shift(path, p->at, +1);
++              if (p->at < iam_entry_shift(path, p->entries,
+                                          dx_get_count(p->entries)))
+                       break;
+-              if (p == path->dp_frames)
++              if (p == path->ip_frames)
+                       return 0;
+               num_frames++;
+               --p;
+@@ -749,7 +1266,7 @@
+        * desired contiuation hash.  If it doesn't, return since
+        * there's no point to read in the successive index pages.
+        */
+-      dx_get_key(path, p->at, (struct dx_key *)&bhash);
++      dx_get_key(path, p->at, (struct iam_key *)&bhash);
+       if (start_hash)
+               *start_hash = bhash;
+       if ((hash & 1) == 0) {
+@@ -761,8 +1278,10 @@
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              if (!(bh = ext3_bread(NULL, dir,
+-                                    dx_get_block(path, p->at), 0, &err)))
++              err = path_descr(path)->id_node_read(path->ip_container,
++                                                   (iam_ptr_t)dx_get_block(path, p->at),
++                                                   NULL, &bh);
++              if (err != 0)
+                       return err; /* Failure */
+               ++p;
+               brelse (p->bh);
+@@ -837,8 +1356,8 @@
+ {
+       struct dx_hash_info hinfo;
+       struct ext3_dir_entry_2 *de;
+-      struct dx_path_compat cpath;
+-      struct dx_path *path = &cpath.dpc_path;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
+       struct inode *dir;
+       int block, err;
+       int count = 0;
+@@ -848,7 +1367,7 @@
+       dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+                      start_minor_hash));
+       dir = dir_file->f_dentry->d_inode;
+-      dx_path_compat_init(&cpath, dir);
++      iam_path_compat_init(&cpath, dir);
+       if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+               hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+               hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -865,7 +1384,7 @@
+       /* Add '.' and '..' from the htree header */
+       if (!start_hash && !start_minor_hash) {
+-              de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data;
++              de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data;
+               if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+                       goto errout;
+               de = ext3_next_entry(de);
+@@ -875,7 +1394,7 @@
+       }
+       while (1) {
+-              block = dx_get_block(path, path->dp_frame->at);
++              block = dx_get_block(path, path->ip_frame->at);
+               ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+                                            start_hash, start_minor_hash);
+               if (ret < 0) {
+@@ -900,12 +1419,12 @@
+                   (count && ((hashval & 1) == 0)))
+                       break;
+       }
+-      dx_path_fini(path);
++      iam_path_fini(path);
+       dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+                      count, *next_hash));
+       return count;
+ errout:
+-      dx_path_fini(path);
++      iam_path_fini(path);
+       return (err);
+ }
+@@ -964,18 +1483,18 @@
+       } while(more);
+ }
+-static void dx_insert_block(struct dx_path *path,
+-                          struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct iam_path *path,
++                          struct iam_frame *frame, u32 hash, u32 block)
+ {
+-      struct dx_entry *entries = frame->entries;
+-      struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1);
++      struct iam_entry *entries = frame->entries;
++      struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1);
+       int count = dx_get_count(entries);
+       assert(count < dx_get_limit(entries));
+-      assert(old < dx_entry_shift(path, entries, count));
+-      memmove(dx_entry_shift(path, new, 1), new,
+-              (char *)dx_entry_shift(path, entries, count) - (char *)new);
+-      dx_set_key(path, new, (struct dx_key *)&hash);
++      assert(old < iam_entry_shift(path, entries, count));
++      memmove(iam_entry_shift(path, new, 1), new,
++              (char *)iam_entry_shift(path, entries, count) - (char *)new);
++      dx_set_key(path, new, (struct iam_key *)&hash);
+       dx_set_block(path, new, block);
+       dx_set_count(entries, count + 1);
+ }
+@@ -1177,9 +1696,9 @@
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_path_compat cpath;
+-      struct dx_path *path = &cpath.dpc_path;
+-      struct dx_entry_compat dummy_dot = {
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_entry_compat dummy_dot = {
+               .block = 0
+       };
+       struct ext3_dir_entry_2 *de, *top;
+@@ -1190,8 +1709,8 @@
+       const u8 *name = dentry->d_name.name;
+       struct inode *dir = dentry->d_parent->d_inode;
+-      dx_path_compat_init(&cpath, dir);
+-      
++      iam_path_compat_init(&cpath, dir);
++
+       sb = dir->i_sb;
+       /* NFS may look up ".." - look at dx_root directory block */
+       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+@@ -1199,13 +1718,15 @@
+               if (*err != 0)
+                       return NULL;
+       } else {
+-              path->dp_frame->bh = NULL;              /* for dx_path_fini() */
+-              path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
++              path->ip_frame->bh = NULL;              /* for iam_path_fini() */
++              path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
+       }
+       hash = hinfo.hash;
+       do {
+-              block = dx_get_block(path, path->dp_frame->at);
+-              if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++              block = dx_get_block(path, path->ip_frame->at);
++              *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block,
++                                                   NULL, &bh);
++              if (*err != 0)
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+               top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+@@ -1220,7 +1741,7 @@
+                               goto errout;
+                       }
+                       *res_dir = de;
+-                      dx_path_fini(path);
++                      iam_path_fini(path);
+                       return bh;
+               }
+               brelse (bh);
+@@ -1238,7 +1759,7 @@
+       *err = -ENOENT;
+ errout:
+       dxtrace(printk("%s not found\n", name));
+-      dx_path_fini(path);
++      iam_path_fini(path);
+       return NULL;
+ }
+ #endif
+@@ -1363,11 +1884,11 @@
+ /* Allocate new node, and split leaf node @bh into it, inserting new pointer
+  * into parent node identified by @frame */
+-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path,
+-                      struct buffer_head **bh,struct dx_frame *frame,
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path,
++                      struct buffer_head **bh,struct iam_frame *frame,
+                       struct dx_hash_info *hinfo, int *error)
+ {
+-      struct inode *dir = path->dp_object;
++      struct inode *dir = path_obj(path);
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count, continued;
+       struct buffer_head *bh2;
+@@ -1553,9 +2074,9 @@
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+       struct dx_root  *root;
+-      struct dx_path_compat cpath;
+-      struct dx_path *path = &cpath.dpc_path;
+-      struct dx_entry *entries;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_entry *entries;
+       struct ext3_dir_entry_2 *de, *de2;
+       char            *data1, *top;
+       unsigned        len;
+@@ -1565,7 +2086,7 @@
+       u32             block;
+       struct fake_dirent *fde;
+-      dx_path_compat_init(&cpath, dir);
++      iam_path_compat_init(&cpath, dir);
+       blocksize =  dir->i_sb->s_blocksize;
+       dxtrace(printk("Creating index\n"));
+       retval = ext3_journal_get_write_access(handle, bh);
+@@ -1612,12 +2133,12 @@
+       hinfo.hash_version = root->info.hash_version;
+       hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+       ext3fs_dirhash(name, namelen, &hinfo);
+-      path->dp_frame->entries = entries;
+-      path->dp_frame->at = entries;
+-      path->dp_frame->bh = bh;
++      path->ip_frame->entries = entries;
++      path->ip_frame->at = entries;
++      path->ip_frame->bh = bh;
+       bh = bh2;
+-      de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval);
+-      dx_path_fini(path);
++      de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &retval);
++      iam_path_fini(path);
+       if (!de)
+               return retval;
+@@ -1698,12 +2219,12 @@
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_path_compat cpath;
+-      struct dx_path *path = &cpath.dpc_path;
+-      struct dx_param *param;
+-      struct dx_frame *frame, *safe;
+-      struct dx_entry *entries;   /* old block contents */
+-      struct dx_entry *entries2;  /* new block contents */
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_descr *param;
++      struct iam_frame *frame, *safe;
++      struct iam_entry *entries;   /* old block contents */
++      struct iam_entry *entries2;  /* new block contents */
+       struct dx_hash_info hinfo;
+       struct buffer_head * bh;
+       struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+@@ -1716,20 +2237,22 @@
+       int i;
+       size_t isize;
+-      dx_path_compat_init(&cpath, dir);
+-      param = path->dp_param;
++      iam_path_compat_init(&cpath, dir);
++      param = path_descr(path);
+       err = dx_probe(dentry, NULL, &hinfo, path);
+       if (err != 0)
+               return err;
+-      frame = path->dp_frame;
++      frame = path->ip_frame;
+       entries = frame->entries;
+       /* XXX nikita: global serialization! */
+       isize = dir->i_size;
+-      if (!(bh = ext3_bread(handle, dir,
+-                            dx_get_block(path, frame->at), 0, &err)))
++      err = param->id_node_read(path->ip_container, 
++                                (iam_ptr_t)dx_get_block(path, 
++                                frame->at), handle, &bh);
++      if (err != 0)
+               goto cleanup;
+       BUFFER_TRACE(bh, "get_write_access");
+@@ -1761,7 +2284,7 @@
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* What levels need split? */
+-      for (nr_splet = 0; frame >= path->dp_frames &&
++      for (nr_splet = 0; frame >= path->ip_frames &&
+            dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+            --frame, ++nr_splet) {
+               if (nr_splet == DX_MAX_TREE_HEIGHT) {
+@@ -1778,7 +2301,7 @@
+       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+               if (!bh_new[i] ||
+-                  param->dpo_node_init(path, bh_new[i], 0) != 0)
++                  param->id_node_init(path->ip_container, bh_new[i], 0) != 0)
+                       goto cleanup;
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -1786,7 +2309,7 @@
+                       goto journal_error;
+       }
+       /* Add "safe" node to transaction too */
+-      if (safe + 1 != path->dp_frames) {
++      if (safe + 1 != path->ip_frames) {
+               err = ext3_journal_get_write_access(handle, safe->bh);
+               if (err)
+                       goto journal_error;
+@@ -1800,12 +2323,12 @@
+               entries = frame->entries;
+               count = dx_get_count(entries);
+-              idx = dx_entry_diff(path, frame->at, entries);
++              idx = iam_entry_diff(path, frame->at, entries);
+               bh2 = bh_new[i];
+               entries2 = dx_get_entries(path, bh2->b_data, 0);
+-              if (frame == path->dp_frames) {
++              if (frame == path->ip_frames) {
+                       /* splitting root node. Tricky point:
+                        *
+                        * In the "normal" B-tree we'd split root *and* add
+@@ -1818,14 +2341,14 @@
+                        */
+                       struct dx_root *root;
+                       u8 indirects;
+-                      struct dx_frame *frames;
++                      struct iam_frame *frames;
+-                      frames = path->dp_frames;
++                      frames = path->ip_frames;
+                       root = (struct dx_root *) frames->bh->b_data;
+                       indirects = root->info.indirect_levels;
+                       dxtrace(printk("Creating new root %d\n", indirects));
+                       memcpy((char *) entries2, (char *) entries,
+-                             count * dx_entry_size(path));
++                             count * iam_entry_size(path));
+                       dx_set_limit(entries2, dx_node_limit(path));
+                       /* Set up root */
+@@ -1835,9 +2358,9 @@
+                       /* Shift frames in the path */
+                       memmove(frames + 2, frames + 1,
+-                              (sizeof path->dp_frames) - 2 * sizeof frames[0]);
++                              (sizeof path->ip_frames) - 2 * sizeof frames[0]);
+                       /* Add new access path frame */
+-                      frames[1].at = dx_entry_shift(path, entries2, idx);
++                      frames[1].at = iam_entry_shift(path, entries2, idx);
+                       frames[1].entries = entries = entries2;
+                       frames[1].bh = bh2;
+                       assert(dx_node_check(path, frame));
+@@ -1853,22 +2376,22 @@
+                       unsigned hash2;
+                       dx_get_key(path,
+-                                 dx_entry_shift(path, entries, count1),
+-                                 (struct dx_key *)&hash2);
++                                 iam_entry_shift(path, entries, count1),
++                                 (struct iam_key *)&hash2);
+                       dxtrace(printk("Split index %i/%i\n", count1, count2));
+                       memcpy ((char *) entries2,
+-                              (char *) dx_entry_shift(path, entries, count1),
+-                              count2 * dx_entry_size(path));
++                              (char *) iam_entry_shift(path, entries, count1),
++                              count2 * iam_entry_size(path));
+                       dx_set_count (entries, count1);
+                       dx_set_count (entries2, count2);
+                       dx_set_limit (entries2, dx_node_limit(path));
+                       /* Which index block gets the new entry? */
+                       if (idx >= count1) {
+-                              frame->at = dx_entry_shift(path, entries2,
+-                                                         idx - count1);
++                              frame->at = iam_entry_shift(path, entries2,
++                                                          idx - count1);
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                               bh_new[i] = bh2;
+@@ -1903,7 +2426,7 @@
+       }
+       if (err)
+               inode->i_size = isize;
+-      dx_path_fini(path);
++      iam_path_fini(path);
+       return err;
+ }
+ #endif
diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch b/ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch
new file mode 100644 (file)
index 0000000..ec66561
--- /dev/null
@@ -0,0 +1,1125 @@
+Index: iam-src/fs/ext3/namei.c
+===================================================================
+--- iam-src.orig/fs/ext3/namei.c       2006-02-15 18:31:48.000000000 +0300
++++ iam-src/fs/ext3/namei.c    2006-02-15 21:25:34.000000000 +0300
+@@ -51,7 +51,10 @@
+ /*
+  * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
+  */
+-#define DX_MAX_TREE_HEIGHT (5)
++enum {
++      DX_MAX_TREE_HEIGHT = 5,
++      DX_SCRATCH_KEYS    = 2
++};
+ static struct buffer_head *ext3_append(handle_t *handle,
+                                       struct inode *inode,
+@@ -83,22 +86,22 @@ static struct buffer_head *ext3_append(h
+ #define dxtrace(command)
+ #endif
+-struct fake_dirent
+-{
++struct fake_dirent {
+       __le32 inode;
+       __le16 rec_len;
+       u8 name_len;
+       u8 file_type;
+ };
+-struct dx_countlimit
+-{
++struct dx_countlimit {
+       __le16 limit;
+       __le16 count;
+ };
+-struct dx_entry
+-{
++struct dx_entry; /* incomplete type */
++struct dx_key;   /* incomplete type */
++
++struct dx_entry_compat {
+       __le32 hash;
+       __le32 block;
+ };
+@@ -109,8 +112,7 @@ struct dx_entry
+  * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+  */
+-struct dx_root
+-{
++struct dx_root {
+       struct fake_dirent dot;
+       char dot_name[4];
+       struct fake_dirent dotdot;
+@@ -124,13 +126,13 @@ struct dx_root
+               u8 unused_flags;
+       }
+       info;
+-      struct dx_entry entries[0];
++      struct {} entries[0];
+ };
+ struct dx_node
+ {
+       struct fake_dirent fake;
+-      struct dx_entry entries[0];
++      struct {} entries[0];
+ };
+@@ -147,38 +149,88 @@ struct dx_map_entry
+       u32 offs;
+ };
++struct dx_path;
++struct dx_param {
++      size_t       dpo_key_size;
++      size_t       dpo_ptr_size;
++      size_t       dpo_node_gap;
++      size_t       dpo_root_gap;
++
++      u32 (*dpo_root_ptr)(struct dx_path *path);
++      int (*dpo_node_check)(struct dx_path *path,
++                            struct dx_frame *frame, void *cookie);
++      int (*dpo_node_init)(struct dx_path *path,
++                           struct buffer_head *bh, int root);
++      int (*dpo_keycmp)(struct dx_path *path,
++                        struct dx_key *k1, struct dx_key *k2);
++};
++
+ /*
+  * Structure to keep track of a path drilled through htree.
+  */
+ struct dx_path {
+-      struct inode    *dp_object;
+-      struct dx_frame  dp_frames[DX_MAX_TREE_HEIGHT];
+-      struct dx_frame *dp_frame;
++      struct inode         *dp_object;
++      struct dx_param      *dp_param;
++      int                   dp_indirect;
++      struct dx_frame       dp_frames[DX_MAX_TREE_HEIGHT];
++      struct dx_frame      *dp_frame;
++      struct dx_key        *dp_key_target;
++      struct dx_key        *dp_key_scratch[DX_SCRATCH_KEYS];
++};
++
++struct dx_path_compat {
++      struct dx_path dpc_path;
++      __u32          dpc_scrach[DX_SCRATCH_KEYS];
+ };
++static u32 htree_root_ptr(struct dx_path *p);
++static int htree_node_check(struct dx_path *path,
++                          struct dx_frame *frame, void *cookie);
++static int htree_node_init(struct dx_path *path,
++                         struct buffer_head *bh, int root);
++static int htree_keycmp(struct dx_path *path,
++                      struct dx_key *k1, struct dx_key *k2);
++
++static struct dx_param htree_compat_param = {
++      .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++      .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
++      .dpo_node_gap = offsetof(struct dx_node, entries),
++      .dpo_root_gap = offsetof(struct dx_root, entries),
++
++      .dpo_root_ptr   = htree_root_ptr,
++      .dpo_node_check = htree_node_check,
++      .dpo_node_init  = htree_node_init,
++      .dpo_keycmp     = htree_keycmp
++};
++
++
+ #ifdef CONFIG_EXT3_INDEX
+-static inline unsigned dx_get_block (struct dx_entry *entry);
+-static void dx_set_block (struct dx_entry *entry, unsigned value);
+-static inline unsigned dx_get_hash (struct dx_entry *entry);
+-static void dx_set_hash (struct dx_entry *entry, unsigned value);
+-static unsigned dx_get_count (struct dx_entry *entries);
+-static unsigned dx_get_limit (struct dx_entry *entries);
+-static void dx_set_count (struct dx_entry *entries, unsigned value);
+-static void dx_set_limit (struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+-static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
+-                               struct inode *dir,
+-                               struct dx_hash_info *hinfo,
+-                               struct dx_path *path,
+-                               int *err);
++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry);
++static void dx_set_block(struct dx_path *p,
++                       struct dx_entry *entry, unsigned value);
++static inline struct dx_key *dx_get_key(struct dx_path *p,
++                                      struct dx_entry *entry,
++                                      struct dx_key *key);
++static void dx_set_key(struct dx_path *p, struct dx_entry *entry,
++                     struct dx_key *key);
++static unsigned dx_get_count(struct dx_entry *entries);
++static unsigned dx_get_limit(struct dx_entry *entries);
++static void dx_set_count(struct dx_entry *entries, unsigned value);
++static void dx_set_limit(struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit(struct dx_path *p);
++static unsigned dx_node_limit(struct dx_path *p);
++static int dx_probe(struct dentry *dentry,
++                  struct inode *dir,
++                  struct dx_hash_info *hinfo,
++                  struct dx_path *path);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct dx_path *path,
++                           struct dx_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+@@ -186,29 +238,72 @@ static struct buffer_head * ext3_dx_find
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
++static inline void dx_path_init(struct dx_path *path, struct inode *inode);
++static inline void dx_path_fini(struct dx_path *path);
++
++
+ /*
+  * Future: use high four bits of block for coalesce-on-delete flags
+  * Mask them off for now.
+  */
+-static inline unsigned dx_get_block (struct dx_entry *entry)
++static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return (void *)((char *)entry + off);
+ }
+-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++static inline size_t dx_entry_size(struct dx_path *p)
+ {
+-      entry->block = cpu_to_le32(value);
++      return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size;
+ }
+-static inline unsigned dx_get_hash (struct dx_entry *entry)
++static inline struct dx_entry *dx_entry_shift(struct dx_path *p,
++                                            struct dx_entry *entry, int shift)
+ {
+-      return le32_to_cpu(entry->hash);
++      void *e = entry;
++      return e + shift * dx_entry_size(p);
++}
++
++static inline ptrdiff_t dx_entry_diff(struct dx_path *p,
++                                    struct dx_entry *e1, struct dx_entry *e2)
++{
++      ptrdiff_t diff;
++
++      diff = (void *)e1 - (void *)e2;
++      assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff);
++      return diff / dx_entry_size(p);
+ }
+-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry)
+ {
+-      entry->hash = cpu_to_le32(value);
++      return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size))
++              & 0x00ffffff;
++}
++
++static inline void dx_set_block(struct dx_path *p,
++                              struct dx_entry *entry, unsigned value)
++{
++      *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value);
++}
++
++static inline struct dx_key *dx_get_key(struct dx_path *p,
++                                      struct dx_entry *entry,
++                                      struct dx_key *key)
++{
++      memcpy(key, entry, p->dp_param->dpo_key_size);
++      return key;
++}
++
++static inline struct dx_key *dx_key_at(struct dx_path *p,
++                                     struct dx_entry *entry)
++{
++      return (struct dx_key *)entry;
++}
++
++static inline void dx_set_key(struct dx_path *p,
++                            struct dx_entry *entry, struct dx_key *key)
++{
++      memcpy(entry, key, p->dp_param->dpo_key_size);
+ }
+ static inline unsigned dx_get_count (struct dx_entry *entries)
+@@ -231,17 +326,163 @@ static inline void dx_set_limit (struct 
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++static inline unsigned dx_root_limit(struct dx_path *p)
++{
++      struct dx_param *param = p->dp_param;
++      unsigned entry_space   = p->dp_object->i_sb->s_blocksize -
++              param->dpo_root_gap;
++      return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++}
++
++static inline unsigned dx_node_limit(struct dx_path *p)
++{
++      struct dx_param *param = p->dp_param;
++      unsigned entry_space   = p->dp_object->i_sb->s_blocksize -
++              param->dpo_node_gap;
++      return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++}
++
++static inline int dx_index_is_compat(struct dx_path *path)
++{
++      return path->dp_param == &htree_compat_param;
++}
++
++static struct dx_entry *dx_get_entries(struct dx_path *path, void *data,
++                                     int root)
++{
++      return data +
++              (root ?
++               path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap);
++}
++
++static struct dx_entry *dx_node_get_entries(struct dx_path *path,
++                                          struct dx_frame *frame)
++{
++      return dx_get_entries(path,
++                            frame->bh->b_data, frame == path->dp_frames);
++}
++
++static inline struct dx_key *keycpy(struct dx_path *p,
++                                  struct dx_key *k1, struct dx_key *k2)
++{
++      return memcpy(k1, k2, p->dp_param->dpo_key_size);
++}
++
++static inline int keycmp(struct dx_path *p,
++                       struct dx_key *k1, struct dx_key *k2)
++{
++      return p->dp_param->dpo_keycmp(p, k1, k2);
++}
++
++static int dx_node_check(struct dx_path *p, struct dx_frame *f)
++{
++      struct dx_entry *e;
++      unsigned count;
++      unsigned  i;
++
++      e = dx_node_get_entries(p, f);
++      count = dx_get_count(e);
++      e = dx_entry_shift(p, e, 1);
++      for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) {
++              keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]);
++              dx_get_key(p, e, p->dp_key_scratch[1]);
++              if (i > 0 &&
++                  keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0)
++                      return 0;
++      }
++      return 1;
++}
++
++static u32 htree_root_ptr(struct dx_path *path)
++{
++      return 0;
++}
++
++struct htree_cookie {
++      struct dx_hash_info *hinfo;
++      struct dentry       *dentry;
++};
++
++static int htree_node_check(struct dx_path *path, struct dx_frame *frame,
++                          void *cookie)
++{
++      void *data;
++      struct dx_entry *entries;
++      struct super_block *sb;
++
++      data = frame->bh->b_data;
++      entries = dx_node_get_entries(path, frame);
++      sb = path->dp_object->i_sb;
++      if (frame == path->dp_frames) {
++              /* root node */
++              struct dx_root *root;
++              struct htree_cookie *hc = cookie;
++
++              root = data;
++              if (root->info.hash_version != DX_HASH_TEA &&
++                  root->info.hash_version != DX_HASH_HALF_MD4 &&
++                  root->info.hash_version != DX_HASH_R5 &&
++                  root->info.hash_version != DX_HASH_LEGACY) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "Unrecognised inode hash code %d",
++                                   root->info.hash_version);
++                      return ERR_BAD_DX_DIR;
++              }
++
++              if (root->info.unused_flags & 1) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "Unimplemented inode hash flags: %#06x",
++                                   root->info.unused_flags);
++                      return ERR_BAD_DX_DIR;
++              }
++
++              path->dp_indirect = root->info.indirect_levels;
++              if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "Unimplemented inode hash depth: %#06x",
++                                   root->info.indirect_levels);
++                      return ERR_BAD_DX_DIR;
++              }
++
++              assert((char *)entries == (((char *)&root->info) +
++                                         root->info.info_length));
++              assert(dx_get_limit(entries) == dx_root_limit(path));
++
++              hc->hinfo->hash_version = root->info.hash_version;
++              hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed;
++              if (hc->dentry)
++                      ext3fs_dirhash(hc->dentry->d_name.name,
++                                     hc->dentry->d_name.len, hc->hinfo);
++              path->dp_key_target = (struct dx_key *)&hc->hinfo->hash;
++      } else {
++              /* non-root index */
++              assert(entries == data + path->dp_param->dpo_node_gap);
++              assert(dx_get_limit(entries) == dx_node_limit(path));
++      }
++      frame->entries = frame->at = entries;
++      return 0;
++}
++
++static int htree_node_init(struct dx_path *path,
++                         struct buffer_head *bh, int root)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
+-              EXT3_DIR_REC_LEN(2) - infosize;
+-      return 0? 20: entry_space / sizeof(struct dx_entry);
++      struct dx_node *node;
++
++      assert(!root);
++
++      node = (void *)bh->b_data;
++      node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize);
++      node->fake.inode = 0;
++      return 0;
+ }
+-static inline unsigned dx_node_limit (struct inode *dir)
++static int htree_keycmp(struct dx_path *path,
++                      struct dx_key *k1, struct dx_key *k2)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
+-      return 0? 22: entry_space / sizeof(struct dx_entry);
++      __u32 p1 = le32_to_cpu(*(__u32 *)k1);
++      __u32 p2 = le32_to_cpu(*(__u32 *)k2);
++
++      return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
+ }
+ /*
+@@ -327,123 +568,105 @@ struct stats dx_show_entries(struct dx_h
+ }
+ #endif /* DX_DEBUG */
+-/*
+- * Probe for a directory leaf block to search.
+- *
+- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+- * error in the directory index, and the caller should fall back to
+- * searching the directory normally.  The callers of dx_probe **MUST**
+- * check for this error code, and make sure it never gets reflected
+- * back to userspace.
+- */
+-static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
+-       struct dx_hash_info *hinfo, struct dx_path *path, int *err)
+-{
+-      unsigned count, indirect;
+-      struct dx_entry *at, *entries, *p, *q, *m;
+-      struct dx_root *root;
+-      struct buffer_head *bh;
+-      struct dx_frame *frame = path->dp_frames;
+-      u32 hash;
++static int dx_lookup(struct dx_path *path, void *cookie)
++{
++      u32 ptr;
++      int err;
++      int i;
+-      frame->bh = NULL;
+-      if (dentry)
+-              dir = dentry->d_parent->d_inode;
+-      if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+-              goto fail;
+-      root = (struct dx_root *) bh->b_data;
+-      if (root->info.hash_version != DX_HASH_TEA &&
+-          root->info.hash_version != DX_HASH_HALF_MD4 &&
+-          root->info.hash_version != DX_HASH_R5 &&
+-          root->info.hash_version != DX_HASH_LEGACY) {
+-              ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unrecognised inode hash code %d", root->info.hash_version);
+-              brelse(bh);
+-              *err = ERR_BAD_DX_DIR;
+-              goto fail;
+-      }
+-      hinfo->hash_version = root->info.hash_version;
+-      hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+-      if (dentry)
+-              ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+-      hash = hinfo->hash;
+-
+-      if (root->info.unused_flags & 1) {
+-              ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unimplemented inode hash flags: %#06x",
+-                           root->info.unused_flags);
+-              brelse(bh);
+-              *err = ERR_BAD_DX_DIR;
+-              goto fail;
+-      }
++      struct dx_param *param;
++      struct dx_frame *frame;
+-      if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) {
+-              ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unimplemented inode hash depth: %#06x",
+-                           root->info.indirect_levels);
+-              brelse(bh);
+-              *err = ERR_BAD_DX_DIR;
+-              goto fail;
+-      }
++      param = path->dp_param;
+-      entries = (struct dx_entry *) (((char *)&root->info) +
+-                                     root->info.info_length);
+-      assert(dx_get_limit(entries) == dx_root_limit(dir,
+-                                                    root->info.info_length));
+-      dxtrace (printk("Look up %x", hash));
+-      while (1)
+-      {
++      for (frame = path->dp_frames, i = 0,
++           ptr = param->dpo_root_ptr(path); i <= path->dp_indirect;
++           ptr = dx_get_block(path, frame->at), ++frame, ++i) {
++              struct dx_entry *entries;
++              struct dx_entry *p;
++              struct dx_entry *q;
++              struct dx_entry *m;
++              unsigned count;
++
++              frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err);
++              if (frame->bh == NULL) {
++                      err = -EIO;
++                      break;
++              }
++              err = param->dpo_node_check(path, frame, cookie);
++              if (err != 0)
++                      break;
++
++              assert(dx_node_check(path, frame));
++
++              entries = frame->entries;
+               count = dx_get_count(entries);
+-              assert (count && count <= dx_get_limit(entries));
+-              p = entries + 1;
+-              q = entries + count - 1;
+-              while (p <= q)
+-              {
+-                      m = p + (q - p)/2;
++              assert(count && count <= dx_get_limit(entries));
++              p = dx_entry_shift(path, entries, 1);
++              q = dx_entry_shift(path, entries, count - 1);
++              while (p <= q) {
++                      m = dx_entry_shift(path,
++                                         p, dx_entry_diff(path, q, p) / 2);
+                       dxtrace(printk("."));
+-                      if (dx_get_hash(m) > hash)
+-                              q = m - 1;
++                      if (keycmp(path, dx_key_at(path, m),
++                                 path->dp_key_target) > 0)
++                              q = dx_entry_shift(path, m, -1);
+                       else
+-                              p = m + 1;
++                              p = dx_entry_shift(path, m, +1);
+               }
+-              if (0) // linear search cross check
+-              {
++              frame->at = dx_entry_shift(path, p, -1);
++              if (1) { // linear search cross check
+                       unsigned n = count - 1;
++                      struct dx_entry *at;
++
+                       at = entries;
+-                      while (n--)
+-                      {
++                      while (n--) {
+                               dxtrace(printk(","));
+-                              if (dx_get_hash(++at) > hash)
+-                              {
+-                                      at--;
++                              at = dx_entry_shift(path, at, +1);
++                              if (keycmp(path, dx_key_at(path, at),
++                                         path->dp_key_target) > 0) {
++                                      if (at != dx_entry_shift(path, frame->at, 1)) {
++                                              BREAKPOINT;
++                                              printk(KERN_EMERG "%i\n",
++                                                     keycmp(path, dx_key_at(path, at),
++                                                            path->dp_key_target));
++                                      }
++                                      at = dx_entry_shift(path, at, -1);
+                                       break;
+                               }
+                       }
+-                      assert (at == p - 1);
++                      assert(at == frame->at);
+               }
+-
+-              at = p - 1;
+-              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+-              frame->bh = bh;
+-              frame->entries = entries;
+-              frame->at = at;
+-              if (!indirect--)
+-                      return path->dp_frame = frame;
+-              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+-                      goto fail2;
+-              at = entries = ((struct dx_node *) bh->b_data)->entries;
+-              assert (dx_get_limit(entries) == dx_node_limit (dir));
+-              frame++;
+-      }
+-fail2:
+-      while (frame >= path->dp_frames) {
+-              brelse(frame->bh);
+-              frame--;
+       }
+-fail:
+-      return NULL;
++      if (err != 0)
++              dx_path_fini(path);
++      path->dp_frame = --frame;
++      return err;
++}
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally.  The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static int dx_probe(struct dentry *dentry, struct inode *dir,
++                  struct dx_hash_info *hinfo, struct dx_path *path)
++{
++      int err;
++      struct htree_cookie hc = {
++              .dentry = dentry,
++              .hinfo  = hinfo
++      };
++
++      assert(dx_index_is_compat(path));
++      err = dx_lookup(path, &hc);
++      assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL);
++      return err;
+ }
+ static inline void dx_path_init(struct dx_path *path, struct inode *inode)
+@@ -458,11 +681,24 @@ static inline void dx_path_fini(struct d
+       int i;
+       for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) {
+-              if (path->dp_frames[i].bh != NULL)
++              if (path->dp_frames[i].bh != NULL) {
+                       brelse(path->dp_frames[i].bh);
++                      path->dp_frames[i].bh = NULL;
++              }
+       }
+ }
++static void dx_path_compat_init(struct dx_path_compat *path,
++                              struct inode *inode)
++{
++      int i;
++      dx_path_init(&path->dpc_path, inode);
++      path->dpc_path.dp_param = &htree_compat_param;
++      for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i)
++              path->dpc_path.dp_key_scratch[i] =
++                      (struct dx_key *)&path->dpc_scrach[i];
++}
++
+ /*
+  * This function increments the frame pointer to search the next leaf
+  * block, and reads in the necessary intervening nodes if the search
+@@ -488,6 +724,8 @@ static int ext3_htree_next_block(struct 
+       int err, num_frames = 0;
+       __u32 bhash;
++      assert(dx_index_is_compat(path));
++
+       p = path->dp_frame;
+       /*
+        * Find the next leaf page by incrementing the frame pointer.
+@@ -497,7 +735,9 @@ static int ext3_htree_next_block(struct 
+        * nodes need to be read.
+        */
+       while (1) {
+-              if (++(p->at) < p->entries + dx_get_count(p->entries))
++              p->at = dx_entry_shift(path, p->at, +1);
++              if (p->at < dx_entry_shift(path, p->entries,
++                                         dx_get_count(p->entries)))
+                       break;
+               if (p == path->dp_frames)
+                       return 0;
+@@ -512,7 +752,7 @@ static int ext3_htree_next_block(struct 
+        * desired contiuation hash.  If it doesn't, return since
+        * there's no point to read in the successive index pages.
+        */
+-      bhash = dx_get_hash(p->at);
++      dx_get_key(path, p->at, (struct dx_key *)&bhash);
+       if (start_hash)
+               *start_hash = bhash;
+       if ((hash & 1) == 0) {
+@@ -524,12 +764,14 @@ static int ext3_htree_next_block(struct 
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err)))
++              if (!(bh = ext3_bread(NULL, dir,
++                                    dx_get_block(path, p->at), 0, &err)))
+                       return err; /* Failure */
+               ++p;
+               brelse (p->bh);
+               p->bh = bh;
+-              p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++              p->at = p->entries = dx_node_get_entries(path, p);
++              assert(dx_node_check(path, p));
+       }
+       return 1;
+ }
+@@ -598,7 +840,8 @@ int ext3_htree_fill_tree(struct file *di
+ {
+       struct dx_hash_info hinfo;
+       struct ext3_dir_entry_2 *de;
+-      struct dx_path path;
++      struct dx_path_compat cpath;
++      struct dx_path *path = &cpath.dpc_path;
+       struct inode *dir;
+       int block, err;
+       int count = 0;
+@@ -608,7 +851,7 @@ int ext3_htree_fill_tree(struct file *di
+       dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+                      start_minor_hash));
+       dir = dir_file->f_dentry->d_inode;
+-      dx_path_init(&path, dir);
++      dx_path_compat_init(&cpath, dir);
+       if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+               hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+               hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -619,12 +862,13 @@ int ext3_htree_fill_tree(struct file *di
+       }
+       hinfo.hash = start_hash;
+       hinfo.minor_hash = 0;
+-      if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err))
++      err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, path);
++      if (err != 0)
+               return err;
+       /* Add '.' and '..' from the htree header */
+       if (!start_hash && !start_minor_hash) {
+-              de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data;
++              de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data;
+               if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+                       goto errout;
+               de = ext3_next_entry(de);
+@@ -634,7 +878,7 @@ int ext3_htree_fill_tree(struct file *di
+       }
+       while (1) {
+-              block = dx_get_block(path.dp_frame->at);
++              block = dx_get_block(path, path->dp_frame->at);
+               ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+                                            start_hash, start_minor_hash);
+               if (ret < 0) {
+@@ -643,7 +887,8 @@ int ext3_htree_fill_tree(struct file *di
+               }
+               count += ret;
+               hashval = ~0;
+-              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval);
++              ret = ext3_htree_next_block(dir,
++                                          HASH_NB_ALWAYS, path, &hashval);
+               *next_hash = hashval;
+               if (ret < 0) {
+                       err = ret;
+@@ -658,12 +903,12 @@ int ext3_htree_fill_tree(struct file *di
+                   (count && ((hashval & 1) == 0)))
+                       break;
+       }
+-      dx_path_fini(&path);
++      dx_path_fini(path);
+       dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+                      count, *next_hash));
+       return count;
+ errout:
+-      dx_path_fini(&path);
++      dx_path_fini(path);
+       return (err);
+ }
+@@ -722,17 +967,19 @@ static void dx_sort_map (struct dx_map_e
+       } while(more);
+ }
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct dx_path *path,
++                          struct dx_frame *frame, u32 hash, u32 block)
+ {
+       struct dx_entry *entries = frame->entries;
+-      struct dx_entry *old = frame->at, *new = old + 1;
++      struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1);
+       int count = dx_get_count(entries);
+       assert(count < dx_get_limit(entries));
+-      assert(old < entries + count);
+-      memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+-      dx_set_hash(new, hash);
+-      dx_set_block(new, block);
++      assert(old < dx_entry_shift(path, entries, count));
++      memmove(dx_entry_shift(path, new, 1), new,
++              (char *)dx_entry_shift(path, entries, count) - (char *)new);
++      dx_set_key(path, new, (struct dx_key *)&hash);
++      dx_set_block(path, new, block);
+       dx_set_count(entries, count + 1);
+ }
+ #endif
+@@ -933,8 +1180,11 @@ static struct buffer_head * ext3_dx_find
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_path path;
+-      struct dx_entry dummy_dot;
++      struct dx_path_compat cpath;
++      struct dx_path *path = &cpath.dpc_path;
++      struct dx_entry_compat dummy_dot = {
++              .block = 0
++      };
+       struct ext3_dir_entry_2 *de, *top;
+       struct buffer_head *bh;
+       unsigned long block;
+@@ -943,20 +1193,21 @@ static struct buffer_head * ext3_dx_find
+       const u8 *name = dentry->d_name.name;
+       struct inode *dir = dentry->d_parent->d_inode;
+-      dx_path_init(&path, dir);
++      dx_path_compat_init(&cpath, dir);
++      
+       sb = dir->i_sb;
+       /* NFS may look up ".." - look at dx_root directory block */
+       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+-              if (!(dx_probe(dentry, NULL, &hinfo, &path, err)))
++              *err = dx_probe(dentry, NULL, &hinfo, path);
++              if (*err != 0)
+                       return NULL;
+       } else {
+-              path.dp_frame->bh = NULL;                       /* for dx_path_fini() */
+-              path.dp_frame->at = &dummy_dot;         /* hack for zero entry*/
+-              dx_set_block(path.dp_frame->at, 0);     /* dx_root block is 0 */
++              path->dp_frame->bh = NULL;              /* for dx_path_fini() */
++              path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
+       }
+       hash = hinfo.hash;
+       do {
+-              block = dx_get_block(path.dp_frame->at);
++              block = dx_get_block(path, path->dp_frame->at);
+               if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -972,12 +1223,12 @@ static struct buffer_head * ext3_dx_find
+                               goto errout;
+                       }
+                       *res_dir = de;
+-                      dx_path_fini(&path);
++                      dx_path_fini(path);
+                       return bh;
+               }
+               brelse (bh);
+               /* Check to see if we should continue to search */
+-              retval = ext3_htree_next_block(dir, hash, &path, NULL);
++              retval = ext3_htree_next_block(dir, hash, path, NULL);
+               if (retval < 0) {
+                       ext3_warning(sb, __FUNCTION__,
+                            "error reading index page in directory #%lu",
+@@ -990,7 +1241,7 @@ static struct buffer_head * ext3_dx_find
+       *err = -ENOENT;
+ errout:
+       dxtrace(printk("%s not found\n", name));
+-      dx_path_fini(&path);
++      dx_path_fini(path);
+       return NULL;
+ }
+ #endif
+@@ -1115,10 +1366,11 @@ static struct ext3_dir_entry_2* dx_pack_
+ /* Allocate new node, and split leaf node @bh into it, inserting new pointer
+  * into parent node identified by @frame */
+-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path,
+                       struct buffer_head **bh,struct dx_frame *frame,
+                       struct dx_hash_info *hinfo, int *error)
+ {
++      struct inode *dir = path->dp_object;
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count, continued;
+       struct buffer_head *bh2;
+@@ -1180,7 +1432,7 @@ static struct ext3_dir_entry_2 *do_split
+               swap(*bh, bh2);
+               de = de2;
+       }
+-      dx_insert_block (frame, hash2 + continued, newblock);
++      dx_insert_block(path, frame, hash2 + continued, newblock);
+       err = ext3_journal_dirty_metadata (handle, bh2);
+       if (err)
+               goto journal_error;
+@@ -1303,7 +1555,8 @@ static int make_indexed_dir(handle_t *ha
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+       struct dx_root  *root;
+-      struct dx_path path;
++      struct dx_path_compat cpath;
++      struct dx_path *path = &cpath.dpc_path;
+       struct dx_entry *entries;
+       struct ext3_dir_entry_2 *de, *de2;
+       char            *data1, *top;
+@@ -1314,7 +1567,7 @@ static int make_indexed_dir(handle_t *ha
+       u32             block;
+       struct fake_dirent *fde;
+-      dx_path_init(&path, dir);
++      dx_path_compat_init(&cpath, dir);
+       blocksize =  dir->i_sb->s_blocksize;
+       dxtrace(printk("Creating index\n"));
+       retval = ext3_journal_get_write_access(handle, bh);
+@@ -1350,21 +1603,21 @@ static int make_indexed_dir(handle_t *ha
+       root->info.info_length = sizeof(root->info);
+       root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+       root->info.hash_version = DX_HASH_R5;
+-      entries = root->entries;
+-      dx_set_block (entries, 1);
++      entries = (void *)root->entries;
++      dx_set_block (path, entries, 1);
+       dx_set_count (entries, 1);
+-      dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++      dx_set_limit (entries, dx_root_limit(path));
+       /* Initialize as for dx_probe */
+       hinfo.hash_version = root->info.hash_version;
+       hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+       ext3fs_dirhash(name, namelen, &hinfo);
+-      path.dp_frame->entries = entries;
+-      path.dp_frame->at = entries;
+-      path.dp_frame->bh = bh;
++      path->dp_frame->entries = entries;
++      path->dp_frame->at = entries;
++      path->dp_frame->bh = bh;
+       bh = bh2;
+-      de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval);
+-      dx_path_fini(&path);
++      de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval);
++      dx_path_fini(path);
+       if (!de)
+               return retval;
+@@ -1445,9 +1698,10 @@ static int ext3_add_entry (handle_t *han
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_path path;
++      struct dx_path_compat cpath;
++      struct dx_path *path = &cpath.dpc_path;
++      struct dx_param *param;
+       struct dx_frame *frame, *safe;
+-      struct dx_node *node2;
+       struct dx_entry *entries;   /* old block contents */
+       struct dx_entry *entries2;  /* new block contents */
+       struct dx_hash_info hinfo;
+@@ -1462,16 +1716,20 @@ static int ext3_dx_add_entry(handle_t *h
+       int i;
+       size_t isize;
+-      dx_path_init(&path, dir);
+-      if (!dx_probe(dentry, NULL, &hinfo, &path, &err))
++      dx_path_compat_init(&cpath, dir);
++      param = path->dp_param;
++
++      err = dx_probe(dentry, NULL, &hinfo, path);
++      if (err != 0)
+               return err;
+-      frame = path.dp_frame;
++      frame = path->dp_frame;
+       entries = frame->entries;
+       /* XXX nikita: global serialization! */
+       isize = dir->i_size;
+-      if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++      if (!(bh = ext3_bread(handle, dir,
++                            dx_get_block(path, frame->at), 0, &err)))
+               goto cleanup;
+       BUFFER_TRACE(bh, "get_write_access");
+@@ -1503,7 +1761,7 @@ static int ext3_dx_add_entry(handle_t *h
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* What levels need split? */
+-      for (nr_splet = 0; frame >= path.dp_frames &&
++      for (nr_splet = 0; frame >= path->dp_frames &&
+            dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+            --frame, ++nr_splet) {
+               if (nr_splet == DX_MAX_TREE_HEIGHT) {
+@@ -1519,19 +1777,16 @@ static int ext3_dx_add_entry(handle_t *h
+        * transaction... */
+       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+-              if (!bh_new[i])
++              if (!bh_new[i] ||
++                  param->dpo_node_init(path, bh_new[i], 0) != 0)
+                       goto cleanup;
+-              node2 = (struct dx_node *)(bh_new[i]->b_data);
+-              entries2 = node2->entries;
+-              node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+-              node2->fake.inode = 0;
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+       }
+       /* Add "safe" node to transaction too */
+-      if (safe + 1 != path.dp_frames) {
++      if (safe + 1 != path->dp_frames) {
+               err = ext3_journal_get_write_access(handle, safe->bh);
+               if (err)
+                       goto journal_error;
+@@ -1545,13 +1800,12 @@ static int ext3_dx_add_entry(handle_t *h
+               entries = frame->entries;
+               count = dx_get_count(entries);
+-              idx = frame->at - entries;
++              idx = dx_entry_diff(path, frame->at, entries);
+               bh2 = bh_new[i];
+-              node2 = (struct dx_node *)(bh2->b_data);
+-              entries2 = node2->entries;
++              entries2 = dx_get_entries(path, bh2->b_data, 0);
+-              if (frame == path.dp_frames) {
++              if (frame == path->dp_frames) {
+                       /* splitting root node. Tricky point:
+                        *
+                        * In the "normal" B-tree we'd split root *and* add
+@@ -1566,27 +1820,29 @@ static int ext3_dx_add_entry(handle_t *h
+                       u8 indirects;
+                       struct dx_frame *frames;
+-                      frames = path.dp_frames;
++                      frames = path->dp_frames;
+                       root = (struct dx_root *) frames->bh->b_data;
+                       indirects = root->info.indirect_levels;
+                       dxtrace(printk("Creating new root %d\n", indirects));
+                       memcpy((char *) entries2, (char *) entries,
+-                             count * sizeof(struct dx_entry));
+-                      dx_set_limit(entries2, dx_node_limit(dir));
++                             count * dx_entry_size(path));
++                      dx_set_limit(entries2, dx_node_limit(path));
+                       /* Set up root */
+                       dx_set_count(entries, 1);
+-                      dx_set_block(entries + 0, newblock[i]);
++                      dx_set_block(path, entries, newblock[i]);
+                       root->info.indirect_levels = indirects + 1;
+                       /* Shift frames in the path */
+                       memmove(frames + 2, frames + 1,
+-                              (sizeof path.dp_frames) - 2 * sizeof frames[0]);
++                              (sizeof path->dp_frames) - 2 * sizeof frames[0]);
+                       /* Add new access path frame */
+-                      frames[1].at = entries2 + idx;
++                      frames[1].at = dx_entry_shift(path, entries2, idx);
+                       frames[1].entries = entries = entries2;
+                       frames[1].bh = bh2;
++                      assert(dx_node_check(path, frame));
+                       ++ frame;
++                      assert(dx_node_check(path, frame));
+                       bh_new[i] = NULL; /* buffer head is "consumed" */
+                       err = ext3_journal_get_write_access(handle, bh2);
+                       if (err)
+@@ -1594,23 +1850,32 @@ static int ext3_dx_add_entry(handle_t *h
+               } else {
+                       /* splitting non-root index node. */
+                       unsigned count1 = count/2, count2 = count - count1;
+-                      unsigned hash2 = dx_get_hash(entries + count1);
++                      unsigned hash2;
++
++                      dx_get_key(path,
++                                 dx_entry_shift(path, entries, count1),
++                                 (struct dx_key *)&hash2);
++
+                       dxtrace(printk("Split index %i/%i\n", count1, count2));
+-                      memcpy ((char *) entries2, (char *) (entries + count1),
+-                              count2 * sizeof(struct dx_entry));
++                      memcpy ((char *) entries2,
++                              (char *) dx_entry_shift(path, entries, count1),
++                              count2 * dx_entry_size(path));
+                       dx_set_count (entries, count1);
+                       dx_set_count (entries2, count2);
+-                      dx_set_limit (entries2, dx_node_limit(dir));
++                      dx_set_limit (entries2, dx_node_limit(path));
+                       /* Which index block gets the new entry? */
+                       if (idx >= count1) {
+-                              frame->at = entries2 + idx - count1;
++                              frame->at = dx_entry_shift(path, entries2,
++                                                         idx - count1);
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                               bh_new[i] = bh2;
+                       }
+-                      dx_insert_block (frame - 1, hash2, newblock[i]);
++                      dx_insert_block(path, frame - 1, hash2, newblock[i]);
++                      assert(dx_node_check(path, frame));
++                      assert(dx_node_check(path, frame - 1));
+                       dxtrace(dx_show_index ("node", frame->entries));
+                       dxtrace(dx_show_index ("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+@@ -1619,9 +1884,10 @@ static int ext3_dx_add_entry(handle_t *h
+                               goto journal_error;
+               }
+       }
+-      de = do_split(handle, dir, &bh, --frame, &hinfo, &err);
++      de = do_split(handle, path, &bh, --frame, &hinfo, &err);
+       if (!de)
+               goto cleanup;
++      assert(dx_node_check(path, frame));
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+       goto cleanup2;
+@@ -1637,7 +1903,7 @@ cleanup2:
+       }
+       if (err)
+               inode->i_size = isize;
+-      dx_path_fini(&path);
++      dx_path_fini(path);
+       return err;
+ }
+ #endif
diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-path.patch b/ldiskfs/kernel_patches/patches/ext3-htree-path.patch
new file mode 100644 (file)
index 0000000..893d1d1
--- /dev/null
@@ -0,0 +1,406 @@
+Index: iam-src/fs/ext3/namei.c
+===================================================================
+--- iam-src.orig/fs/ext3/namei.c       2006-02-09 20:44:02.000000000 +0300
++++ iam-src/fs/ext3/namei.c    2006-02-10 18:23:32.000000000 +0300
+@@ -147,6 +147,15 @@ struct dx_map_entry
+       u32 offs;
+ };
++/*
++ * Structure to keep track of a path drilled through htree.
++ */
++struct dx_path {
++      struct inode    *dp_object;
++      struct dx_frame  dp_frames[DX_MAX_TREE_HEIGHT];
++      struct dx_frame *dp_frame;
++};
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -161,9 +170,8 @@ static unsigned dx_node_limit (struct in
+ static struct dx_frame *dx_probe(struct dentry *dentry,
+                                struct inode *dir,
+                                struct dx_hash_info *hinfo,
+-                               struct dx_frame *frame,
++                               struct dx_path *path,
+                                int *err);
+-static void dx_release (struct dx_frame *frames);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+@@ -172,9 +180,7 @@ static struct ext3_dir_entry_2 *dx_move_
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+ static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct dx_frame *frame,
+-                               struct dx_frame *frames,
+-                               __u32 *start_hash);
++                               struct dx_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+@@ -332,13 +338,13 @@ struct stats dx_show_entries(struct dx_h
+  */
+ static struct dx_frame *
+ dx_probe(struct dentry *dentry, struct inode *dir,
+-       struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++       struct dx_hash_info *hinfo, struct dx_path *path, int *err)
+ {
+       unsigned count, indirect;
+       struct dx_entry *at, *entries, *p, *q, *m;
+       struct dx_root *root;
+       struct buffer_head *bh;
+-      struct dx_frame *frame = frame_in;
++      struct dx_frame *frame = path->dp_frames;
+       u32 hash;
+       frame->bh = NULL;
+@@ -352,8 +358,7 @@ dx_probe(struct dentry *dentry, struct i
+           root->info.hash_version != DX_HASH_R5 &&
+           root->info.hash_version != DX_HASH_LEGACY) {
+               ext3_warning(dir->i_sb, __FUNCTION__,
+-                           "Unrecognised inode hash code %d",
+-                           root->info.hash_version);
++                           "Unrecognised inode hash code %d", root->info.hash_version);
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+@@ -424,7 +429,8 @@ dx_probe(struct dentry *dentry, struct i
+               frame->bh = bh;
+               frame->entries = entries;
+               frame->at = at;
+-              if (!indirect--) return frame;
++              if (!indirect--)
++                      return path->dp_frame = frame;
+               if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+                       goto fail2;
+               at = entries = ((struct dx_node *) bh->b_data)->entries;
+@@ -432,7 +438,7 @@ dx_probe(struct dentry *dentry, struct i
+               frame++;
+       }
+ fail2:
+-      while (frame >= frame_in) {
++      while (frame >= path->dp_frames) {
+               brelse(frame->bh);
+               frame--;
+       }
+@@ -440,16 +446,20 @@ fail:
+       return NULL;
+ }
+-static void dx_release (struct dx_frame *frames)
++static inline void dx_path_init(struct dx_path *path, struct inode *inode)
+ {
+-      int height;
++      memset(path, 0, sizeof *path);
++      path->dp_object = inode;
++      path->dp_frame = path->dp_frames;
++}
+-      if (frames[0].bh == NULL)
+-              return;
+-      height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels;
+-      for (; height >= 0; height--) {
+-              assert(frames[height].bh != NULL);
+-              brelse(frames[height].bh);
++static inline void dx_path_fini(struct dx_path *path)
++{
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) {
++              if (path->dp_frames[i].bh != NULL)
++                      brelse(path->dp_frames[i].bh);
+       }
+ }
+@@ -471,16 +481,14 @@ static void dx_release (struct dx_frame 
+  * hash of the next page.
+  */
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct dx_frame *frame,
+-                               struct dx_frame *frames,
+-                               __u32 *start_hash)
++                               struct dx_path *path, __u32 *start_hash)
+ {
+       struct dx_frame *p;
+       struct buffer_head *bh;
+       int err, num_frames = 0;
+       __u32 bhash;
+-      p = frame;
++      p = path->dp_frame;
+       /*
+        * Find the next leaf page by incrementing the frame pointer.
+        * If we run out of entries in the interior node, loop around and
+@@ -491,10 +499,10 @@ static int ext3_htree_next_block(struct 
+       while (1) {
+               if (++(p->at) < p->entries + dx_get_count(p->entries))
+                       break;
+-              if (p == frames)
++              if (p == path->dp_frames)
+                       return 0;
+               num_frames++;
+-              p--;
++              --p;
+       }
+       /*
+@@ -516,10 +524,9 @@ static int ext3_htree_next_block(struct 
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+-                                    0, &err)))
++              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err)))
+                       return err; /* Failure */
+-              p++;
++              ++p;
+               brelse (p->bh);
+               p->bh = bh;
+               p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+@@ -591,7 +598,7 @@ int ext3_htree_fill_tree(struct file *di
+ {
+       struct dx_hash_info hinfo;
+       struct ext3_dir_entry_2 *de;
+-      struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
++      struct dx_path path;
+       struct inode *dir;
+       int block, err;
+       int count = 0;
+@@ -601,6 +608,7 @@ int ext3_htree_fill_tree(struct file *di
+       dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+                      start_minor_hash));
+       dir = dir_file->f_dentry->d_inode;
++      dx_path_init(&path, dir);
+       if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+               hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+               hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -611,13 +619,12 @@ int ext3_htree_fill_tree(struct file *di
+       }
+       hinfo.hash = start_hash;
+       hinfo.minor_hash = 0;
+-      frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+-      if (!frame)
++      if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err))
+               return err;
+       /* Add '.' and '..' from the htree header */
+       if (!start_hash && !start_minor_hash) {
+-              de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++              de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data;
+               if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+                       goto errout;
+               de = ext3_next_entry(de);
+@@ -627,7 +634,7 @@ int ext3_htree_fill_tree(struct file *di
+       }
+       while (1) {
+-              block = dx_get_block(frame->at);
++              block = dx_get_block(path.dp_frame->at);
+               ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+                                            start_hash, start_minor_hash);
+               if (ret < 0) {
+@@ -636,8 +643,7 @@ int ext3_htree_fill_tree(struct file *di
+               }
+               count += ret;
+               hashval = ~0;
+-              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+-                                          frame, frames, &hashval);
++              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval);
+               *next_hash = hashval;
+               if (ret < 0) {
+                       err = ret;
+@@ -652,12 +658,12 @@ int ext3_htree_fill_tree(struct file *di
+                   (count && ((hashval & 1) == 0)))
+                       break;
+       }
+-      dx_release(frames);
++      dx_path_fini(&path);
+       dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+                      count, *next_hash));
+       return count;
+ errout:
+-      dx_release(frames);
++      dx_path_fini(&path);
+       return (err);
+ }
+@@ -927,7 +933,8 @@ static struct buffer_head * ext3_dx_find
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
++      struct dx_path path;
++      struct dx_entry dummy_dot;
+       struct ext3_dir_entry_2 *de, *top;
+       struct buffer_head *bh;
+       unsigned long block;
+@@ -936,20 +943,20 @@ static struct buffer_head * ext3_dx_find
+       const u8 *name = dentry->d_name.name;
+       struct inode *dir = dentry->d_parent->d_inode;
++      dx_path_init(&path, dir);
+       sb = dir->i_sb;
+       /* NFS may look up ".." - look at dx_root directory block */
+       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+-              if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
++              if (!(dx_probe(dentry, NULL, &hinfo, &path, err)))
+                       return NULL;
+       } else {
+-              frame = frames;
+-              frame->bh = NULL;                       /* for dx_release() */
+-              frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
+-              dx_set_block(frame->at, 0);             /* dx_root block is 0 */
++              path.dp_frame->bh = NULL;                       /* for dx_path_fini() */
++              path.dp_frame->at = &dummy_dot;         /* hack for zero entry*/
++              dx_set_block(path.dp_frame->at, 0);     /* dx_root block is 0 */
+       }
+       hash = hinfo.hash;
+       do {
+-              block = dx_get_block(frame->at);
++              block = dx_get_block(path.dp_frame->at);
+               if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -965,13 +972,12 @@ static struct buffer_head * ext3_dx_find
+                               goto errout;
+                       }
+                       *res_dir = de;
+-                      dx_release (frames);
++                      dx_path_fini(&path);
+                       return bh;
+               }
+               brelse (bh);
+               /* Check to see if we should continue to search */
+-              retval = ext3_htree_next_block(dir, hash, frame,
+-                                             frames, NULL);
++              retval = ext3_htree_next_block(dir, hash, &path, NULL);
+               if (retval < 0) {
+                       ext3_warning(sb, __FUNCTION__,
+                            "error reading index page in directory #%lu",
+@@ -984,7 +990,7 @@ static struct buffer_head * ext3_dx_find
+       *err = -ENOENT;
+ errout:
+       dxtrace(printk("%s not found\n", name));
+-      dx_release (frames);
++      dx_path_fini(&path);
+       return NULL;
+ }
+ #endif
+@@ -1297,7 +1303,7 @@ static int make_indexed_dir(handle_t *ha
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+       struct dx_root  *root;
+-      struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
++      struct dx_path path;
+       struct dx_entry *entries;
+       struct ext3_dir_entry_2 *de, *de2;
+       char            *data1, *top;
+@@ -1308,6 +1314,7 @@ static int make_indexed_dir(handle_t *ha
+       u32             block;
+       struct fake_dirent *fde;
++      dx_path_init(&path, dir);
+       blocksize =  dir->i_sb->s_blocksize;
+       dxtrace(printk("Creating index\n"));
+       retval = ext3_journal_get_write_access(handle, bh);
+@@ -1352,14 +1359,13 @@ static int make_indexed_dir(handle_t *ha
+       hinfo.hash_version = root->info.hash_version;
+       hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+       ext3fs_dirhash(name, namelen, &hinfo);
+-      frame = frames;
+-      frame->entries = entries;
+-      frame->at = entries;
+-      frame->bh = bh;
++      path.dp_frame->entries = entries;
++      path.dp_frame->at = entries;
++      path.dp_frame->bh = bh;
+       bh = bh2;
+-      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+-      dx_release (frames);
+-      if (!(de))
++      de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval);
++      dx_path_fini(&path);
++      if (!de)
+               return retval;
+       return add_dirent_to_buf(handle, dentry, inode, de, bh);
+@@ -1439,7 +1445,8 @@ static int ext3_add_entry (handle_t *han
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe;
++      struct dx_path path;
++      struct dx_frame *frame, *safe;
+       struct dx_node *node2;
+       struct dx_entry *entries;   /* old block contents */
+       struct dx_entry *entries2;  /* new block contents */
+@@ -1455,9 +1462,10 @@ static int ext3_dx_add_entry(handle_t *h
+       int i;
+       size_t isize;
+-      frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+-      if (!frame)
++      dx_path_init(&path, dir);
++      if (!dx_probe(dentry, NULL, &hinfo, &path, &err))
+               return err;
++      frame = path.dp_frame;
+       entries = frame->entries;
+       /* XXX nikita: global serialization! */
+@@ -1495,7 +1503,7 @@ static int ext3_dx_add_entry(handle_t *h
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* What levels need split? */
+-      for (nr_splet = 0; frame >= frames &&
++      for (nr_splet = 0; frame >= path.dp_frames &&
+            dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+            --frame, ++nr_splet) {
+               if (nr_splet == DX_MAX_TREE_HEIGHT) {
+@@ -1523,7 +1531,7 @@ static int ext3_dx_add_entry(handle_t *h
+                       goto journal_error;
+       }
+       /* Add "safe" node to transaction too */
+-      if (safe + 1 != frames) {
++      if (safe + 1 != path.dp_frames) {
+               err = ext3_journal_get_write_access(handle, safe->bh);
+               if (err)
+                       goto journal_error;
+@@ -1543,7 +1551,7 @@ static int ext3_dx_add_entry(handle_t *h
+               node2 = (struct dx_node *)(bh2->b_data);
+               entries2 = node2->entries;
+-              if (frame == frames) {
++              if (frame == path.dp_frames) {
+                       /* splitting root node. Tricky point:
+                        *
+                        * In the "normal" B-tree we'd split root *and* add
+@@ -1556,7 +1564,9 @@ static int ext3_dx_add_entry(handle_t *h
+                        */
+                       struct dx_root *root;
+                       u8 indirects;
++                      struct dx_frame *frames;
++                      frames = path.dp_frames;
+                       root = (struct dx_root *) frames->bh->b_data;
+                       indirects = root->info.indirect_levels;
+                       dxtrace(printk("Creating new root %d\n", indirects));
+@@ -1571,7 +1581,7 @@ static int ext3_dx_add_entry(handle_t *h
+                       /* Shift frames in the path */
+                       memmove(frames + 2, frames + 1,
+-                              (sizeof frames) - 2 * sizeof frames[0]);
++                              (sizeof path.dp_frames) - 2 * sizeof frames[0]);
+                       /* Add new access path frame */
+                       frames[1].at = entries2 + idx;
+                       frames[1].entries = entries = entries2;
+@@ -1627,7 +1637,7 @@ cleanup2:
+       }
+       if (err)
+               inode->i_size = isize;
+-      dx_release(frames);
++      dx_path_fini(&path);
+       return err;
+ }
+ #endif
diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch b/ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch
new file mode 100644 (file)
index 0000000..48897e7
--- /dev/null
@@ -0,0 +1,88 @@
+Index: iam-src/fs/ext3/hash.c
+===================================================================
+--- iam-src.orig/fs/ext3/hash.c        2006-02-11 01:08:59.000000000 +0300
++++ iam-src/fs/ext3/hash.c     2006-02-11 20:46:22.000000000 +0300
+@@ -4,7 +4,7 @@
+  * Copyright (C) 2002 by Theodore Ts'o
+  *
+  * This file is released under the GPL v2.
+- * 
++ *
+  * This file may be redistributed under the terms of the GNU Public
+  * License.
+  */
+@@ -115,6 +115,18 @@ static __u32 dx_hack_hash (const char *n
+       return (hash0 << 1);
+ }
++static __u32 dx_r5_hash(const signed char *msg, int len)
++{
++      __u32 a = 0;
++      while (len--) {
++              a += *msg << 4;
++              a += *msg >> 4;
++              a *= 11;
++              msg++;
++      }
++      return a;
++}
++
+ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+ {
+       __u32   pad, val;
+@@ -146,11 +158,11 @@ static void str2hashbuf(const char *msg,
+  * Returns the hash of a filename.  If len is 0 and name is NULL, then
+  * this function can be used to test whether or not a hash version is
+  * supported.
+- * 
++ *
+  * The seed is an 4 longword (32 bits) "secret" which can be used to
+  * uniquify a hash.  If the seed is all zero's, then some default seed
+  * may be used.
+- * 
++ *
+  * A particular hash version specifies whether or not the seed is
+  * represented, and whether or not the returned hash is 32 bits or 64
+  * bits.  32 bit hashes will return 0 for the minor hash.
+@@ -205,6 +217,9 @@ int ext3fs_dirhash(const char *name, int
+               hash = buf[0];
+               minor_hash = buf[1];
+               break;
++      case DX_HASH_R5:
++              hash = dx_r5_hash(name, len);
++              break;
+       default:
+               hinfo->hash = 0;
+               return -1;
+Index: iam-src/fs/ext3/namei.c
+===================================================================
+--- iam-src.orig/fs/ext3/namei.c       2006-02-11 01:09:12.000000000 +0300
++++ iam-src/fs/ext3/namei.c    2006-02-11 20:45:58.000000000 +0300
+@@ -370,6 +370,7 @@ dx_probe(struct dentry *dentry, struct i
+       root = (struct dx_root *) bh->b_data;
+       if (root->info.hash_version != DX_HASH_TEA &&
+           root->info.hash_version != DX_HASH_HALF_MD4 &&
++          root->info.hash_version != DX_HASH_R5 &&
+           root->info.hash_version != DX_HASH_LEGACY) {
+               ext3_warning(dir->i_sb, __FUNCTION__,
+                            "Unrecognised inode hash code %d", root->info.hash_version);
+@@ -1363,6 +1364,7 @@ static int make_indexed_dir(handle_t *ha
+       memset (&root->info, 0, sizeof(root->info));
+       root->info.info_length = sizeof(root->info);
+       root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
++      root->info.hash_version = DX_HASH_R5;
+       entries = root->entries;
+       dx_set_block (entries, 1);
+       dx_set_count (entries, 1);
+Index: iam-src/include/linux/ext3_fs.h
+===================================================================
+--- iam-src.orig/include/linux/ext3_fs.h       2006-02-11 01:08:59.000000000 +0300
++++ iam-src/include/linux/ext3_fs.h    2006-02-11 20:45:58.000000000 +0300
+@@ -665,6 +665,7 @@ struct ext3_dir_entry_2 {
+ #define DX_HASH_LEGACY                0
+ #define DX_HASH_HALF_MD4      1
+ #define DX_HASH_TEA           2
++#define DX_HASH_R5            3
+ /* hash info structure used by the directory hash */
+ struct dx_hash_info
diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-ops.patch b/ldiskfs/kernel_patches/patches/ext3-iam-ops.patch
new file mode 100644 (file)
index 0000000..e59f0c8
--- /dev/null
@@ -0,0 +1,1178 @@
+Index: iam/fs/ext3/namei.c
+===================================================================
+--- iam.orig/fs/ext3/namei.c
++++ iam/fs/ext3/namei.c
+@@ -82,13 +82,16 @@
+  *
+  * Entries in index node are sorted by their key value.
+  *
++ * Format of leaf node:
+  *
+- *
+- *
+- *
+- *
+- *
+- *
++ * +-----+-------+-------+-------+------+-------+------------+
++ * |     | count |       |       |      |       |            |
++ * | gap |   /   | leaf  | leaf  | .... | leaf  | free space |
++ * |     | limit |       |       |      |       |            |
++ * +-----+-------+-------+-------+------+-------+------------+
++
++ *       leaf          For leaf entry: consists of a rec immediately followd by 
++ *                     a key. size of a key and size of a rec depends on container.  
+  *
+  *
+  *
+@@ -96,6 +99,7 @@
+  *
+  */
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -111,7 +115,7 @@
+ #include "xattr.h"
+ #include "iopen.h"
+ #include "acl.h"
+-
++#include <linux/lustre_iam.h>
+ /*
+  * define how far ahead to read directories while searching them.
+  */
+@@ -120,13 +124,6 @@
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+-/*
+- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
+- */
+-enum {
+-      DX_MAX_TREE_HEIGHT = 5,
+-      DX_SCRATCH_KEYS    = 2
+-};
+ static struct buffer_head *ext3_append(handle_t *handle,
+                                       struct inode *inode,
+@@ -205,194 +202,6 @@ struct dx_map_entry
+       u32 offs;
+ };
+-/*
+- * Entry within index tree node. Consists of a key immediately followed
+- * (without padding) by a pointer to the child node.
+- *
+- * Both key and pointer are of variable size, hence incomplete type.
+- */
+-struct iam_entry;
+-
+-struct iam_entry_compat {
+-      __le32 hash;
+-      __le32 block;
+-};
+-
+-/*
+- * Incomplete type used to refer to keys in iam container.
+- *
+- * As key size can be different from container to container, iam has to use
+- * incomplete type. Clients cast pointer to iam_key to real key type and back.
+- */
+-struct iam_key;
+-
+-/* Incomplete type use to refer to the records stored in iam containers. */
+-struct iam_rec;
+-
+-typedef __u64 iam_ptr_t;
+-
+-/*
+- * Index node traversed during tree lookup.
+- */
+-struct iam_frame {
+-      struct buffer_head *bh;    /* buffer holding node data */
+-      struct iam_entry *entries; /* array of entries */
+-      struct iam_entry *at;      /* target entry, found by binary search */
+-};
+-
+-/* leaf node reached by tree lookup */
+-struct iam_leaf {
+-      struct buffer_head *bh;
+-      struct iam_leaf_entry *entries;
+-      struct iam_leaf_entry *at;
+-};
+-
+-struct iam_path;
+-struct iam_container;
+-
+-/*
+- * Parameters, describing a flavor of iam container.
+- */
+-struct iam_descr {
+-      /*
+-       * Size of a key in this container, in bytes.
+-       */
+-      size_t       id_key_size;
+-      /*
+-       * Size of a pointer to the next level (stored in index nodes), in
+-       * bytes.
+-       */
+-      size_t       id_ptr_size;
+-      /*
+-       * Size of a record (stored in leaf nodes), in bytes.
+-       */
+-      size_t       id_rec_size;
+-      /*
+-       * Size of unused (by iam) space at the beginning of every non-root
+-       * node, in bytes. Used for compatibility with ext3.
+-       */
+-      size_t       id_node_gap;
+-      /*
+-       * Size of unused (by iam) space at the beginning of root node, in
+-       * bytes. Used for compatibility with ext3.
+-       */
+-      size_t       id_root_gap;
+-
+-      /*
+-       * Returns pointer (in the same sense as pointer in index entry) to
+-       * the root node.
+-       */
+-      __u32 (*id_root_ptr)(struct iam_container *c);
+-
+-      /*
+-       * Check validity and consistency of index node. This is called when
+-       * iam just loaded new node into frame.
+-       */
+-      int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
+-      /*
+-       * Initialize new node (stored in @bh) that is going to be added into
+-       * tree.
+-       */
+-      int (*id_node_init)(struct iam_container *c,
+-                          struct buffer_head *bh, int root);
+-      int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
+-                          handle_t *h, struct buffer_head **bh);
+-      /*
+-       * Key comparison function. Returns -1, 0, +1.
+-       */
+-      int (*id_keycmp)(struct iam_container *c,
+-                       struct iam_key *k1, struct iam_key *k2);
+-      /*
+-       * Create new container.
+-       *
+-       * Newly created container has a root node and a single leaf. Leaf
+-       * contains single record with the smallest possible key.
+-       */
+-      int (*id_create)(struct iam_container *c);
+-      struct {
+-              /*
+-               * leaf operations.
+-               */
+-              /*
+-               * returns true iff leaf is positioned at the last entry.
+-               */
+-              int (*at_end)(struct iam_container *c, struct iam_leaf *l);
+-              /* position leaf at the first entry */
+-              void (*start)(struct iam_container *c, struct iam_leaf *l);
+-              /* more leaf to the next entry. */
+-              void (*next)(struct iam_container *c, struct iam_leaf *l);
+-              /* return key of current leaf record in @k */
+-              void (*key)(struct iam_container *c, struct iam_leaf *l,
+-                          struct iam_key *k);
+-              /* return pointer to entry body */
+-              struct iam_rec *(*rec)(struct iam_container *c,
+-                                     struct iam_leaf *l);
+-      } id_leaf;
+-};
+-
+-struct iam_container {
+-      /*
+-       * Underlying flat file. IO against this object is issued to
+-       * read/write nodes.
+-       */
+-      struct inode     *ic_object;
+-      /*
+-       * container flavor.
+-       */
+-      struct iam_descr *ic_descr;
+-      /*
+-       * pointer to flavor-specific per-container data.
+-       */
+-      void             *ic_descr_data;
+-};
+-
+-/*
+- * Structure to keep track of a path drilled through htree.
+- */
+-struct iam_path {
+-      /*
+-       * Parent container.
+-       */
+-      struct iam_container  *ip_container;
+-      /*
+-       * Number of index levels minus one.
+-       */
+-      int                    ip_indirect;
+-      /*
+-       * Nodes that top-to-bottom traversal passed through.
+-       */
+-      struct iam_frame       ip_frames[DX_MAX_TREE_HEIGHT];
+-      /*
+-       * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
+-       * immediately above leaf).
+-       */
+-      struct iam_frame      *ip_frame;
+-      /*
+-       * Leaf node: a child of ->ip_frame.
+-       */
+-      struct iam_leaf       *ip_leaf;
+-      /*
+-       * Key searched for.
+-       */
+-      struct iam_key        *ip_key_target;
+-      /*
+-       * Scratch-pad area for temporary keys.
+-       */
+-      struct iam_key        *ip_key_scratch[DX_SCRATCH_KEYS];
+-      /*
+-       * pointer to flavor-specific per-container data.
+-       */
+-      void                  *ip_descr_data;
+-};
+-
+-/*
+- * Helper structure for legacy htrees.
+- */
+-struct iam_path_compat {
+-      struct iam_path      ipc_path;
+-      struct iam_container ipc_container;
+-      __u32                ipc_scrach[DX_SCRATCH_KEYS];
+-};
+ static u32 htree_root_ptr(struct iam_container *c);
+ static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
+@@ -427,58 +236,7 @@ struct iam_descr;
+ struct iam_container;
+ struct iam_path;
+-/*
+- * Initialize container @c, acquires additional reference on @inode.
+- */
+-int iam_container_init(struct iam_container *c,
+-                     struct iam_descr *descr, struct inode *inode);
+-/*
+- * Finalize container @c, release all resources.
+- */
+-void iam_container_fini(struct iam_container *c);
+-/*
+- * Search container @c for record with key @k. If record is found, its data
+- * are moved into @r.
+- *
+- *
+- *
+- * Return values: +ve: found, 0: not-found, -ve: error
+- */
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-/*
+- * Insert new record @r with key @k into container @c (within context of
+- * transaction @h.
+- *
+- * Return values: 0: success, -ve: error, including -EEXIST when record with
+- * given key is already present.
+- *
+- * postcondition: ergo(result == 0 || result == -EEXIST,
+- *                                  iam_lookup(c, k, r2) > 0 &&
+- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_insert(handle_t *h, struct iam_container *c,
+-             struct iam_key *k, struct iam_rec *r);
+-/*
+- * Replace existing record with key @k, or insert new one. New record data are
+- * in @r.
+- *
+- * Return values: 0: success, -ve: error.
+- *
+- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
+- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_update(handle_t *h, struct iam_container *c,
+-             struct iam_key *k, struct iam_rec *r);
+-/*
+- * Delete existing record with key @k.
+- *
+- * Return values: 0: success, -ENOENT: not-found, -ve: other error.
+- *
+- * postcondition: ergo(result == 0 || result == -ENOENT,
+- *                                 !iam_lookup(c, k, *));
+- */
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
+ /*
+  * iam cursor (iterator) api.
+@@ -508,6 +266,11 @@ enum iam_it_state {
+       IAM_IT_ATTACHED
+ };
++struct htree_cookie {
++      struct dx_hash_info *hinfo;
++      struct dentry       *dentry;
++};
++
+ /*
+  * Iterator.
+  *
+@@ -704,7 +467,7 @@ static int ext3_dx_add_entry(handle_t *h
+                            struct inode *inode);
+ static inline void iam_path_init(struct iam_path *path,
+-                               struct iam_container *c);
++                               struct iam_container *c, struct htree_cookie *hc);
+ static inline void iam_path_fini(struct iam_path *path);
+@@ -865,11 +628,6 @@ static u32 htree_root_ptr(struct iam_con
+       return 0;
+ }
+-struct htree_cookie {
+-      struct dx_hash_info *hinfo;
+-      struct dentry       *dentry;
+-};
+-
+ static int htree_node_check(struct iam_path *path, struct iam_frame *frame)
+ {
+       void *data;
+@@ -1171,11 +929,13 @@ void iam_container_fini(struct iam_conta
+       }
+ }
+-static inline void iam_path_init(struct iam_path *path, struct iam_container *c)
++static inline void iam_path_init(struct iam_path *path, struct iam_container *c, 
++                               struct htree_cookie *hc)
+ {
+       memset(path, 0, sizeof *path);
+       path->ip_container = c;
+       path->ip_frame = path->ip_frames;
++      path->ip_descr_data = hc;
+ }
+ static inline void iam_path_fini(struct iam_path *path)
+@@ -1201,7 +961,7 @@ static void iam_path_compat_init(struct 
+        * iam_path_fini().
+        */
+       iput(inode);
+-      iam_path_init(&path->ipc_path, &path->ipc_container);
++      iam_path_init(&path->ipc_path, &path->ipc_container, NULL);
+       for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
+               path->ipc_path.ip_key_scratch[i] =
+                       (struct iam_key *)&path->ipc_scrach[i];
+@@ -1213,6 +973,425 @@ static void iam_path_compat_fini(struct 
+       iam_container_fini(&path->ipc_container);
+ }
++static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf)
++{
++      int block, err;
++      struct buffer_head *bh;
++      
++      block = dx_get_block(path, path->ip_frame->at);
++      err = path_descr(path)->id_node_read(path->ip_container, block, 
++                                           NULL, &bh);
++      if (err)
++              return err;
++
++      leaf->bh = bh;
++      leaf->entries = (struct iam_leaf_entry *)bh->b_data;
++      return 0;
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf)
++{
++      if (leaf->bh)
++              brelse(leaf->bh);
++}
++
++/*
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
++ *
++ *
++ *
++ * Return values: +ve: found, 0: not-found, -ve: error
++ */
++
++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r)
++{
++      struct dx_hash_info     hinfo;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct htree_cookie hc = {
++              .hinfo  = &hinfo
++      };
++      int err, i;
++
++      iam_path_init(path, c, &hc);
++      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++              path->ip_key_scratch[i] =
++                      (struct iam_key *)&cpath.ipc_scrach[i];
++      err = dx_lookup(path);
++      do {
++              struct iam_leaf leaf;
++              err = iam_leaf_init(path, &leaf);
++              if (err)
++                      goto errout;
++
++              for (path_descr(path)->id_leaf.start(c, &leaf);
++                   !path_descr(path)->id_leaf.at_end(c, &leaf);
++                   path_descr(path)->id_leaf.next(c, &leaf)) {
++                      struct iam_key *key;
++
++                      key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL);
++                      path_descr(path)->id_leaf.key(c, &leaf, key);
++                      if (keycmp(c, k, key) == 0) {
++                              memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf),
++                                     path_descr(path)->id_rec_size);
++                              iam_path_fini(path);
++                              iam_leaf_fini(&leaf);
++                              return 0;
++                      }
++              }
++
++              iam_leaf_fini(&leaf);
++              /* Check to see if we should continue to search */
++              err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL);
++              if (err < 0)
++                      goto errout;
++      } while (err == 1);
++errout:
++      iam_path_fini(path);
++      return(err);
++}
++EXPORT_SYMBOL(iam_lookup);
++
++static inline size_t iam_leaf_entry_size(struct iam_path *p)
++{
++      return path_descr(p)->id_rec_size + path_descr(p)->id_key_size;
++}
++
++static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p,
++                                    struct iam_leaf_entry *e1, struct iam_leaf_entry *e2)
++{
++      ptrdiff_t diff;
++
++      diff = (void *)e1 - (void *)e2;
++      assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff);
++      return diff / iam_leaf_entry_size(p);
++}
++
++static inline struct iam_leaf_entry* 
++iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift)
++{
++      void *e = entry;
++      return e + shift * iam_leaf_entry_size(p);
++}
++
++static inline struct iam_key *
++dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key)
++{
++      memcpy(key, e, path_descr(p)->id_key_size);
++      return key;
++}
++
++static inline struct iam_key *
++iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry)
++{
++      void *e = entry;
++      return e + path_descr(p)->id_rec_size;
++}
++static inline struct iam_leaf_entry *
++iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry)
++{
++      return entry; 
++}
++
++static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, 
++                         struct iam_key *k)
++{
++      struct iam_leaf_entry *p, *q, *m;
++      struct iam_leaf_entry *entries = leaf->entries;
++      int count = dx_get_count((struct iam_entry *)entries);
++      
++      p = iam_leaf_entry_shift(path, entries, 1);
++      q = iam_leaf_entry_shift(path, entries, count - 1);
++      while (p <= q) {
++              m = iam_leaf_entry_shift(path,
++                                 p, iam_leaf_entry_diff(path, q, p) / 2);
++              dxtrace(printk("."));
++              if (keycmp(path->ip_container, iam_leaf_key_at(path, m),
++                         path->ip_key_target) > 0)
++                      q = iam_leaf_entry_shift(path, m, -1);
++              else
++                      p = iam_leaf_entry_shift(path, m, +1);
++      }
++      leaf->at = q; 
++      return 0;
++}
++
++/*XXX what kind of lock should this entry be locked: WangDi */
++static int iam_leaf_insert(handle_t *handle, struct iam_path *path, 
++                         struct iam_key *k, struct iam_rec *r)
++{
++      struct iam_leaf leaf;
++      struct iam_leaf_entry *p, *q;
++      int err, count;
++
++      err = iam_leaf_init(path, &leaf);
++      if (err)
++              goto errout;
++      path_descr(path)->id_leaf.start(path->ip_container, &leaf);
++      count = dx_get_count((struct iam_entry *)leaf.entries);
++      if (dx_get_count((struct iam_entry *)leaf.entries) >= 
++          dx_get_limit((struct iam_entry *)leaf.entries)){
++              err = -ENOSPC;
++              goto errout;
++      }
++
++      err = iam_leaf_lookup(path, &leaf, k);
++      if (err)
++              goto errout;
++      
++      /*insert the k/r to leaf entries*/
++      p = iam_leaf_entry_shift(path, leaf.at, 1);
++      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
++      while (q < p) {
++              memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path));
++              q = iam_leaf_entry_shift(path, q, -1);  
++      }
++      memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size);
++      memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size);
++
++      dx_set_count((struct iam_entry*)leaf.entries, count + 1);
++      err = ext3_journal_dirty_metadata(handle, leaf.bh);
++      if (err)
++              ext3_std_error(path->ip_container->ic_object->i_sb, err);
++errout:       
++      iam_leaf_fini(&leaf);
++      return err;
++} 
++
++static int split_leaf_node(handle_t *handle, struct iam_path *path)
++{
++      struct inode *dir = path_obj(path);
++      unsigned continued = 0;
++      struct buffer_head *bh2;
++      u32 newblock, hash_split;
++      char *data2;
++      struct iam_leaf leaf;
++      unsigned split;
++      int     err;
++
++      bh2 = ext3_append (handle, dir, &newblock, &err);
++      if (!(bh2)) {
++              err = -ENOSPC;
++              goto errout;
++      }
++      err = iam_leaf_init(path, &leaf);
++      if (err)
++              goto errout;
++
++      BUFFER_TRACE(leaf.bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, leaf.bh);
++      if (err) {
++      journal_error:
++              iam_leaf_fini(&leaf);
++              brelse(bh2);
++              ext3_std_error(dir->i_sb, err);
++              err = -EIO;
++              goto errout;
++      }
++      data2 = bh2->b_data;
++      split = dx_get_count((struct iam_entry*)leaf.entries)/2;
++      hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split));
++      if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)),
++                 iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0)
++              continued = 1;
++
++      memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1),
++             iam_leaf_entry_shift(path, leaf.entries, split),
++             split * iam_leaf_entry_size(path));
++ 
++      /* Which block gets the new entry? */
++      dx_insert_block(path, path->ip_frame, hash_split + continued, newblock);
++      err = ext3_journal_dirty_metadata (handle, bh2);
++      if (err)
++              goto journal_error;
++      err = ext3_journal_dirty_metadata (handle, leaf.bh);
++      if (err)
++              goto journal_error;
++      brelse (bh2);
++      iam_leaf_fini(&leaf);
++errout:
++      return err;
++}
++
++static int split_index_node(handle_t *handle, struct iam_path *path);
++/*
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h.
++ *
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
++ *
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ *                                  iam_lookup(c, k, r2) > 0 &&
++ *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, 
++             struct iam_rec *r)
++{
++      struct dx_hash_info     hinfo;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct htree_cookie hc = {
++              .hinfo  = &hinfo
++      };
++      int err, i;
++
++      iam_path_init(path, c, &hc);
++      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++              path->ip_key_scratch[i] =
++                      (struct iam_key *)&cpath.ipc_scrach[i];
++      err = dx_lookup(path);
++      if (err)
++              goto errout; 
++
++      err = iam_leaf_insert(handle, path, k, r);
++      
++      if (err != -ENOSPC) 
++              goto errout;    
++
++      err = split_index_node(handle, path);
++      if (err)
++              goto errout;    
++
++      err = split_leaf_node(handle, path);
++      if (err)
++              goto errout;
++      
++      err = iam_leaf_insert(handle, path, k, r);
++errout:
++      iam_path_fini(path);
++      return(err);
++}
++
++EXPORT_SYMBOL(iam_insert);
++static int iam_leaf_delete(handle_t *handle, struct iam_path *path, 
++                         struct iam_key *k)
++{
++      struct iam_leaf leaf;
++      struct iam_leaf_entry *p, *q;
++      int err, count;
++
++      err = iam_leaf_init(path, &leaf);
++      if (err)
++              goto errout;
++      
++      err = iam_leaf_lookup(path, &leaf, k);
++      if (err)
++              goto errout;
++
++      count = dx_get_count((struct iam_entry*)leaf.entries);
++      /*delete the k to leaf entries*/
++      p = iam_leaf_entry_shift(path, leaf.at, 1);
++      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
++      while (p < q) {
++              memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path));
++              p = iam_leaf_entry_shift(path, p, 1);
++      }
++      dx_set_count((struct iam_entry*)leaf.entries, count - 1);
++
++      err = ext3_journal_dirty_metadata(handle, leaf.bh);
++      if (err)
++              ext3_std_error(path_obj(path)->i_sb, err);
++errout:       
++      iam_leaf_fini(&leaf);
++      return err;
++}
++
++/*
++ * Delete existing record with key @k.
++ *
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ *                                 !iam_lookup(c, k, *));
++ */
++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k)
++{
++      struct dx_hash_info     hinfo;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct htree_cookie hc = {
++              .hinfo  = &hinfo
++      };
++      int err, i;
++
++      iam_path_init(path, c, &hc);
++      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++              path->ip_key_scratch[i] =
++                      (struct iam_key *)&cpath.ipc_scrach[i];
++      err = dx_lookup(path);
++      if (err)
++              goto errout; 
++
++      err = iam_leaf_delete(h, path, k);
++errout:
++      iam_path_fini(path);
++      return err;
++}
++
++EXPORT_SYMBOL(iam_delete);
++
++static int iam_leaf_update(handle_t *handle, struct iam_path *path, 
++                         struct iam_key *k, struct iam_rec *r)
++{
++      struct iam_leaf leaf;
++      int err;
++
++      err = iam_leaf_init(path, &leaf);
++      if (err)
++              goto errout;
++      
++      err = iam_leaf_lookup(path, &leaf, k);
++      if (err)
++              goto errout;
++
++      memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size);
++      memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size);
++
++      err = ext3_journal_dirty_metadata(handle, leaf.bh);
++      if (err)
++              ext3_std_error(path_obj(path)->i_sb, err);
++errout:       
++      iam_leaf_fini(&leaf);
++      return err;
++}
++/*
++ * Replace existing record with key @k, or insert new one. New record data are
++ * in @r.
++ *
++ * Return values: 0: success, -ve: error.
++ *
++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
++ *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_update(handle_t *h, struct iam_container *c,
++             struct iam_key *k, struct iam_rec *r)
++{
++      struct dx_hash_info     hinfo;
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct htree_cookie hc = {
++              .hinfo  = &hinfo
++      };
++      int err, i;
++      
++      iam_path_init(path, c, &hc);
++      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++              path->ip_key_scratch[i] =
++                      (struct iam_key *)&cpath.ipc_scrach[i];
++      err = dx_lookup(path);
++      if (err)
++              goto errout; 
++
++      err = iam_leaf_update(h, path, k, r);
++errout:
++      iam_path_fini(path);
++      return err;
++}
++
++EXPORT_SYMBOL(iam_update);
++
+ /*
+  * This function increments the frame pointer to search the next leaf
+  * block, and reads in the necessary intervening nodes if the search
+@@ -2245,59 +2424,21 @@ static int ext3_add_entry (handle_t *han
+ }
+ #ifdef CONFIG_EXT3_INDEX
+-/*
+- * Returns 0 for success, or a negative error value
+- */
+-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+-                           struct inode *inode)
++static int split_index_node(handle_t *handle, struct iam_path *path)
+ {
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct iam_descr *param;
+-      struct iam_frame *frame, *safe;
++
+       struct iam_entry *entries;   /* old block contents */
+       struct iam_entry *entries2;  /* new block contents */
+-      struct dx_hash_info hinfo;
+-      struct buffer_head * bh;
++      struct iam_frame *frame, *safe;
+       struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+-      struct inode *dir = dentry->d_parent->d_inode;
+-      struct super_block * sb = dir->i_sb;
+-      struct ext3_dir_entry_2 *de;
+       u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+-      int err;
++      struct inode *dir = path_obj(path);
+       int nr_splet;
+-      int i;
+-      size_t isize;
+-
+-      iam_path_compat_init(&cpath, dir);
+-      param = path_descr(path);
++      int i, err;
+-      err = dx_probe(dentry, NULL, &hinfo, path);
+-      if (err != 0)
+-              return err;
+       frame = path->ip_frame;
+       entries = frame->entries;
+-      /* XXX nikita: global serialization! */
+-      isize = dir->i_size;
+-
+-      err = param->id_node_read(path->ip_container, 
+-                                (iam_ptr_t)dx_get_block(path, 
+-                                frame->at), handle, &bh);
+-      if (err != 0)
+-              goto cleanup;
+-
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+-      err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+-      if (err != -ENOSPC) {
+-              bh = NULL;
+-              goto cleanup;
+-      }
+-
+       /*
+        * Tall-tree handling: we might have to split multiple index blocks
+        * all the way up to tree root. Tricky point here is error handling:
+@@ -2320,7 +2461,7 @@ static int ext3_dx_add_entry(handle_t *h
+            dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+            --frame, ++nr_splet) {
+               if (nr_splet == DX_MAX_TREE_HEIGHT) {
+-                      ext3_warning(sb, __FUNCTION__,
++                      ext3_warning(dir->i_sb, __FUNCTION__,
+                                    "Directory index full!\n");
+                       err = -ENOSPC;
+                       goto cleanup;
+@@ -2333,7 +2474,7 @@ static int ext3_dx_add_entry(handle_t *h
+       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+               if (!bh_new[i] ||
+-                  param->id_node_init(path->ip_container, bh_new[i], 0) != 0)
++                  path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0)
+                       goto cleanup;
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -2439,9 +2580,71 @@ static int ext3_dx_add_entry(handle_t *h
+                               goto journal_error;
+               }
+       }
++      goto cleanup;
++journal_error:
++      ext3_std_error(dir->i_sb, err);
++
++cleanup:
++      for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
++              if (bh_new[i] != NULL)
++                      brelse(bh_new[i]);
++      }
++      return err;
++}
++
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode)
++{
++      struct iam_path_compat cpath;
++      struct iam_path *path = &cpath.ipc_path;
++      struct iam_descr *param;
++      struct iam_frame *frame;
++      struct dx_hash_info hinfo;
++      struct buffer_head * bh = NULL;
++      struct inode *dir = dentry->d_parent->d_inode;
++      struct ext3_dir_entry_2 *de;
++      int err;
++      size_t isize;
++
++      iam_path_compat_init(&cpath, dir);
++      param = path_descr(path);
++
++      err = dx_probe(dentry, NULL, &hinfo, path);
++      if (err != 0)
++              return err;
++      frame = path->ip_frame;
++
++      /* XXX nikita: global serialization! */
++      isize = dir->i_size;
++
++      err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), 
++                                handle, &bh);
++      if (err != 0)
++              goto cleanup;
++
++      BUFFER_TRACE(bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, bh);
++      if (err)
++              goto journal_error;
++
++      err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
++      if (err != -ENOSPC) {
++              bh = NULL;
++              goto cleanup;
++      }
++      
++      err = split_index_node(handle, path);
++      if (err)
++              goto cleanup;   
++
++      /*copy split inode too*/
+       de = do_split(handle, path, &bh, --frame, &hinfo, &err);
+       if (!de)
+               goto cleanup;
++
+       assert(dx_node_check(path, frame));
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+       goto cleanup2;
+@@ -2452,10 +2655,6 @@ cleanup:
+       if (bh)
+               brelse(bh);
+ cleanup2:
+-      for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
+-              if (bh_new[i] != NULL)
+-                      brelse(bh_new[i]);
+-      }
+       if (err)
+               inode->i_size = isize;
+       iam_path_fini(path);
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h
++++ iam/include/linux/lustre_iam.h
+@@ -0,0 +1,212 @@
++/*
++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
++ */
++enum {
++      DX_MAX_TREE_HEIGHT = 5,
++      DX_SCRATCH_KEYS    = 2
++};
++
++/*
++ * Entry within index tree node. Consists of a key immediately followed
++ * (without padding) by a pointer to the child node.
++ *
++ * Both key and pointer are of variable size, hence incomplete type.
++ */
++struct iam_entry;
++
++struct iam_entry_compat {
++      __le32 hash;
++      __le32 block;
++};
++
++/*
++ * Incomplete type used to refer to keys in iam container.
++ *
++ * As key size can be different from container to container, iam has to use
++ * incomplete type. Clients cast pointer to iam_key to real key type and back.
++ */
++struct iam_key;
++
++/* Incomplete type use to refer to the records stored in iam containers. */
++struct iam_rec;
++
++typedef __u64 iam_ptr_t;
++
++/*
++ * Index node traversed during tree lookup.
++ */
++struct iam_frame {
++      struct buffer_head *bh;    /* buffer holding node data */
++      struct iam_entry *entries; /* array of entries */
++      struct iam_entry *at;      /* target entry, found by binary search */
++};
++
++/* leaf node reached by tree lookup */
++#define iam_leaf_entry iam_rec
++struct iam_leaf {
++      struct buffer_head *bh;
++      struct iam_leaf_entry *entries;
++      struct iam_leaf_entry *at;
++};
++
++struct iam_path;
++struct iam_container;
++
++/*
++ * Parameters, describing a flavor of iam container.
++ */
++struct iam_descr {
++      /*
++       * Size of a key in this container, in bytes.
++       */
++      size_t       id_key_size;
++      /*
++       * Size of a pointer to the next level (stored in index nodes), in
++       * bytes.
++       */
++      size_t       id_ptr_size;
++      /*
++       * Size of a record (stored in leaf nodes), in bytes.
++       */
++      size_t       id_rec_size;
++      /*
++       * Size of unused (by iam) space at the beginning of every non-root
++       * node, in bytes. Used for compatibility with ext3.
++       */
++      size_t       id_node_gap;
++      /*
++       * Size of unused (by iam) space at the beginning of root node, in
++       * bytes. Used for compatibility with ext3.
++       */
++      size_t       id_root_gap;
++
++      /*
++       * Returns pointer (in the same sense as pointer in index entry) to
++       * the root node.
++       */
++      __u32 (*id_root_ptr)(struct iam_container *c);
++
++      /*
++       * Check validity and consistency of index node. This is called when
++       * iam just loaded new node into frame.
++       */
++      int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
++      /*
++       * Initialize new node (stored in @bh) that is going to be added into
++       * tree.
++       */
++      int (*id_node_init)(struct iam_container *c,
++                          struct buffer_head *bh, int root);
++      int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
++                          handle_t *h, struct buffer_head **bh);
++      /*
++       * Key comparison function. Returns -1, 0, +1.
++       */
++      int (*id_keycmp)(struct iam_container *c,
++                       struct iam_key *k1, struct iam_key *k2);
++      /*
++       * Create new container.
++       *
++       * Newly created container has a root node and a single leaf. Leaf
++       * contains single record with the smallest possible key.
++       */
++      int (*id_create)(struct iam_container *c);
++      struct {
++              /*
++               * leaf operations.
++               */
++              /*
++               * returns true iff leaf is positioned at the last entry.
++               */
++              int (*at_end)(struct iam_container *c, struct iam_leaf *l);
++              /* position leaf at the first entry */
++              void (*start)(struct iam_container *c, struct iam_leaf *l);
++              /* more leaf to the next entry. */
++              void (*next)(struct iam_container *c, struct iam_leaf *l);
++              /* return key of current leaf record in @k */
++              void (*key)(struct iam_container *c, struct iam_leaf *l,
++                          struct iam_key *k);
++              /* return pointer to entry body */
++              struct iam_rec *(*rec)(struct iam_container *c,
++                                     struct iam_leaf *l);
++      } id_leaf;
++};
++
++struct iam_container {
++      /*
++       * Underlying flat file. IO against this object is issued to
++       * read/write nodes.
++       */
++      struct inode     *ic_object;
++      /*
++       * container flavor.
++       */
++      struct iam_descr *ic_descr;
++      /*
++       * pointer to flavor-specific per-container data.
++       */
++      void             *ic_descr_data;
++};
++
++/*
++ * Structure to keep track of a path drilled through htree.
++ */
++struct iam_path {
++      /*
++       * Parent container.
++       */
++      struct iam_container  *ip_container;
++      /*
++       * Number of index levels minus one.
++       */
++      int                    ip_indirect;
++      /*
++       * Nodes that top-to-bottom traversal passed through.
++       */
++      struct iam_frame       ip_frames[DX_MAX_TREE_HEIGHT];
++      /*
++       * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
++       * immediately above leaf).
++       */
++      struct iam_frame      *ip_frame;
++      /*
++       * Leaf node: a child of ->ip_frame.
++       */
++      struct iam_leaf       *ip_leaf;
++      /*
++       * Key searched for.
++       */
++      struct iam_key        *ip_key_target;
++      /*
++       * Scratch-pad area for temporary keys.
++       */
++      struct iam_key        *ip_key_scratch[DX_SCRATCH_KEYS];
++      /*
++       * pointer to flavor-specific per-container data.
++       */
++      void                  *ip_descr_data;
++};
++
++/*
++ * Helper structure for legacy htrees.
++ */
++struct iam_path_compat {
++      struct iam_path      ipc_path;
++      struct iam_container ipc_container;
++      __u32                ipc_scrach[DX_SCRATCH_KEYS];
++};
++
++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
++int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++                     struct iam_descr *descr, struct inode *inode);
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c);
++
diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-separate.patch b/ldiskfs/kernel_patches/patches/ext3-iam-separate.patch
new file mode 100644 (file)
index 0000000..717ecce
--- /dev/null
@@ -0,0 +1,6758 @@
+Index: iam/include/linux/ext3_fs.h
+===================================================================
+--- iam.orig/include/linux/ext3_fs.h   2007-05-23 11:18:17.000000000 +0800
++++ iam/include/linux/ext3_fs.h        2007-05-23 11:18:20.000000000 +0800
+@@ -758,9 +758,7 @@
+ extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv);
+ /* dir.c */
+-extern int ext3_check_dir_entry(const char *, struct inode *,
+-                              struct ext3_dir_entry_2 *,
+-                              struct buffer_head *, unsigned long);
++
+ extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+                                   __u32 minor_hash,
+                                   struct ext3_dir_entry_2 *dirent);
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h        2007-05-23 11:18:18.000000000 +0800
++++ iam/include/linux/lustre_iam.h     2007-05-23 11:18:20.000000000 +0800
+@@ -1,9 +1,68 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  lustre_iam.c
++ *  Top-level entry points into osd module
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Wang Di <wangdi@clusterfs.com>
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
++ *
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
++ */
++
++#ifndef __LINUX_LUSTRE_IAM_H__
++#define __LINUX_LUSTRE_IAM_H__
++
++/* handle_t, journal_start(), journal_stop() */
++#include <linux/jbd.h>
++
+ /*
+- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
++ *  linux/include/linux/lustre_iam.h
+  */
++
+ enum {
++        /*
++         * Maximal number of non-leaf levels in htree. In the stock ext3 this
++         * is 2.
++         */
+       DX_MAX_TREE_HEIGHT = 5,
+-      DX_SCRATCH_KEYS    = 2
++        /*
++         * Scratch keys used by generic code for temporaries.
++         *
++         * Allocation:
++         *
++         *         [0] reserved for assertions and as a staging area for
++         *         record keys immediately used for key comparisons.
++         *
++         *         [1] reserved for record key, stored during iteration over
++         *         node records (see dx_node_check()).
++         *
++         *         [2] reserved for leaf node operations.
++         *
++         *         [3] reserved for index operations.
++         */
++      DX_SCRATCH_KEYS    = 4,
++        /*
++         * Maximal format name length.
++         */
++        DX_FMT_NAME_LEN    = 16
+ };
+ /*
+@@ -30,6 +89,11 @@
+ /* Incomplete type use to refer to the records stored in iam containers. */
+ struct iam_rec;
++struct iam_cookie {
++      struct iam_key *ic_key;
++      struct iam_rec *ic_rec;
++};
++
+ typedef __u64 iam_ptr_t;
+ /*
+@@ -41,45 +105,25 @@
+       struct iam_entry *at;      /* target entry, found by binary search */
+ };
+-/* leaf node reached by tree lookup */
+-#define iam_leaf_entry iam_rec
+-struct iam_leaf {
+-      struct buffer_head *bh;
+-      struct iam_leaf_entry *entries;
+-      struct iam_leaf_entry *at;
+-};
++/*
++ * Opaque entry in the leaf node.
++ */
++struct iam_lentry;
+ struct iam_path;
+ struct iam_container;
+-/*
+- * Parameters, describing a flavor of iam container.
+- */
+-struct iam_descr {
+-      /*
+-       * Size of a key in this container, in bytes.
+-       */
+-      size_t       id_key_size;
+-      /*
+-       * Size of a pointer to the next level (stored in index nodes), in
+-       * bytes.
+-       */
+-      size_t       id_ptr_size;
+-      /*
+-       * Size of a record (stored in leaf nodes), in bytes.
+-       */
+-      size_t       id_rec_size;
+-      /*
+-       * Size of unused (by iam) space at the beginning of every non-root
+-       * node, in bytes. Used for compatibility with ext3.
+-       */
+-      size_t       id_node_gap;
+-      /*
+-       * Size of unused (by iam) space at the beginning of root node, in
+-       * bytes. Used for compatibility with ext3.
+-       */
+-      size_t       id_root_gap;
++/* leaf node reached by tree lookup */
++struct iam_leaf {
++        struct iam_path    *il_path;
++      struct buffer_head *il_bh;
++      struct iam_lentry  *il_entries;
++      struct iam_lentry  *il_at;
++      void               *il_descr_data;
++};
++
++struct iam_operations {
+       /*
+        * Returns pointer (in the same sense as pointer in index entry) to
+        * the root node.
+@@ -102,8 +146,8 @@
+       /*
+        * Key comparison function. Returns -1, 0, +1.
+        */
+-      int (*id_keycmp)(struct iam_container *c,
+-                       struct iam_key *k1, struct iam_key *k2);
++      int (*id_keycmp)(const struct iam_container *c,
++                       const struct iam_key *k1, const struct iam_key *k2);
+       /*
+        * Create new container.
+        *
+@@ -111,25 +155,113 @@
+        * contains single record with the smallest possible key.
+        */
+       int (*id_create)(struct iam_container *c);
+-      struct {
++        /*
++         * Format name.
++         */
++        char id_name[DX_FMT_NAME_LEN];
++};
++
++struct iam_leaf_operations {
+               /*
+                * leaf operations.
+                */
++
++        /*
++         * initialize just loaded leaf node.
++         */
++        int (*init)(struct iam_leaf *p);
++        /*
++         * Format new node.
++         */
++        void (*init_new)(struct iam_container *c, struct buffer_head *bh);
++        /*
++         * Release resources.
++         */
++        void (*fini)(struct iam_leaf *l);
+               /*
+                * returns true iff leaf is positioned at the last entry.
+                */
+-              int (*at_end)(struct iam_container *c, struct iam_leaf *l);
++        int (*at_end)(const struct iam_leaf *l);
+               /* position leaf at the first entry */
+-              void (*start)(struct iam_container *c, struct iam_leaf *l);
++        void (*start)(struct iam_leaf *l);
+               /* more leaf to the next entry. */
+-              void (*next)(struct iam_container *c, struct iam_leaf *l);
+-              /* return key of current leaf record in @k */
+-              void (*key)(struct iam_container *c, struct iam_leaf *l,
+-                          struct iam_key *k);
+-              /* return pointer to entry body */
+-              struct iam_rec *(*rec)(struct iam_container *c,
+-                                     struct iam_leaf *l);
+-      } id_leaf;
++        void (*next)(struct iam_leaf *l);
++        /* return key of current leaf record. This method may return
++         * either pointer to the key stored in node, or copy key into
++         * @k buffer supplied by caller and return pointer to this
++         * buffer. The latter approach is used when keys in nodes are
++         * not stored in plain form (e.g., htree doesn't store keys at
++         * all).
++         *
++         * Caller should assume that returned pointer is only valid
++         * while leaf node is pinned and locked.*/
++        struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k);
++        /* return pointer to entry body. Pointer is valid while
++           corresponding leaf node is locked and pinned. */
++        struct iam_rec *(*rec)(const struct iam_leaf *l);
++
++        void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
++        void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
++
++        /*
++         * Search leaf @l for a record with key @k or for a place
++         * where such record is to be inserted.
++         *
++         * Scratch keys from @path can be used.
++         */
++        int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
++
++        int (*can_add)(const struct iam_leaf *l,
++                       const struct iam_key *k, const struct iam_rec *r);
++        /*
++         * add rec for a leaf
++         */
++        void (*rec_add)(struct iam_leaf *l,
++                        const struct iam_key *k, const struct iam_rec *r);
++        /*
++         * remove rec for a leaf
++         */
++        void (*rec_del)(struct iam_leaf *l);
++        /*
++         * split leaf node, moving some entries into @bh (the latter currently
++         * is assumed to be empty).
++         */
++        void (*split)(struct iam_leaf *l, struct buffer_head *bh);
++};
++
++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
++
++/*
++ * Parameters, describing a flavor of iam container.
++ */
++struct iam_descr {
++      /*
++       * Size of a key in this container, in bytes.
++       */
++      size_t       id_key_size;
++      /*
++       * Size of a pointer to the next level (stored in index nodes), in
++       * bytes.
++       */
++      size_t       id_ptr_size;
++      /*
++       * Size of a record (stored in leaf nodes), in bytes.
++       */
++      size_t       id_rec_size;
++      /*
++       * Size of unused (by iam) space at the beginning of every non-root
++       * node, in bytes. Used for compatibility with ext3.
++       */
++      size_t       id_node_gap;
++      /*
++       * Size of unused (by iam) space at the beginning of root node, in
++       * bytes. Used for compatibility with ext3.
++       */
++      size_t       id_root_gap;
++
++        struct iam_operations           *id_ops;
++        struct iam_leaf_operations      *id_leaf_ops;
+ };
+ struct iam_container {
+@@ -142,10 +274,17 @@
+        * container flavor.
+        */
+       struct iam_descr *ic_descr;
++};
++
++/*
++ * description-specific part of iam_path. This is usually embedded into larger
++ * structure.
++ */
++struct iam_path_descr {
+       /*
+-       * pointer to flavor-specific per-container data.
++       * Scratch-pad area for temporary keys.
+        */
+-      void             *ic_descr_data;
++      struct iam_key        *ipd_key_scratch[DX_SCRATCH_KEYS];
+ };
+ /*
+@@ -172,36 +311,240 @@
+       /*
+        * Leaf node: a child of ->ip_frame.
+        */
+-      struct iam_leaf       *ip_leaf;
++      struct iam_leaf        ip_leaf;
+       /*
+        * Key searched for.
+        */
+-      struct iam_key        *ip_key_target;
+-      /*
+-       * Scratch-pad area for temporary keys.
+-       */
+-      struct iam_key        *ip_key_scratch[DX_SCRATCH_KEYS];
++      const struct iam_key  *ip_key_target;
+       /*
+-       * pointer to flavor-specific per-container data.
++       * Description-specific data.
+        */
+-      void                  *ip_descr_data;
++      struct iam_path_descr *ip_data;
+ };
++struct dx_hash_info;
++
+ /*
+  * Helper structure for legacy htrees.
+  */
+ struct iam_path_compat {
+       struct iam_path      ipc_path;
+       struct iam_container ipc_container;
+-      __u32                ipc_scrach[DX_SCRATCH_KEYS];
++      __u32                 ipc_scratch[DX_SCRATCH_KEYS];
++      struct dx_hash_info  *ipc_hinfo;
++      struct dentry        *ipc_dentry;
++      struct iam_path_descr ipc_descr;
++};
++
++/*
++ * iam cursor (iterator) api.
++ */
++
++/*
++ * States of iterator state machine.
++ */
++enum iam_it_state {
++      /* initial state */
++      IAM_IT_DETACHED,
++      /* iterator is above particular record in the container */
++      IAM_IT_ATTACHED
+ };
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
+-int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+ /*
+- * Initialize container @c, acquires additional reference on @inode.
++ * Flags controlling iterator functionality.
++ */
++enum iam_it_flags {
++      /*
++       * this iterator will move (iam_it_{prev,next}() will be called on it)
++       */
++      IAM_IT_MOVE  = (1 << 0),
++      /*
++       * tree can be updated through this iterator.
++       */
++      IAM_IT_WRITE = (1 << 1)
++};
++
++/*
++ * Iterator.
++ *
++ * Immediately after call to iam_it_init() iterator is in "detached"
++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but
++ * doesn't point to any particular record in this container.
++ *
++ * After successful call to iam_it_get() and until corresponding call to
++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
++ *
++ * Attached iterator can move through records in a container (provided
++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
++ * passes over them, and can modify container (provided IAM_IT_WRITE
++ * permission).
++ *
++ * Concurrency: iterators are supposed to be local to thread. Interfaces below
++ * do no internal serialization.
++ *
++ */
++struct iam_iterator {
++      /*
++       * iterator flags, taken from enum iam_it_flags.
++       */
++      __u32                 ii_flags;
++      enum iam_it_state     ii_state;
++      /*
++       * path to the record. Valid in IAM_IT_ATTACHED state.
++       */
++      struct iam_path       ii_path;
++};
++
++void iam_path_init(struct iam_path *path, struct iam_container *c,
++                 struct iam_path_descr *pd);
++void iam_path_fini(struct iam_path *path);
++
++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
++void iam_path_compat_fini(struct iam_path_compat *path);
++
++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize);
++void iam_ipd_free(struct iam_path_descr *ipd);
++
++/*
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
++               struct iam_path_descr *pd);
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it);
++
++/*
++ * Attach iterator. After successful completion, @it points to record with the
++ * largest key not larger than @k. Semantics of ->id_create() method guarantee
++ * that such record will always be found.
++ *
++ * Return value: 0: positioned on existing record,
++ *             -ve: error.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0,
++ *                     (it_state(it) == IAM_IT_ATTACHED &&
++ *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
++ */
++int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
++
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ *                iam_it_container(dst) == iam_it_container(src) &&
++ *                dst->ii_flags = src->ii_flags &&
++ *                ergo(it_state(it) == IAM_IT_ATTACHED,
++ *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
++ */
++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
++
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it);
++
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ *              +1: end of container reached
++ *             -ve: error
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_next(struct iam_iterator *it);
++
++/*
++ * Return pointer to the record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
++
++/*
++ * Replace contents of record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
++
++/*
++ * Place key under iterator in @k, return @k
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_key *iam_it_key_get(const struct iam_iterator *it,
++                               struct iam_key *k);
++
++/*
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                it->ii_flags&IAM_IT_WRITE &&
++ *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0,
++ *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
++ *                     !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++                    const struct iam_key *k, const struct iam_rec *r);
++/*
++ * Delete record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
++
++typedef __u64 iam_pos_t;
++
++/*
++ * Convert iterator to cookie.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++iam_pos_t iam_it_store(const struct iam_iterator *it);
++
++/*
++ * Restore iterator from cookie.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
++ *                path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
++ *                                  iam_it_store(it) == pos)
++ */
++int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
++
++int iam_lookup(struct iam_container *c, const struct iam_key *k,
++               struct iam_rec *r, struct iam_path_descr *pd);
++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
++             struct iam_path_descr *pd);
++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
++             struct iam_rec *r, struct iam_path_descr *pd);
++int iam_insert(handle_t *handle, struct iam_container *c,
++               const struct iam_key *k,
++             struct iam_rec *r, struct iam_path_descr *pd);
++/*
++ * Initialize container @c.
+  */
+ int iam_container_init(struct iam_container *c,
+                      struct iam_descr *descr, struct inode *inode);
+@@ -210,3 +553,170 @@
+  */
+ void iam_container_fini(struct iam_container *c);
++/*
++ * Determine container format.
++ */
++int iam_container_setup(struct iam_container *c);
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++static inline struct iam_descr *iam_container_descr(struct iam_container *c)
++{
++        return c->ic_descr;
++}
++
++static inline struct iam_descr *iam_path_descr(const struct iam_path *p)
++{
++      return p->ip_container->ic_descr;
++}
++
++static inline struct inode *iam_path_obj(struct iam_path *p)
++{
++      return p->ip_container->ic_object;
++}
++
++static inline void iam_keycpy(const struct iam_container *c,
++                              struct iam_key *k1, const struct iam_key *k2)
++{
++      memcpy(k1, k2, c->ic_descr->id_key_size);
++}
++
++static inline int iam_keycmp(const struct iam_container *c,
++                           const struct iam_key *k1, const struct iam_key *k2)
++{
++      return c->ic_descr->id_ops->id_keycmp(c, k1, k2);
++}
++
++static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
++                            const struct iam_rec *rec_src)
++{
++      memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size);
++}
++
++static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
++{
++      return (void *)((char *)entry + off);
++}
++
++/*XXX These stuff put here, just because they are used by iam.c and namei.c*/
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
++{
++      return le32_to_cpu(*(u32*)iam_entry_off(entry,
++                                              iam_path_descr(p)->id_key_size))
++              & 0x00ffffff;
++}
++
++static inline void dx_set_block(struct iam_path *p,
++                              struct iam_entry *entry, unsigned value)
++{
++      *(u32*)iam_entry_off(entry,
++                           iam_path_descr(p)->id_key_size) =
++              cpu_to_le32(value);
++}
++
++static inline void dx_set_key(struct iam_path *p, struct iam_entry *entry,
++                              const struct iam_key *key)
++{
++        iam_keycpy(p->ip_container, iam_entry_off(entry, 0), key);
++}
++
++struct dx_countlimit {
++      __le16 limit;
++      __le16 count;
++};
++
++static inline unsigned dx_get_count(struct iam_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->count);
++}
++
++static inline unsigned dx_get_limit(struct iam_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
++}
++
++static inline void dx_set_count(struct iam_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
++}
++
++static inline unsigned dx_node_limit(struct iam_path *p)
++{
++      struct iam_descr *param = iam_path_descr(p);
++      unsigned entry_space   = iam_path_obj(p)->i_sb->s_blocksize -
++              param->id_node_gap;
++      return entry_space / (param->id_key_size + param->id_ptr_size);
++}
++
++static inline struct iam_entry *dx_get_entries(struct iam_path *path,
++                                             void *data, int root)
++{
++      struct iam_descr *param = iam_path_descr(path);
++      return data + (root ? param->id_root_gap : param->id_node_gap);
++}
++
++
++static inline struct iam_entry *dx_node_get_entries(struct iam_path *path,
++                                                  struct iam_frame *frame)
++{
++      return dx_get_entries(path,
++                            frame->bh->b_data, frame == path->ip_frames);
++}
++
++static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr)
++{
++      assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
++      return path->ip_data->ipd_key_scratch[nr];
++}
++
++int dx_lookup(struct iam_path *path);
++void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
++                   u32 hash, u32 block);
++
++int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                        struct iam_path *path, __u32 *start_hash);
++
++struct buffer_head *ext3_append(handle_t *handle, struct inode *inode,
++                              u32 *block, int *err);
++int split_index_node(handle_t *handle, struct iam_path *path);
++
++/*
++ * external
++ */
++void iam_container_write_lock(struct iam_container *c);
++void iam_container_write_unlock(struct iam_container *c);
++
++void iam_container_read_lock(struct iam_container *c);
++void iam_container_read_unlock(struct iam_container *c);
++
++int iam_index_next(struct iam_container *c, struct iam_path *p);
++int iam_read_leaf(struct iam_path *p);
++
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++                handle_t *handle, struct buffer_head **bh);
++
++void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++                  const struct iam_key *key, iam_ptr_t ptr);
++
++int  iam_leaf_at_end(const struct iam_leaf *l);
++void iam_leaf_next(struct iam_leaf *folio);
++
++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
++struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf);
++struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
++
++
++struct iam_format {
++        int (*if_guess)(struct iam_container *c);
++        struct list_head if_linkage;
++};
++
++void iam_format_register(struct iam_format *fmt);
++
++void iam_lfix_format_init(void);
++
++/* __LINUX_LUSTRE_IAM_H__ */
++#endif
+Index: iam/fs/ext3/iam.c
+===================================================================
+--- iam.orig/fs/ext3/iam.c     2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam.c  2007-05-23 11:18:20.000000000 +0800
+@@ -0,0 +1,1436 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  iam.c
++ *  Top-level entry points into iam module
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Wang Di <wangdi@clusterfs.com>
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
++ *
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
++ */
++
++/*
++ * iam: big theory statement.
++ *
++ * iam (Index Access Module) is a module providing abstraction of persistent
++ * transactional container on top of generalized ext3 htree.
++ *
++ * iam supports:
++ *
++ *     - key, pointer, and record size specifiable per container.
++ *
++ *     - trees taller than 2 index levels.
++ *
++ *     - read/write to existing ext3 htree directories as iam containers.
++ *
++ * iam container is a tree, consisting of leaf nodes containing keys and
++ * records stored in this container, and index nodes, containing keys and
++ * pointers to leaf or index nodes.
++ *
++ * iam does not work with keys directly, instead it calls user-supplied key
++ * comparison function (->dpo_keycmp()).
++ *
++ * Pointers are (currently) interpreted as logical offsets (measured in
++ * blocksful) within underlying flat file on top of which iam tree lives.
++ *
++ * On-disk format:
++ *
++ * iam mostly tries to reuse existing htree formats.
++ *
++ * Format of index node:
++ *
++ * +-----+-------+-------+-------+------+-------+------------+
++ * |     | count |       |       |      |       |            |
++ * | gap |   /   | entry | entry | .... | entry | free space |
++ * |     | limit |       |       |      |       |            |
++ * +-----+-------+-------+-------+------+-------+------------+
++ *
++ *       gap           this part of node is never accessed by iam code. It
++ *                     exists for binary compatibility with ext3 htree (that,
++ *                     in turn, stores fake struct ext2_dirent for ext2
++ *                     compatibility), and to keep some unspecified per-node
++ *                     data. Gap can be different for root and non-root index
++ *                     nodes. Gap size can be specified for each container
++ *                     (gap of 0 is allowed).
++ *
++ *       count/limit   current number of entries in this node, and the maximal
++ *                     number of entries that can fit into node. count/limit
++ *                     has the same size as entry, and is itself counted in
++ *                     count.
++ *
++ *       entry         index entry: consists of a key immediately followed by
++ *                     a pointer to a child node. Size of a key and size of a
++ *                     pointer depends on container. Entry has neither
++ *                     alignment nor padding.
++ *
++ *       free space    portion of node new entries are added to
++ *
++ * Entries in index node are sorted by their key value.
++ *
++ * Format of a leaf node is not specified. Generic iam code accesses leaf
++ * nodes through ->id_leaf methods in struct iam_descr.
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/pagemap.h>
++#include <linux/jbd.h>
++#include <linux/time.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/fcntl.h>
++#include <linux/stat.h>
++#include <linux/string.h>
++#include <linux/quotaops.h>
++#include <linux/buffer_head.h>
++#include <linux/smp_lock.h>
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++#include "xattr.h"
++#include "iopen.h"
++#include "acl.h"
++
++/*
++ * List of all registered formats.
++ *
++ * No locking. Callers synchronize.
++ */
++static LIST_HEAD(iam_formats);
++
++void iam_format_register(struct iam_format *fmt)
++{
++        list_add(&fmt->if_linkage, &iam_formats);
++}
++EXPORT_SYMBOL(iam_format_register);
++
++/*
++ * Determine format of given container. This is done by scanning list of
++ * registered formats and calling ->if_guess() method of each in turn.
++ */
++static int iam_format_guess(struct iam_container *c)
++{
++        int result;
++        struct iam_format *fmt;
++
++        /*
++         * XXX temporary initialization hook.
++         */
++        {
++                static int initialized = 0;
++
++                if (!initialized) {
++                        /*
++                         * Keep that order: htree should be registered first,
++                         * so that iam_htree_guess() runs last.
++                         */
++                        iam_htree_format_init();
++                        iam_lvar_format_init();
++                        iam_lfix_format_init();
++                        initialized = 1;
++                }
++        }
++
++        result = -ENOENT;
++        list_for_each_entry(fmt, &iam_formats, if_linkage) {
++                result = fmt->if_guess(c);
++                if (result == 0)
++                        break;
++        }
++        return result;
++}
++
++/*
++ * Initialize container @c.
++ */
++int iam_container_init(struct iam_container *c,
++                     struct iam_descr *descr, struct inode *inode)
++{
++      memset(c, 0, sizeof *c);
++      c->ic_descr  = descr;
++      c->ic_object = inode;
++        init_rwsem(&c->ic_sem);
++        return 0;
++}
++EXPORT_SYMBOL(iam_container_init);
++
++/*
++ * Determine container format.
++ */
++int iam_container_setup(struct iam_container *c)
++{
++        return iam_format_guess(c);
++}
++EXPORT_SYMBOL(iam_container_setup);
++
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c)
++{
++}
++EXPORT_SYMBOL(iam_container_fini);
++
++void iam_path_init(struct iam_path *path, struct iam_container *c,
++                   struct iam_path_descr *pd)
++{
++      memset(path, 0, sizeof *path);
++      path->ip_container = c;
++      path->ip_frame = path->ip_frames;
++      path->ip_data = pd;
++        path->ip_leaf.il_path = path;
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf);
++
++void iam_path_release(struct iam_path *path)
++{
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
++              if (path->ip_frames[i].bh != NULL) {
++                      brelse(path->ip_frames[i].bh);
++                      path->ip_frames[i].bh = NULL;
++              }
++      }
++}
++
++void iam_path_fini(struct iam_path *path)
++{
++      iam_leaf_fini(&path->ip_leaf);
++        iam_path_release(path);
++}
++
++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
++{
++      int i;
++
++        path->ipc_hinfo = &path->ipc_hinfo_area;
++      for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
++              path->ipc_descr.ipd_key_scratch[i] =
++                      (struct iam_ikey *)&path->ipc_scratch[i];
++
++      iam_container_init(&path->ipc_container,
++                           &iam_htree_compat_param, inode);
++      iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
++}
++
++void iam_path_compat_fini(struct iam_path_compat *path)
++{
++      iam_path_fini(&path->ipc_path);
++      iam_container_fini(&path->ipc_container);
++}
++
++/*
++ * Helper function initializing iam_path_descr and its key scratch area.
++ */
++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
++{
++        struct iam_path_descr *ipd;
++        void *karea;
++        int i;
++
++        ipd = area;
++        karea = ipd + 1;
++        for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
++                ipd->ipd_key_scratch[i] = karea;
++        return ipd;
++}
++EXPORT_SYMBOL(iam_ipd_alloc);
++
++void iam_ipd_free(struct iam_path_descr *ipd)
++{
++}
++EXPORT_SYMBOL(iam_ipd_free);
++
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++                  handle_t *h, struct buffer_head **bh)
++{
++      int result = 0;
++
++      *bh = ext3_bread(h, c->ic_object, (int)ptr, 0, &result);
++      if (*bh == NULL)
++              result = -EIO;
++      return result;
++}
++
++/*
++ * Return pointer to current leaf record. Pointer is valid while corresponding
++ * leaf node is locked and pinned.
++ */
++static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
++{
++      return iam_leaf_ops(leaf)->rec(leaf);
++}
++
++/*
++ * Return pointer to the current leaf key. This function returns pointer to
++ * the key stored in node.
++ *
++ * Caller should assume that returned pointer is only valid while leaf node is
++ * pinned and locked.
++ */
++static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
++{
++      return iam_leaf_ops(leaf)->key(leaf);
++}
++
++static int iam_leaf_key_size(const struct iam_leaf *leaf)
++{
++      return iam_leaf_ops(leaf)->key_size(leaf);
++}
++
++static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
++                                      struct iam_ikey *key)
++{
++      return iam_leaf_ops(leaf)->ikey(leaf, key);
++}
++
++static int iam_leaf_keycmp(const struct iam_leaf *leaf,
++                           const struct iam_key *key)
++{
++      return iam_leaf_ops(leaf)->key_cmp(leaf, key);
++}
++
++static int iam_leaf_keyeq(const struct iam_leaf *leaf,
++                          const struct iam_key *key)
++{
++      return iam_leaf_ops(leaf)->key_eq(leaf, key);
++}
++
++#if EXT3_INVARIANT_ON
++static int iam_leaf_check(struct iam_leaf *leaf);
++extern int dx_node_check(struct iam_path *p, struct iam_frame *f);
++
++static int iam_path_check(struct iam_path *p)
++{
++        int i;
++        int result;
++        struct iam_frame *f;
++        struct iam_descr *param;
++
++        result = 1;
++        param = iam_path_descr(p);
++        for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
++                f = &p->ip_frames[i];
++                if (f->bh != NULL) {
++                        result = dx_node_check(p, f);
++                        if (result)
++                                result = !param->id_ops->id_node_check(p, f);
++                }
++        }
++        if (result && p->ip_leaf.il_bh != NULL)
++                result = iam_leaf_check(&p->ip_leaf);
++        if (result == 0) {
++                ext3_std_error(iam_path_obj(p)->i_sb, result);
++        }
++        return result;
++}
++#endif
++
++static int iam_leaf_load(struct iam_path *path)
++{
++      iam_ptr_t block;
++      int err;
++      struct iam_container *c;
++      struct buffer_head   *bh;
++      struct iam_leaf      *leaf;
++      struct iam_descr     *descr;
++      
++      c     = path->ip_container;
++      leaf  = &path->ip_leaf;
++      descr = iam_path_descr(path);
++      block = path->ip_frame->leaf;
++        if (block == 0) {
++                /* XXX bug 11027 */
++                printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
++                       (long unsigned)path->ip_frame->leaf,
++                       dx_get_count(dx_node_get_entries(path, path->ip_frame)),
++                       path->ip_frames[0].bh, path->ip_frames[1].bh,
++                       path->ip_frames[2].bh);
++        }
++      err   = descr->id_ops->id_node_read(c, block, NULL, &bh);
++      if (err == 0) {
++              leaf->il_bh = bh;
++                leaf->il_curidx = block;
++              err = iam_leaf_ops(leaf)->init(leaf);
++                assert_inv(ergo(err == 0, iam_leaf_check(leaf)));
++      }
++      return err;
++}
++
++static void iam_leaf_unlock(struct iam_leaf *leaf)
++{
++        if (leaf->il_lock != NULL) {
++                dx_unlock_htree(iam_leaf_container(leaf)->ic_object,
++                                leaf->il_lock);
++              do_corr(schedule());
++                leaf->il_lock = NULL;
++        }
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf)
++{
++        if (leaf->il_path != NULL) {
++                iam_leaf_unlock(leaf);
++                assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf)));
++                iam_leaf_ops(leaf)->fini(leaf);
++                if (leaf->il_bh) {
++                        brelse(leaf->il_bh);
++                        leaf->il_bh = NULL;
++                        leaf->il_curidx = 0;
++                }
++        }
++}
++
++static void iam_leaf_start(struct iam_leaf *folio)
++{
++      iam_leaf_ops(folio)->start(folio);
++}
++
++void iam_leaf_next(struct iam_leaf *folio)
++{
++      iam_leaf_ops(folio)->next(folio);
++}
++
++static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
++                             const struct iam_rec *rec)
++{
++        iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
++}
++
++static void iam_rec_del(struct iam_leaf *leaf, int shift)
++{
++        iam_leaf_ops(leaf)->rec_del(leaf, shift);
++}
++
++int iam_leaf_at_end(const struct iam_leaf *leaf)
++{
++        return iam_leaf_ops(leaf)->at_end(leaf);
++}
++
++void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr)
++{
++        iam_leaf_ops(l)->split(l, bh, nr);
++}
++
++int iam_leaf_can_add(const struct iam_leaf *l,
++                     const struct iam_key *k, const struct iam_rec *r)
++{
++        return iam_leaf_ops(l)->can_add(l, k, r);
++}
++
++#if EXT3_INVARIANT_ON
++static int iam_leaf_check(struct iam_leaf *leaf)
++{
++        return 1;
++#if 0
++        struct iam_lentry    *orig;
++        struct iam_path      *path;
++        struct iam_container *bag;
++        struct iam_ikey       *k0;
++        struct iam_ikey       *k1;
++        int result;
++        int first;
++
++        orig = leaf->il_at;
++        path = iam_leaf_path(leaf);
++        bag  = iam_leaf_container(leaf);
++
++        result = iam_leaf_ops(leaf)->init(leaf);
++        if (result != 0)
++                return result;
++
++        first = 1;
++        iam_leaf_start(leaf);
++        k0 = iam_path_ikey(path, 0);
++        k1 = iam_path_ikey(path, 1);
++        while (!iam_leaf_at_end(leaf)) {
++              iam_ikeycpy(bag, k0, k1);
++              iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1));
++                if (!first && iam_ikeycmp(bag, k0, k1) > 0) {
++                        return 0;
++                }
++                first = 0;
++                iam_leaf_next(leaf);
++        }
++        leaf->il_at = orig;
++        return 1;
++#endif
++}
++#endif
++
++static int iam_txn_dirty(handle_t *handle,
++                         struct iam_path *path, struct buffer_head *bh)
++{
++        int result;
++
++        result = ext3_journal_dirty_metadata(handle, bh);
++        if (result != 0)
++                ext3_std_error(iam_path_obj(path)->i_sb, result);
++        return result;
++}
++
++static int iam_txn_add(handle_t *handle,
++                       struct iam_path *path, struct buffer_head *bh)
++{
++        int result;
++
++        result = ext3_journal_get_write_access(handle, bh);
++        if (result != 0)
++                ext3_std_error(iam_path_obj(path)->i_sb, result);
++        return result;
++}
++
++/***********************************************************************/
++/* iterator interface                                                  */
++/***********************************************************************/
++
++static enum iam_it_state it_state(const struct iam_iterator *it)
++{
++        return it->ii_state;
++}
++
++/*
++ * Helper function returning scratch key.
++ */
++static struct iam_container *iam_it_container(const struct iam_iterator *it)
++{
++      return it->ii_path.ip_container;
++}
++
++static inline int it_keycmp(const struct iam_iterator *it,
++                            const struct iam_key *k)
++{
++      return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
++}
++
++static inline int it_keyeq(const struct iam_iterator *it,
++                           const struct iam_key *k)
++{
++      return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
++}
++
++static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
++{
++        return iam_ikeycmp(it->ii_path.ip_container,
++                           iam_leaf_ikey(&it->ii_path.ip_leaf,
++                                         iam_path_ikey(&it->ii_path, 0)), ik);
++}
++
++static inline int it_at_rec(const struct iam_iterator *it)
++{
++        return !iam_leaf_at_end(&it->ii_path.ip_leaf);
++}
++
++static inline int it_before(const struct iam_iterator *it)
++{
++        return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
++}
++
++/*
++ * Helper wrapper around iam_it_get(): returns 0 (success) only when record
++ * with exactly the same key as asked is found.
++ */
++static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
++{
++        int result;
++
++        result = iam_it_get(it, k);
++        if (result > 0)
++                result = 0;
++        else if (result == 0)
++                /*
++                 * Return -ENOENT if cursor is located above record with a key
++                 * different from one specified, or in the empty leaf.
++                 *
++                 * XXX returning -ENOENT only works if iam_it_get() never
++                 * returns -ENOENT as a legitimate error.
++                 */
++                result = -ENOENT;
++        return result;
++}
++
++void iam_container_write_lock(struct iam_container *ic)
++{
++        down_write(&ic->ic_sem);
++}
++
++void iam_container_write_unlock(struct iam_container *ic)
++{
++        up_write(&ic->ic_sem);
++}
++
++void iam_container_read_lock(struct iam_container *ic)
++{
++        down_read(&ic->ic_sem);
++}
++
++void iam_container_read_unlock(struct iam_container *ic)
++{
++        up_read(&ic->ic_sem);
++}
++
++/*
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
++               struct iam_path_descr *pd)
++{
++      memset(it, 0, sizeof *it);
++      it->ii_flags  = flags;
++      it->ii_state  = IAM_IT_DETACHED;
++      iam_path_init(&it->ii_path, c, pd);
++      return 0;
++}
++EXPORT_SYMBOL(iam_it_init);
++
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it)
++{
++      assert_corr(it_state(it) == IAM_IT_DETACHED);
++      iam_path_fini(&it->ii_path);
++}
++EXPORT_SYMBOL(iam_it_fini);
++
++/*
++ * Performs tree top-to-bottom traversal starting from root, and loads leaf
++ * node.
++ */
++static int iam_path_lookup(struct iam_path *path, int index)
++{
++      struct iam_container *c;
++      struct iam_descr *descr;
++      struct iam_leaf  *leaf;
++      int result;
++      
++      c = path->ip_container;
++      leaf = &path->ip_leaf;
++      descr = iam_path_descr(path);
++      result = dx_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
++        assert_inv(iam_path_check(path));
++        do_corr(schedule());
++      if (result == 0) {
++              result = iam_leaf_load(path);
++                assert_inv(ergo(result == 0, iam_leaf_check(leaf)));
++              if (result == 0) {
++                        do_corr(schedule());
++                        if (index)
++                                result = iam_leaf_ops(leaf)->
++                                        ilookup(leaf, path->ip_ikey_target);
++                        else
++                                result = iam_leaf_ops(leaf)->
++                                        lookup(leaf, path->ip_key_target);
++                        do_corr(schedule());
++                }
++                if (result < 0)
++                        iam_leaf_unlock(leaf);
++      }
++      return result;
++}
++
++/*
++ * Common part of iam_it_{i,}get().
++ */
++static int __iam_it_get(struct iam_iterator *it, int index)
++{
++        int result;
++        assert_corr(it_state(it) == IAM_IT_DETACHED);
++
++        result = iam_path_lookup(&it->ii_path, index);
++        if (result >= 0) {
++                int collision;
++
++                collision = result & IAM_LOOKUP_LAST;
++                switch (result & ~IAM_LOOKUP_LAST) {
++                case IAM_LOOKUP_EXACT:
++                        result = +1;
++                        it->ii_state = IAM_IT_ATTACHED;
++                        break;
++                case IAM_LOOKUP_OK:
++                        result = 0;
++                        it->ii_state = IAM_IT_ATTACHED;
++                        break;
++                case IAM_LOOKUP_BEFORE:
++                case IAM_LOOKUP_EMPTY:
++                        result = 0;
++                        it->ii_state = IAM_IT_SKEWED;
++                        break;
++                default:
++                        assert(0);
++                }
++                result |= collision;
++        }
++        /*
++         * See iam_it_get_exact() for explanation.
++         */
++        assert_corr(result != -ENOENT);
++        return result;
++}
++
++/*
++ * Correct hash, but not the same key was found, iterate through hash
++ * collision chain, looking for correct record.
++ */
++static int iam_it_collision(struct iam_iterator *it)
++{
++        int result;
++
++        assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
++
++        while ((result = iam_it_next(it)) == 0) {
++              do_corr(schedule());
++                if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
++                        return -ENOENT;
++                if (it_keyeq(it, it->ii_path.ip_key_target))
++                        return 0;
++        }
++        return result;
++}
++
++/*
++ * Attach iterator. After successful completion, @it points to record with
++ * least key not larger than @k.
++ *
++ * Return value: 0: positioned on existing record,
++ *             +ve: exact position found,
++ *             -ve: error.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
++ *                     it_keycmp(it, k) <= 0)
++ */
++int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
++{
++        int result;
++        assert_corr(it_state(it) == IAM_IT_DETACHED);
++
++        it->ii_path.ip_ikey_target = NULL;
++        it->ii_path.ip_key_target  = k;
++
++        result = __iam_it_get(it, 0);
++
++        if (result == IAM_LOOKUP_LAST) {
++                result = iam_it_collision(it);
++                if (result != 0) {
++                        iam_it_put(it);
++                        iam_it_fini(it);
++                        result = __iam_it_get(it, 0);
++                } else
++                        result = +1;
++        }
++        if (result > 0)
++                result &= ~IAM_LOOKUP_LAST;
++
++        assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
++      assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
++                         it_keycmp(it, k) <= 0));
++        return result;
++}
++EXPORT_SYMBOL(iam_it_get);
++
++/*
++ * Attach iterator by index key.
++ */
++static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
++{
++        assert_corr(it_state(it) == IAM_IT_DETACHED);
++
++        it->ii_path.ip_ikey_target = k;
++        return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
++}
++
++/*
++ * Attach iterator, and assure it points to the record (not skewed).
++ *
++ * Return value: 0: positioned on existing record,
++ *             +ve: exact position found,
++ *             -ve: error.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED &&
++ *                !(it->ii_flags&IAM_IT_WRITE)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
++{
++        int result;
++        assert_corr(it_state(it) == IAM_IT_DETACHED &&
++                    !(it->ii_flags&IAM_IT_WRITE));
++        result = iam_it_get(it, k);
++        if (result == 0) {
++                if (it_state(it) != IAM_IT_ATTACHED) {
++                        assert_corr(it_state(it) == IAM_IT_SKEWED);
++                        result = iam_it_next(it);
++                }
++        }
++        assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
++        return result;
++}
++EXPORT_SYMBOL(iam_it_get_at);
++
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ *                iam_it_container(dst) == iam_it_container(src) &&
++ *                dst->ii_flags = src->ii_flags &&
++ *                ergo(it_state(src) == IAM_IT_ATTACHED,
++ *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ *                     iam_it_key_get(dst) == iam_it_key_get(src))
++ */
++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
++{
++        dst->ii_flags     = src->ii_flags;
++        dst->ii_state     = src->ii_state;
++        /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
++        /*
++         * XXX: duplicate lock.
++         */
++      assert_corr(it_state(dst) == it_state(src));
++      assert_corr(iam_it_container(dst) == iam_it_container(src));
++      assert_corr(dst->ii_flags = src->ii_flags);
++      assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
++                  iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++                  iam_it_key_get(dst) == iam_it_key_get(src)));
++
++}
++
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it)
++{
++        if (it->ii_state != IAM_IT_DETACHED) {
++                it->ii_state = IAM_IT_DETACHED;
++              iam_leaf_fini(&it->ii_path.ip_leaf);
++        }
++}
++EXPORT_SYMBOL(iam_it_put);
++
++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
++                                        struct iam_ikey *ikey);
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ *              +1: end of container reached
++ *             -ve: error
++ *
++ * precondition:  (it_state(it) == IAM_IT_ATTACHED ||
++ *                 it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
++ *                ergo(result >  0, it_state(it) == IAM_IT_DETACHED)
++ */
++int iam_it_next(struct iam_iterator *it)
++{
++        int result;
++        struct iam_path      *path;
++        struct iam_leaf      *leaf;
++        struct inode         *obj;
++        do_corr(struct iam_ikey *ik_orig);
++
++        /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
++        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++                    it_state(it) == IAM_IT_SKEWED);
++
++        path = &it->ii_path;
++        leaf = &path->ip_leaf;
++        obj  = iam_path_obj(path);
++
++        assert_corr(iam_leaf_is_locked(leaf));
++
++        result = 0;
++        do_corr(ik_orig = it_at_rec(it) ?
++                iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
++        if (it_before(it)) {
++                assert_corr(!iam_leaf_at_end(leaf));
++                it->ii_state = IAM_IT_ATTACHED;
++        } else {
++                if (!iam_leaf_at_end(leaf))
++                        /* advance within leaf node */
++                        iam_leaf_next(leaf);
++                /*
++                 * multiple iterations may be necessary due to empty leaves.
++                 */
++                while (result == 0 && iam_leaf_at_end(leaf)) {
++                        do_corr(schedule());
++                        /* advance index portion of the path */
++                        result = iam_index_next(iam_it_container(it), path);
++                        assert_corr(iam_leaf_is_locked(leaf));
++                        if (result == 1) {
++                                struct dynlock_handle *lh;
++                                lh = dx_lock_htree(obj, path->ip_frame->leaf,
++                                                   DLT_WRITE);
++                                if (lh != NULL) {
++                                        iam_leaf_fini(leaf);
++                                        leaf->il_lock = lh;
++                                        result = iam_leaf_load(path);
++                                        if (result == 0)
++                                                iam_leaf_start(leaf);
++                                } else
++                                        result = -ENOMEM;
++                        } else if (result == 0)
++                                /* end of container reached */
++                                result = +1;
++                        if (result != 0)
++                                iam_it_put(it);
++                }
++                if (result == 0)
++                        it->ii_state = IAM_IT_ATTACHED;
++        }
++        assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
++        assert_corr(ergo(result >  0, it_state(it) == IAM_IT_DETACHED));
++        assert_corr(ergo(result == 0 && ik_orig != NULL,
++                         it_ikeycmp(it, ik_orig) >= 0));
++        return result;
++}
++EXPORT_SYMBOL(iam_it_next);
++
++/*
++ * Return pointer to the record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
++{
++        assert_corr(it_state(it) == IAM_IT_ATTACHED);
++        assert_corr(it_at_rec(it));
++        return iam_leaf_rec(&it->ii_path.ip_leaf);
++}
++EXPORT_SYMBOL(iam_it_rec_get);
++
++static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
++{
++        struct iam_leaf *folio;
++
++        folio = &it->ii_path.ip_leaf;
++        iam_leaf_ops(folio)->rec_set(folio, r);
++}
++
++/*
++ * Replace contents of record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_set(handle_t *h,
++                   struct iam_iterator *it, const struct iam_rec *r)
++{
++        int result;
++        struct iam_path *path;
++        struct buffer_head *bh;
++
++        assert_corr(it_state(it) == IAM_IT_ATTACHED &&
++                    it->ii_flags&IAM_IT_WRITE);
++        assert_corr(it_at_rec(it));
++
++        path = &it->ii_path;
++        bh   = path->ip_leaf.il_bh;
++        result = iam_txn_add(h, path, bh);
++        if (result == 0) {
++                iam_it_reccpy(it, r);
++                result = iam_txn_dirty(h, path, bh);
++        }
++        return result;
++}
++EXPORT_SYMBOL(iam_it_rec_set);
++
++/*
++ * Return pointer to the index key under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED ||
++ *                it_state(it) == IAM_IT_SKEWED
++ */
++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
++                                        struct iam_ikey *ikey)
++{
++        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++                    it_state(it) == IAM_IT_SKEWED);
++        assert_corr(it_at_rec(it));
++        return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
++}
++
++/*
++ * Return pointer to the key under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED ||
++ *                it_state(it) == IAM_IT_SKEWED
++ */
++struct iam_key *iam_it_key_get(const struct iam_iterator *it)
++{
++        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++                    it_state(it) == IAM_IT_SKEWED);
++        assert_corr(it_at_rec(it));
++        return iam_leaf_key(&it->ii_path.ip_leaf);
++}
++EXPORT_SYMBOL(iam_it_key_get);
++
++/*
++ * Return size of key under iterator (in bytes)
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED ||
++ *                it_state(it) == IAM_IT_SKEWED
++ */
++int iam_it_key_size(const struct iam_iterator *it)
++{
++        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++                    it_state(it) == IAM_IT_SKEWED);
++        assert_corr(it_at_rec(it));
++        return iam_leaf_key_size(&it->ii_path.ip_leaf);
++}
++EXPORT_SYMBOL(iam_it_key_size);
++
++/*
++ * Insertion of new record. Interaction with jbd during non-trivial case (when
++ * split happens) is as following:
++ *
++ *  - new leaf node is involved into transaction by ext3_append();
++ *
++ *  - old leaf node is involved into transaction by iam_add_rec();
++ *
++ *  - leaf where insertion point ends in, is marked dirty by iam_add_rec();
++ *
++ *  - leaf without insertion point is marked dirty (as @new_leaf) by
++ *  iam_new_leaf();
++ *
++ *  - split index nodes are involved into transaction and marked dirty by
++ *  split_index_node().
++ *
++ *  - "safe" index node, which is no split, but where new pointer is inserted
++ *  is involved into transaction and marked dirty by split_index_node().
++ *
++ *  - index node where pointer to new leaf is inserted is involved into
++ *  transaction by split_index_node() and marked dirty by iam_add_rec().
++ *
++ *  - inode is marked dirty by iam_add_rec().
++ *
++ */
++
++static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
++{
++        int err;
++        iam_ptr_t blknr;
++        struct buffer_head   *new_leaf;
++        struct buffer_head   *old_leaf;
++        struct iam_container *c;
++        struct inode         *obj;
++        struct iam_path      *path;
++
++        assert_inv(iam_leaf_check(leaf));
++
++        c = iam_leaf_container(leaf);
++        path = leaf->il_path;
++
++        obj = c->ic_object;
++        new_leaf = ext3_append(handle, obj, (__u32 *)&blknr, &err);
++        do_corr(schedule());
++        if (new_leaf != NULL) {
++                struct dynlock_handle *lh;
++
++                lh = dx_lock_htree(obj, blknr, DLT_WRITE);
++                do_corr(schedule());
++                if (lh != NULL) {
++                        iam_leaf_ops(leaf)->init_new(c, new_leaf);
++                        do_corr(schedule());
++                        old_leaf = leaf->il_bh;
++                        iam_leaf_split(leaf, &new_leaf, blknr);
++                        if (old_leaf != leaf->il_bh) {
++                                /*
++                                 * Switched to the new leaf.
++                                 */
++                                iam_leaf_unlock(leaf);
++                                leaf->il_lock = lh;
++                                path->ip_frame->leaf = blknr;
++                        } else
++                                dx_unlock_htree(obj, lh);
++                        do_corr(schedule());
++                        err = iam_txn_dirty(handle, path, new_leaf);
++                        brelse(new_leaf);
++                        if (err == 0)
++                                err = ext3_mark_inode_dirty(handle, obj);
++                        do_corr(schedule());
++                } else
++                        err = -ENOMEM;
++        }
++        assert_inv(iam_leaf_check(leaf));
++        assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf));
++        assert_inv(iam_path_check(iam_leaf_path(leaf)));
++        return err;
++}
++
++static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
++                       struct iam_path *path,
++                       const struct iam_key *k, const struct iam_rec *r)
++{
++      int err;
++        struct iam_leaf *leaf;
++
++        leaf = &path->ip_leaf;
++        assert_inv(iam_leaf_check(leaf));
++        assert_inv(iam_path_check(path));
++        err = iam_txn_add(handle, path, leaf->il_bh);
++        if (err == 0) {
++              do_corr(schedule());
++                if (!iam_leaf_can_add(leaf, k, r)) {
++                        struct dynlock_handle *lh = NULL;
++
++                        do {
++                                assert_corr(lh == NULL);
++                                do_corr(schedule());
++                                err = split_index_node(handle, path, &lh);
++                                if (err == -EAGAIN) {
++                                        assert_corr(lh == NULL);
++
++                                        iam_path_fini(path);
++                                        it->ii_state = IAM_IT_DETACHED;
++
++                                        do_corr(schedule());
++                                        err = iam_it_get_exact(it, k);
++                                        if (err == -ENOENT)
++                                                err = +1; /* repeat split */
++                                        else if (err == 0)
++                                                err = -EEXIST;
++                                }
++                        } while (err > 0);
++                        assert_inv(iam_path_check(path));
++                        if (err == 0) {
++                                assert_corr(lh != NULL);
++                                do_corr(schedule());
++                                err = iam_new_leaf(handle, leaf);
++                                if (err == 0)
++                                        err = iam_txn_dirty(handle, path,
++                                                            path->ip_frame->bh);
++                        }
++                        dx_unlock_htree(iam_path_obj(path), lh);
++                        do_corr(schedule());
++                }
++                if (err == 0) {
++                        iam_leaf_rec_add(leaf, k, r);
++                        err = iam_txn_dirty(handle, path, leaf->il_bh);
++                }
++        }
++        assert_inv(iam_leaf_check(leaf));
++        assert_inv(iam_leaf_check(&path->ip_leaf));
++        assert_inv(iam_path_check(path));
++      return err;
++}
++
++/*
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right. On success, iterator is positioned on the newly inserted record.
++ *
++ * precondition: it->ii_flags&IAM_IT_WRITE &&
++ *               (it_state(it) == IAM_IT_ATTACHED ||
++ *                it_state(it) == IAM_IT_SKEWED) &&
++ *               ergo(it_state(it) == IAM_IT_ATTACHED,
++ *                    it_keycmp(it, k) <= 0) &&
++ *               ergo(it_before(it), it_keycmp(it, k) > 0));
++ * postcondition: ergo(result == 0,
++ *                     it_state(it) == IAM_IT_ATTACHED &&
++ *                     it_keycmp(it, k) == 0 &&
++ *                     !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++                      const struct iam_key *k, const struct iam_rec *r)
++{
++        int result;
++        struct iam_path *path;
++
++        path = &it->ii_path;
++
++        assert_corr(it->ii_flags&IAM_IT_WRITE);
++        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++                    it_state(it) == IAM_IT_SKEWED);
++        assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
++                         it_keycmp(it, k) <= 0));
++        assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
++      result = iam_add_rec(h, it, path, k, r);
++        if (result == 0)
++                it->ii_state = IAM_IT_ATTACHED;
++        assert_corr(ergo(result == 0,
++                         it_state(it) == IAM_IT_ATTACHED &&
++                         it_keycmp(it, k) == 0));
++        return result;
++}
++EXPORT_SYMBOL(iam_it_rec_insert);
++
++/*
++ * Delete record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                it->ii_flags&IAM_IT_WRITE &&
++ *                it_at_rec(it)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED ||
++ *                it_state(it) == IAM_IT_DETACHED
++ */
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
++{
++        int result;
++        struct iam_leaf *leaf;
++        struct iam_path *path;
++
++        assert_corr(it_state(it) == IAM_IT_ATTACHED &&
++                    it->ii_flags&IAM_IT_WRITE);
++        assert_corr(it_at_rec(it));
++
++        path = &it->ii_path;
++        leaf = &path->ip_leaf;
++
++        assert_inv(iam_leaf_check(leaf));
++        assert_inv(iam_path_check(path));
++
++        result = iam_txn_add(h, path, leaf->il_bh);
++        /*
++         * no compaction for now.
++         */
++        if (result == 0) {
++                iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
++                result = iam_txn_dirty(h, path, leaf->il_bh);
++                if (result == 0 && iam_leaf_at_end(leaf) &&
++                    it->ii_flags&IAM_IT_MOVE) {
++                        result = iam_it_next(it);
++                        if (result > 0)
++                                result = 0;
++                }
++        }
++        assert_inv(iam_leaf_check(leaf));
++        assert_inv(iam_path_check(path));
++        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++                    it_state(it) == IAM_IT_DETACHED);
++      return result;
++}
++EXPORT_SYMBOL(iam_it_rec_delete);
++
++/*
++ * Convert iterator to cookie.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++iam_pos_t iam_it_store(const struct iam_iterator *it)
++{
++        iam_pos_t result;
++
++        assert_corr(it_state(it) == IAM_IT_ATTACHED);
++        assert_corr(it_at_rec(it));
++        assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
++                    sizeof result);
++
++        result = 0;
++        return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
++}
++EXPORT_SYMBOL(iam_it_store);
++
++/*
++ * Restore iterator from cookie.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
++ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
++ *                                  iam_it_store(it) == pos)
++ */
++int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
++{
++        assert_corr(it_state(it) == IAM_IT_DETACHED &&
++                    it->ii_flags&IAM_IT_MOVE);
++        assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
++        return iam_it_iget(it, (struct iam_ikey *)&pos);
++}
++EXPORT_SYMBOL(iam_it_load);
++
++/***********************************************************************/
++/* invariants                                                          */
++/***********************************************************************/
++
++static inline int ptr_inside(void *base, size_t size, void *ptr)
++{
++        return (base <= ptr) && (ptr < base + size);
++}
++
++int iam_frame_invariant(struct iam_frame *f)
++{
++        return
++                (f->bh != NULL &&
++                f->bh->b_data != NULL &&
++                ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
++                ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
++                f->entries <= f->at);
++}
++int iam_leaf_invariant(struct iam_leaf *l)
++{
++        return
++                l->il_bh != NULL &&
++                l->il_bh->b_data != NULL &&
++                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
++                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
++                l->il_entries <= l->il_at;
++}
++
++int iam_path_invariant(struct iam_path *p)
++{
++        int i;
++
++        if (p->ip_container == NULL ||
++            p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
++            p->ip_frame != p->ip_frames + p->ip_indirect ||
++            !iam_leaf_invariant(&p->ip_leaf))
++                return 0;
++        for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
++                if (i <= p->ip_indirect) {
++                        if (!iam_frame_invariant(&p->ip_frames[i]))
++                                return 0;
++                }
++        }
++        return 1;
++}
++
++int iam_it_invariant(struct iam_iterator *it)
++{
++        return
++                (it->ii_state == IAM_IT_DETACHED ||
++                 it->ii_state == IAM_IT_ATTACHED ||
++                 it->ii_state == IAM_IT_SKEWED) &&
++                !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
++                ergo(it->ii_state == IAM_IT_ATTACHED ||
++                     it->ii_state == IAM_IT_SKEWED,
++                     iam_path_invariant(&it->ii_path) &&
++                     equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
++}
++
++/*
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
++ *
++ * Return values: 0: found, -ENOENT: not-found, -ve: error
++ */
++int iam_lookup(struct iam_container *c, const struct iam_key *k,
++               struct iam_rec *r, struct iam_path_descr *pd)
++{
++        struct iam_iterator it;
++        int result;
++
++        iam_it_init(&it, c, 0, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == 0)
++                /*
++                 * record with required key found, copy it into user buffer
++                 */
++                iam_reccpy(&it.ii_path.ip_leaf, r);
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
++}
++EXPORT_SYMBOL(iam_lookup);
++
++/*
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h).
++ *
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
++ *
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ *                                  iam_lookup(c, k, r2) > 0;
++ */
++int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
++               const struct iam_rec *r, struct iam_path_descr *pd)
++{
++        struct iam_iterator it;
++        int result;
++
++        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == -ENOENT)
++                result = iam_it_rec_insert(h, &it, k, r);
++        else if (result == 0)
++                result = -EEXIST;
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
++}
++EXPORT_SYMBOL(iam_insert);
++
++/*
++ * Update record with the key @k in container @c (within context of
++ * transaction @h), new record is given by @r.
++ *
++ * Return values: 0: success, -ve: error, including -ENOENT if no record with
++ * the given key found.
++ */
++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
++               const struct iam_rec *r, struct iam_path_descr *pd)
++{
++        struct iam_iterator it;
++        int result;
++
++        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == 0)
++                iam_it_rec_set(h, &it, r);
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
++}
++EXPORT_SYMBOL(iam_update);
++
++/*
++ * Delete existing record with key @k.
++ *
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ *                                 !iam_lookup(c, k, *));
++ */
++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
++             struct iam_path_descr *pd)
++{
++        struct iam_iterator it;
++        int result;
++
++        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == 0)
++                iam_it_rec_delete(h, &it);
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
++}
++EXPORT_SYMBOL(iam_delete);
++
+Index: iam/fs/ext3/namei.c
+===================================================================
+--- iam.orig/fs/ext3/namei.c   2007-05-23 11:18:18.000000000 +0800
++++ iam/fs/ext3/namei.c        2007-05-23 11:18:20.000000000 +0800
+@@ -24,81 +24,6 @@
+  *    Theodore Ts'o, 2002
+  */
+-/*
+- * iam: big theory statement.
+- *
+- * iam (Index Access Module) is a module providing abstraction of persistent
+- * transactional container on top of generalized ext3 htree.
+- *
+- * iam supports:
+- *
+- *     - key, pointer, and record size specifiable per container.
+- *
+- *     - trees taller than 2 index levels.
+- *
+- *     - read/write to existing ext3 htree directories as iam containers.
+- *
+- * iam container is a tree, consisting of leaf nodes containing keys and
+- * records stored in this container, and index nodes, containing keys and
+- * pointers to leaf or index nodes.
+- *
+- * iam does not work with keys directly, instead it calls user-supplied key
+- * comparison function (->dpo_keycmp()).
+- *
+- * Pointers are (currently) interpreted as logical offsets (measured in
+- * blocksful) within underlying flat file on top of which iam tree lives.
+- *
+- * On-disk format:
+- *
+- * iam mostly tries to reuse existing htree formats.
+- *
+- * Format of index node:
+- *
+- * +-----+-------+-------+-------+------+-------+------------+
+- * |     | count |       |       |      |       |            |
+- * | gap |   /   | entry | entry | .... | entry | free space |
+- * |     | limit |       |       |      |       |            |
+- * +-----+-------+-------+-------+------+-------+------------+
+- *
+- *       gap           this part of node is never accessed by iam code. It
+- *                     exists for binary compatibility with ext3 htree (that,
+- *                     in turn, stores fake struct ext2_dirent for ext2
+- *                     compatibility), and to keep some unspecified per-node
+- *                     data. Gap can be different for root and non-root index
+- *                     nodes. Gap size can be specified for each container
+- *                     (gap of 0 is allowed).
+- *
+- *       count/limit   current number of entries in this node, and the maximal
+- *                     number of entries that can fit into node. count/limit
+- *                     has the same size as entry, and is itself counted in
+- *                     count.
+- *
+- *       entry         index entry: consists of a key immediately followed by
+- *                     a pointer to a child node. Size of a key and size of a
+- *                     pointer depends on container. Entry has neither
+- *                     alignment nor padding.
+- *
+- *       free space    portion of node new entries are added to
+- *
+- * Entries in index node are sorted by their key value.
+- *
+- * Format of leaf node:
+- *
+- * +-----+-------+-------+-------+------+-------+------------+
+- * |     | count |       |       |      |       |            |
+- * | gap |   /   | leaf  | leaf  | .... | leaf  | free space |
+- * |     | limit |       |       |      |       |            |
+- * +-----+-------+-------+-------+------+-------+------------+
+-
+- *       leaf          For leaf entry: consists of a rec immediately followd by 
+- *                     a key. size of a key and size of a rec depends on container.  
+- *
+- *
+- *
+- *
+- *
+- */
+-
+ #include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+@@ -112,10 +37,10 @@
+ #include <linux/quotaops.h>
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
++#include <linux/lustre_iam.h>
+ #include "xattr.h"
+ #include "iopen.h"
+ #include "acl.h"
+-#include <linux/lustre_iam.h>
+ /*
+  * define how far ahead to read directories while searching them.
+  */
+@@ -125,7 +50,7 @@
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+-static struct buffer_head *ext3_append(handle_t *handle,
++struct buffer_head *ext3_append(handle_t *handle,
+                                       struct inode *inode,
+                                       u32 *block, int *err)
+ {
+@@ -136,14 +61,15 @@
+       if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+               inode->i_size += inode->i_sb->s_blocksize;
+               EXT3_I(inode)->i_disksize = inode->i_size;
+-              ext3_journal_get_write_access(handle,bh);
++              *err = ext3_journal_get_write_access(handle, bh);
++              if (*err != 0) {
++                      brelse(bh);
++                      bh = NULL;
++              }
+       }
+       return bh;
+ }
+-#ifndef assert
+-#define assert(test) J_ASSERT(test)
+-#endif
+ #ifndef swap
+ #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+@@ -155,293 +81,10 @@
+ #define dxtrace(command)
+ #endif
+-struct fake_dirent {
+-      __le32 inode;
+-      __le16 rec_len;
+-      u8 name_len;
+-      u8 file_type;
+-};
+-
+-struct dx_countlimit {
+-      __le16 limit;
+-      __le16 count;
+-};
+-
+-/*
+- * dx_root_info is laid out so that if it should somehow get overlaid by a
+- * dirent the two low bits of the hash version will be zero.  Therefore, the
+- * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+- */
+-
+-struct dx_root {
+-      struct fake_dirent dot;
+-      char dot_name[4];
+-      struct fake_dirent dotdot;
+-      char dotdot_name[4];
+-      struct dx_root_info
+-      {
+-              __le32 reserved_zero;
+-              u8 hash_version;
+-              u8 info_length; /* 8 */
+-              u8 indirect_levels;
+-              u8 unused_flags;
+-      }
+-      info;
+-      struct {} entries[0];
+-};
+-
+-struct dx_node
+-{
+-      struct fake_dirent fake;
+-      struct {} entries[0];
+-};
+-
+-struct dx_map_entry
+-{
+-      u32 hash;
+-      u32 offs;
+-};
+-
+-
+-static u32 htree_root_ptr(struct iam_container *c);
+-static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
+-static int htree_node_init(struct iam_container *c,
+-                         struct buffer_head *bh, int root);
+-static int htree_keycmp(struct iam_container *c,
+-                      struct iam_key *k1, struct iam_key *k2);
+-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
+-                         handle_t *h, struct buffer_head **bh);
+-
+-/*
+- * Parameters describing iam compatibility mode in which existing ext3 htrees
+- * can be manipulated.
+- */
+-static struct iam_descr htree_compat_param = {
+-      .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
+-      .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
+-      .id_node_gap = offsetof(struct dx_node, entries),
+-      .id_root_gap = offsetof(struct dx_root, entries),
+-
+-      .id_root_ptr   = htree_root_ptr,
+-      .id_node_check = htree_node_check,
+-      .id_node_init  = htree_node_init,
+-      .id_node_read  = htree_node_read,
+-      .id_keycmp     = htree_keycmp
+-};
+-
+-
+-struct iam_key;
+-struct iam_rec;
+-struct iam_descr;
+-struct iam_container;
+-struct iam_path;
+-
+-
+-
+-/*
+- * iam cursor (iterator) api.
+- */
+-
+-/*
+- * Flags controlling iterator functionality.
+- */
+-enum iam_it_flags {
+-      /*
+-       * this iterator will move (iam_it_{prev,next}() will be called on it)
+-       */
+-      IAM_IT_MOVE  = (1 << 0),
+-      /*
+-       * tree can be updated through this iterator.
+-       */
+-      IAM_IT_WRITE = (1 << 1)
+-};
+-
+-/*
+- * States of iterator state machine.
+- */
+-enum iam_it_state {
+-      /* initial state */
+-      IAM_IT_DETACHED,
+-      /* iterator is above particular record in the container */
+-      IAM_IT_ATTACHED
+-};
+-
+-struct htree_cookie {
+-      struct dx_hash_info *hinfo;
+-      struct dentry       *dentry;
+-};
+-
+-/*
+- * Iterator.
+- *
+- * Immediately after call to iam_it_init() iterator is in "detached"
+- * (IAM_IT_DETACHED) state: it is associated with given parent container, but
+- * doesn't point to any particular record in this container.
+- *
+- * After successful call to iam_it_get() and until corresponding call to
+- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
+- *
+- * Attached iterator can move through records in a container (provided
+- * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+- * passes over them, and can modify container (provided IAM_IT_WRITE
+- * permission).
+- *
+- * Concurrency: iterators are supposed to be local to thread. Interfaces below
+- * do no internal serialization.
+- *
+- */
+-struct iam_iterator {
+-      /*
+-       * iterator flags, taken from enum iam_it_flags.
+-       */
+-      __u32                 ii_flags;
+-      enum iam_it_state     ii_state;
+-      /*
+-       * path to the record. Valid in IAM_IT_ATTACHED state.
+-       */
+-      struct iam_path       ii_path;
+-};
+-
+-static inline struct iam_key *keycpy(struct iam_container *c,
+-                                   struct iam_key *k1, struct iam_key *k2)
+-{
+-      return memcpy(k1, k2, c->ic_descr->id_key_size);
+-}
+-
+-static inline int keycmp(struct iam_container *c,
+-                       struct iam_key *k1, struct iam_key *k2)
+-{
+-      return c->ic_descr->id_keycmp(c, k1, k2);
+-}
+-
+-static struct iam_container *iam_it_container(struct iam_iterator *it)
+-{
+-      return it->ii_path.ip_container;
+-}
+-
+-static inline int it_keycmp(struct iam_iterator *it,
+-                          struct iam_key *k1, struct iam_key *k2)
+-{
+-      return keycmp(iam_it_container(it), k1, k2);
+-}
+-
+-/*
+- * Initialize iterator to IAM_IT_DETACHED state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+-int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags);
+-/*
+- * Finalize iterator and release all resources.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- */
+-void iam_it_fini(struct iam_iterator *it);
+-
+-/*
+- * Attach iterator. After successful completion, @it points to record with the
+- * largest key not larger than @k. Semantics of ->id_create() method guarantee
+- * that such record will always be found.
+- *
+- * Return value: 0: positioned on existing record,
+- *             -ve: error.
+- *
+- * precondition:  it_state(it) == IAM_IT_DETACHED
+- * postcondition: ergo(result == 0,
+- *                     (it_state(it) == IAM_IT_ATTACHED &&
+- *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
+- */
+-int iam_it_get(struct iam_iterator *it, struct iam_key *k);
+-
+-/*
+- * Duplicates iterator.
+- *
+- * postcondition: it_state(dst) == it_state(src) &&
+- *                iam_it_container(dst) == iam_it_container(src) &&
+- *                dst->ii_flags = src->ii_flags &&
+- *                ergo(it_state(it) == IAM_IT_ATTACHED,
+- *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+- *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
+- */
+-void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
+-
+-/*
+- * Detach iterator. Does nothing it detached state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+-void iam_it_put(struct iam_iterator *it);
+-
+-/*
+- * Move iterator one record right.
+- *
+- * Return value: 0: success,
+- *              +1: end of container reached
+- *             -ve: error
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
+- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
+- */
+-int iam_it_next(struct iam_iterator *it);
+-
+-/*
+- * Return pointer to the record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-const struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
+-
+-/*
+- * Replace contents of record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
+-
+-/*
+- * Place key under iterator in @k, return @k
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-const struct iam_key *iam_it_key_get(struct iam_iterator *it,
+-                                   struct iam_key *k);
+-
+-/*
+- * Insert new record with key @k and contents from @r, shifting records to the
+- * right.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+- *                it->ii_flags&IAM_IT_WRITE &&
+- *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- *                ergo(result == 0,
+- *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
+- *                     !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+-                    struct iam_key *k, struct iam_rec *r);
+-/*
+- * Delete record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+-
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
+ static void dx_set_block(struct iam_path *p,
+                        struct iam_entry *entry, unsigned value);
+-static inline struct iam_key *dx_get_key(struct iam_path *p,
+-                                      struct iam_entry *entry,
+-                                      struct iam_key *key);
+-static void dx_set_key(struct iam_path *p, struct iam_entry *entry,
+-                     struct iam_key *key);
+-static unsigned dx_get_count(struct iam_entry *entries);
+ static unsigned dx_get_limit(struct iam_entry *entries);
+ static void dx_set_count(struct iam_entry *entries, unsigned value);
+ static void dx_set_limit(struct iam_entry *entries, unsigned value);
+@@ -457,264 +100,62 @@
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct iam_path *path,
+-                           struct iam_frame *frame, u32 hash, u32 block);
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct iam_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
+-static inline void iam_path_init(struct iam_path *path,
+-                               struct iam_container *c, struct htree_cookie *hc);
+-static inline void iam_path_fini(struct iam_path *path);
+-
+-
+-/*
+- * Future: use high four bits of block for coalesce-on-delete flags
+- * Mask them off for now.
+- */
+-
+-static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off)
+-{
+-      return (void *)((char *)entry + off);
+-}
+-
+-static inline struct iam_descr *path_descr(struct iam_path *p)
+-{
+-      return p->ip_container->ic_descr;
+-}
+-
+-static inline struct inode *path_obj(struct iam_path *p)
+-{
+-      return p->ip_container->ic_object;
+-}
+-
+-static inline size_t iam_entry_size(struct iam_path *p)
+-{
+-      return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size;
+-}
+-
+-static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
+-                                            struct iam_entry *entry, int shift)
+-{
+-      void *e = entry;
+-      return e + shift * iam_entry_size(p);
+-}
+-
+-static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
+-                                    struct iam_entry *e1, struct iam_entry *e2)
+-{
+-      ptrdiff_t diff;
+-
+-      diff = (void *)e1 - (void *)e2;
+-      assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
+-      return diff / iam_entry_size(p);
+-}
+-
+-static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+-{
+-      return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size))
+-              & 0x00ffffff;
+-}
+-
+-static inline void dx_set_block(struct iam_path *p,
+-                              struct iam_entry *entry, unsigned value)
+-{
+-      *(u32*)entry_off(entry,
+-                       path_descr(p)->id_key_size) = cpu_to_le32(value);
+-}
+-
+-static inline struct iam_key *dx_get_key(struct iam_path *p,
+-                                      struct iam_entry *entry,
+-                                      struct iam_key *key)
+-{
+-      memcpy(key, entry, path_descr(p)->id_key_size);
+-      return key;
+-}
+-
+-static inline struct iam_key *iam_key_at(struct iam_path *p,
+-                                     struct iam_entry *entry)
+-{
+-      return (struct iam_key *)entry;
+-}
+-
+-static inline void dx_set_key(struct iam_path *p,
+-                            struct iam_entry *entry, struct iam_key *key)
+-{
+-      memcpy(entry, key, path_descr(p)->id_key_size);
+-}
+-
+-static inline unsigned dx_get_count (struct iam_entry *entries)
+-{
+-      return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+-}
+-
+-static inline unsigned dx_get_limit (struct iam_entry *entries)
+-{
+-      return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+-}
+-
+-static inline void dx_set_count (struct iam_entry *entries, unsigned value)
+-{
+-      ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+-}
+-
+-static inline void dx_set_limit (struct iam_entry *entries, unsigned value)
++static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+ {
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+-static inline unsigned dx_root_limit(struct iam_path *p)
++int dx_index_is_compat(struct iam_path *path)
+ {
+-      struct iam_descr *param = path_descr(p);
+-      unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
+-              param->id_root_gap;
+-      return entry_space / (param->id_key_size + param->id_ptr_size);
++      return iam_path_descr(path) == &iam_htree_compat_param;
+ }
+-static inline unsigned dx_node_limit(struct iam_path *p)
+-{
+-      struct iam_descr *param = path_descr(p);
+-      unsigned entry_space   = path_obj(p)->i_sb->s_blocksize -
+-              param->id_node_gap;
+-      return entry_space / (param->id_key_size + param->id_ptr_size);
+-}
+-
+-static inline int dx_index_is_compat(struct iam_path *path)
+-{
+-      return path_descr(path) == &htree_compat_param;
+-}
+-
+-static struct iam_entry *dx_get_entries(struct iam_path *path, void *data,
+-                                     int root)
+-{
+-      return data +
+-              (root ?
+-               path_descr(path)->id_root_gap : path_descr(path)->id_node_gap);
+-}
+-
+-static struct iam_entry *dx_node_get_entries(struct iam_path *path,
+-                                          struct iam_frame *frame)
+-{
+-      return dx_get_entries(path,
+-                            frame->bh->b_data, frame == path->ip_frames);
+-}
+-static int dx_node_check(struct iam_path *p, struct iam_frame *f)
++int dx_node_check(struct iam_path *p, struct iam_frame *f)
+ {
+       struct iam_entry     *e;
+       struct iam_container *c;
+       unsigned count;
+       unsigned  i;
++      iam_ptr_t  blk;
++      iam_ptr_t  root;
++      struct inode *inode;
+       c = p->ip_container;
+       e = dx_node_get_entries(p, f);
+       count = dx_get_count(e);
+       e = iam_entry_shift(p, e, 1);
++      root = iam_path_descr(p)->id_ops->id_root_ptr(c);
++
++      inode = iam_path_obj(p);
+       for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
+-              keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]);
+-              dx_get_key(p, e, p->ip_key_scratch[1]);
++              iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1));
++              iam_get_ikey(p, e, iam_path_ikey(p, 1));
+               if (i > 0 &&
+-                  keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0)
++                  iam_ikeycmp(c, iam_path_ikey(p, 0),
++                              iam_path_ikey(p, 1)) > 0) {
++                      BREAKPOINT();
+                       return 0;
+       }
+-      return 1;
+-}
+-
+-static u32 htree_root_ptr(struct iam_container *c)
+-{
++              blk = dx_get_block(p, e);
++              if (inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) {
++                      BREAKPOINT();
+       return 0;
+-}
+-
+-static int htree_node_check(struct iam_path *path, struct iam_frame *frame)
+-{
+-      void *data;
+-      struct iam_entry *entries;
+-      struct super_block *sb;
+-
+-      data = frame->bh->b_data;
+-      entries = dx_node_get_entries(path, frame);
+-      sb = path_obj(path)->i_sb;
+-      if (frame == path->ip_frames) {
+-              /* root node */
+-              struct dx_root *root;
+-              struct htree_cookie *hc = path->ip_descr_data;
+-
+-              root = data;
+-              if (root->info.hash_version > DX_HASH_MAX) {
+-                      ext3_warning(sb, __FUNCTION__,
+-                                   "Unrecognised inode hash code %d",
+-                                   root->info.hash_version);
+-                      return ERR_BAD_DX_DIR;
+               }
+-
+-              if (root->info.unused_flags & 1) {
+-                      ext3_warning(sb, __FUNCTION__,
+-                                   "Unimplemented inode hash flags: %#06x",
+-                                   root->info.unused_flags);
+-                      return ERR_BAD_DX_DIR;
+-              }
+-
+-              path->ip_indirect = root->info.indirect_levels;
+-              if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) {
+-                      ext3_warning(sb, __FUNCTION__,
+-                                   "Unimplemented inode hash depth: %#06x",
+-                                   root->info.indirect_levels);
+-                      return ERR_BAD_DX_DIR;
++              /*
++               * By definition of a tree, no node points to the root.
++               */
++              if (blk == root) {
++                      BREAKPOINT();
++                      return 0;
+               }
+-
+-              assert((char *)entries == (((char *)&root->info) +
+-                                         root->info.info_length));
+-              assert(dx_get_limit(entries) == dx_root_limit(path));
+-
+-              hc->hinfo->hash_version = root->info.hash_version;
+-              hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed;
+-              if (hc->dentry)
+-                      ext3fs_dirhash(hc->dentry->d_name.name,
+-                                     hc->dentry->d_name.len, hc->hinfo);
+-              path->ip_key_target = (struct iam_key *)&hc->hinfo->hash;
+-      } else {
+-              /* non-root index */
+-              assert(entries == data + path_descr(path)->id_node_gap);
+-              assert(dx_get_limit(entries) == dx_node_limit(path));
+       }
+-      frame->entries = frame->at = entries;
+-      return 0;
+-}
+-
+-static int htree_node_init(struct iam_container *c,
+-                         struct buffer_head *bh, int root)
+-{
+-      struct dx_node *node;
+-
+-      assert(!root);
+-
+-      node = (void *)bh->b_data;
+-      node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize);
+-      node->fake.inode = 0;
+-      return 0;
+-}
+-
+-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
+-                         handle_t *handle, struct buffer_head **bh)
+-{
+-      int result = 0;
+-
+-      *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result);
+-      if (*bh == NULL)
+-              result = -EIO;
+-      return result;
+-}
+-
+-static int htree_keycmp(struct iam_container *c,
+-                      struct iam_key *k1, struct iam_key *k2)
+-{
+-      __u32 p1 = le32_to_cpu(*(__u32 *)k1);
+-      __u32 p2 = le32_to_cpu(*(__u32 *)k2);
+-
+-      return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
++      return 1;
+ }
+ /*
+@@ -797,601 +238,124 @@
+               printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
+                       names, space/bcount,(space/bcount)*100/blocksize);
+       return (struct stats) { names, space, bcount};
+-}
+-#endif /* DX_DEBUG */
+-
+-static int dx_lookup(struct iam_path *path)
+-{
+-      u32 ptr;
+-      int err = 0;
+-      int i;
+-
+-      struct iam_descr *param;
+-      struct iam_frame *frame;
+-      struct iam_container *c;
+-
+-      param = path_descr(path);
+-      c = path->ip_container;
+-      
+-      for (frame = path->ip_frames, i = 0,
+-                   ptr = param->id_root_ptr(path->ip_container);
+-           i <= path->ip_indirect;
+-           ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+-              struct iam_entry *entries;
+-              struct iam_entry *p;
+-              struct iam_entry *q;
+-              struct iam_entry *m;
+-              unsigned count;
+-
+-              err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh);
+-              if (err != 0)
+-                      break;
+-              err = param->id_node_check(path, frame);
+-              if (err != 0)
+-                      break;
+-
+-              assert(dx_node_check(path, frame));
+-
+-              entries = frame->entries;
+-              count = dx_get_count(entries);
+-              assert(count && count <= dx_get_limit(entries));
+-              p = iam_entry_shift(path, entries, 1);
+-              q = iam_entry_shift(path, entries, count - 1);
+-              while (p <= q) {
+-                      m = iam_entry_shift(path,
+-                                         p, iam_entry_diff(path, q, p) / 2);
+-                      dxtrace(printk("."));
+-                      if (keycmp(c, iam_key_at(path, m),
+-                                 path->ip_key_target) > 0)
+-                              q = iam_entry_shift(path, m, -1);
+-                      else
+-                              p = iam_entry_shift(path, m, +1);
+-              }
+-
+-              frame->at = iam_entry_shift(path, p, -1);
+-              if (1) { // linear search cross check
+-                      unsigned n = count - 1;
+-                      struct iam_entry *at;
+-
+-                      at = entries;
+-                      while (n--) {
+-                              dxtrace(printk(","));
+-                              at = iam_entry_shift(path, at, +1);
+-                              if (keycmp(c, iam_key_at(path, at),
+-                                         path->ip_key_target) > 0) {
+-                                      if (at != iam_entry_shift(path, frame->at, 1)) {
+-                                              BREAKPOINT;
+-                                              printk(KERN_EMERG "%i\n",
+-                                                     keycmp(c, iam_key_at(path, at),
+-                                                            path->ip_key_target));
+-                                      }
+-                                      at = iam_entry_shift(path, at, -1);
+-                                      break;
+-                              }
+-                      }
+-                      assert(at == frame->at);
+-              }
+-      }
+-      if (err != 0)
+-              iam_path_fini(path);
+-      path->ip_frame = --frame;
+-      return err;
+-}
+-
+-/*
+- * Probe for a directory leaf block to search.
+- *
+- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+- * error in the directory index, and the caller should fall back to
+- * searching the directory normally.  The callers of dx_probe **MUST**
+- * check for this error code, and make sure it never gets reflected
+- * back to userspace.
+- */
+-static int dx_probe(struct dentry *dentry, struct inode *dir,
+-                  struct dx_hash_info *hinfo, struct iam_path *path)
+-{
+-      int err;
+-      struct htree_cookie hc = {
+-              .dentry = dentry,
+-              .hinfo  = hinfo
+-      };
+-
+-      assert(dx_index_is_compat(path));
+-      path->ip_descr_data = &hc;
+-      err = dx_lookup(path);
+-      assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+-      return err;
+-}
+-
+-/*
+- * Initialize container @c, acquires additional reference on @inode.
+- */
+-int iam_container_init(struct iam_container *c,
+-                     struct iam_descr *descr, struct inode *inode)
+-{
+-      memset(c, 0, sizeof *c);
+-      c->ic_descr  = descr;
+-      c->ic_object = igrab(inode);
+-      if (c->ic_object != NULL)
+-              return 0;
+-      else
+-              return -ENOENT;
+-}
+-
+-/*
+- * Finalize container @c, release all resources.
+- */
+-void iam_container_fini(struct iam_container *c)
+-{
+-      if (c->ic_object != NULL) {
+-              iput(c->ic_object);
+-              c->ic_object = NULL;
+-      }
+-}
+-
+-static inline void iam_path_init(struct iam_path *path, struct iam_container *c, 
+-                               struct htree_cookie *hc)
+-{
+-      memset(path, 0, sizeof *path);
+-      path->ip_container = c;
+-      path->ip_frame = path->ip_frames;
+-      path->ip_descr_data = hc;
+-}
+-
+-static inline void iam_path_fini(struct iam_path *path)
+-{
+-      int i;
+-
+-      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
+-              if (path->ip_frames[i].bh != NULL) {
+-                      brelse(path->ip_frames[i].bh);
+-                      path->ip_frames[i].bh = NULL;
+-              }
+-      }
+-}
+-
+-static void iam_path_compat_init(struct iam_path_compat *path,
+-                               struct inode *inode)
+-{
+-      int i;
+-
+-      iam_container_init(&path->ipc_container, &htree_compat_param, inode);
+-      /*
+-       * XXX hack allowing finalization of iam_path_compat with
+-       * iam_path_fini().
+-       */
+-      iput(inode);
+-      iam_path_init(&path->ipc_path, &path->ipc_container, NULL);
+-      for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
+-              path->ipc_path.ip_key_scratch[i] =
+-                      (struct iam_key *)&path->ipc_scrach[i];
+-}
+-
+-static void iam_path_compat_fini(struct iam_path_compat *path)
+-{
+-      iam_path_fini(&path->ipc_path);
+-      iam_container_fini(&path->ipc_container);
+-}
+-
+-static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf)
+-{
+-      int block, err;
+-      struct buffer_head *bh;
+-      
+-      block = dx_get_block(path, path->ip_frame->at);
+-      err = path_descr(path)->id_node_read(path->ip_container, block, 
+-                                           NULL, &bh);
+-      if (err)
+-              return err;
+-
+-      leaf->bh = bh;
+-      leaf->entries = (struct iam_leaf_entry *)bh->b_data;
+-      return 0;
+-}
+-
+-static void iam_leaf_fini(struct iam_leaf *leaf)
+-{
+-      if (leaf->bh)
+-              brelse(leaf->bh);
+-}
+-
+-/*
+- * Search container @c for record with key @k. If record is found, its data
+- * are moved into @r.
+- *
+- *
+- *
+- * Return values: +ve: found, 0: not-found, -ve: error
+- */
+-
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      do {
+-              struct iam_leaf leaf;
+-              err = iam_leaf_init(path, &leaf);
+-              if (err)
+-                      goto errout;
+-
+-              for (path_descr(path)->id_leaf.start(c, &leaf);
+-                   !path_descr(path)->id_leaf.at_end(c, &leaf);
+-                   path_descr(path)->id_leaf.next(c, &leaf)) {
+-                      struct iam_key *key;
+-
+-                      key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL);
+-                      path_descr(path)->id_leaf.key(c, &leaf, key);
+-                      if (keycmp(c, k, key) == 0) {
+-                              memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf),
+-                                     path_descr(path)->id_rec_size);
+-                              iam_path_fini(path);
+-                              iam_leaf_fini(&leaf);
+-                              return 0;
+-                      }
+-              }
+-
+-              iam_leaf_fini(&leaf);
+-              /* Check to see if we should continue to search */
+-              err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL);
+-              if (err < 0)
+-                      goto errout;
+-      } while (err == 1);
+-errout:
+-      iam_path_fini(path);
+-      return(err);
+-}
+-EXPORT_SYMBOL(iam_lookup);
+-
+-static inline size_t iam_leaf_entry_size(struct iam_path *p)
+-{
+-      return path_descr(p)->id_rec_size + path_descr(p)->id_key_size;
+-}
+-
+-static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p,
+-                                    struct iam_leaf_entry *e1, struct iam_leaf_entry *e2)
+-{
+-      ptrdiff_t diff;
+-
+-      diff = (void *)e1 - (void *)e2;
+-      assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff);
+-      return diff / iam_leaf_entry_size(p);
+-}
+-
+-static inline struct iam_leaf_entry* 
+-iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift)
+-{
+-      void *e = entry;
+-      return e + shift * iam_leaf_entry_size(p);
+-}
+-
+-static inline struct iam_key *
+-dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key)
+-{
+-      memcpy(key, e, path_descr(p)->id_key_size);
+-      return key;
+-}
+-
+-static inline struct iam_key *
+-iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry)
+-{
+-      void *e = entry;
+-      return e + path_descr(p)->id_rec_size;
+-}
+-static inline struct iam_leaf_entry *
+-iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry)
+-{
+-      return entry; 
+-}
+-
+-static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, 
+-                         struct iam_key *k)
+-{
+-      struct iam_leaf_entry *p, *q, *m;
+-      struct iam_leaf_entry *entries = leaf->entries;
+-      int count = dx_get_count((struct iam_entry *)entries);
+-      
+-      p = iam_leaf_entry_shift(path, entries, 1);
+-      q = iam_leaf_entry_shift(path, entries, count - 1);
+-      while (p <= q) {
+-              m = iam_leaf_entry_shift(path,
+-                                 p, iam_leaf_entry_diff(path, q, p) / 2);
+-              dxtrace(printk("."));
+-              if (keycmp(path->ip_container, iam_leaf_key_at(path, m),
+-                         path->ip_key_target) > 0)
+-                      q = iam_leaf_entry_shift(path, m, -1);
+-              else
+-                      p = iam_leaf_entry_shift(path, m, +1);
+-      }
+-      leaf->at = q; 
+-      return 0;
+-}
+-
+-/*XXX what kind of lock should this entry be locked: WangDi */
+-static int iam_leaf_insert(handle_t *handle, struct iam_path *path, 
+-                         struct iam_key *k, struct iam_rec *r)
+-{
+-      struct iam_leaf leaf;
+-      struct iam_leaf_entry *p, *q;
+-      int err, count;
+-
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
+-      path_descr(path)->id_leaf.start(path->ip_container, &leaf);
+-      count = dx_get_count((struct iam_entry *)leaf.entries);
+-      if (dx_get_count((struct iam_entry *)leaf.entries) >= 
+-          dx_get_limit((struct iam_entry *)leaf.entries)){
+-              err = -ENOSPC;
+-              goto errout;
+-      }
+-
+-      err = iam_leaf_lookup(path, &leaf, k);
+-      if (err)
+-              goto errout;
+-      
+-      /*insert the k/r to leaf entries*/
+-      p = iam_leaf_entry_shift(path, leaf.at, 1);
+-      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
+-      while (q < p) {
+-              memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path));
+-              q = iam_leaf_entry_shift(path, q, -1);  
+-      }
+-      memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size);
+-      memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size);
+-
+-      dx_set_count((struct iam_entry*)leaf.entries, count + 1);
+-      err = ext3_journal_dirty_metadata(handle, leaf.bh);
+-      if (err)
+-              ext3_std_error(path->ip_container->ic_object->i_sb, err);
+-errout:       
+-      iam_leaf_fini(&leaf);
+-      return err;
+-} 
+-
+-static int split_leaf_node(handle_t *handle, struct iam_path *path)
+-{
+-      struct inode *dir = path_obj(path);
+-      unsigned continued = 0;
+-      struct buffer_head *bh2;
+-      u32 newblock, hash_split;
+-      char *data2;
+-      struct iam_leaf leaf;
+-      unsigned split;
+-      int     err;
+-
+-      bh2 = ext3_append (handle, dir, &newblock, &err);
+-      if (!(bh2)) {
+-              err = -ENOSPC;
+-              goto errout;
+-      }
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
+-
+-      BUFFER_TRACE(leaf.bh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, leaf.bh);
+-      if (err) {
+-      journal_error:
+-              iam_leaf_fini(&leaf);
+-              brelse(bh2);
+-              ext3_std_error(dir->i_sb, err);
+-              err = -EIO;
+-              goto errout;
+-      }
+-      data2 = bh2->b_data;
+-      split = dx_get_count((struct iam_entry*)leaf.entries)/2;
+-      hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split));
+-      if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)),
+-                 iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0)
+-              continued = 1;
+-
+-      memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1),
+-             iam_leaf_entry_shift(path, leaf.entries, split),
+-             split * iam_leaf_entry_size(path));
+- 
+-      /* Which block gets the new entry? */
+-      dx_insert_block(path, path->ip_frame, hash_split + continued, newblock);
+-      err = ext3_journal_dirty_metadata (handle, bh2);
+-      if (err)
+-              goto journal_error;
+-      err = ext3_journal_dirty_metadata (handle, leaf.bh);
+-      if (err)
+-              goto journal_error;
+-      brelse (bh2);
+-      iam_leaf_fini(&leaf);
+-errout:
+-      return err;
+-}
+-
+-static int split_index_node(handle_t *handle, struct iam_path *path);
+-/*
+- * Insert new record @r with key @k into container @c (within context of
+- * transaction @h.
+- *
+- * Return values: 0: success, -ve: error, including -EEXIST when record with
+- * given key is already present.
+- *
+- * postcondition: ergo(result == 0 || result == -EEXIST,
+- *                                  iam_lookup(c, k, r2) > 0 &&
+- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, 
+-             struct iam_rec *r)
++}
++#endif /* DX_DEBUG */
++
++int dx_lookup(struct iam_path *path)
+ {
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
++      u32 ptr;
++      int err = 0;
++      int i;
++      int delta;
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      if (err)
+-              goto errout; 
++      struct iam_descr *param;
++      struct iam_frame *frame;
++      struct iam_container *c;
+-      err = iam_leaf_insert(handle, path, k, r);
++      param = iam_path_descr(path);
++      c = path->ip_container;
+       
+-      if (err != -ENOSPC) 
+-              goto errout;    
++      delta = dx_index_is_compat(path) ? 1 : 2;
+-      err = split_index_node(handle, path);
+-      if (err)
+-              goto errout;    
++      for (frame = path->ip_frames, i = 0,
++                   ptr = param->id_ops->id_root_ptr(c);
++           i <= path->ip_indirect;
++           ptr = dx_get_block(path, frame->at), ++frame, ++i) {
++              struct iam_entry *entries;
++              struct iam_entry *p;
++              struct iam_entry *q;
++              struct iam_entry *m;
++              unsigned count;
+-      err = split_leaf_node(handle, path);
+-      if (err)
+-              goto errout;
+-      
+-      err = iam_leaf_insert(handle, path, k, r);
+-errout:
+-      iam_path_fini(path);
+-      return(err);
+-}
++              err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
++                                                &frame->bh);
++              if (err != 0)
++                      break;
+-EXPORT_SYMBOL(iam_insert);
+-static int iam_leaf_delete(handle_t *handle, struct iam_path *path, 
+-                         struct iam_key *k)
+-{
+-      struct iam_leaf leaf;
+-      struct iam_leaf_entry *p, *q;
+-      int err, count;
++              if (EXT3_INVARIANT_ON) {
++                      err = param->id_ops->id_node_check(path, frame);
++                      if (err != 0)
++                              break;
++              }
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
++              err = param->id_ops->id_node_load(path, frame);
++              if (err != 0)
++                      break;
++
++              assert_inv(dx_node_check(path, frame));
+       
+-      err = iam_leaf_lookup(path, &leaf, k);
+-      if (err)
+-              goto errout;
++              entries = frame->entries;
++              count = dx_get_count(entries);
++              assert_corr(count && count <= dx_get_limit(entries));
++              p = iam_entry_shift(path, entries, delta);
++              q = iam_entry_shift(path, entries, count - 1);
++              while (p <= q) {
++                      m = iam_entry_shift(path,
++                                         p, iam_entry_diff(path, q, p) / 2);
++                      dxtrace(printk("."));
++                      if (iam_ikeycmp(c, iam_ikey_at(path, m),
++                                      path->ip_ikey_target) > 0)
++                              q = iam_entry_shift(path, m, -1);
++                      else
++                              p = iam_entry_shift(path, m, +1);
++              }
+-      count = dx_get_count((struct iam_entry*)leaf.entries);
+-      /*delete the k to leaf entries*/
+-      p = iam_leaf_entry_shift(path, leaf.at, 1);
+-      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
+-      while (p < q) {
+-              memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path));
+-              p = iam_leaf_entry_shift(path, p, 1);
+-      }
+-      dx_set_count((struct iam_entry*)leaf.entries, count - 1);
++              frame->at = iam_entry_shift(path, p, -1);
++              if (EXT3_INVARIANT_ON) { // linear search cross check
++                      unsigned n = count - 1;
++                      struct iam_entry *at;
+-      err = ext3_journal_dirty_metadata(handle, leaf.bh);
+-      if (err)
+-              ext3_std_error(path_obj(path)->i_sb, err);
+-errout:       
+-      iam_leaf_fini(&leaf);
++                      at = entries;
++                      while (n--) {
++                              dxtrace(printk(","));
++                              at = iam_entry_shift(path, at, +1);
++                              if (iam_ikeycmp(c, iam_ikey_at(path, at),
++                                             path->ip_ikey_target) > 0) {
++                                      if (at != iam_entry_shift(path, frame->at, 1)) {
++                                              BREAKPOINT();
++                                              printk(KERN_EMERG "%i\n",
++                                                     iam_ikeycmp(c, iam_ikey_at(path, at),
++                                                            path->ip_ikey_target));
++                                      }
++                                      at = iam_entry_shift(path, at, -1);
++                                      break;
++                              }
++                      }
++                      assert_corr(at == frame->at);
++              }
++      }
++      if (err != 0)
++              iam_path_fini(path);
++      path->ip_frame = --frame;
+       return err;
+ }
+ /*
+- * Delete existing record with key @k.
+- *
+- * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ * Probe for a directory leaf block to search.
+  *
+- * postcondition: ergo(result == 0 || result == -ENOENT,
+- *                                 !iam_lookup(c, k, *));
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally.  The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
+  */
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      if (err)
+-              goto errout; 
+-
+-      err = iam_leaf_delete(h, path, k);
+-errout:
+-      iam_path_fini(path);
+-      return err;
+-}
+-
+-EXPORT_SYMBOL(iam_delete);
+-
+-static int iam_leaf_update(handle_t *handle, struct iam_path *path, 
+-                         struct iam_key *k, struct iam_rec *r)
++static int dx_probe(struct dentry *dentry, struct inode *dir,
++                  struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+-      struct iam_leaf leaf;
+       int err;
+-
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
++      struct iam_path_compat *ipc;
+       
+-      err = iam_leaf_lookup(path, &leaf, k);
+-      if (err)
+-              goto errout;
+-
+-      memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size);
+-      memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size);
++      assert_corr(path->ip_data != NULL);
++      ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
++      ipc->ipc_dentry = dentry;
++      ipc->ipc_hinfo = hinfo;
+-      err = ext3_journal_dirty_metadata(handle, leaf.bh);
+-      if (err)
+-              ext3_std_error(path_obj(path)->i_sb, err);
+-errout:       
+-      iam_leaf_fini(&leaf);
+-      return err;
+-}
+-/*
+- * Replace existing record with key @k, or insert new one. New record data are
+- * in @r.
+- *
+- * Return values: 0: success, -ve: error.
+- *
+- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
+- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_update(handle_t *h, struct iam_container *c,
+-             struct iam_key *k, struct iam_rec *r)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-      
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
++      assert_corr(dx_index_is_compat(path));
+       err = dx_lookup(path);
+-      if (err)
+-              goto errout; 
+-
+-      err = iam_leaf_update(h, path, k, r);
+-errout:
+-      iam_path_fini(path);
++      assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+       return err;
+ }
+-EXPORT_SYMBOL(iam_update);
+-
+ /*
+  * This function increments the frame pointer to search the next leaf
+  * block, and reads in the necessary intervening nodes if the search
+@@ -1409,16 +373,15 @@
+  * If start_hash is non-null, it will be filled in with the starting
+  * hash of the next page.
+  */
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct iam_path *path, __u32 *start_hash)
++static int ext3_htree_advance(struct inode *dir, __u32 hash,
++                            struct iam_path *path, __u32 *start_hash,
++                            int compat)
+ {
+       struct iam_frame *p;
+       struct buffer_head *bh;
+       int err, num_frames = 0;
+       __u32 bhash;
+-      assert(dx_index_is_compat(path));
+-
+       p = path->ip_frame;
+       /*
+        * Find the next leaf page by incrementing the frame pointer.
+@@ -1438,6 +401,10 @@
+               --p;
+       }
++      if (compat) {
++              /*
++               * Htree hash magic.
++               */
+       /*
+        * If the hash is 1, then continue only if the next page has a
+        * continuation hash of any value.  This is used for readdir
+@@ -1445,19 +412,21 @@
+        * desired contiuation hash.  If it doesn't, return since
+        * there's no point to read in the successive index pages.
+        */
+-      dx_get_key(path, p->at, (struct iam_key *)&bhash);
++              iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
+       if (start_hash)
+               *start_hash = bhash;
+       if ((hash & 1) == 0) {
+               if ((bhash & ~1) != hash)
+                       return 0;
+       }
++      }
+       /*
+        * If the hash is HASH_NB_ALWAYS, we always go to the next
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              err = path_descr(path)->id_node_read(path->ip_container,
++              err = iam_path_descr(path)->id_ops->
++                      id_node_read(path->ip_container,
+                                                    (iam_ptr_t)dx_get_block(path, p->at),
+                                                    NULL, &bh);
+               if (err != 0)
+@@ -1465,12 +434,23 @@
+               ++p;
+               brelse (p->bh);
+               p->bh = bh;
+-              p->at = p->entries = dx_node_get_entries(path, p);
+-              assert(dx_node_check(path, p));
++              p->entries = dx_node_get_entries(path, p);
++              p->at = iam_entry_shift(path, p->entries, !compat);
++              assert_inv(dx_node_check(path, p));
+       }
+       return 1;
+ }
++int iam_index_next(struct iam_container *c, struct iam_path *path)
++{
++      return ext3_htree_advance(c->ic_object, 0, path, NULL, 0);
++}
++
++int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                        struct iam_path *path, __u32 *start_hash)
++{
++      return ext3_htree_advance(dir, hash, path, start_hash, 1);
++}
+ /*
+  * p is at least 6 bytes before the end of page
+@@ -1662,21 +642,30 @@
+       } while(more);
+ }
+-static void dx_insert_block(struct iam_path *path,
+-                          struct iam_frame *frame, u32 hash, u32 block)
++void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++                  const struct iam_ikey *key, iam_ptr_t ptr)
+ {
+       struct iam_entry *entries = frame->entries;
+-      struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1);
++      struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+       int count = dx_get_count(entries);
+-      assert(count < dx_get_limit(entries));
+-      assert(old < iam_entry_shift(path, entries, count));
++      assert_corr(count < dx_get_limit(entries));
++      assert_corr(frame->at < iam_entry_shift(path, entries, count));
++
+       memmove(iam_entry_shift(path, new, 1), new,
+               (char *)iam_entry_shift(path, entries, count) - (char *)new);
+-      dx_set_key(path, new, (struct iam_key *)&hash);
+-      dx_set_block(path, new, block);
++      dx_set_ikey(path, new, key);
++      dx_set_block(path, new, ptr);
+       dx_set_count(entries, count + 1);
+ }
++
++void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
++                   u32 hash, u32 block)
++{
++      assert_corr(dx_index_is_compat(path));
++      iam_insert_key(path, frame, (struct iam_ikey *)&hash, block);
++}
++
+ #endif
+@@ -1903,7 +892,8 @@
+       hash = hinfo.hash;
+       do {
+               block = dx_get_block(path, path->ip_frame->at);
+-              *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block,
++              *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container,
++                                                        (iam_ptr_t)block,
+                                                    NULL, &bh);
+               if (*err != 0)
+                       goto errout;
+@@ -2093,22 +1083,69 @@
+       return prev;
+ }
++struct ext3_dir_entry_2 *move_entries(struct inode *dir,
++                                    struct dx_hash_info *hinfo,
++                                    struct buffer_head **bh1,
++                                    struct buffer_head **bh2,
++                                    __u32 *delim_hash)
++{
++      char *data1;
++      char *data2;
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count;
++      unsigned continued;
++      unsigned split;
++      u32 hash2;
++
++      struct dx_map_entry     *map;
++      struct ext3_dir_entry_2 *de1;
++      struct ext3_dir_entry_2 *de2;
++
++      data1 = (*bh1)->b_data;
++      data2 = (*bh2)->b_data;
++
++      /* create map in the end of data2 block */
++      map = (struct dx_map_entry *) (data2 + blocksize);
++      count = dx_make_map((struct ext3_dir_entry_2 *) data1,
++                          blocksize, hinfo, map);
++      map -= count;
++      split = count/2; // need to adjust to actual middle
++      dx_sort_map(map, count);
++      hash2 = map[split].hash;
++      continued = hash2 == map[split - 1].hash;
++      dxtrace(printk("Split block %i at %x, %i/%i\n",
++              dx_get_block(frame->at), hash2, split, count - split));
++
++      /* Fancy dance to stay within two buffers */
++      de2 = dx_move_dirents(data1, data2, map + split, count - split);
++      de1 = dx_pack_dirents(data1, blocksize);
++      de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1);
++      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++      dxtrace(dx_show_leaf(hinfo,
++                           (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++      dxtrace(dx_show_leaf(hinfo,
++                           (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++      /* Which block gets the new entry? */
++      if (hinfo->hash >= hash2) {
++              swap(*bh1, *bh2);
++              de1 = de2;
++      }
++      *delim_hash = hash2 + continued;
++      return de1;
++}
++
+ /* Allocate new node, and split leaf node @bh into it, inserting new pointer
+  * into parent node identified by @frame */
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path,
+                       struct buffer_head **bh,struct iam_frame *frame,
+                       struct dx_hash_info *hinfo, int *error)
+ {
+-      struct inode *dir = path_obj(path);
+-      unsigned blocksize = dir->i_sb->s_blocksize;
+-      unsigned count, continued;
++      struct inode *dir = iam_path_obj(path);
+       struct buffer_head *bh2;
+       u32 newblock;
+       u32 hash2;
+-      struct dx_map_entry *map;
+-      char *data1 = (*bh)->b_data, *data2;
+-      unsigned split;
+-      struct ext3_dir_entry_2 *de = NULL, *de2;
++      struct ext3_dir_entry_2 *de = NULL;
+       int     err;
+       bh2 = ext3_append (handle, dir, &newblock, error);
+@@ -2133,35 +1170,9 @@
+       if (err)
+               goto journal_error;
+-      data2 = bh2->b_data;
+-
+-      /* create map in the end of data2 block */
+-      map = (struct dx_map_entry *) (data2 + blocksize);
+-      count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
+-                           blocksize, hinfo, map);
+-      map -= count;
+-      split = count/2; // need to adjust to actual middle
+-      dx_sort_map (map, count);
+-      hash2 = map[split].hash;
+-      continued = hash2 == map[split - 1].hash;
+-      dxtrace(printk("Split block %i at %x, %i/%i\n",
+-              dx_get_block(frame->at), hash2, split, count-split));
+-
+-      /* Fancy dance to stay within two buffers */
+-      de2 = dx_move_dirents(data1, data2, map + split, count - split);
+-      de = dx_pack_dirents(data1,blocksize);
+-      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+-      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++      de = move_entries(dir, hinfo, bh, &bh2, &hash2);
+-      /* Which block gets the new entry? */
+-      if (hinfo->hash >= hash2)
+-      {
+-              swap(*bh, bh2);
+-              de = de2;
+-      }
+-      dx_insert_block(path, frame, hash2 + continued, newblock);
++      dx_insert_block(path, frame, hash2, newblock);
+       err = ext3_journal_dirty_metadata (handle, bh2);
+       if (err)
+               goto journal_error;
+@@ -2175,6 +1186,63 @@
+ }
+ #endif
++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir,
++                                            struct buffer_head *bh,
++                                            const char *name, int namelen)
++{
++      struct ext3_dir_entry_2 *de;
++      char *top;
++      unsigned long offset;
++      int nlen;
++      int rlen;
++      int reclen;
++
++      reclen = EXT3_DIR_REC_LEN(namelen);
++      de = (struct ext3_dir_entry_2 *)bh->b_data;
++      top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++      offset = 0;
++      while ((char *) de <= top) {
++              if (!ext3_check_dir_entry("ext3_add_entry",
++                                        dir, de, bh, offset))
++                      return ERR_PTR(-EIO);
++              if (ext3_match(namelen, name, de))
++                      return ERR_PTR(-EEXIST);
++              nlen = EXT3_DIR_REC_LEN(de->name_len);
++              rlen = le16_to_cpu(de->rec_len);
++              if ((de->inode? rlen - nlen: rlen) >= reclen)
++                      return de;
++              de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++              offset += rlen;
++      }
++      return ERR_PTR(-ENOSPC);
++}
++
++struct ext3_dir_entry_2 *split_entry(struct inode *dir,
++                                   struct ext3_dir_entry_2 *de,
++                                   unsigned long ino, mode_t mode,
++                                   const char *name, int namelen)
++{
++      int nlen;
++      int rlen;
++
++      nlen = EXT3_DIR_REC_LEN(de->name_len);
++      rlen = le16_to_cpu(de->rec_len);
++      if (de->inode) {
++              struct ext3_dir_entry_2 *de1;
++
++              de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++              de1->rec_len = cpu_to_le16(rlen - nlen);
++              de->rec_len = cpu_to_le16(nlen);
++              de = de1;
++      }
++      de->file_type = EXT3_FT_UNKNOWN;
++      de->inode = cpu_to_le32(ino);
++      if (ino != 0)
++              ext3_set_de_type(dir->i_sb, de, mode);
++      de->name_len = namelen;
++      memcpy(de->name, name, namelen);
++      return de;
++}
+ /*
+  * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+@@ -2194,34 +1262,16 @@
+       struct inode    *dir = dentry->d_parent->d_inode;
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+-      unsigned long   offset = 0;
+-      unsigned short  reclen;
+-      int             nlen, rlen, err;
+-      char            *top;
++      int             err;
+-      reclen = EXT3_DIR_REC_LEN(namelen);
+       if (!de) {
+-              de = (struct ext3_dir_entry_2 *)bh->b_data;
+-              top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+-              while ((char *) de <= top) {
+-                      if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
+-                                                bh, offset)) {
+-                              brelse (bh);
+-                              return -EIO;
+-                      }
+-                      if (ext3_match (namelen, name, de)) {
+-                              brelse (bh);
+-                              return -EEXIST;
+-                      }
+-                      nlen = EXT3_DIR_REC_LEN(de->name_len);
+-                      rlen = le16_to_cpu(de->rec_len);
+-                      if ((de->inode? rlen - nlen: rlen) >= reclen)
+-                              break;
+-                      de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
+-                      offset += rlen;
++              de = find_insertion_point(dir, bh, name, namelen);
++              if (IS_ERR(de)) {
++                      err = PTR_ERR(de);
++                      if (err != -ENOSPC)
++                              brelse(bh);
++                      return err;
+               }
+-              if ((char *) de > top)
+-                      return -ENOSPC;
+       }
+       BUFFER_TRACE(bh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, bh);
+@@ -2232,22 +1282,9 @@
+       }
+       /* By now the buffer is marked for journaling */
+-      nlen = EXT3_DIR_REC_LEN(de->name_len);
+-      rlen = le16_to_cpu(de->rec_len);
+-      if (de->inode) {
+-              struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
+-              de1->rec_len = cpu_to_le16(rlen - nlen);
+-              de->rec_len = cpu_to_le16(nlen);
+-              de = de1;
+-      }
+-      de->file_type = EXT3_FT_UNKNOWN;
+-      if (inode) {
+-              de->inode = cpu_to_le32(inode->i_ino);
+-              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+-      } else
+-              de->inode = 0;
+-      de->name_len = namelen;
+-      memcpy (de->name, name, namelen);
++
++      split_entry(dir, de, inode ? inode->i_ino : 0,
++                  inode ? inode->i_mode : 0, name, namelen);
+       /*
+        * XXX shouldn't update any times until successful
+        * completion of syscall, but too many callers depend
+@@ -2423,8 +1460,40 @@
+       return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ }
++static int shift_entries(struct iam_path *path,
++                       struct iam_frame *frame, unsigned count,
++                       struct iam_entry *entries, struct iam_entry *entries2,
++                       u32 newblock)
++{
++      unsigned count1;
++      unsigned count2;
++      int delta;
++
++      struct iam_frame *parent = frame - 1;
++      struct iam_ikey *pivot = iam_path_ikey(path, 3);
++
++      delta = dx_index_is_compat(path) ? 0 : +1;
++
++      count1 = count/2 + delta;
++      count2 = count - count1;
++      iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
++
++      dxtrace(printk("Split index %i/%i\n", count1, count2));
++
++      memcpy((char *) iam_entry_shift(path, entries2, delta),
++             (char *) iam_entry_shift(path, entries, count1),
++             count2 * iam_entry_size(path));
++
++      dx_set_count(entries, count1);
++      dx_set_count(entries2, count2 + delta);
++      dx_set_limit(entries2, dx_node_limit(path));
++
++      iam_insert_key(path, parent, pivot, newblock);
++      return count1;
++}
++
+ #ifdef CONFIG_EXT3_INDEX
+-static int split_index_node(handle_t *handle, struct iam_path *path)
++int split_index_node(handle_t *handle, struct iam_path *path)
+ {
+       struct iam_entry *entries;   /* old block contents */
+@@ -2432,10 +1501,17 @@
+       struct iam_frame *frame, *safe;
+       struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+       u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+-      struct inode *dir = path_obj(path);
++      struct inode *dir = iam_path_obj(path);
++      struct iam_descr *descr;
+       int nr_splet;
+       int i, err;
++      descr = iam_path_descr(path);
++      /*
++       * Algorithm below depends on this.
++       */
++      assert_corr(dx_root_limit(path) < dx_node_limit(path));
++
+       frame = path->ip_frame;
+       entries = frame->entries;
+@@ -2474,7 +1550,8 @@
+       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+               if (!bh_new[i] ||
+-                  path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0)
++                  descr->id_ops->id_node_init(path->ip_container,
++                                              bh_new[i], 0) != 0)
+                       goto cleanup;
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -2493,6 +1570,7 @@
+               unsigned count;
+               int idx;
+               struct buffer_head *bh2;
++              struct buffer_head *bh;
+               entries = frame->entries;
+               count = dx_get_count(entries);
+@@ -2501,6 +1579,7 @@
+               bh2 = bh_new[i];
+               entries2 = dx_get_entries(path, bh2->b_data, 0);
++              bh = frame->bh;
+               if (frame == path->ip_frames) {
+                       /* splitting root node. Tricky point:
+                        *
+@@ -2512,22 +1591,20 @@
+                        * capacity of the root node is smaller than that of
+                        * non-root one.
+                        */
+-                      struct dx_root *root;
+-                      u8 indirects;
+                       struct iam_frame *frames;
++                      struct iam_entry *next;
++
++                      assert_corr(i == 0);
+                       frames = path->ip_frames;
+-                      root = (struct dx_root *) frames->bh->b_data;
+-                      indirects = root->info.indirect_levels;
+-                      dxtrace(printk("Creating new root %d\n", indirects));
+                       memcpy((char *) entries2, (char *) entries,
+                              count * iam_entry_size(path));
+                       dx_set_limit(entries2, dx_node_limit(path));
+                       /* Set up root */
+-                      dx_set_count(entries, 1);
+-                      dx_set_block(path, entries, newblock[i]);
+-                      root->info.indirect_levels = indirects + 1;
++                      next = descr->id_ops->id_root_inc(path->ip_container,
++                                                        path, frame);
++                      dx_set_block(path, next, newblock[0]);
+                       /* Shift frames in the path */
+                       memmove(frames + 2, frames + 1,
+@@ -2536,49 +1613,61 @@
+                       frames[1].at = iam_entry_shift(path, entries2, idx);
+                       frames[1].entries = entries = entries2;
+                       frames[1].bh = bh2;
+-                      assert(dx_node_check(path, frame));
++                      assert_inv(dx_node_check(path, frame));
++                      ++ path->ip_frame;
+                       ++ frame;
+-                      assert(dx_node_check(path, frame));
+-                      bh_new[i] = NULL; /* buffer head is "consumed" */
++                      assert_inv(dx_node_check(path, frame));
++                      bh_new[0] = NULL; /* buffer head is "consumed" */
+                       err = ext3_journal_get_write_access(handle, bh2);
+                       if (err)
+                               goto journal_error;
+               } else {
+                       /* splitting non-root index node. */
+-                      unsigned count1 = count/2, count2 = count - count1;
+-                      unsigned hash2;
+-
+-                      dx_get_key(path,
+-                                 iam_entry_shift(path, entries, count1),
+-                                 (struct iam_key *)&hash2);
+-
+-                      dxtrace(printk("Split index %i/%i\n", count1, count2));
+-
+-                      memcpy ((char *) entries2,
+-                              (char *) iam_entry_shift(path, entries, count1),
+-                              count2 * iam_entry_size(path));
+-                      dx_set_count (entries, count1);
+-                      dx_set_count (entries2, count2);
+-                      dx_set_limit (entries2, dx_node_limit(path));
++                      struct iam_frame *parent = frame - 1;
++                      count = shift_entries(path, frame, count,
++                                            entries, entries2, newblock[i]);
+                       /* Which index block gets the new entry? */
+-                      if (idx >= count1) {
++                      if (idx >= count) {
++                              int d = dx_index_is_compat(path) ? 0 : +1;
++
+                               frame->at = iam_entry_shift(path, entries2,
+-                                                          idx - count1);
++                                                          idx - count + d);
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                               bh_new[i] = bh2;
++                              parent->at = iam_entry_shift(path,
++                                                           parent->at, +1);
+                       }
+-                      dx_insert_block(path, frame - 1, hash2, newblock[i]);
+-                      assert(dx_node_check(path, frame));
+-                      assert(dx_node_check(path, frame - 1));
++                      assert_inv(dx_node_check(path, frame));
++                      assert_inv(dx_node_check(path, parent));
+                       dxtrace(dx_show_index ("node", frame->entries));
+                       dxtrace(dx_show_index ("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+                       err = ext3_journal_dirty_metadata(handle, bh2);
+                       if (err)
+                               goto journal_error;
++                      err = ext3_journal_dirty_metadata(handle, parent->bh);
++                      if (err)
++                              goto journal_error;
++              }
++              err = ext3_journal_dirty_metadata(handle, bh);
++              if (err)
++                      goto journal_error;
++              /*
++               * This function was called to make insertion of new leaf
++               * possible. Check that it fulfilled its obligations.
++               */
++              assert_corr(dx_get_count(path->ip_frame->entries) <
++                          dx_get_limit(path->ip_frame->entries));
+               }
++      if (nr_splet > 0) {
++              /*
++               * Log ->i_size modification.
++               */
++              err = ext3_mark_inode_dirty(handle, dir);
++              if (err)
++                      goto journal_error;
+       }
+       goto cleanup;
+ journal_error:
+@@ -2610,7 +1699,7 @@
+       size_t isize;
+       iam_path_compat_init(&cpath, dir);
+-      param = path_descr(path);
++      param = iam_path_descr(path);
+       err = dx_probe(dentry, NULL, &hinfo, path);
+       if (err != 0)
+@@ -2620,7 +1709,8 @@
+       /* XXX nikita: global serialization! */
+       isize = dir->i_size;
+-      err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), 
++      err = param->id_ops->id_node_read(path->ip_container,
++                      (iam_ptr_t)dx_get_block(path, frame->at),
+                                 handle, &bh);
+       if (err != 0)
+               goto cleanup;
+@@ -2641,11 +1731,11 @@
+               goto cleanup;   
+       /*copy split inode too*/
+-      de = do_split(handle, path, &bh, --frame, &hinfo, &err);
++      de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err);
+       if (!de)
+               goto cleanup;
+-      assert(dx_node_check(path, frame));
++      assert_inv(dx_node_check(path, frame));
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+       goto cleanup2;
+@@ -2752,6 +1842,26 @@
+       return ext3_new_inode(handle, dir, mode, inum);
+ }
++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode)
++{
++      struct inode *inode;
++
++      inode = ext3_new_inode(handle, dir, mode, 0);
++      if (!IS_ERR(inode)) {
++              if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) {
++#ifdef CONFIG_LDISKFS_FS_XATTR
++                      inode->i_op = &ext3_special_inode_operations;
++#endif
++              } else {
++                      inode->i_op = &ext3_file_inode_operations;
++                      inode->i_fop = &ext3_file_operations;
++                      ext3_set_aops(inode);
++              }
++      }
++      return inode;
++}
++EXPORT_SYMBOL(ext3_create_inode);
++
+ /*
+  * By the time this is called, we already have created
+  * the directory cache entry for the new file, but it
+Index: iam/fs/ext3/Makefile
+===================================================================
+--- iam.orig/fs/ext3/Makefile  2007-05-23 11:18:11.000000000 +0800
++++ iam/fs/ext3/Makefile       2007-05-23 11:18:20.000000000 +0800
+@@ -6,7 +6,7 @@
+ ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+          ioctl.o namei.o super.o symlink.o hash.o resize.o \
+-         extents.o mballoc.o
++         extents.o mballoc.o iam.o iam_lfix.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: iam/fs/ext3/iam_lvar.c
+===================================================================
+--- iam.orig/fs/ext3/iam_lvar.c        2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam_lvar.c     2007-05-23 11:19:15.000000000 +0800
+@@ -0,0 +1,1080 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  iam_lvar.c
++ *  implementation of iam format for fixed size records, variable sized keys.
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
++ *
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++/*
++ * Leaf operations.
++ */
++
++enum {
++        IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in
++                                      * lustre/utils/create_iam.c */
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct lvar_leaf_header {
++        __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
++        __le16 vlh_used;  /* used bytes, including header */
++};
++
++/*
++ * Format of leaf entry:
++ *
++ * __le16 keysize
++ *     u8 key[keysize]
++ *     u8 record[rec_size]
++ *
++ * Entries are ordered in key order.
++ */
++
++/* This is duplicated in lustre/utils/create_iam.c */
++typedef __u32 lvar_hash_t;
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct lvar_leaf_entry {
++        __le32 vle_hash;
++        __le16 vle_keysize;
++        u8     vle_key[0];
++};
++
++#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
++
++
++static inline int blocksize(const struct iam_leaf *leaf)
++{
++        return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
++}
++
++static inline const char *kchar(const struct iam_key *key)
++{
++        return (void *)key;
++}
++
++static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
++{
++        return (struct iam_lentry *)ent;
++}
++
++static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
++{
++        return (struct lvar_leaf_entry *)lent;
++}
++
++
++static inline int e_keysize(const struct lvar_leaf_entry *ent)
++{
++        return le16_to_cpu(ent->vle_keysize);
++}
++
++/* This is duplicated in lustre/utils/create_iam.c */
++enum {
++        LVAR_PAD   = 4,
++        LVAR_ROUND = LVAR_PAD - 1
++};
++
++static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
++{
++        CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1)));
++
++        return (offsetof(struct lvar_leaf_entry, vle_key) +
++                namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
++}
++
++static inline int rec_size(const struct iam_rec *rec)
++{
++        return *(const char *)rec;
++}
++
++static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
++{
++        return ((void *)ent) +
++                offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
++}
++
++static inline int e_size(const struct iam_leaf *leaf,
++                         const struct lvar_leaf_entry *ent)
++{
++        return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
++}
++
++static inline char *e_char(const struct lvar_leaf_entry *ent)
++{
++        return (char *)&ent->vle_key;
++}
++
++static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
++{
++        return (struct iam_key *)e_char(ent);
++}
++
++static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
++{
++        return le32_to_cpu(ent->vle_hash);
++}
++
++static void e_print(const struct lvar_leaf_entry *ent)
++{
++        printk("        %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
++               e_keysize(ent), e_keysize(ent), e_char(ent));
++}
++#if 0
++static int e_check(const struct iam_leaf *leaf,
++                   const struct lvar_leaf_entry *ent)
++{
++        const void *point = ent;
++        const void *start = leaf->il_bh->b_data;
++        return
++                start + sizeof(struct lvar_leaf_header) <= point &&
++                point + e_size(leaf, ent) < start + blocksize(leaf);
++}
++#endif
++
++static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
++                                             const struct lvar_leaf_entry *ent)
++{
++        return ((void *)ent) + e_size(leaf, ent);
++}
++
++#define LVAR_HASH_SANDWICH  (0)
++#define LVAR_HASH_TEA       (1)
++#define LVAR_HASH_R5        (0)
++#define LVAR_HASH_PREFIX    (0)
++
++static __u32 hash_build0(const char *name, int namelen)
++{
++        __u32 result;
++
++        if (namelen == 0)
++                return 0;
++        if (strncmp(name, ".", 1) == 0 && namelen == 1)
++                return 1;
++        if (strncmp(name, "..", 2) == 0 && namelen == 2)
++                return 2;
++
++        if (LVAR_HASH_PREFIX) {
++                result = 0;
++                strncpy((void *)&result,
++                        name, min(namelen, (int)sizeof result));
++        } else {
++                struct dx_hash_info hinfo;
++
++                if (LVAR_HASH_TEA)
++                        hinfo.hash_version = DX_HASH_TEA;
++                else
++                        hinfo.hash_version = DX_HASH_R5;
++                hinfo.seed = 0;
++                ext3fs_dirhash(name, namelen, &hinfo);
++                result = hinfo.hash;
++                if (LVAR_HASH_SANDWICH) {
++                        __u32 result2;
++
++                        hinfo.hash_version = DX_HASH_TEA;
++                        hinfo.seed = 0;
++                        ext3fs_dirhash(name, namelen, &hinfo);
++                        result2 = hinfo.hash;
++                        result = (0xfc000000 & result2) | (0x03ffffff & result);
++                }
++        }
++        return result;
++}
++
++enum {
++        HASH_GRAY_AREA = 1024,
++        MAX_HASH_SIZE  = 0x7fffffffUL
++};
++
++static __u32 hash_build(const char *name, int namelen)
++{
++        __u32 hash;
++
++        hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE;
++        if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA)
++                hash &= HASH_GRAY_AREA - 1;
++        return hash;
++}
++
++static inline lvar_hash_t get_hash(const struct iam_container *bag,
++                                   const char *name, int namelen)
++{
++        return hash_build(name, namelen);
++}
++
++static inline int e_eq(const struct lvar_leaf_entry *ent,
++                       const char *name, int namelen)
++{
++        return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
++}
++
++static inline int e_cmp(const struct iam_leaf *leaf,
++                        const struct lvar_leaf_entry *ent, lvar_hash_t hash)
++{
++        lvar_hash_t ehash;
++
++        ehash = e_hash(ent);
++        return ehash == hash ? 0 : (ehash < hash ? -1 : +1);
++}
++
++static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
++{
++        return (struct lvar_leaf_header *)l->il_bh->b_data;
++}
++
++static int h_used(const struct lvar_leaf_header *hdr)
++{
++        return le16_to_cpu(hdr->vlh_used);
++}
++
++static void h_used_adj(const struct iam_leaf *leaf,
++                       struct lvar_leaf_header *hdr, int adj)
++{
++        int used;
++
++        used = h_used(hdr) + adj;
++        assert_corr(sizeof *hdr <= used && used <= blocksize(leaf));
++        hdr->vlh_used = cpu_to_le16(used);
++}
++
++static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
++{
++        return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
++}
++
++static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
++{
++        return (void *)l->il_bh->b_data + h_used(n_head(l));
++}
++
++static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
++{
++        return lentry_lvar(l->il_at);
++}
++
++void n_print(const struct iam_leaf *l)
++{
++        struct lvar_leaf_entry *scan;
++
++        printk(KERN_EMERG "used: %d\n", h_used(n_head(l)));
++        for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
++                e_print(scan);
++}
++
++#if EXT3_CORRECTNESS_ON
++static int n_at_rec(const struct iam_leaf *folio)
++{
++        return
++                n_start(folio) <= lentry_lvar(folio->il_at) &&
++                lentry_lvar(folio->il_at) < n_end(folio);
++}
++
++#if EXT3_INVARIANT_ON
++static int n_invariant(const struct iam_leaf *leaf)
++{
++        struct iam_path        *path;
++        struct lvar_leaf_entry *scan;
++        struct lvar_leaf_entry *end;
++        lvar_hash_t             hash;
++        lvar_hash_t             nexthash;
++        lvar_hash_t             starthash;
++
++        end  = n_end(leaf);
++        hash = 0;
++        path = leaf->il_path;
++
++        if (h_used(n_head(leaf)) > blocksize(leaf))
++                return 0;
++
++        /*
++         * Delimiting key in the parent index node. Clear least bit to account
++         * for hash collision marker.
++         */
++        starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
++        for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
++                nexthash = e_hash(scan);
++                if (nexthash != get_hash(iam_leaf_container(leaf),
++                                         e_char(scan), e_keysize(scan))) {
++                        BREAKPOINT();
++                        return 0;
++                }
++                if (0 && nexthash < starthash) {
++                        /*
++                         * Unfortunately this useful invariant cannot be
++                         * reliably checked as parent node is nor necessarily
++                         * locked.
++                         */
++                        n_print(leaf);
++                        printk("%#x < %#x\n", nexthash, starthash);
++                        dump_stack();
++                        return 0;
++                }
++                if (nexthash < hash) {
++                        BREAKPOINT();
++                        return 0;
++                }
++                hash = nexthash;
++        }
++        if (scan != end) {
++                BREAKPOINT();
++                return 0;
++        }
++        return 1;
++}
++/* EXT3_INVARIANT_ON */
++#endif
++
++/* EXT3_CORRECTNESS_ON */
++#endif
++
++static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
++                                  struct iam_ikey *key)
++{
++        lvar_hash_t *hash;
++
++        assert_corr(n_at_rec(l));
++
++        hash = (void *)key;
++        *hash = e_hash(n_cur(l));
++        return key;
++}
++
++static struct iam_key *lvar_key(const struct iam_leaf *l)
++{
++        return e_key(n_cur(l));
++}
++
++static int lvar_key_size(const struct iam_leaf *l)
++{
++        return e_keysize(n_cur(l));
++}
++
++static void lvar_start(struct iam_leaf *l)
++{
++        l->il_at = lvar_lentry(n_start(l));
++}
++
++static int lvar_init(struct iam_leaf *l)
++{
++        int result;
++        int used;
++        struct lvar_leaf_header *head;
++
++        assert_corr(l->il_bh != NULL);
++
++        head = n_head(l);
++        used = h_used(head);
++        if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) &&
++            used <= blocksize(l)) {
++                l->il_at = l->il_entries = lvar_lentry(n_start(l));
++                result = 0;
++        } else {
++                struct inode *obj;
++
++                obj = iam_leaf_container(l)->ic_object;
++                ext3_error(obj->i_sb, __FUNCTION__,
++                           "Wrong magic in node %llu (#%lu): %#x != %#x or "
++                           "wrong used: %i",
++                           (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
++                           head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC),
++                           used);
++                result = -EIO;
++        }
++        return result;
++}
++
++static void lvar_fini(struct iam_leaf *l)
++{
++        l->il_entries = l->il_at = NULL;
++}
++
++struct iam_rec *lvar_rec(const struct iam_leaf *l)
++{
++        assert_corr(n_at_rec(l));
++        return e_rec(n_cur(l));
++}
++
++static void lvar_next(struct iam_leaf *l)
++{
++        assert_corr(n_at_rec(l));
++        assert_corr(iam_leaf_is_locked(l));
++        l->il_at = lvar_lentry(e_next(l, n_cur(l)));
++}
++
++static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
++{
++        struct lvar_leaf_entry *found;
++        struct lvar_leaf_entry *scan;
++        struct lvar_leaf_entry *end;
++        int                     result;
++        const char             *name;
++        int                     namelen;
++        int                     found_equal;
++        lvar_hash_t             hash;
++        int                     last;
++
++        assert_inv(n_invariant(leaf));
++        end = n_end(leaf);
++
++        name = kchar(k);
++        namelen = strlen(name);
++        hash = get_hash(iam_leaf_container(leaf), name, namelen);
++        found = NULL;
++        found_equal = 0;
++        last = 1;
++
++        for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
++                lvar_hash_t scan_hash;
++
++                scan_hash = e_hash(scan);
++                if (scan_hash < hash)
++                        found = scan;
++                else if (scan_hash == hash) {
++                        if (e_eq(scan, name, namelen)) {
++                                /*
++                                 * perfect match
++                                 */
++                                leaf->il_at = lvar_lentry(scan);
++                                return IAM_LOOKUP_EXACT;
++                        } else if (!found_equal) {
++                                        found = scan;
++                                        found_equal = 1;
++                        }
++                } else {
++                        last = 0;
++                        break;
++                }
++        }
++        if (found == NULL) {
++                /*
++                 * @k is less than all hashes in the leaf.
++                 */
++                lvar_start(leaf);
++                result = IAM_LOOKUP_BEFORE;
++        } else {
++                leaf->il_at = lvar_lentry(found);
++                result = IAM_LOOKUP_OK;
++                assert_corr(n_at_rec(leaf));
++        }
++        if (last)
++                result |= IAM_LOOKUP_LAST;
++        assert_inv(n_invariant(leaf));
++
++        return result;
++}
++
++static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
++{
++        struct lvar_leaf_entry *scan;
++        struct lvar_leaf_entry *end;
++        lvar_hash_t             hash;
++
++        assert_inv(n_invariant(leaf));
++        end  = n_end(leaf);
++        hash = *(const lvar_hash_t *)ik;
++
++        lvar_start(leaf);
++        for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
++                lvar_hash_t scan_hash;
++
++                scan_hash = e_hash(scan);
++                if (scan_hash > hash)
++                        return scan == n_start(leaf) ?
++                                IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
++                leaf->il_at = lvar_lentry(scan);
++                if (scan_hash == hash)
++                        return IAM_LOOKUP_EXACT;
++        }
++        assert_inv(n_invariant(leaf));
++        /*
++         * @ik is greater than any key in the node. Return last record in the
++         * node.
++         */
++        return IAM_LOOKUP_OK;
++}
++
++static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++        memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
++}
++
++static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++        assert_corr(n_at_rec(l));
++        assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
++        assert_corr(iam_leaf_is_locked(l));
++        __lvar_key_set(l, k);
++        assert_inv(n_invariant(l));
++}
++
++static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
++{
++        lvar_hash_t hash;
++        const char *name;
++
++        name = kchar(k);
++
++        hash = get_hash(iam_leaf_container(l), name, strlen(name));
++        return e_cmp(l, n_cur(l), hash);
++}
++
++static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
++{
++        const char *name;
++
++        name = kchar(k);
++        return e_eq(n_cur(l), name, strlen(name));
++}
++
++static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++        memcpy(e_rec(n_cur(l)), r, rec_size(r));
++}
++
++static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++        assert_corr(n_at_rec(l));
++        assert_corr(iam_leaf_is_locked(l));
++        __lvar_rec_set(l, r);
++        assert_inv(n_invariant(l));
++}
++
++static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
++{
++        struct iam_rec *rec;
++
++        rec = e_rec(n_cur(l));
++        assert_corr(n_at_rec(l));
++        assert_corr(iam_leaf_is_locked(l));
++        memcpy(r, rec, rec_size(rec));
++        assert_inv(n_invariant(l));
++}
++
++static int lvar_can_add(const struct iam_leaf *l,
++                        const struct iam_key *k, const struct iam_rec *r)
++{
++        assert_corr(iam_leaf_is_locked(l));
++        return
++                h_used(n_head(l)) +
++                getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
++}
++
++static int lvar_at_end(const struct iam_leaf *folio)
++{
++        assert_corr(iam_leaf_is_locked(folio));
++        return n_cur(folio) == n_end(folio);
++}
++
++static void lvar_rec_add(struct iam_leaf *leaf,
++                         const struct iam_key *k, const struct iam_rec *r)
++{
++        const char *key;
++        int   ksize;
++        int   shift;
++        void *end;
++        void *start;
++        ptrdiff_t diff;
++
++        assert_corr(lvar_can_add(leaf, k, r));
++        assert_inv(n_invariant(leaf));
++        assert_corr(iam_leaf_is_locked(leaf));
++
++        key   = kchar(k);
++        ksize = strlen(key);
++        shift = getsize(leaf, ksize, rec_size(r));
++
++        if (!lvar_at_end(leaf)) {
++                assert_corr(n_cur(leaf) < n_end(leaf));
++                end = n_end(leaf);
++                if (lvar_key_cmp(leaf, k) <= 0)
++                        lvar_next(leaf);
++                else
++                        /*
++                         * Another exceptional case: insertion with the key
++                         * less than least key in the leaf.
++                         */
++                        assert_corr(leaf->il_at == leaf->il_entries);
++
++                start = leaf->il_at;
++                diff  = PDIFF(end, start);
++                assert_corr(diff >= 0);
++                memmove(start + shift, start, diff);
++        }
++        h_used_adj(leaf, n_head(leaf), shift);
++        n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
++        n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
++                                                     key, ksize));
++        __lvar_key_set(leaf, k);
++        __lvar_rec_set(leaf, r);
++        assert_corr(n_at_rec(leaf));
++        assert_inv(n_invariant(leaf));
++}
++
++static void lvar_rec_del(struct iam_leaf *leaf, int shift)
++{
++        void *next;
++        void *end;
++        int nob;
++
++        assert_corr(n_at_rec(leaf));
++        assert_inv(n_invariant(leaf));
++        assert_corr(iam_leaf_is_locked(leaf));
++
++        end  = n_end(leaf);
++        next = e_next(leaf, n_cur(leaf));
++        nob  = e_size(leaf, n_cur(leaf));
++        memmove(leaf->il_at, next, end - next);
++        h_used_adj(leaf, n_head(leaf), -nob);
++        assert_inv(n_invariant(leaf));
++}
++
++static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
++{
++        struct lvar_leaf_header *hdr;
++
++        hdr = (struct lvar_leaf_header *)bh->b_data;
++        hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
++        hdr->vlh_used  = sizeof *hdr;
++}
++
++static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
++                                          struct lvar_leaf_entry **prev)
++{
++        void *scan;
++        void *start;
++        int threshold;
++
++        *prev = NULL;
++        threshold = blocksize(leaf) / 2;
++        for (scan = start = n_start(leaf); scan - start <= threshold;
++             *prev = scan, scan = e_next(leaf, scan)) {
++                ;
++        }
++        return scan;
++}
++
++static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
++                       iam_ptr_t new_blknr)
++{
++        struct lvar_leaf_entry  *first_to_move;
++        struct lvar_leaf_entry  *last_to_stay;
++        struct iam_path         *path;
++        struct lvar_leaf_header *hdr;
++        struct buffer_head      *new_leaf;
++
++        ptrdiff_t   tomove;
++        lvar_hash_t hash;
++
++        assert_inv(n_invariant(leaf));
++        assert_corr(iam_leaf_is_locked(leaf));
++
++        new_leaf = *bh;
++        path = iam_leaf_path(leaf);
++
++        hdr = (void *)new_leaf->b_data;
++
++        first_to_move = find_pivot(leaf, &last_to_stay);
++        assert_corr(last_to_stay != NULL);
++        assert_corr(e_next(leaf, last_to_stay) == first_to_move);
++
++        hash = e_hash(first_to_move);
++        if (hash == e_hash(last_to_stay))
++                /*
++                 * Duplicate hash.
++                 */
++                hash |= 1;
++
++        tomove = PDIFF(n_end(leaf), first_to_move);
++        memmove(hdr + 1, first_to_move, tomove);
++
++        h_used_adj(leaf, hdr, tomove);
++        h_used_adj(leaf, n_head(leaf), -tomove);
++
++        assert_corr(n_end(leaf) == first_to_move);
++
++        if (n_cur(leaf) >= first_to_move) {
++                /*
++                 * insertion point moves into new leaf.
++                 */
++                ptrdiff_t shift;
++                int result;
++
++                shift = PDIFF(leaf->il_at, first_to_move);
++                *bh = leaf->il_bh;
++                leaf->il_bh = new_leaf;
++                leaf->il_curidx = new_blknr;
++
++                assert_corr(iam_leaf_is_locked(leaf));
++                result = lvar_init(leaf);
++                /*
++                 * init cannot fail, as node was just initialized.
++                 */
++                assert_corr(result == 0);
++                leaf->il_at = ((void *)leaf->il_at) + shift;
++        }
++        /*
++         * Insert pointer to the new node (together with the least key in
++         * the node) into index node.
++         */
++        iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
++                            new_blknr);
++        assert_corr(n_cur(leaf) < n_end(leaf));
++        assert_inv(n_invariant(leaf));
++}
++
++static struct iam_leaf_operations lvar_leaf_ops = {
++        .init           = lvar_init,
++        .init_new       = lvar_init_new,
++        .fini           = lvar_fini,
++        .start          = lvar_start,
++        .next           = lvar_next,
++        .key            = lvar_key,
++        .ikey           = lvar_ikey,
++        .rec            = lvar_rec,
++        .key_set        = lvar_key_set,
++        .key_cmp        = lvar_key_cmp,
++        .key_eq         = lvar_key_eq,
++        .key_size       = lvar_key_size,
++        .rec_set        = lvar_rec_set,
++        .rec_get        = lvar_rec_get,
++        .lookup         = lvar_lookup,
++        .ilookup        = lvar_ilookup,
++        .at_end         = lvar_at_end,
++        .rec_add        = lvar_rec_add,
++        .rec_del        = lvar_rec_del,
++        .can_add        = lvar_can_add,
++        .split          = lvar_split
++};
++
++/*
++ * Index operations.
++ */
++
++enum {
++        /* This is duplicated in lustre/utils/create_iam.c */
++        /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
++        IAM_LVAR_ROOT_MAGIC = 0xb01dface
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct lvar_root {
++        __le32 vr_magic;
++        __le16 vr_recsize;
++        __le16 vr_ptrsize;
++        u8     vr_indirect_levels;
++        u8     vr_padding0;
++        __le16 vr_padding1;
++};
++
++static __u32 lvar_root_ptr(struct iam_container *c)
++{
++        return 0;
++}
++
++static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
++                          int root)
++{
++        return 0;
++}
++
++static struct iam_entry *lvar_root_inc(struct iam_container *c,
++                                       struct iam_path *path,
++                                       struct iam_frame *frame)
++{
++        struct lvar_root *root;
++        struct iam_entry *entries;
++
++        assert_corr(iam_frame_is_locked(path, frame));
++        entries = frame->entries;
++
++        dx_set_count(entries, 2);
++        assert_corr(dx_get_limit(entries) == dx_root_limit(path));
++
++        root = (void *)frame->bh->b_data;
++        assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
++        root->vr_indirect_levels ++;
++        frame->at = entries = iam_entry_shift(path, entries, 1);
++        memset(iam_ikey_at(path, entries), 0,
++               iam_path_descr(path)->id_ikey_size);
++        return entries;
++}
++
++static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++        unsigned count;
++        unsigned limit;
++        unsigned limit_correct;
++        struct iam_entry *entries;
++
++        entries = dx_node_get_entries(path, frame);
++
++        if (frame == path->ip_frames) {
++                struct lvar_root *root;
++
++                root = (void *)frame->bh->b_data;
++                if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
++                        return -EIO;
++                limit_correct = dx_root_limit(path);
++        } else
++                limit_correct = dx_node_limit(path);
++        count = dx_get_count(entries);
++        limit = dx_get_limit(entries);
++        if (count > limit)
++                return -EIO;
++        if (limit != limit_correct)
++                return -EIO;
++        return 0;
++}
++
++static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
++{
++        struct iam_entry *entries;
++        void *data;
++        entries = dx_node_get_entries(path, frame);
++
++        data = frame->bh->b_data;
++
++        if (frame == path->ip_frames) {
++                struct lvar_root *root;
++                const char *name;
++
++                root = data;
++                name = kchar(path->ip_key_target);
++                path->ip_indirect = root->vr_indirect_levels;
++                if (path->ip_ikey_target == NULL) {
++                        path->ip_ikey_target = iam_path_ikey(path, 4);
++                        *(lvar_hash_t *)path->ip_ikey_target =
++                                get_hash(path->ip_container, name,
++                                         strlen(name));
++                }
++        }
++        frame->entries = frame->at = entries;
++        return 0;
++}
++
++static int lvar_ikeycmp(const struct iam_container *c,
++                        const struct iam_ikey *k1, const struct iam_ikey *k2)
++{
++      lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
++      lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
++
++      return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
++}
++
++static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
++                                             void *area)
++{
++        return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
++}
++
++static int root_limit(int rootgap, int blocksize, int size)
++{
++        int limit;
++        int nlimit;
++
++        limit = (blocksize - rootgap) / size;
++        nlimit = blocksize / size;
++        if (limit == nlimit)
++                limit--;
++        return limit;
++}
++
++static int lvar_root_limit(int blocksize, int size)
++{
++        return root_limit(sizeof(struct lvar_root), blocksize, size);
++}
++
++static void lvar_root(void *buf,
++                      int blocksize, int keysize, int ptrsize, int recsize)
++{
++        struct lvar_root *root;
++        struct dx_countlimit *limit;
++        void                 *entry;
++        int isize;
++
++        isize = sizeof(lvar_hash_t) + ptrsize;
++        root = buf;
++        *root = (typeof(*root)) {
++                .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
++                .vr_recsize          = cpu_to_le16(recsize),
++                .vr_ptrsize          = cpu_to_le16(ptrsize),
++                .vr_indirect_levels  = 0
++        };
++
++        limit = (void *)(root + 1);
++        *limit = (typeof(*limit)){
++                /*
++                 * limit itself + one pointer to the leaf.
++                 */
++                .count = cpu_to_le16(2),
++                .limit = lvar_root_limit(blocksize,
++                                         sizeof (lvar_hash_t) + ptrsize)
++        };
++
++        entry = root + 1;
++        /*
++         * Skip over @limit.
++         */
++        entry += isize;
++
++        /*
++         * Entry format is <key> followed by <ptr>. In the minimal tree
++         * consisting of a root and single node, <key> is a minimal possible
++         * key.
++         */
++        *(lvar_hash_t *)entry = 0;
++        entry += sizeof(lvar_hash_t);
++        /* now @entry points to <ptr> */
++        if (ptrsize == 4)
++                *(u_int32_t *)entry = cpu_to_le32(1);
++        else
++                *(u_int64_t *)entry = cpu_to_le64(1);
++}
++
++static int lvar_esize(int namelen, int recsize)
++{
++        return (offsetof(struct lvar_leaf_entry, vle_key) +
++                namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
++}
++
++static void lvar_leaf(void *buf,
++                      int blocksize, int keysize, int ptrsize, int recsize)
++{
++        struct lvar_leaf_header *head;
++        struct lvar_leaf_entry  *entry;
++
++        /* form leaf */
++        head = buf;
++        *head = (typeof(*head)) {
++                .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
++                .vlh_used  = cpu_to_le16(sizeof *head + lvar_esize(0, recsize))
++        };
++        entry = (void *)(head + 1);
++        *entry = (typeof(*entry)) {
++                .vle_hash    = 0,
++                .vle_keysize = 0
++        };
++        memset(e_rec(entry), 0, recsize);
++        *(char *)e_rec(entry) = recsize;
++}
++
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++
++int iam_lvar_create(struct inode *obj,
++                    int keysize, int ptrsize, int recsize, handle_t *handle)
++{
++        struct buffer_head *root_node;
++        struct buffer_head *leaf_node;
++        struct super_block *sb;
++
++        u32 blknr;
++        int result;
++        unsigned long bsize;
++
++        assert_corr(obj->i_size == 0);
++
++        sb = obj->i_sb;
++        bsize = sb->s_blocksize;
++        root_node = ext3_append(handle, obj, &blknr, &result);
++        leaf_node = ext3_append(handle, obj, &blknr, &result);
++        if (root_node != NULL && leaf_node != NULL) {
++                lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
++                lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
++                ext3_mark_inode_dirty(handle, obj);
++                result = ext3_journal_dirty_metadata(handle, root_node);
++                if (result == 0)
++                        result = ext3_journal_dirty_metadata(handle, leaf_node);
++                if (result != 0)
++                        ext3_std_error(sb, result);
++        }
++        brelse(leaf_node);
++        brelse(root_node);
++        return result;
++}
++EXPORT_SYMBOL(iam_lvar_create);
++
++static struct iam_operations lvar_ops = {
++        .id_root_ptr    = lvar_root_ptr,
++        .id_node_read   = iam_node_read,
++        .id_node_init   = lvar_node_init,
++        .id_node_check  = lvar_node_check,
++        .id_node_load   = lvar_node_load,
++        .id_ikeycmp     = lvar_ikeycmp,
++        .id_root_inc    = lvar_root_inc,
++        .id_ipd_alloc   = lvar_ipd_alloc,
++        .id_ipd_free    = iam_ipd_free,
++        .id_name        = "lvar"
++};
++
++static int lvar_guess(struct iam_container *c)
++{
++        int result;
++        struct buffer_head *bh;
++        const struct lvar_root *root;
++
++        assert_corr(c->ic_object != NULL);
++
++        result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
++        if (result == 0) {
++                root = (void *)bh->b_data;
++                if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
++                        struct iam_descr *descr;
++
++                        descr = c->ic_descr;
++                        descr->id_key_size  = EXT3_NAME_LEN;
++                        descr->id_ikey_size = sizeof (lvar_hash_t);
++                        descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
++                        descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
++                        descr->id_root_gap  = sizeof *root;
++                        descr->id_node_gap  = 0;
++                        descr->id_ops       = &lvar_ops;
++                        descr->id_leaf_ops  = &lvar_leaf_ops;
++                } else
++                        result = -EBADF;
++                brelse(bh);
++        }
++        return result;
++}
++
++static struct iam_format lvar_format = {
++        .if_guess = lvar_guess
++};
++
++void iam_lvar_format_init(void)
++{
++        iam_format_register(&lvar_format);
++}
++
+Index: iam/fs/ext3/iam_lfix.c
+===================================================================
+--- iam.orig/fs/ext3/iam_lfix.c        2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam_lfix.c     2007-05-23 11:18:20.000000000 +0800
+@@ -0,0 +1,735 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  iam_lfix.c
++ *  implementation of iam format for fixed size records.
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Wang Di <wangdi@clusterfs.com>
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
++ *
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++/*
++ * Leaf operations.
++ */
++
++enum {
++        IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in
++                                        * lustre/utils/create_iam.c */
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct iam_leaf_head {
++        __le16 ill_magic;
++        __le16 ill_count;
++};
++
++static inline int iam_lfix_entry_size(const struct iam_leaf *l)
++{
++        return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size;
++}
++
++static inline struct iam_lentry *
++iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift)
++{
++        return (void *)entry + shift * iam_lfix_entry_size(l);
++}
++
++static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry)
++{
++        return (struct iam_key *)entry;
++}
++
++static inline int lfix_keycmp(const struct iam_container *c,
++                              const struct iam_key *k1,
++                              const struct iam_key *k2)
++{
++      return memcmp(k1, k2, c->ic_descr->id_key_size);
++}
++
++static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l)
++{
++        return (struct iam_leaf_head *)l->il_bh->b_data;
++}
++
++static struct iam_lentry *iam_entries(const struct buffer_head *bh)
++{
++        return (void *)bh->b_data + sizeof(struct iam_leaf_head);
++}
++
++static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l)
++{
++        return iam_entries(l->il_bh);
++}
++
++static int leaf_count_limit(const struct iam_leaf *leaf)
++{
++        int free_space;
++
++        free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
++        free_space -= sizeof(struct iam_leaf_head);
++        return free_space / iam_lfix_entry_size(leaf);
++}
++
++static int lentry_count_get(const struct iam_leaf *leaf)
++{
++        return le16_to_cpu(iam_get_head(leaf)->ill_count);
++}
++
++static void lentry_count_set(struct iam_leaf *leaf, unsigned count)
++{
++        assert_corr(0 <= count && count <= leaf_count_limit(leaf));
++        iam_get_head(leaf)->ill_count = cpu_to_le16(count);
++}
++
++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l);
++
++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON
++static int iam_leaf_at_rec(const struct iam_leaf *folio)
++{
++        return
++                iam_get_lentries(folio) <= folio->il_at &&
++                folio->il_at < iam_lfix_get_end(folio);
++}
++#endif
++
++static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l,
++                                      struct iam_ikey *key)
++{
++        void *ie = l->il_at;
++        assert_corr(iam_leaf_at_rec(l));
++        return (struct iam_ikey*)ie;
++}
++
++static struct iam_key *iam_lfix_key(const struct iam_leaf *l)
++{
++        void *ie = l->il_at;
++        assert_corr(iam_leaf_at_rec(l));
++        return (struct iam_key*)ie;
++}
++
++static int iam_lfix_key_size(const struct iam_leaf *l)
++{
++        return iam_leaf_descr(l)->id_key_size;
++}
++
++static void iam_lfix_start(struct iam_leaf *l)
++{
++        l->il_at = iam_get_lentries(l);
++}
++
++static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l,
++                                      const struct iam_lentry *e1,
++                                      const struct iam_lentry *e2)
++{
++        ptrdiff_t diff;
++        int esize;
++
++        esize = iam_lfix_entry_size(l);
++        diff = (void *)e1 - (void *)e2;
++        assert_corr(diff / esize * esize == diff);
++        return diff / esize;
++}
++
++static int iam_lfix_init(struct iam_leaf *l)
++{
++        int result;
++        struct iam_leaf_head *ill;
++        int count;
++
++        assert_corr(l->il_bh != NULL);
++
++        ill = iam_get_head(l);
++        count = le16_to_cpu(ill->ill_count);
++        if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) &&
++            0 <= count && count <= leaf_count_limit(l)) {
++                l->il_at = l->il_entries = iam_get_lentries(l);
++                result = 0;
++        } else {
++                struct inode *obj;
++
++                obj = iam_leaf_container(l)->ic_object;
++                ext3_error(obj->i_sb, __FUNCTION__,
++                           "Wrong magic in node %llu (#%lu): %#x != %#x or "
++                           "wrong count: %i (%i)",
++                           (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
++                           ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC),
++                           count, leaf_count_limit(l));
++                result = -EIO;
++        }
++        return result;
++}
++
++static void iam_lfix_fini(struct iam_leaf *l)
++{
++        l->il_entries = l->il_at = NULL;
++}
++
++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l)
++{
++        int count = lentry_count_get(l);
++        struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count);
++
++        return ile;
++}
++
++struct iam_rec *iam_lfix_rec(const struct iam_leaf *l)
++{
++        void *e = l->il_at;
++        assert_corr(iam_leaf_at_rec(l));
++        return e + iam_leaf_descr(l)->id_key_size;
++}
++
++static void iam_lfix_next(struct iam_leaf *l)
++{
++        assert_corr(iam_leaf_at_rec(l));
++        l->il_at = iam_lfix_shift(l, l->il_at, 1);
++}
++
++/*
++ * Bug chasing.
++ */
++int lfix_dump = 0;
++EXPORT_SYMBOL(lfix_dump);
++
++static char hdigit(char ch)
++{
++        static char d[] = "0123456789abcdef";
++        return d[ch & 0xf];
++}
++
++static char *hex(char ch, char *area)
++{
++        area[0] = hdigit(ch >> 4);
++        area[1] = hdigit(ch);
++        area[2] = 0;
++        return area;
++}
++
++static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry)
++{
++        int i;
++        char *area;
++        char h[3];
++
++        area = (char *)entry;
++        printk(KERN_EMERG "[");
++        for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area)
++                printk("%s", hex(*area, h));
++        printk("]-(");
++        for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area)
++                printk("%s", hex(*area, h));
++        printk(")\n");
++}
++
++static void lfix_print(struct iam_leaf *leaf)
++{
++        struct iam_lentry *entry;
++        int count;
++        int i;
++
++        entry = leaf->il_entries;
++        count = lentry_count_get(leaf);
++        printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count);
++        for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1))
++                l_print(leaf, entry);
++}
++
++static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k)
++{
++        struct iam_lentry *p, *q, *m, *t;
++        struct iam_container *c;
++        int count;
++        int result;
++
++        count = lentry_count_get(l);
++        if (count == 0)
++                return IAM_LOOKUP_EMPTY;
++
++        result = IAM_LOOKUP_OK;
++        c = iam_leaf_container(l);
++
++        p = l->il_entries;
++        q = iam_lfix_shift(l, p, count - 1);
++        if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) {
++                /*
++                 * @k is less than the least key in the leaf
++                 */
++                l->il_at = p;
++                result = IAM_LOOKUP_BEFORE;
++        } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) {
++                l->il_at = q;
++        } else {
++                /*
++                 * EWD1293
++                 */
++                while (iam_lfix_shift(l, p, 1) != q) {
++                        m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2);
++                        assert_corr(p < m && m < q);
++                        if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0)
++                                p = m;
++                        else
++                                q = m;
++                }
++                assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 &&
++                            lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0);
++                /*
++                 * skip over records with duplicate keys.
++                 */
++                while (p > l->il_entries) {
++                        t = iam_lfix_shift(l, p, -1);
++                        if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0)
++                                p = t;
++                        else
++                                break;
++                }
++                l->il_at = p;
++        }
++        assert_corr(iam_leaf_at_rec(l));
++
++        if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0)
++                result = IAM_LOOKUP_EXACT;
++
++        if (lfix_dump)
++                lfix_print(l);
++
++        return result;
++}
++
++static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik)
++{
++        assert(0);
++        return IAM_LOOKUP_OK;
++}
++
++static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++        assert_corr(iam_leaf_at_rec(l));
++        memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size);
++}
++
++static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
++{
++        return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k);
++}
++
++static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k)
++{
++        return !lfix_keycmp(iam_leaf_container(l),
++                            iam_leaf_key_at(l->il_at), k);
++}
++
++static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++        assert_corr(iam_leaf_at_rec(l));
++        memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size);
++}
++
++static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r)
++{
++        assert_corr(iam_leaf_at_rec(l));
++        memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size);
++}
++
++static void iam_lfix_rec_add(struct iam_leaf *leaf,
++                             const struct iam_key *k, const struct iam_rec *r)
++{
++        struct iam_lentry *end;
++        struct iam_lentry *cur;
++        struct iam_lentry *start;
++        ptrdiff_t diff;
++        int count;
++
++        assert_corr(iam_leaf_can_add(leaf, k, r));
++
++        count = lentry_count_get(leaf);
++        /*
++         * This branch handles two exceptional cases:
++         *
++         *   - leaf positioned beyond last record, and
++         *
++         *   - empty leaf.
++         */
++        if (!iam_leaf_at_end(leaf)) {
++                end   = iam_lfix_get_end(leaf);
++                cur   = leaf->il_at;
++                if (lfix_keycmp(iam_leaf_container(leaf),
++                               k, iam_leaf_key_at(cur)) >= 0)
++                        iam_lfix_next(leaf);
++                else
++                        /*
++                         * Another exceptional case: insertion with the key
++                         * less than least key in the leaf.
++                         */
++                        assert_corr(cur == leaf->il_entries);
++
++                start = leaf->il_at;
++                diff  = (void *)end - (void *)start;
++                assert_corr(diff >= 0);
++                memmove(iam_lfix_shift(leaf, start, 1), start, diff);
++        }
++        lentry_count_set(leaf, count + 1);
++        iam_lfix_key_set(leaf, k);
++        iam_lfix_rec_set(leaf, r);
++        assert_corr(iam_leaf_at_rec(leaf));
++}
++
++static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift)
++{
++        struct iam_lentry *next, *end;
++        int count;
++        ptrdiff_t diff;
++
++        assert_corr(iam_leaf_at_rec(leaf));
++
++        count = lentry_count_get(leaf);
++        end = iam_lfix_get_end(leaf);
++        next = iam_lfix_shift(leaf, leaf->il_at, 1);
++        diff = (void *)end - (void *)next;
++        memmove(leaf->il_at, next, diff);
++
++        lentry_count_set(leaf, count - 1);
++}
++
++static int iam_lfix_can_add(const struct iam_leaf *l,
++                            const struct iam_key *k, const struct iam_rec *r)
++{
++        return lentry_count_get(l) < leaf_count_limit(l);
++}
++
++static int iam_lfix_at_end(const struct iam_leaf *folio)
++{
++        return folio->il_at == iam_lfix_get_end(folio);
++}
++
++static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh)
++{
++        struct iam_leaf_head *hdr;
++
++        hdr = (struct iam_leaf_head*)bh->b_data;
++        hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC);
++        hdr->ill_count = cpu_to_le16(0);
++}
++
++static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh,
++                           iam_ptr_t new_blknr)
++{
++        struct iam_path       *path;
++        struct iam_leaf_head  *hdr;
++        const struct iam_ikey *pivot;
++        struct buffer_head    *new_leaf;
++
++        unsigned count;
++        unsigned split;
++
++        void *start;
++        void *finis;
++
++        new_leaf = *bh;
++        path = iam_leaf_path(l);
++
++        hdr = (void *)new_leaf->b_data;
++
++        count = lentry_count_get(l);
++        split = count / 2;
++
++        start = iam_lfix_shift(l, iam_get_lentries(l), split);
++        finis = iam_lfix_shift(l, iam_get_lentries(l), count);
++
++        pivot = (const struct iam_ikey *)iam_leaf_key_at(start);
++
++        memmove(iam_entries(new_leaf), start, finis - start);
++        hdr->ill_count = count - split;
++        lentry_count_set(l, split);
++        if ((void *)l->il_at >= start) {
++                /*
++                 * insertion point moves into new leaf.
++                 */
++                int shift;
++                int result;
++
++                shift = iam_lfix_diff(l, l->il_at, start);
++                *bh = l->il_bh;
++                l->il_bh = new_leaf;
++                l->il_curidx = new_blknr;
++                result = iam_lfix_init(l);
++                /*
++                 * init cannot fail, as node was just initialized.
++                 */
++                assert_corr(result == 0);
++                l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift);
++        }
++        /*
++         * Insert pointer to the new node (together with the least key in
++         * the node) into index node.
++         */
++        iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr);
++}
++
++static struct iam_leaf_operations iam_lfix_leaf_ops = {
++        .init           = iam_lfix_init,
++        .init_new       = iam_lfix_init_new,
++        .fini           = iam_lfix_fini,
++        .start          = iam_lfix_start,
++        .next           = iam_lfix_next,
++        .key            = iam_lfix_key,
++        .ikey           = iam_lfix_ikey,
++        .rec            = iam_lfix_rec,
++        .key_set        = iam_lfix_key_set,
++        .key_cmp        = iam_lfix_key_cmp,
++        .key_eq         = iam_lfix_key_eq,
++        .key_size       = iam_lfix_key_size,
++        .rec_set        = iam_lfix_rec_set,
++        .rec_get        = iam_lfix_rec_get,
++        .lookup         = iam_lfix_lookup,
++        .ilookup        = iam_lfix_ilookup,
++        .at_end         = iam_lfix_at_end,
++        .rec_add        = iam_lfix_rec_add,
++        .rec_del        = iam_lfix_rec_del,
++        .can_add        = iam_lfix_can_add,
++        .split          = iam_lfix_split
++};
++
++/*
++ * Index operations.
++ */
++
++enum {
++        /* This is duplicated in lustre/utils/create_iam.c */
++        /*
++         * Then shalt thou see the dew-BEDABBLED wretch
++         * Turn, and return, indenting with the way;
++         * Each envious brier his weary legs doth scratch,
++         * Each shadow makes him stop, each murmur stay:
++         * For misery is trodden on by many,
++         * And being low never relieved by any.
++         */
++        IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct iam_lfix_root {
++        __le64  ilr_magic;
++        __le16  ilr_keysize;
++        __le16  ilr_recsize;
++        __le16  ilr_ptrsize;
++        u8      ilr_indirect_levels;
++        u8      ilr_padding;
++};
++
++static __u32 iam_lfix_root_ptr(struct iam_container *c)
++{
++        return 0;
++}
++
++static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh,
++                              int root)
++{
++        return 0;
++}
++
++static struct iam_entry *iam_lfix_root_inc(struct iam_container *c,
++                                           struct iam_path *path,
++                                           struct iam_frame *frame)
++{
++        struct iam_lfix_root *root;
++        struct iam_entry     *entries;
++
++        entries = frame->entries;
++
++        dx_set_count(entries, 2);
++        assert_corr(dx_get_limit(entries) == dx_root_limit(path));
++
++        root = (void *)frame->bh->b_data;
++        assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC);
++        root->ilr_indirect_levels ++;
++        frame->at = entries = iam_entry_shift(path, entries, 1);
++        memset(iam_ikey_at(path, entries), 0,
++               iam_path_descr(path)->id_ikey_size);
++        return entries;
++}
++
++static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++        unsigned count;
++        unsigned limit;
++        unsigned limit_correct;
++        struct iam_entry *entries;
++
++        entries = dx_node_get_entries(path, frame);
++
++        if (frame == path->ip_frames) {
++                struct iam_lfix_root *root;
++
++                root = (void *)frame->bh->b_data;
++                if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) {
++                        return -EIO;
++                }
++                limit_correct = dx_root_limit(path);
++        } else
++                limit_correct = dx_node_limit(path);
++        count = dx_get_count(entries);
++        limit = dx_get_limit(entries);
++        if (count > limit) {
++                return -EIO;
++        }
++        if (limit != limit_correct) {
++                return -EIO;
++        }
++        return 0;
++}
++
++static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame)
++{
++        struct iam_entry *entries;
++        void *data;
++        entries = dx_node_get_entries(path, frame);
++
++        data = frame->bh->b_data;
++
++        if (frame == path->ip_frames) {
++                struct iam_lfix_root *root;
++
++                root = data;
++                path->ip_indirect = root->ilr_indirect_levels;
++                if (path->ip_ikey_target == NULL)
++                        path->ip_ikey_target =
++                                (struct iam_ikey *)path->ip_key_target;
++        }
++        frame->entries = frame->at = entries;
++        return 0;
++}
++
++static int iam_lfix_ikeycmp(const struct iam_container *c,
++                            const struct iam_ikey *k1,
++                            const struct iam_ikey *k2)
++{
++        return memcmp(k1, k2, c->ic_descr->id_ikey_size);
++}
++
++static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c,
++                                                 void *area)
++{
++        return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
++}
++
++static struct iam_operations iam_lfix_ops = {
++        .id_root_ptr    = iam_lfix_root_ptr,
++        .id_node_read   = iam_node_read,
++        .id_node_init   = iam_lfix_node_init,
++        .id_node_check  = iam_lfix_node_check,
++        .id_node_load   = iam_lfix_node_load,
++        .id_ikeycmp     = iam_lfix_ikeycmp,
++        .id_root_inc    = iam_lfix_root_inc,
++        .id_ipd_alloc   = iam_lfix_ipd_alloc,
++        .id_ipd_free    = iam_ipd_free,
++        .id_name        = "lfix"
++};
++
++static int iam_lfix_guess(struct iam_container *c)
++{
++        int result;
++        struct buffer_head *bh;
++        const struct iam_lfix_root *root;
++
++        assert_corr(c->ic_object != NULL);
++
++        result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh);
++        if (result == 0) {
++                root = (void *)bh->b_data;
++                if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) {
++                        struct iam_descr *descr;
++
++                        descr = c->ic_descr;
++                        descr->id_key_size  = le16_to_cpu(root->ilr_keysize);
++                        descr->id_ikey_size = le16_to_cpu(root->ilr_keysize);
++                        descr->id_rec_size  = le16_to_cpu(root->ilr_recsize);
++                        descr->id_ptr_size  = le16_to_cpu(root->ilr_ptrsize);
++                        descr->id_root_gap  = sizeof(struct iam_lfix_root);
++                        descr->id_node_gap  = 0;
++                        descr->id_ops       = &iam_lfix_ops;
++                        descr->id_leaf_ops  = &iam_lfix_leaf_ops;
++                } else
++                        result = -EBADF;
++                brelse(bh);
++        }
++        return result;
++}
++
++static struct iam_format iam_lfix_format = {
++        .if_guess = iam_lfix_guess
++};
++
++void iam_lfix_format_init(void)
++{
++        iam_format_register(&iam_lfix_format);
++}
++
++/*
++ * Debugging aid.
++ */
++
++#define KEYSIZE (8)
++#define RECSIZE (8)
++#define PTRSIZE (4)
++
++#define LFIX_ROOT_RECNO \
++        ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE))
++
++#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE))
++
++#define LFIX_LEAF_RECNO \
++        ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE))
++
++struct lfix_root {
++        struct iam_lfix_root lr_root;
++        struct {
++                char key[KEYSIZE];
++                char ptr[PTRSIZE];
++        } lr_entry[LFIX_ROOT_RECNO];
++};
++
++struct lfix_index {
++        struct dx_countlimit li_cl;
++        char   li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)];
++        struct {
++                char key[KEYSIZE];
++                char ptr[PTRSIZE];
++        } li_entry[LFIX_INDEX_RECNO - 1];
++};
++
++struct lfix_leaf {
++        struct iam_leaf_head ll_head;
++        struct {
++                char key[KEYSIZE];
++                char rec[RECSIZE];
++        } ll_entry[LFIX_LEAF_RECNO];
++};
+Index: iam/fs/ext3/iam_htree.c
+===================================================================
+--- iam.orig/fs/ext3/iam_htree.c       2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam_htree.c    2007-05-23 11:18:20.000000000 +0800
+@@ -0,0 +1,687 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  iam_htree.c
++ *  implementation of iam format for ext3/htree.
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
++ *
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error(), EXT3_DIR_ROUND() */
++#include <linux/ext3_fs.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++static inline struct ext3_dir_entry_2 *dent(struct iam_lentry *ent)
++{
++        return (struct ext3_dir_entry_2 *)ent;
++}
++
++static inline struct iam_path_compat *getipc(const struct iam_leaf *folio)
++{
++        struct iam_path *path;
++
++        path = iam_leaf_path(folio);
++        assert_corr(dx_index_is_compat(path));
++        assert_corr(path->ip_data != NULL);
++        return container_of(path->ip_data, struct iam_path_compat, ipc_descr);
++}
++
++static inline struct ext3_dir_entry_2 *getent(const struct iam_leaf *folio)
++{
++        return dent(folio->il_at);
++}
++
++static __u32 hashname(const struct iam_leaf *folio,
++                      const char *name, int namelen)
++{
++        int result;
++        struct dx_hash_info *hinfo;
++
++        hinfo = getipc(folio)->ipc_hinfo;
++        assert_corr(hinfo != NULL);
++        result = ext3fs_dirhash(name, namelen, hinfo);
++        assert_corr(result == 0);
++        return hinfo->hash;
++}
++
++static __u32 gethash(const struct iam_leaf *folio,
++                     const struct ext3_dir_entry_2 *ent)
++{
++        return hashname(folio, ent->name, ent->name_len);
++}
++
++static inline size_t recsize(size_t namelen)
++{
++        return EXT3_DIR_REC_LEN(namelen);
++}
++
++static struct ext3_dir_entry_2 *getlast(const struct iam_leaf *folio, int namelen)
++{
++        return
++                (void *)folio->il_bh->b_data +
++                iam_leaf_container(folio)->ic_object->i_sb->s_blocksize -
++                recsize(namelen);
++}
++
++static struct ext3_dir_entry_2 *gettop(const struct iam_leaf *folio)
++{
++        return getlast(folio, 0);
++}
++
++static inline int ent_is_live(const struct ext3_dir_entry_2 *ent)
++{
++        return ent->inode != 0;
++}
++
++static struct ext3_dir_entry_2 *entnext(const struct ext3_dir_entry_2 *ent)
++{
++      return (void *)ent + le16_to_cpu(ent->rec_len);
++}
++
++static struct ext3_dir_entry_2 *skipdead(struct ext3_dir_entry_2 *ent)
++{
++        if (!ent_is_live(ent))
++                ent = entnext(ent);
++        /*
++         * There can be no more than one dead entry in a row.
++         */
++        return ent;
++}
++
++static struct ext3_dir_entry_2 *getstart(const struct iam_leaf *folio)
++{
++        return (void *)folio->il_bh->b_data;
++}
++
++static int getfreespace(const struct ext3_dir_entry_2 *ent)
++{
++        int free;
++
++        free = le16_to_cpu(ent->rec_len);
++        if (ent_is_live(ent))
++                free -= recsize(ent->name_len);
++        assert_corr(free >= 0);
++        return free;
++}
++
++static int entcmp(const struct iam_leaf *folio,
++                  const struct ext3_dir_entry_2 *e0, const struct ext3_dir_entry_2 *e1)
++{
++        __u32 hash0;
++        __u32 hash1;
++
++        assert_corr(ent_is_live(e0));
++        assert_corr(ent_is_live(e1));
++
++        hash0 = gethash(folio, e0);
++        hash1 = gethash(folio, e1);
++        if (hash0 < hash1)
++                return -1;
++        else if (hash0 > hash1)
++                return +1;
++        else if (e0 < e1)
++                return -1;
++        else if (e0 > e1)
++                return +1;
++        else
++                return 0;
++}
++
++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON
++static int iam_leaf_at_rec(const struct iam_leaf *folio)
++{
++        struct ext3_dir_entry_2 *ent;
++
++        ent = getent(folio);
++        return getstart(folio) <= ent &&
++                ent < gettop(folio) && ent_is_live(ent);
++}
++#endif
++
++/*
++ * Leaf operations.
++ */
++
++static struct iam_ikey *iam_htree_ikey(const struct iam_leaf *l,
++                                       struct iam_ikey *key)
++{
++        __u32 *hash;
++        assert_corr(iam_leaf_at_rec(l));
++
++        hash = (void *)key;
++        *hash = gethash(l, getent(l));
++        return key;
++}
++
++static struct iam_key *iam_htree_key(const struct iam_leaf *l)
++{
++        assert_corr(iam_leaf_at_rec(l));
++
++        return (struct iam_key *)&getent(l)->name;
++}
++
++static int iam_htree_key_size(const struct iam_leaf *l)
++{
++        assert_corr(iam_leaf_at_rec(l));
++
++        return getent(l)->name_len;
++}
++
++static void iam_htree_start(struct iam_leaf *l)
++{
++        l->il_at = (void *)skipdead(getstart(l));
++}
++
++static int iam_htree_init(struct iam_leaf *l)
++{
++        assert_corr(l->il_bh != NULL);
++
++        l->il_at = l->il_entries = (void *)getstart(l);
++        return 0;
++}
++
++static void iam_htree_fini(struct iam_leaf *l)
++{
++        l->il_entries = l->il_at = NULL;
++}
++
++struct iam_rec *iam_htree_rec(const struct iam_leaf *l)
++{
++        assert_corr(iam_leaf_at_rec(l));
++        return (void *)&getent(l)->inode;
++}
++
++static void iam_htree_next(struct iam_leaf *l)
++{
++        struct ext3_dir_entry_2 *scan;
++        struct ext3_dir_entry_2 *found;
++
++        assert_corr(iam_leaf_at_rec(l));
++        found = NULL;
++        for (scan = getstart(l); scan < gettop(l); scan = entnext(scan)) {
++                if (scan != getent(l) && ent_is_live(scan) &&
++                    entcmp(l, getent(l), scan) < 0 &&
++                    (found == NULL || entcmp(l, scan, found) < 0))
++                        found = scan;
++        }
++        assert_corr(ergo(found != NULL,
++                         gethash(l, getent(l)) <= gethash(l, found)));
++        l->il_at = (void *)(found ? : gettop(l));
++}
++
++static int iam_htree_at_end(const struct iam_leaf *folio)
++{
++        return getent(folio) >= gettop(folio);
++}
++
++
++static inline int match(int len, const char *const name,
++                        struct ext3_dir_entry_2 *de)
++{
++      if (len != de->name_len)
++              return 0;
++      if (!de->inode)
++              return 0;
++      return !memcmp(name, de->name, len);
++}
++
++static int iam_htree_lookup(struct iam_leaf *l, const struct iam_key *k)
++{
++        struct iam_container *c;
++        struct ext3_dir_entry_2 *scan;
++        struct ext3_dir_entry_2 *found;
++        __u32 hash;
++        int result;
++        int namelen;
++        int last = 1;
++        const char *name;
++
++        c = iam_leaf_container(l);
++        name = (const char *)k;
++        namelen = strlen(name);
++        hash = hashname(l, name, namelen);
++        found = NULL;
++        result = IAM_LOOKUP_OK;
++        for (scan = getstart(l); scan < getlast(l, namelen);
++             scan = entnext(scan)) {
++                if (match(namelen, name, scan)) {
++                        found = scan;
++                        result = IAM_LOOKUP_EXACT;
++                        break;
++                } else if (ent_is_live(scan)) {
++                        if (gethash(l, scan) <= hash)
++                                found = scan;
++                        else
++                                last = 0;
++                }
++        }
++        if (found == NULL) {
++                /*
++                 * @k is less than all hashes in the leaf.
++                 */
++                iam_htree_start(l);
++                result = IAM_LOOKUP_BEFORE;
++        } else {
++                l->il_at = (void *)found;
++                assert_corr(iam_leaf_at_rec(l));
++        }
++        if (last)
++                result |= IAM_LOOKUP_LAST;
++        return result;
++}
++
++static int iam_htree_ilookup(struct iam_leaf *l, const struct iam_ikey *ik)
++{
++        assert(0);
++        return IAM_LOOKUP_OK;
++}
++
++static void iam_htree_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++        assert_corr(iam_leaf_at_rec(l));
++        assert(0);
++}
++
++static int iam_htree_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
++{
++        const char *name;
++        __u32 h0;
++        __u32 h1;
++
++        name = (const char *)k;
++
++        assert_corr(ent_is_live(getent(l)));
++
++        h0 = gethash(l, getent(l));
++        h1 = hashname(l, name, strlen(name));
++
++        return h0 < h1 ? -1 : (h0 == h1 ? 0 : +1);
++}
++
++static int iam_htree_key_eq(const struct iam_leaf *l, const struct iam_key *k)
++{
++        const char *name;
++
++        name = (const char *)k;
++        return match(strlen(name), name, getent(l));
++}
++
++static void iam_htree_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++        __u32 *ino;
++
++        ino = (void *)r;
++        getent(l)->inode = cpu_to_le32(*ino);
++}
++
++static void iam_htree_rec_get(const struct iam_leaf *l, struct iam_rec *r)
++{
++        __u32 *ino;
++
++        ino = (void *)r;
++        *ino = le32_to_cpu(getent(l)->inode);
++}
++
++static void iam_htree_rec_add(struct iam_leaf *leaf, const struct iam_key *k,
++                              const struct iam_rec *r)
++{
++        struct ext3_dir_entry_2 *scan;
++        struct inode        *dir;
++        const  char         *name;
++
++        __u32 *ino;
++        int    namelen;
++
++        assert_corr(iam_leaf_can_add(leaf, k, r));
++
++        dir = iam_leaf_container(leaf)->ic_object;
++        ino = (void *)r;
++        name = (const char *)k;
++        namelen = strlen(name);
++
++        scan = find_insertion_point(dir, leaf->il_bh, name, namelen);
++        assert_corr(!IS_ERR(scan));
++        scan = split_entry(dir, scan, *ino, EXT3_FT_UNKNOWN, name, namelen);
++        leaf->il_at = (void *)scan;
++}
++
++static void iam_htree_rec_del(struct iam_leaf *leaf, int shift)
++{
++        struct ext3_dir_entry_2 *orig;
++        struct ext3_dir_entry_2 *scan;
++        struct ext3_dir_entry_2 *prev;
++
++        assert_corr(iam_leaf_at_rec(leaf));
++
++        orig = getent(leaf);
++
++        if (shift)
++                iam_htree_next(leaf);
++
++        for (prev = NULL, scan = getstart(leaf); scan < orig;
++             prev = scan, scan = entnext(scan))
++                ;
++
++        assert_corr(scan == orig);
++        if (prev != NULL) {
++                prev->rec_len = cpu_to_le16(le16_to_cpu(prev->rec_len) +
++                                              le16_to_cpu(scan->rec_len));
++        } else {
++                assert_corr(scan == getstart(leaf));
++                scan->inode = 0;
++        }
++        iam_leaf_container(leaf)->ic_object->i_version ++;
++}
++
++static int iam_htree_can_add(const struct iam_leaf *leaf,
++                             const struct iam_key *k, const struct iam_rec *r)
++{
++        struct ext3_dir_entry_2 *scan;
++        int size;
++
++        size = recsize(strlen((const char *)k));
++        for (scan = getstart(leaf);
++             scan < gettop(leaf); scan = entnext(scan)) {
++                if (getfreespace(scan) >= size)
++                        return 1;
++        }
++        return 0;
++}
++
++static void iam_htree_init_new(struct iam_container *c, struct buffer_head *bh)
++{
++        /*
++         * Do nothing, all work is done by iam_htree_split().
++         */
++}
++
++static void iam_htree_split(struct iam_leaf *l, struct buffer_head **bh,
++                           iam_ptr_t new_blknr)
++{
++        __u32 delim_hash;
++        __u32 old_hash;
++        struct buffer_head *newbh = *bh;
++        struct iam_path *path;
++
++        old_hash = gethash(l, getent(l));
++        move_entries(iam_leaf_container(l)->ic_object,
++                     getipc(l)->ipc_hinfo, &l->il_bh, bh, &delim_hash);
++        /*
++         * Insert pointer to the new node (together with the least key in
++         * the node) into index node.
++         */
++        path = iam_leaf_path(l);
++        if (l->il_bh == newbh) {
++                /*
++                 * insertion point moves into new leaf.
++                 */
++                assert_corr(delim_hash >= old_hash);
++                l->il_curidx = new_blknr;
++                iam_htree_lookup(l, (void *)&old_hash);
++        }
++        iam_insert_key_lock(path,
++                            path->ip_frame, (void *)&delim_hash, new_blknr);
++}
++
++static struct iam_leaf_operations iam_htree_leaf_ops = {
++        .init           = iam_htree_init,
++        .init_new       = iam_htree_init_new,
++        .fini           = iam_htree_fini,
++        .start          = iam_htree_start,
++        .next           = iam_htree_next,
++        .key            = iam_htree_key,
++        .ikey           = iam_htree_ikey,
++        .rec            = iam_htree_rec,
++        .key_set        = iam_htree_key_set,
++        .key_cmp        = iam_htree_key_cmp,
++        .key_eq         = iam_htree_key_eq,
++        .key_size       = iam_htree_key_size,
++        .rec_set        = iam_htree_rec_set,
++        .rec_get        = iam_htree_rec_get,
++        .lookup         = iam_htree_lookup,
++        .ilookup        = iam_htree_ilookup,
++        .at_end         = iam_htree_at_end,
++        .rec_add        = iam_htree_rec_add,
++        .rec_del        = iam_htree_rec_del,
++        .can_add        = iam_htree_can_add,
++        .split          = iam_htree_split
++};
++
++/*
++ * Index operations.
++ */
++
++static __u32 iam_htree_root_ptr(struct iam_container *c)
++{
++      return 0;
++}
++
++static int iam_htree_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++      /* XXX no checks yet */
++      return 0;
++}
++
++static int is_htree(struct super_block *sb,
++                    const struct dx_root *root, int silent)
++{
++        if (root->info.hash_version > DX_HASH_MAX) {
++                if (!silent)
++                        ext3_warning(sb, __FUNCTION__,
++                                     "Unrecognised inode hash code %d",
++                                     root->info.hash_version);
++                return -EIO;
++        }
++
++        if (root->info.unused_flags & 1) {
++                if (!silent)
++                      ext3_warning(sb, __FUNCTION__,
++                                   "Unimplemented inode hash flags: %#06x",
++                                   root->info.unused_flags);
++                      return -EIO;
++        }
++
++        if (root->info.indirect_levels > DX_MAX_TREE_HEIGHT - 1) {
++                if (!silent)
++                        ext3_warning(sb, __FUNCTION__,
++                                     "Unimplemented inode hash depth: %#06x",
++                                     root->info.indirect_levels);
++                return -EIO;
++        }
++        return 0;
++}
++
++static int iam_htree_node_load(struct iam_path *path, struct iam_frame *frame)
++{
++      void *data;
++      struct iam_entry *entries;
++      struct super_block *sb;
++
++      data = frame->bh->b_data;
++      entries = dx_node_get_entries(path, frame);
++      sb = iam_path_obj(path)->i_sb;
++      if (frame == path->ip_frames) {
++              /* root node */
++              struct dx_root *root;
++              struct iam_path_compat *ipc;
++                int check;
++                const char *name;
++                int namelen;
++
++              root = data;
++              assert_corr(path->ip_data != NULL);
++              ipc = container_of(path->ip_data, struct iam_path_compat,
++                                 ipc_descr);
++
++                check = is_htree(sb, root, 0);
++                if (check != 0)
++                        return check;
++              path->ip_indirect = root->info.indirect_levels;
++
++              assert_corr((char *)entries == (((char *)&root->info) +
++                                                root->info.info_length));
++              assert_corr(dx_get_limit(entries) == dx_root_limit(path));
++
++              ipc->ipc_hinfo->hash_version = root->info.hash_version;
++              ipc->ipc_hinfo->seed = EXT3_SB(sb)->s_hash_seed;
++                name = NULL;
++              if (ipc->ipc_qstr) {
++                        name = ipc->ipc_qstr->name;
++                        namelen = ipc->ipc_qstr->len;
++                } else if (ipc->ipc_hinfo == &ipc->ipc_hinfo_area){
++                        name = (const char *)path->ip_key_target;
++                        namelen = strlen(name);
++                }
++                if (name != NULL)
++                        ext3fs_dirhash(name, namelen, ipc->ipc_hinfo);
++                if (path->ip_ikey_target == NULL) {
++                        path->ip_ikey_target = iam_path_ikey(path, 4);
++                        *(__u32 *)path->ip_ikey_target = ipc->ipc_hinfo->hash;
++                }
++      } else {
++              /* non-root index */
++              assert_corr(entries ==
++                            data + iam_path_descr(path)->id_node_gap);
++              assert_corr(dx_get_limit(entries) == dx_node_limit(path));
++      }
++      frame->entries = frame->at = entries;
++      return 0;
++}
++
++static int iam_htree_node_init(struct iam_container *c,
++                               struct buffer_head *bh, int root)
++{
++      struct dx_node *node;
++
++      assert_corr(!root);
++
++      node = (void *)bh->b_data;
++      node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize);
++      node->fake.inode = 0;
++      return 0;
++}
++
++static struct iam_entry *iam_htree_root_inc(struct iam_container *c,
++                                            struct iam_path *path,
++                                            struct iam_frame *frame)
++{
++        struct dx_root   *root;
++        struct iam_entry *entries;
++
++        entries = frame->entries;
++
++        dx_set_count(entries, 1);
++        root = (struct dx_root *) frame->bh->b_data;
++        root->info.indirect_levels++;
++
++        return entries;
++}
++
++static int iam_htree_ikeycmp(const struct iam_container *c,
++                             const struct iam_ikey *k1,
++                             const struct iam_ikey *k2)
++{
++      __u32 p1 = le32_to_cpu(*(__u32 *)k1);
++      __u32 p2 = le32_to_cpu(*(__u32 *)k2);
++
++      return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
++}
++
++static struct iam_path_descr *iam_htree_ipd_alloc(const struct iam_container *c,
++                                                  void *area)
++{
++      struct iam_path_compat *ipc;
++
++      ipc = area;
++        memset(ipc, 0, sizeof *ipc);
++        iam_path_compat_init(ipc, c->ic_object);
++        return &ipc->ipc_descr;
++}
++
++static void iam_htree_ipd_free(struct iam_path_descr *ipd)
++{
++}
++
++static struct iam_operations iam_htree_ops = {
++        .id_root_ptr    = iam_htree_root_ptr,
++        .id_node_read   = iam_node_read,
++        .id_node_init   = iam_htree_node_init,
++        .id_node_check  = iam_htree_node_check,
++        .id_node_load   = iam_htree_node_load,
++        .id_ikeycmp     = iam_htree_ikeycmp,
++        .id_root_inc    = iam_htree_root_inc,
++        .id_ipd_alloc   = iam_htree_ipd_alloc,
++        .id_ipd_free    = iam_htree_ipd_free,
++        .id_name        = "htree"
++};
++
++/*
++ * Parameters describing iam compatibility mode in which existing ext3 htrees
++ * can be manipulated.
++ */
++struct iam_descr iam_htree_compat_param = {
++      .id_key_size  = EXT3_NAME_LEN,
++        .id_rec_size  = sizeof ((struct ext3_dir_entry_2 *)NULL)->inode,
++      .id_ikey_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++      .id_ptr_size  = sizeof ((struct dx_map_entry *)NULL)->offs,
++      .id_node_gap  = offsetof(struct dx_node, entries),
++      .id_root_gap  = offsetof(struct dx_root, entries),
++      .id_ops       = &iam_htree_ops,
++      .id_leaf_ops  = &iam_htree_leaf_ops
++};
++EXPORT_SYMBOL(iam_htree_compat_param);
++
++static int iam_htree_guess(struct iam_container *c)
++{
++        int result;
++        struct buffer_head *bh;
++        const struct dx_root *root;
++
++        assert_corr(c->ic_object != NULL);
++
++        result = iam_node_read(c, iam_htree_root_ptr(c), NULL, &bh);
++        if (result == 0) {
++                root = (void *)bh->b_data;
++                result = is_htree(c->ic_object->i_sb, root, 1);
++                if (result == 0)
++                        c->ic_descr = &iam_htree_compat_param;
++                else
++                        result = -EBADF;
++                brelse(bh);
++        }
++        return result;
++}
++
++static struct iam_format iam_htree_format = {
++        .if_guess = iam_htree_guess
++};
++
++void iam_htree_format_init(void)
++{
++        iam_format_register(&iam_htree_format);
++}
diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch b/ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch
new file mode 100644 (file)
index 0000000..fd03c92
--- /dev/null
@@ -0,0 +1,1408 @@
+Index: iam/fs/ext3/Makefile
+===================================================================
+--- iam.orig/fs/ext3/Makefile
++++ iam/fs/ext3/Makefile
+@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o
+ ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+          ioctl.o namei.o super.o symlink.o hash.o resize.o \
+-         extents.o mballoc.o iam.o iam_lfix.o
++         extents.o mballoc.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: iam/fs/ext3/dir.c
+===================================================================
+--- iam.orig/fs/ext3/dir.c
++++ iam/fs/ext3/dir.c
+@@ -28,6 +28,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/slab.h>
+ #include <linux/rbtree.h>
++#include <linux/lustre_iam.h>
+ static unsigned char ext3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+@@ -61,6 +62,7 @@ static unsigned char get_dtype(struct su
+ }
+                              
++#if EXT3_INVARIANT_ON
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+                         struct ext3_dir_entry_2 * de,
+                         struct buffer_head * bh,
+@@ -90,6 +92,7 @@ int ext3_check_dir_entry (const char * f
+                       rlen, de->name_len);
+       return error_msg == NULL ? 1 : 0;
+ }
++#endif
+ static int ext3_readdir(struct file * filp,
+                        void * dirent, filldir_t filldir)
+@@ -305,12 +308,14 @@ static void free_rb_tree_fname(struct rb
+       root->rb_node = NULL;
+ }
++extern struct iam_private_info *ext3_iam_alloc_info(int flags);
++extern void ext3_iam_release_info(struct iam_private_info *info);
+ struct dir_private_info *create_dir_info(loff_t pos)
+ {
+       struct dir_private_info *p;
+-      p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++      p = (void *)ext3_iam_alloc_info(GFP_KERNEL);
+       if (!p)
+               return NULL;
+       p->root.rb_node = NULL;
+@@ -326,6 +331,7 @@ struct dir_private_info *create_dir_info
+ void ext3_htree_free_dir_info(struct dir_private_info *p)
+ {
+       free_rb_tree_fname(&p->root);
++      ext3_iam_release_info((void *)p);
+       kfree(p);
+ }
+Index: iam/fs/ext3/file.c
+===================================================================
+--- iam.orig/fs/ext3/file.c
++++ iam/fs/ext3/file.c
+@@ -23,6 +23,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/lustre_iam.h>
+ #include "xattr.h"
+ #include "acl.h"
+@@ -31,14 +32,18 @@
+  * from ext3_file_open: open gets called at every open, but release
+  * gets called only when /all/ the files are closed.
+  */
+-static int ext3_release_file (struct inode * inode, struct file * filp)
++static int ext3_release_file(struct inode * inode, struct file * filp)
+ {
+       /* if we are the last writer on the inode, drop the block reservation */
+       if ((filp->f_mode & FMODE_WRITE) &&
+                       (atomic_read(&inode->i_writecount) == 1))
+               ext3_discard_reservation(inode);
+-      if (is_dx(inode) && filp->private_data)
++      if (is_dx(inode) && filp->private_data) {
++              if (S_ISDIR(inode->i_mode))
+               ext3_htree_free_dir_info(filp->private_data);
++              else
++                      ext3_iam_release(filp, inode);
++      }
+       return 0;
+ }
+Index: iam/fs/ext3/iam-uapi.c
+===================================================================
+--- iam.orig/fs/ext3/iam-uapi.c
++++ iam/fs/ext3/iam-uapi.c
+@@ -0,0 +1,368 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  iam_uapi.c
++ *  User-level interface to iam (ioctl based)
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
++ *
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++struct iam_private_info {
++        struct dir_private_info ipi_dir; /* has to be first */
++        struct iam_container   ipi_bag;
++        struct iam_descr       ipi_descr;
++        struct iam_iterator    ipi_it;
++        struct iam_path_descr *ipi_ipd;
++        char                   ipi_ipd_area[DX_IPD_MAX_SIZE];
++};
++
++enum {
++        IAM_INSERT_CREDITS = 20
++};
++
++static struct iam_private_info *get_ipi(struct file *filp)
++{
++        return filp->private_data;
++}
++
++static int iam_uapi_it(int cmd, struct inode *inode,
++                       struct file *filp, struct iam_uapi_it *itop)
++{
++        struct iam_private_info *ipi;
++        struct iam_iterator     *it;
++        enum iam_it_state        st;
++        int result = 0;
++
++        ipi = get_ipi(filp);
++        it = &ipi->ipi_it;
++        st = it->ii_state;
++        switch (cmd) {
++        case IAM_IOC_IT_START:
++                result = iam_it_init(it, &ipi->ipi_bag,
++                                     IAM_IT_MOVE, ipi->ipi_ipd);
++                if (result == 0)
++                        result = iam_it_get(it, itop->iui_op.iul_key);
++                break;
++        case IAM_IOC_IT_NEXT:
++                if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED)
++                        result = iam_it_next(it);
++                else
++                        result = -EBUSY;
++                break;
++        case IAM_IOC_IT_STOP:
++                iam_it_put(it);
++                iam_it_fini(it);
++                result = 0;
++                break;
++        }
++        st = it->ii_state;
++        if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED)
++                memcpy(itop->iui_op.iul_key, iam_it_key_get(it),
++                       iam_it_key_size(it));
++        if (st == IAM_IT_ATTACHED)
++                iam_reccpy(&it->ii_path.ip_leaf, itop->iui_op.iul_rec);
++        itop->iui_state = st;
++        return result;
++}
++
++static int iam_uapi_op(int cmd, struct inode *inode,
++                       struct file *filp, struct iam_uapi_op *op)
++{
++        int result;
++        struct iam_private_info *ipi;
++
++        ipi = get_ipi(filp);
++        if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_DELETE) {
++                handle_t *h;
++
++                h = ext3_journal_start(inode, IAM_INSERT_CREDITS);
++                if (!IS_ERR(h)) {
++                        if (cmd == IAM_IOC_INSERT)
++                                result = iam_insert(h, &ipi->ipi_bag,
++                                                    op->iul_key,
++                                                    op->iul_rec, ipi->ipi_ipd);
++                        else
++                                result = iam_delete(h, &ipi->ipi_bag,
++                                                    op->iul_key, ipi->ipi_ipd);
++                        ext3_journal_stop(h);
++                } else {
++                        result = PTR_ERR(h);
++                        ext3_std_error(inode->i_sb, result);
++                }
++        } else
++                result = iam_lookup(&ipi->ipi_bag, op->iul_key,
++                                    op->iul_rec, ipi->ipi_ipd);
++        return result;
++}
++
++struct iam_private_info *ext3_iam_alloc_info(int flags)
++{
++        struct iam_private_info *info;
++
++        info = kmalloc(sizeof *info, flags);
++        if (info != NULL)
++                memset(info, 0, sizeof *info);
++        return info;
++}
++
++void ext3_iam_release_info(struct iam_private_info *info)
++{
++        iam_it_put(&info->ipi_it);
++        iam_it_fini(&info->ipi_it);
++        if (info->ipi_ipd != NULL)
++                info->ipi_bag.ic_descr->id_ops->id_ipd_free(info->ipi_ipd);
++        iam_container_fini(&info->ipi_bag);
++}
++
++void ext3_iam_release(struct file *filp, struct inode *inode)
++{
++        struct iam_private_info *info;
++
++        info = filp->private_data;
++        ext3_iam_release_info(info);
++
++        kfree(info);
++        EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
++static int iam_uapi_init(struct inode *inode,
++                         struct file *filp, struct iam_uapi_info *ua)
++{
++        int result;
++        struct iam_private_info *info;
++
++        info = ext3_iam_alloc_info(GFP_KERNEL);
++        if (info != NULL) {
++                struct iam_container *bag;
++                struct iam_descr     *des;
++
++                bag = &info->ipi_bag;
++                des = &info->ipi_descr;
++                result = iam_container_init(bag, des, inode);
++                if (result == 0) {
++                        result = iam_container_setup(bag);
++                        if (result == 0) {
++                                /*
++                                 * Container setup might change ->ic_descr
++                                 */
++                                des = bag->ic_descr;
++                                info->ipi_ipd = des->id_ops->
++                                        id_ipd_alloc(bag, info->ipi_ipd_area);
++                                if (info->ipi_ipd != NULL) {
++                                        filp->private_data = info;
++                                        EXT3_I(inode)->i_flags |= EXT3_INDEX_FL;
++                                } else
++                                        result = -ENOMEM;
++                        }
++                }
++        } else
++                result = -ENOMEM;
++        return result;
++}
++
++
++static int getua(struct iam_uapi_info *ua, unsigned long arg)
++{
++        if (copy_from_user(ua, (struct iam_uapi_info __user *)arg, sizeof *ua))
++                return -EFAULT;
++        else
++                return 0;
++}
++
++static int putua(struct iam_uapi_info *ua, unsigned long arg)
++{
++        if (copy_to_user((struct iam_uapi_info __user *)arg, ua, sizeof *ua))
++                return -EFAULT;
++        else
++                return 0;
++}
++
++enum outop_t {
++        KEY   = 1 << 0,
++        REC   = 1 << 1,
++        STATE = 1 << 2
++};
++
++static int outop(struct iam_uapi_op *op, struct iam_uapi_op *uop,
++                 struct iam_descr *des, enum outop_t opt)
++{
++        int result;
++
++        if (((opt & REC) && copy_to_user((void __user *)uop->iul_rec,
++                                         op->iul_rec, des->id_rec_size)) ||
++            ((opt & KEY) && copy_to_user((void __user *)uop->iul_key,
++                                         op->iul_key, des->id_key_size)))
++                result = -EFAULT;
++        else
++                result = 0;
++        return result;
++}
++
++static void putop(struct iam_uapi_op *op)
++{
++        kfree(op->iul_key);
++        kfree(op->iul_rec);
++}
++
++static int getop(struct iam_uapi_op *op, struct iam_uapi_op *uop,
++                 struct iam_descr *des, unsigned long arg)
++{
++        int result;
++        int ks;
++        int rs;
++
++        ks = des->id_key_size;
++        rs = des->id_rec_size;
++        op->iul_key = kmalloc(ks, GFP_KERNEL);
++        op->iul_rec = kmalloc(rs, GFP_KERNEL);
++        if (!copy_from_user(uop,
++                            (struct iam_uapi_op __user *)arg, sizeof *uop) &&
++            op->iul_key != NULL && op->iul_rec != NULL &&
++            !copy_from_user(op->iul_key, (void __user *)uop->iul_key, ks) &&
++            !copy_from_user(op->iul_rec, (void __user *)uop->iul_rec, rs))
++                result = 0;
++        else {
++                result = -EFAULT;
++                putop(op);
++        }
++        return result;
++}
++
++static int outit(struct iam_uapi_it *it, struct iam_uapi_it *uit,
++                 struct iam_descr *des, enum outop_t opt, unsigned long arg)
++{
++        int result;
++
++        result = outop(&it->iui_op, &uit->iui_op, des, opt);
++        if (result == 0 && (opt&STATE))
++                result = put_user(it->iui_state, (int __user *) arg);
++        return result;
++}
++
++static void putit(struct iam_uapi_it *it)
++{
++        putop(&it->iui_op);
++}
++
++static int getit(struct iam_uapi_it *it, struct iam_uapi_it *uit,
++                 struct iam_descr *des, unsigned long arg)
++{
++        return getop(&it->iui_op, &uit->iui_op, des,
++                     (unsigned long)&((struct iam_uapi_it *)arg)->iui_op);
++}
++
++int iam_uapi_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++                   unsigned long arg)
++{
++        int result;
++        struct iam_uapi_info ua;
++        struct iam_uapi_op   uop;
++        struct iam_uapi_op   op;
++        struct iam_uapi_it   uit;
++        struct iam_uapi_it   it;
++        enum outop_t opt;
++
++        if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) {
++                result = -EACCES;
++        } else if (cmd == IAM_IOC_POLYMORPH) {
++                /*
++                 * If polymorphing into directory, increase hard-link count.
++                 */
++                if (S_ISDIR((umode_t)arg) && !S_ISDIR(inode->i_mode))
++                        inode->i_nlink++;
++                else if (!S_ISDIR((umode_t)arg) && S_ISDIR(inode->i_mode))
++                        inode->i_nlink--;
++                inode->i_mode = (umode_t)arg;
++                mark_inode_dirty(inode);
++                result = 0;
++        } else if (cmd == IAM_IOC_INIT) {
++                if (filp->private_data == NULL) {
++                        result = getua(&ua, arg);
++                        if (result == 0)
++                                result = iam_uapi_init(inode, filp, &ua);
++                } else
++                        result = -EBUSY;
++        } else if (is_dx(inode) && filp->private_data != NULL) {
++                struct iam_descr *des;
++
++                switch (cmd) {
++                case IAM_IOC_IT_START:
++                case IAM_IOC_IT_NEXT:
++                        opt = KEY|REC|STATE;
++                        break;
++                case IAM_IOC_LOOKUP:
++                        opt = REC;
++                        break;
++                default:
++                        opt = 0;
++                        break;
++                }
++
++                des = get_ipi(filp)->ipi_bag.ic_descr;
++                if (cmd == IAM_IOC_GETINFO) {
++                        ua.iui_keysize = des->id_key_size;
++                        ua.iui_recsize = des->id_rec_size;
++                        ua.iui_ptrsize = des->id_ptr_size;
++                        ua.iui_height  = 0; /* not yet */
++                        memcpy(ua.iui_fmt_name, des->id_ops->id_name,
++                               ARRAY_SIZE(ua.iui_fmt_name));
++                        result = putua(&ua, arg);
++                } else if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_LOOKUP ||
++                           cmd == IAM_IOC_DELETE) {
++                        result = getop(&op, &uop, des, arg);
++                        if (result == 0) {
++                                int res2;
++                                result = iam_uapi_op(cmd, inode, filp, &op);
++
++                                res2 = outop(&op, &uop, des, opt);
++                                result = result ? : res2;
++                                putop(&op);
++                        }
++                } else if (cmd == IAM_IOC_IT_START || cmd == IAM_IOC_IT_NEXT ||
++                           cmd == IAM_IOC_IT_STOP) {
++                        result = getit(&it, &uit, des, arg);
++                        if (result == 0) {
++                                int res2;
++
++                                result = iam_uapi_it(cmd, inode, filp, &it);
++
++                                res2 = outit(&it, &uit, des, opt, arg);
++                                result = result ? : res2;
++                                putit(&it);
++                        }
++                } else
++                        result = -EINVAL;
++        } else
++                result = -ENOENT;
++        return result;
++}
+Index: iam/fs/ext3/ioctl.c
+===================================================================
+--- iam.orig/fs/ext3/ioctl.c
++++ iam/fs/ext3/ioctl.c
+@@ -250,6 +250,6 @@ flags_err:
+       default:
+-              return -ENOTTY;
++              return iam_uapi_ioctl(inode, filp, cmd, arg);
+       }
+ }
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h
++++ iam/include/linux/lustre_iam.h
+@@ -30,9 +30,6 @@
+ #ifndef __LINUX_LUSTRE_IAM_H__
+ #define __LINUX_LUSTRE_IAM_H__
+-/* handle_t, journal_start(), journal_stop() */
+-#include <linux/jbd.h>
+-
+ /*
+  *  linux/include/linux/lustre_iam.h
+  */
+@@ -57,14 +54,95 @@ enum {
+          *         [2] reserved for leaf node operations.
+          *
+          *         [3] reserved for index operations.
++         *
++         *         [4] reserved for path->ip_ikey_target
++         *
+          */
+-      DX_SCRATCH_KEYS    = 4,
++      DX_SCRATCH_KEYS    = 5,
+         /*
+          * Maximal format name length.
+          */
+         DX_FMT_NAME_LEN    = 16
+ };
++#ifdef __KERNEL__
++/* handle_t, journal_start(), journal_stop() */
++#include <linux/jbd.h>
++
++/*
++ * Debugging.
++ *
++ * Various debugging levels.
++ */
++
++#if 0
++/*
++ * Following macros are defined in config.h and are tunable through
++ * appropriate configure switches (indicated below).
++ */
++
++/*
++ * Compile basic assertions in. You want this most of the time.
++ *
++ * --{enable,disable}-ldiskfs-assert (on by default).
++ */
++#define EXT3_ASSERT (1)
++
++/*
++ * Compile heavier correctness checks in. You want this during development
++ * cycle.
++ *
++ * --{enable,disable}-ldiskfs-correctness (off by default).
++ */
++#define EXT3_CORRECTNESS (1)
++
++/*
++ * Compile heavy invariant checking in. You want this early during development
++ * or when chasing a bug.
++ *
++ * --{enable,disable}-ldiskfs-invariant (off by default).
++ */
++#define EXT3_INVARIANT (1)
++#endif
++
++#if defined(EXT3_ASSERT)
++#define EXT3_ASSERT_ON (1)
++#else
++#define EXT3_ASSERT_ON (0)
++#endif
++
++#if defined(EXT3_CORRECTNESS)
++#define EXT3_CORRECTNESS_ON (1)
++#else
++#define EXT3_CORRECTNESS_ON (0)
++#endif
++
++#if defined(EXT3_INVARIANT)
++#define EXT3_INVARIANT_ON (1)
++#else
++#define EXT3_INVARIANT_ON (0)
++#endif
++
++#ifndef assert
++#if EXT3_ASSERT_ON
++#define assert(test) J_ASSERT(test)
++#else
++#define assert(test) ((void)(test))
++#endif
++#endif
++
++#if EXT3_CORRECTNESS_ON
++#define assert_corr(test) J_ASSERT(test)
++#else
++#define assert_corr(test) do {;} while (0)
++#endif
++
++#if EXT3_INVARIANT_ON
++#define assert_inv(test) J_ASSERT(test)
++#else
++#define assert_inv(test) do {;} while (0)
++#endif
++
+ /*
+  * Entry within index tree node. Consists of a key immediately followed
+  * (without padding) by a pointer to the child node.
+@@ -86,14 +164,21 @@ struct iam_entry_compat {
+  */
+ struct iam_key;
+-/* Incomplete type use to refer to the records stored in iam containers. */
++/*
++ * Incomplete type use to refer to the records stored in iam containers.
++ */
+ struct iam_rec;
+-struct iam_cookie {
+-      struct iam_key *ic_key;
+-      struct iam_rec *ic_rec;
+-};
++/*
++ * Key in index node. Possibly compressed. Fixed size.
++ */
++struct iam_ikey;
++/*
++ * Scalar type into which certain iam_key's can be uniquely mapped. Used to
++ * support interfaces like readdir(), where iteration over index has to be
++ * re-startable.
++ */
+ typedef __u64 iam_ptr_t;
+ /*
+@@ -123,6 +208,31 @@ struct iam_leaf {
+       void               *il_descr_data;
+ };
++/*
++ * Return values of ->lookup() operation from struct iam_leaf_operations.
++ */
++enum iam_lookup_t {
++        /*
++         * lookup found a record with the key requested
++         */
++        IAM_LOOKUP_EXACT,
++        /*
++         * lookup positioned leaf on some record
++         */
++        IAM_LOOKUP_OK,
++        /*
++         * leaf was empty
++         */
++        IAM_LOOKUP_EMPTY,
++        /*
++         * lookup positioned leaf before first record
++         */
++        IAM_LOOKUP_BEFORE
++};
++
++/*
++ * Format-specific container operations. These are called by generic iam code.
++ */
+ struct iam_operations {
+       /*
+        * Returns pointer (in the same sense as pointer in index entry) to
+@@ -131,11 +241,15 @@ struct iam_operations {
+       __u32 (*id_root_ptr)(struct iam_container *c);
+       /*
+-       * Check validity and consistency of index node. This is called when
+-       * iam just loaded new node into frame.
++       * Check validity and consistency of index node.
+        */
+       int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
+       /*
++       * Copy some data from node header into frame. This is called when
++       * new node is loaded into frame.
++       */
++      int (*id_node_load)(struct iam_path *path, struct iam_frame *frame);
++      /*
+        * Initialize new node (stored in @bh) that is going to be added into
+        * tree.
+        */
+@@ -144,23 +258,33 @@ struct iam_operations {
+       int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
+                           handle_t *h, struct buffer_head **bh);
+       /*
+-       * Key comparison function. Returns -1, 0, +1.
++       * Key comparison functions. Returns -1, 0, +1.
+        */
+-      int (*id_keycmp)(const struct iam_container *c,
+-                       const struct iam_key *k1, const struct iam_key *k2);
++      int (*id_ikeycmp)(const struct iam_container *c,
++                          const struct iam_ikey *k1,
++                          const struct iam_ikey *k2);
+       /*
+-       * Create new container.
+-       *
+-       * Newly created container has a root node and a single leaf. Leaf
+-       * contains single record with the smallest possible key.
++         * Modify root node when tree height increases.
+        */
+-      int (*id_create)(struct iam_container *c);
++      struct iam_entry *(*id_root_inc)(struct iam_container *c,
++                                         struct iam_path *path,
++                                         struct iam_frame *frame);
++
++        struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c);
++        void (*id_ipd_free)(const struct iam_container *c,
++                            struct iam_path_descr *ipd);
+         /*
+          * Format name.
+          */
+         char id_name[DX_FMT_NAME_LEN];
+ };
++/*
++ * Another format-specific operation vector, consisting of methods to access
++ * leaf nodes. This is separated from struct iam_operations, because it is
++ * assumed that there will be many formats with different format of leaf
++ * nodes, yes the same struct iam_operations.
++ */
+ struct iam_leaf_operations {
+               /*
+                * leaf operations.
+@@ -186,7 +310,8 @@ struct iam_leaf_operations {
+         void (*start)(struct iam_leaf *l);
+               /* more leaf to the next entry. */
+         void (*next)(struct iam_leaf *l);
+-        /* return key of current leaf record. This method may return
++        /*
++         * return key of current leaf record. This method may return
+          * either pointer to the key stored in node, or copy key into
+          * @k buffer supplied by caller and return pointer to this
+          * buffer. The latter approach is used when keys in nodes are
+@@ -194,8 +319,10 @@ struct iam_leaf_operations {
+          * all).
+          *
+          * Caller should assume that returned pointer is only valid
+-         * while leaf node is pinned and locked.*/
+-        struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k);
++         * while leaf node is pinned and locked.
++         */
++        struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k);
++        struct iam_key *(*key)(const struct iam_leaf *l);
+         /* return pointer to entry body. Pointer is valid while
+            corresponding leaf node is locked and pinned. */
+         struct iam_rec *(*rec)(const struct iam_leaf *l);
+@@ -203,6 +330,9 @@ struct iam_leaf_operations {
+         void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
+         void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
++      int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
++
++        int (*key_size)(const struct iam_leaf *l);
+         /*
+          * Search leaf @l for a record with key @k or for a place
+          * where such record is to be inserted.
+@@ -210,6 +340,7 @@ struct iam_leaf_operations {
+          * Scratch keys from @path can be used.
+          */
+         int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
++        int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik);
+         int (*can_add)(const struct iam_leaf *l,
+                        const struct iam_key *k, const struct iam_rec *r);
+@@ -221,17 +352,15 @@ struct iam_leaf_operations {
+         /*
+          * remove rec for a leaf
+          */
+-        void (*rec_del)(struct iam_leaf *l);
++        void (*rec_del)(struct iam_leaf *l, int shift);
+         /*
+          * split leaf node, moving some entries into @bh (the latter currently
+          * is assumed to be empty).
+          */
+-        void (*split)(struct iam_leaf *l, struct buffer_head *bh);
++        void (*split)(struct iam_leaf *l, struct buffer_head **bh,
++                      iam_ptr_t newblknr);
+ };
+-struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
+-struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
+-
+ /*
+  * Parameters, describing a flavor of iam container.
+  */
+@@ -241,6 +370,10 @@ struct iam_descr {
+        */
+       size_t       id_key_size;
+       /*
++       * Size of a key in index nodes, in bytes.
++       */
++      size_t       id_ikey_size;
++      /*
+        * Size of a pointer to the next level (stored in index nodes), in
+        * bytes.
+        */
+@@ -264,6 +397,9 @@ struct iam_descr {
+         struct iam_leaf_operations      *id_leaf_ops;
+ };
++/*
++ * An instance of iam container.
++ */
+ struct iam_container {
+       /*
+        * Underlying flat file. IO against this object is issued to
+@@ -274,6 +410,10 @@ struct iam_container {
+        * container flavor.
+        */
+       struct iam_descr *ic_descr;
++        /*
++         * read-write lock protecting index consistency.
++         */
++        struct rw_semaphore ic_sem;
+ };
+ /*
+@@ -284,7 +424,7 @@ struct iam_path_descr {
+       /*
+        * Scratch-pad area for temporary keys.
+        */
+-      struct iam_key        *ipd_key_scratch[DX_SCRATCH_KEYS];
++      struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS];
+ };
+ /*
+@@ -316,6 +456,7 @@ struct iam_path {
+        * Key searched for.
+        */
+       const struct iam_key  *ip_key_target;
++      const struct iam_ikey *ip_ikey_target;
+       /*
+        * Description-specific data.
+        */
+@@ -334,6 +475,7 @@ struct iam_path_compat {
+       struct dx_hash_info  *ipc_hinfo;
+       struct dentry        *ipc_dentry;
+       struct iam_path_descr ipc_descr;
++        struct dx_hash_info   ipc_hinfo_area;
+ };
+ /*
+@@ -347,7 +489,9 @@ enum iam_it_state {
+       /* initial state */
+       IAM_IT_DETACHED,
+       /* iterator is above particular record in the container */
+-      IAM_IT_ATTACHED
++      IAM_IT_ATTACHED,
++        /* iterator is positioned before record  */
++        IAM_IT_SKEWED
+ };
+ /*
+@@ -355,7 +499,7 @@ enum iam_it_state {
+  */
+ enum iam_it_flags {
+       /*
+-       * this iterator will move (iam_it_{prev,next}() will be called on it)
++       * this iterator will move (iam_it_next() will be called on it)
+        */
+       IAM_IT_MOVE  = (1 << 0),
+       /*
+@@ -372,15 +516,26 @@ enum iam_it_flags {
+  * doesn't point to any particular record in this container.
+  *
+  * After successful call to iam_it_get() and until corresponding call to
+- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
++ * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or
++ * IAM_IT_SKEWED.
+  *
+- * Attached iterator can move through records in a container (provided
++ * Active iterator can move through records in a container (provided
+  * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+  * passes over them, and can modify container (provided IAM_IT_WRITE
+  * permission).
+  *
++ * Iteration may reach the end of container, at which point iterator switches
++ * into IAM_IT_DETACHED state.
++ *
+  * Concurrency: iterators are supposed to be local to thread. Interfaces below
+- * do no internal serialization.
++ * do no internal serialization of access to the iterator fields.
++ *
++ * When in non-detached state, iterator keeps some container nodes pinned in
++ * memory and locked (that locking may be implemented at the container
++ * granularity though). In particular, clients may assume that pointers to
++ * records and keys obtained through iterator interface as valid until
++ * iterator is detached (except that they may be invalidated by sub-sequent
++ * operations done through the same iterator).
+  *
+  */
+ struct iam_iterator {
+@@ -390,7 +545,8 @@ struct iam_iterator {
+       __u32                 ii_flags;
+       enum iam_it_state     ii_state;
+       /*
+-       * path to the record. Valid in IAM_IT_ATTACHED state.
++       * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED
++       * states.
+        */
+       struct iam_path       ii_path;
+ };
+@@ -405,133 +561,26 @@ void iam_path_compat_fini(struct iam_pat
+ struct iam_path_descr *iam_ipd_alloc(void *area, int keysize);
+ void iam_ipd_free(struct iam_path_descr *ipd);
+-/*
+- * Initialize iterator to IAM_IT_DETACHED state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+ int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
+                struct iam_path_descr *pd);
+-/*
+- * Finalize iterator and release all resources.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- */
+ void iam_it_fini(struct iam_iterator *it);
+-
+-/*
+- * Attach iterator. After successful completion, @it points to record with the
+- * largest key not larger than @k. Semantics of ->id_create() method guarantee
+- * that such record will always be found.
+- *
+- * Return value: 0: positioned on existing record,
+- *             -ve: error.
+- *
+- * precondition:  it_state(it) == IAM_IT_DETACHED
+- * postcondition: ergo(result == 0,
+- *                     (it_state(it) == IAM_IT_ATTACHED &&
+- *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
+- */
+ int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
+-
+-/*
+- * Duplicates iterator.
+- *
+- * postcondition: it_state(dst) == it_state(src) &&
+- *                iam_it_container(dst) == iam_it_container(src) &&
+- *                dst->ii_flags = src->ii_flags &&
+- *                ergo(it_state(it) == IAM_IT_ATTACHED,
+- *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+- *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
+- */
++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k);
+ void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
+-
+-/*
+- * Detach iterator. Does nothing it detached state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+ void iam_it_put(struct iam_iterator *it);
+-
+-/*
+- * Move iterator one record right.
+- *
+- * Return value: 0: success,
+- *              +1: end of container reached
+- *             -ve: error
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
+- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
+- */
+ int iam_it_next(struct iam_iterator *it);
+-
+-/*
+- * Return pointer to the record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+ struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
+-
+-/*
+- * Replace contents of record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
+-
+-/*
+- * Place key under iterator in @k, return @k
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-struct iam_key *iam_it_key_get(const struct iam_iterator *it,
+-                               struct iam_key *k);
+-
+-/*
+- * Insert new record with key @k and contents from @r, shifting records to the
+- * right.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+- *                it->ii_flags&IAM_IT_WRITE &&
+- *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- *                ergo(result == 0,
+- *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
+- *                     !memcmp(iam_it_rec_get(it), r, ...))
+- */
++int iam_it_rec_set(handle_t *h,
++                   struct iam_iterator *it, const struct iam_rec *r);
++struct iam_key *iam_it_key_get(const struct iam_iterator *it);
++int iam_it_key_size(const struct iam_iterator *it);
+ int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+                     const struct iam_key *k, const struct iam_rec *r);
+-/*
+- * Delete record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+ int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+ typedef __u64 iam_pos_t;
+-/*
+- * Convert iterator to cookie.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+- *                path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+ iam_pos_t iam_it_store(const struct iam_iterator *it);
+-
+-/*
+- * Restore iterator from cookie.
+- *
+- * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
+- *                path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+- * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
+- *                                  iam_it_store(it) == pos)
+- */
+ int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
+ int iam_lookup(struct iam_container *c, const struct iam_key *k,
+@@ -539,10 +588,10 @@ int iam_lookup(struct iam_container *c, 
+ int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
+              struct iam_path_descr *pd);
+ int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
+-             struct iam_rec *r, struct iam_path_descr *pd);
++             const struct iam_rec *r, struct iam_path_descr *pd);
+ int iam_insert(handle_t *handle, struct iam_container *c,
+                const struct iam_key *k,
+-             struct iam_rec *r, struct iam_path_descr *pd);
++             const struct iam_rec *r, struct iam_path_descr *pd);
+ /*
+  * Initialize container @c.
+  */
+@@ -558,10 +607,6 @@ void iam_container_fini(struct iam_conta
+  */
+ int iam_container_setup(struct iam_container *c);
+-#ifndef assert
+-#define assert(test) J_ASSERT(test)
+-#endif
+-
+ static inline struct iam_descr *iam_container_descr(struct iam_container *c)
+ {
+         return c->ic_descr;
+@@ -577,16 +622,65 @@ static inline struct inode *iam_path_obj
+       return p->ip_container->ic_object;
+ }
+-static inline void iam_keycpy(const struct iam_container *c,
+-                              struct iam_key *k1, const struct iam_key *k2)
++static inline void iam_ikeycpy(const struct iam_container *c,
++                               struct iam_ikey *k1, const struct iam_ikey *k2)
++{
++      memcpy(k1, k2, c->ic_descr->id_ikey_size);
++}
++
++static inline size_t iam_entry_size(struct iam_path *p)
++{
++      return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size;
++}
++
++static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
++                                              struct iam_entry *entry,
++                                              int shift)
++{
++      void *e = entry;
++      return e + shift * iam_entry_size(p);
++}
++
++static inline struct iam_ikey *iam_get_ikey(struct iam_path *p,
++                                            struct iam_entry *entry,
++                                            struct iam_ikey *key)
++{
++      return memcpy(key, entry, iam_path_descr(p)->id_ikey_size);
++}
++
++static inline struct iam_ikey *iam_ikey_at(struct iam_path *p,
++                                           struct iam_entry *entry)
++{
++      return (struct iam_ikey *)entry;
++}
++
++static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
++                                     struct iam_entry *e1,
++                                     struct iam_entry *e2)
++{
++      ptrdiff_t diff;
++
++      diff = (void *)e1 - (void *)e2;
++      assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
++      return diff / iam_entry_size(p);
++}
++
++/*
++ * Helper for the frequent case, where key was already placed into @k1 by
++ * callback.
++ */
++static inline void iam_ikeycpy0(const struct iam_container *c,
++                                struct iam_ikey *k1, const struct iam_ikey *k2)
+ {
+-      memcpy(k1, k2, c->ic_descr->id_key_size);
++        if (k1 != k2)
++                iam_ikeycpy(c, k1, k2);
+ }
+-static inline int iam_keycmp(const struct iam_container *c,
+-                           const struct iam_key *k1, const struct iam_key *k2)
++static inline int iam_ikeycmp(const struct iam_container *c,
++                              const struct iam_ikey *k1,
++                              const struct iam_ikey *k2)
+ {
+-      return c->ic_descr->id_ops->id_keycmp(c, k1, k2);
++      return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2);
+ }
+ static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
+@@ -600,11 +694,38 @@ static inline void *iam_entry_off(struct
+       return (void *)((char *)entry + off);
+ }
++/*
++ * Leaf helpers.
++ */
++
++static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf)
++{
++        return leaf->il_path;
++}
++
++static inline struct iam_container *
++iam_leaf_container(const struct iam_leaf *leaf)
++{
++        return iam_leaf_path(leaf)->ip_container;
++}
++
++static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf)
++{
++        return iam_leaf_container(leaf)->ic_descr;
++}
++
++static inline struct iam_leaf_operations *
++iam_leaf_ops(const struct iam_leaf *leaf)
++{
++        return iam_leaf_descr(leaf)->id_leaf_ops;
++}
++
++
+ /*XXX These stuff put here, just because they are used by iam.c and namei.c*/
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+ {
+       return le32_to_cpu(*(u32*)iam_entry_off(entry,
+-                                              iam_path_descr(p)->id_key_size))
++                                              iam_path_descr(p)->id_ikey_size))
+               & 0x00ffffff;
+ }
+@@ -612,21 +733,64 @@ static inline void dx_set_block(struct i
+                               struct iam_entry *entry, unsigned value)
+ {
+       *(u32*)iam_entry_off(entry,
+-                           iam_path_descr(p)->id_key_size) =
++                           iam_path_descr(p)->id_ikey_size) =
+               cpu_to_le32(value);
+ }
+-static inline void dx_set_key(struct iam_path *p, struct iam_entry *entry,
+-                              const struct iam_key *key)
++static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry,
++                               const struct iam_ikey *key)
+ {
+-        iam_keycpy(p->ip_container, iam_entry_off(entry, 0), key);
++        iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key);
+ }
++struct dx_map_entry
++{
++      u32 hash;
++      u32 offs;
++};
++
++struct fake_dirent {
++      __le32 inode;
++      __le16 rec_len;
++      u8 name_len;
++      u8 file_type;
++};
++
+ struct dx_countlimit {
+       __le16 limit;
+       __le16 count;
+ };
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero.  Therefore, the
++ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
++ */
++
++struct dx_root {
++      struct fake_dirent dot;
++      char dot_name[4];
++      struct fake_dirent dotdot;
++      char dotdot_name[4];
++      struct dx_root_info
++      {
++              __le32 reserved_zero;
++              u8 hash_version;
++              u8 info_length; /* 8 */
++              u8 indirect_levels;
++              u8 unused_flags;
++      }
++      info;
++      struct {} entries[0];
++};
++
++struct dx_node
++{
++      struct fake_dirent fake;
++      struct {} entries[0];
++};
++
++
+ static inline unsigned dx_get_count(struct iam_entry *entries)
+ {
+       return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+@@ -647,9 +811,21 @@ static inline unsigned dx_node_limit(str
+       struct iam_descr *param = iam_path_descr(p);
+       unsigned entry_space   = iam_path_obj(p)->i_sb->s_blocksize -
+               param->id_node_gap;
+-      return entry_space / (param->id_key_size + param->id_ptr_size);
++      return entry_space / (param->id_ikey_size + param->id_ptr_size);
++}
++
++static inline unsigned dx_root_limit(struct iam_path *p)
++{
++      struct iam_descr *param = iam_path_descr(p);
++      unsigned limit = iam_path_obj(p)->i_sb->s_blocksize -
++                param->id_root_gap;
++        limit /= (param->id_ikey_size + param->id_ptr_size);
++        if (limit == dx_node_limit(p))
++                limit--;
++      return limit;
+ }
++
+ static inline struct iam_entry *dx_get_entries(struct iam_path *path,
+                                              void *data, int root)
+ {
+@@ -665,7 +841,8 @@ static inline struct iam_entry *dx_node_
+                             frame->bh->b_data, frame == path->ip_frames);
+ }
+-static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr)
++static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path,
++                                             int nr)
+ {
+       assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
+       return path->ip_data->ipd_key_scratch[nr];
+@@ -674,6 +851,7 @@ static inline struct iam_key *iam_path_k
+ int dx_lookup(struct iam_path *path);
+ void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+                    u32 hash, u32 block);
++int dx_index_is_compat(struct iam_path *path);
+ int ext3_htree_next_block(struct inode *dir, __u32 hash,
+                         struct iam_path *path, __u32 *start_hash);
+@@ -681,6 +859,20 @@ int ext3_htree_next_block(struct inode *
+ struct buffer_head *ext3_append(handle_t *handle, struct inode *inode,
+                               u32 *block, int *err);
+ int split_index_node(handle_t *handle, struct iam_path *path);
++struct ext3_dir_entry_2 *split_entry(struct inode *dir,
++                                   struct ext3_dir_entry_2 *de,
++                                   unsigned long ino, mode_t mode,
++                                   const char *name, int namelen);
++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir,
++                                            struct buffer_head *bh,
++                                            const char *name, int namelen);
++struct ext3_dir_entry_2 *move_entries(struct inode *dir,
++                                    struct dx_hash_info *hinfo,
++                                    struct buffer_head **bh1,
++                                    struct buffer_head **bh2,
++                                    __u32 *delim_hash);
++
++extern struct iam_descr iam_htree_compat_param;
+ /*
+  * external
+@@ -698,10 +890,12 @@ int iam_node_read(struct iam_container *
+                 handle_t *handle, struct buffer_head **bh);
+ void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
+-                  const struct iam_key *key, iam_ptr_t ptr);
++                  const struct iam_ikey *key, iam_ptr_t ptr);
+ int  iam_leaf_at_end(const struct iam_leaf *l);
+ void iam_leaf_next(struct iam_leaf *folio);
++int iam_leaf_can_add(const struct iam_leaf *l,
++                     const struct iam_key *k, const struct iam_rec *r);
+ struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
+ struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
+@@ -709,14 +903,95 @@ struct iam_descr *iam_leaf_descr(const s
+ struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++                  handle_t *h, struct buffer_head **bh);
++
++/*
++ * Container format.
++ */
+ struct iam_format {
++        /*
++         * Method called to recognize container format. Should return true iff
++         * container @c conforms to this format. This method may do IO to read
++         * container pages.
++         *
++         * If container is recognized, this method sets operation vectors
++         * ->id_ops and ->id_leaf_ops in container description (c->ic_descr),
++         * and fills other description fields.
++         */
+         int (*if_guess)(struct iam_container *c);
++        /*
++         * Linkage into global list of container formats.
++         */
+         struct list_head if_linkage;
+ };
+ void iam_format_register(struct iam_format *fmt);
+ void iam_lfix_format_init(void);
++void iam_lvar_format_init(void);
++void iam_htree_format_init(void);
++
++struct iam_private_info;
++
++void ext3_iam_release(struct file *filp, struct inode *inode);
++
++int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
++                   unsigned long arg);
++
++/* dir.c */
++#if EXT3_INVARIANT_ON
++extern int ext3_check_dir_entry(const char *, struct inode *,
++                              struct ext3_dir_entry_2 *,
++                              struct buffer_head *, unsigned long);
++#else
++static inline int ext3_check_dir_entry(const char * function,
++                                     struct inode * dir,
++                                     struct ext3_dir_entry_2 * de,
++                                     struct buffer_head * bh,
++                                     unsigned long offset)
++{
++      return 1;
++}
++#endif
++
++/* __KERNEL__ */
++#endif
++
++/*
++ * User level API. Copy exists in lustre/lustre/tests/iam_ut.c
++ */
++
++struct iam_uapi_info {
++        __u16 iui_keysize;
++        __u16 iui_recsize;
++        __u16 iui_ptrsize;
++        __u16 iui_height;
++        char  iui_fmt_name[DX_FMT_NAME_LEN];
++};
++
++struct iam_uapi_op {
++        void *iul_key;
++        void *iul_rec;
++};
++
++struct iam_uapi_it {
++        struct iam_uapi_op iui_op;
++        __u16              iui_state;
++};
++
++enum iam_ioctl_cmd {
++        IAM_IOC_INIT     = _IOW('i', 1, struct iam_uapi_info),
++        IAM_IOC_GETINFO  = _IOR('i', 2, struct iam_uapi_info),
++        IAM_IOC_INSERT   = _IOR('i', 3, struct iam_uapi_op),
++        IAM_IOC_LOOKUP   = _IOWR('i', 4, struct iam_uapi_op),
++        IAM_IOC_DELETE   = _IOR('i', 5, struct iam_uapi_op),
++        IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it),
++        IAM_IOC_IT_NEXT  = _IOW('i', 7, struct iam_uapi_it),
++        IAM_IOC_IT_STOP  = _IOR('i', 8, struct iam_uapi_it),
++
++        IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long)
++};
+ /* __LINUX_LUSTRE_IAM_H__ */
+ #endif
diff --git a/ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch b/ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch
new file mode 100644 (file)
index 0000000..d03d74c
--- /dev/null
@@ -0,0 +1,42 @@
+Index: iam/fs/ext3/super.c
+===================================================================
+--- iam.orig/fs/ext3/super.c
++++ iam/fs/ext3/super.c
+@@ -147,6 +147,8 @@ static void ext3_handle_error(struct sup
+       EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+       es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
++      dump_stack();
++
+       if (sb->s_flags & MS_RDONLY)
+               return;
+@@ -1168,7 +1170,7 @@ static int ext3_check_descriptors (struc
+  * e2fsck was run on this filesystem, and it must have already done the orphan
+  * inode cleanup for us, so we can safely abort without any further action.
+  */
+-static void ext3_orphan_cleanup (struct super_block * sb,
++void ext3_orphan_cleanup (struct super_block * sb,
+                                struct ext3_super_block * es)
+ {
+       unsigned int s_flags = sb->s_flags;
+@@ -1256,7 +1258,9 @@ static void ext3_orphan_cleanup (struct 
+       }
+ #endif
+       sb->s_flags = s_flags; /* Restore MS_RDONLY status */
++        EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+ }
++EXPORT_SYMBOL(ext3_orphan_cleanup);
+ #define log2(n) ffz(~(n))
+@@ -1682,8 +1686,7 @@ static int ext3_fill_super (struct super
+        * superblock lock.
+        */
+       EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+-      ext3_orphan_cleanup(sb, es);
+-      EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
++
+       if (needs_recovery)
+               printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+       ext3_mark_recovery_complete(sb, es);
diff --git a/ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch
new file mode 100644 (file)
index 0000000..565ba60
--- /dev/null
@@ -0,0 +1,1247 @@
+Index: iam/fs/ext3/namei.c
+===================================================================
+--- iam.orig/fs/ext3/namei.c
++++ iam/fs/ext3/namei.c
+@@ -55,18 +55,20 @@ struct buffer_head *ext3_append(handle_t
+                                       u32 *block, int *err)
+ {
+       struct buffer_head *bh;
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      /* with parallel dir operations all appends
++       * have to be serialized -bzzz */
++      down(&ei->i_append_sem);
+       *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+-      if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++      bh = ext3_bread(handle, inode, *block, 1, err);
++      if (bh != NULL) {
+               inode->i_size += inode->i_sb->s_blocksize;
+-              EXT3_I(inode)->i_disksize = inode->i_size;
+-              *err = ext3_journal_get_write_access(handle, bh);
+-              if (*err != 0) {
+-                      brelse(bh);
+-                      bh = NULL;
+-              }
++              ei->i_disksize = inode->i_size;
+       }
++      up(&ei->i_append_sem);
++      
+       return bh;
+ }
+@@ -90,7 +92,7 @@ static void dx_set_count(struct iam_entr
+ static void dx_set_limit(struct iam_entry *entries, unsigned value);
+ static unsigned dx_root_limit(struct iam_path *p);
+ static unsigned dx_node_limit(struct iam_path *p);
+-static int dx_probe(struct dentry *dentry,
++static int dx_probe(struct qstr *name,
+                   struct inode *dir,
+                   struct dx_hash_info *hinfo,
+                   struct iam_path *path);
+@@ -104,7 +106,6 @@ static struct buffer_head * ext3_dx_find
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
+-
+ static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+ {
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+@@ -138,23 +139,20 @@ int dx_node_check(struct iam_path *p, st
+               iam_get_ikey(p, e, iam_path_ikey(p, 1));
+               if (i > 0 &&
+                   iam_ikeycmp(c, iam_path_ikey(p, 0),
+-                              iam_path_ikey(p, 1)) > 0) {
+-                      BREAKPOINT();
++                              iam_path_ikey(p, 1)) > 0)
+                       return 0;
+-      }
+               blk = dx_get_block(p, e);
+-              if (inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) {
+-                      BREAKPOINT();
++              /*
++               * Disable this check as it is racy.
++               */
++              if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize)
+       return 0;
+-              }
+               /*
+                * By definition of a tree, no node points to the root.
+                */
+-              if (blk == root) {
+-                      BREAKPOINT();
++              if (blk == root)
+                       return 0;
+               }
+-      }
+       return 1;
+ }
+@@ -241,12 +239,241 @@ struct stats dx_show_entries(struct dx_h
+ }
+ #endif /* DX_DEBUG */
+-int dx_lookup(struct iam_path *path)
++/*
++ * Per-node tree locking.
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ */
++
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock     25
++
++#define DX_DEBUG (1)
++
++#if DX_DEBUG
++static struct dx_lock_stats {
++      unsigned dls_bh_lock;
++      unsigned dls_bh_busy;
++      unsigned dls_bh_again;
++      unsigned dls_bh_full_again;
++} dx_lock_stats = { 0, };
++#define DX_DEVAL(x) x
++#else
++#define DX_DEVAL(x)
++#endif
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++      DX_DEVAL(dx_lock_stats.dls_bh_lock++);
++#ifdef CONFIG_SMP
++        while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++              DX_DEVAL(dx_lock_stats.dls_bh_busy++);
++                while (test_bit(BH_DXLock, &bh->b_state))
++                        cpu_relax();
++        }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++        smp_mb__before_clear_bit();
++        clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++/*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
++                                   enum dynlock_type lt)
++{
++      return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS);
++}
++
++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh)
++{
++      if (lh != NULL)
++              dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh);
++}
++
++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh)
++{
++      int i;
++
++      for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
++              if (*lh != NULL) {
++                      dx_unlock_htree(dir, *lh);
++                      *lh = NULL;
++              }
++      }
++}
++
++/*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct iam_entry *dx_find_position(struct iam_path *path,
++                                 struct iam_frame *frame)
++{
++      int count;
++      struct iam_entry *p;
++      struct iam_entry *q;
++      struct iam_entry *m;
++
++      count = dx_get_count(frame->entries);
++      assert_corr(count && count <= dx_get_limit(frame->entries));
++      p = iam_entry_shift(path, frame->entries,
++                          dx_index_is_compat(path) ? 1 : 2);
++      q = iam_entry_shift(path, frame->entries, count - 1);
++      while (p <= q) {
++              m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
++              if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
++                              path->ip_ikey_target) > 0)
++                      q = iam_entry_shift(path, m, -1);
++              else
++                      p = iam_entry_shift(path, m, +1);
++      }
++      return iam_entry_shift(path, p, -1);
++}
++
++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame)
++{
++      return dx_get_block(path, dx_find_position(path, frame));
++}
++
++/*
++ * Fast check for frame consistency.
++ */
++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame)
++{
++      struct iam_container *bag;
++      struct iam_entry *next;
++      struct iam_entry *last;
++      struct iam_entry *entries;
++      struct iam_entry *at;
++
++      bag     = path->ip_container;
++      at      = frame->at;
++      entries = frame->entries;
++      last    = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
++
++      if (unlikely(at > last))
++              return -EAGAIN;
++
++      if (unlikely(dx_get_block(path, at) != frame->leaf))
++              return -EAGAIN;
++
++      if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
++                               path->ip_ikey_target) > 0))
++              return -EAGAIN;
++
++      next = iam_entry_shift(path, at, +1);
++      if (next <= last) {
++              if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
++                                       path->ip_ikey_target) <= 0))
++                      return -EAGAIN;
++      }
++      return 0;
++}
++
++/*
++ * returns 0 if path was unchanged, -EAGAIN otherwise.
++ */
++static int dx_check_path(struct iam_path *path, struct iam_frame *frame)
++{
++      int equal;
++
++      dx_lock_bh(frame->bh);
++      equal = dx_check_fast(path, frame) == 0 ||
++              frame->leaf == dx_find_ptr(path, frame);
++      DX_DEVAL(dx_lock_stats.dls_bh_again += !equal);
++      dx_unlock_bh(frame->bh);
++      
++      return equal ? 0 : -EAGAIN;
++}
++
++/*
++ * returns 0 if path was unchanged, -EAGAIN otherwise.
++ */
++static int dx_check_full_path(struct iam_path *path, int search)
++{
++      struct iam_frame *bottom;
++      struct iam_frame *scan;
++      int i;
++      int result;
++
++      do_corr(schedule());
++
++      for (bottom = path->ip_frames, i = 0;
++           i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
++              ; /* find last filled in frame */
++      }
++
++      /*
++       * Lock frames, bottom to top.
++       */
++      for (scan = bottom - 1; scan >= path->ip_frames; --scan)
++              dx_lock_bh(scan->bh);
++      /*
++       * Check them top to bottom.
++       */
++      result = 0;
++      for (scan = path->ip_frames; scan < bottom; ++scan) {
++              struct iam_entry *pos;
++
++              if (search) {
++                      if (dx_check_fast(path, scan) == 0)
++                              continue;
++
++                      pos = dx_find_position(path, scan);
++                      if (scan->leaf != dx_get_block(path, pos)) {
++                              result = -EAGAIN;
++                              break;
++                      }
++                      scan->at = pos;
++              } else {
++                      pos = iam_entry_shift(path, scan->entries,
++                                            dx_get_count(scan->entries) - 1);
++                      if (scan->at > pos ||
++                          scan->leaf != dx_get_block(path, scan->at)) {
++                              result = -EAGAIN;
++                              break;
++                      }
++              }
++      }
++
++      /*
++       * Unlock top to bottom.
++       */
++      for (scan = path->ip_frames; scan < bottom; ++scan)
++              dx_unlock_bh(scan->bh);
++      DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result);
++      do_corr(schedule());
++
++      return result;
++}
++
++static int dx_lookup_try(struct iam_path *path)
+ {
+       u32 ptr;
+       int err = 0;
+       int i;
+-      int delta;
+       struct iam_descr *param;
+       struct iam_frame *frame;
+@@ -255,20 +482,19 @@ int dx_lookup(struct iam_path *path)
+       param = iam_path_descr(path);
+       c = path->ip_container;
+       
+-      delta = dx_index_is_compat(path) ? 1 : 2;
+-
+-      for (frame = path->ip_frames, i = 0,
+                    ptr = param->id_ops->id_root_ptr(c);
+-           i <= path->ip_indirect;
+-           ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+-              struct iam_entry *entries;
+-              struct iam_entry *p;
+-              struct iam_entry *q;
+-              struct iam_entry *m;
+-              unsigned count;
+-
++      for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
++           ++frame, ++i) {
+               err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
+                                                 &frame->bh);
++              do_corr(schedule());
++
++              dx_lock_bh(frame->bh);
++              /*
++               * node must be initialized under bh lock because concurrent
++               * creation procedure may change it and dx_lookup_try() will
++               * see obsolete tree height. -bzzz
++               */
+               if (err != 0)
+                       break;
+@@ -283,53 +509,82 @@ int dx_lookup(struct iam_path *path)
+                       break;
+               assert_inv(dx_node_check(path, frame));
+-      
+-              entries = frame->entries;
+-              count = dx_get_count(entries);
+-              assert_corr(count && count <= dx_get_limit(entries));
+-              p = iam_entry_shift(path, entries, delta);
+-              q = iam_entry_shift(path, entries, count - 1);
+-              while (p <= q) {
+-                      m = iam_entry_shift(path,
+-                                         p, iam_entry_diff(path, q, p) / 2);
+-                      dxtrace(printk("."));
+-                      if (iam_ikeycmp(c, iam_ikey_at(path, m),
+-                                      path->ip_ikey_target) > 0)
+-                              q = iam_entry_shift(path, m, -1);
+-                      else
+-                              p = iam_entry_shift(path, m, +1);
++              /*
++               * splitting may change root index block and move hash we're
++               * looking for into another index block so, we have to check
++               * this situation and repeat from begining if path got changed
++               * -bzzz
++               */
++              if (i > 0) {
++                      err = dx_check_path(path, frame - 1);
++                      if (err != 0)
++                              break;
+               }
+-              frame->at = iam_entry_shift(path, p, -1);
+-              if (EXT3_INVARIANT_ON) { // linear search cross check
+-                      unsigned n = count - 1;
+-                      struct iam_entry *at;
++              frame->at = dx_find_position(path, frame);
++              frame->curidx = ptr;
++              frame->leaf = ptr = dx_get_block(path, frame->at);
+-                      at = entries;
+-                      while (n--) {
+-                              dxtrace(printk(","));
+-                              at = iam_entry_shift(path, at, +1);
+-                              if (iam_ikeycmp(c, iam_ikey_at(path, at),
+-                                             path->ip_ikey_target) > 0) {
+-                                      if (at != iam_entry_shift(path, frame->at, 1)) {
+-                                              BREAKPOINT();
+-                                              printk(KERN_EMERG "%i\n",
+-                                                     iam_ikeycmp(c, iam_ikey_at(path, at),
+-                                                            path->ip_ikey_target));
+-                                      }
+-                                      at = iam_entry_shift(path, at, -1);
+-                                      break;
+-                              }
+-                      }
+-                      assert_corr(at == frame->at);
+-              }
++              dx_unlock_bh(frame->bh);
++              do_corr(schedule());
+       }
+       if (err != 0)
+-              iam_path_fini(path);
++              dx_unlock_bh(frame->bh);
+       path->ip_frame = --frame;
+       return err;
+ }
++static int dx_lookup(struct iam_path *path)
++{
++      int err;
++      int i;
++
++      for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
++              assert(path->ip_frames[i].bh == NULL);
++
++      do {
++              err = dx_lookup_try(path);
++              do_corr(schedule());
++              if (err != 0)
++                      iam_path_fini(path);
++      } while (err == -EAGAIN);
++
++      return err;
++}
++
++/*
++ * Performs path lookup and returns with found leaf (if any) locked by htree
++ * lock.
++ */
++int dx_lookup_lock(struct iam_path *path,
++                 struct dynlock_handle **dl, enum dynlock_type lt)
++{
++      int result;
++      struct inode *dir;
++
++      dir = iam_path_obj(path);
++      while ((result = dx_lookup(path)) == 0) {
++              do_corr(schedule());
++              *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt);
++              if (*dl == NULL) {
++                      iam_path_fini(path);
++                      result = -ENOMEM;
++                      break;
++              }
++              do_corr(schedule());
++              /*
++               * while locking leaf we just found may get split so we need
++               * to check this -bzzz
++               */
++              if (dx_check_full_path(path, 1) == 0)
++                      break;
++              dx_unlock_htree(dir, *dl);
++              *dl = NULL;
++              iam_path_fini(path);
++      }
++      return result;
++}
++
+ /*
+  * Probe for a directory leaf block to search.
+  *
+@@ -339,7 +594,7 @@ int dx_lookup(struct iam_path *path)
+  * check for this error code, and make sure it never gets reflected
+  * back to userspace.
+  */
+-static int dx_probe(struct dentry *dentry, struct inode *dir,
++static int dx_probe(struct qstr *name, struct inode *dir,
+                   struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+       int err;
+@@ -347,7 +602,7 @@ static int dx_probe(struct dentry *dentr
+       
+       assert_corr(path->ip_data != NULL);
+       ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
+-      ipc->ipc_dentry = dentry;
++      ipc->ipc_qstr  = name;
+       ipc->ipc_hinfo = hinfo;
+       assert_corr(dx_index_is_compat(path));
+@@ -356,6 +611,7 @@ static int dx_probe(struct dentry *dentr
+       return err;
+ }
++
+ /*
+  * This function increments the frame pointer to search the next leaf
+  * block, and reads in the necessary intervening nodes if the search
+@@ -391,10 +647,16 @@ static int ext3_htree_advance(struct ino
+        * nodes need to be read.
+        */
+       while (1) {
++              do_corr(schedule());
++              dx_lock_bh(p->bh);
+               p->at = iam_entry_shift(path, p->at, +1);
+               if (p->at < iam_entry_shift(path, p->entries,
+-                                         dx_get_count(p->entries)))
++                                          dx_get_count(p->entries))) {
++                      p->leaf = dx_get_block(path, p->at);
++                      dx_unlock_bh(p->bh);
+                       break;
++              }
++              dx_unlock_bh(p->bh);
+               if (p == path->ip_frames)
+                       return 0;
+               num_frames++;
+@@ -425,25 +687,125 @@ static int ext3_htree_advance(struct ino
+        * block so no check is necessary
+        */
+       while (num_frames--) {
++              iam_ptr_t idx;
++
++              do_corr(schedule());
++              dx_lock_bh(p->bh);
++              idx = p->leaf = dx_get_block(path, p->at);
++              dx_unlock_bh(p->bh);
+               err = iam_path_descr(path)->id_ops->
+-                      id_node_read(path->ip_container,
+-                                                   (iam_ptr_t)dx_get_block(path, p->at),
+-                                                   NULL, &bh);
++                      id_node_read(path->ip_container, idx, NULL, &bh);
+               if (err != 0)
+                       return err; /* Failure */
+               ++p;
+-              brelse (p->bh);
++              brelse(p->bh);
++              assert_corr(p->bh != bh);
+               p->bh = bh;
+               p->entries = dx_node_get_entries(path, p);
+               p->at = iam_entry_shift(path, p->entries, !compat);
++              assert_corr(p->curidx != idx);
++              p->curidx = idx;
++              dx_lock_bh(p->bh);
++              assert_corr(p->leaf != dx_get_block(path, p->at));
++              p->leaf = dx_get_block(path, p->at);
++              dx_unlock_bh(p->bh);
+               assert_inv(dx_node_check(path, p));
+       }
+       return 1;
+ }
++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
++{
++      struct iam_frame *f;
++
++      for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
++              do_corr(schedule());
++              *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ);
++              if (*lh == NULL)
++                      return -ENOMEM;
++      }
++      return 0;
++}
++
++static int iam_index_advance(struct iam_path *path)
++{
++      return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
++}
++
++/*
++ * Advance index part of @path to point to the next leaf. Returns 1 on
++ * success, 0, when end of container was reached. Leaf node is locked.
++ */
+ int iam_index_next(struct iam_container *c, struct iam_path *path)
+ {
+-      return ext3_htree_advance(c->ic_object, 0, path, NULL, 0);
++      iam_ptr_t cursor;
++      struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, };
++      int result;
++      struct inode *object;
++
++      /*
++       * Locking for iam_index_next()... is to be described.
++       */
++
++      object = c->ic_object;
++      cursor = path->ip_frame->leaf;
++
++      while (1) {
++              result = iam_index_lock(path, lh);
++              do_corr(schedule());
++              if (result < 0)
++                      break;
++              
++              result = dx_check_full_path(path, 0);
++              if (result == 0 && cursor == path->ip_frame->leaf) {
++                      result = iam_index_advance(path);
++
++                      assert_corr(result == 0 ||
++                                  cursor != path->ip_frame->leaf);
++                      break;
++              }
++              do {
++                      dx_unlock_array(object, lh);
++
++                      iam_path_release(path);
++                      do_corr(schedule());
++
++                      result = dx_lookup(path);
++                      if (result < 0)
++                              break;
++
++                      while (path->ip_frame->leaf != cursor) {
++                              do_corr(schedule());
++
++                              result = iam_index_lock(path, lh);
++                              do_corr(schedule());
++                              if (result < 0)
++                                      break;
++
++                              result = dx_check_full_path(path, 0);
++                              if (result != 0)
++                                      break;
++
++                              result = iam_index_advance(path);
++                              if (result == 0) {
++                                      ext3_error(object->i_sb, __FUNCTION__,
++                                                 "cannot find cursor: %u\n",
++                                                 cursor);
++                                      result = -EIO;
++                              }
++                              if (result < 0)
++                                      break;
++                              result = dx_check_full_path(path, 0);
++                              if (result != 0)
++                                      break;
++                              dx_unlock_array(object, lh);
++                      }
++              } while (result == -EAGAIN);
++              if (result < 0)
++                      break;
++      }
++      dx_unlock_array(object, lh);
++      return result;
+ }
+ int ext3_htree_next_block(struct inode *dir, __u32 hash,
+@@ -649,14 +1011,29 @@ void iam_insert_key(struct iam_path *pat
+       struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+       int count = dx_get_count(entries);
++      /*
++       * Unfortunately we cannot assert this, as this function is sometimes
++       * called by VFS under i_sem and without pdirops lock.
++       */
++      assert_corr(1 || iam_frame_is_locked(path, frame));
+       assert_corr(count < dx_get_limit(entries));
+       assert_corr(frame->at < iam_entry_shift(path, entries, count));
++      assert_inv(dx_node_check(path, frame));
+       memmove(iam_entry_shift(path, new, 1), new,
+               (char *)iam_entry_shift(path, entries, count) - (char *)new);
+       dx_set_ikey(path, new, key);
+       dx_set_block(path, new, ptr);
+       dx_set_count(entries, count + 1);
++      assert_inv(dx_node_check(path, frame));
++}
++
++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
++                       const struct iam_ikey *key, iam_ptr_t ptr)
++{
++      dx_lock_bh(frame->bh);
++      iam_insert_key(path, frame, key, ptr);
++      dx_unlock_bh(frame->bh);
+ }
+ void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+@@ -882,7 +1259,7 @@ static struct buffer_head * ext3_dx_find
+       sb = dir->i_sb;
+       /* NFS may look up ".." - look at dx_root directory block */
+       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+-              *err = dx_probe(dentry, NULL, &hinfo, path);
++              *err = dx_probe(&dentry->d_name, NULL, &hinfo, path);
+               if (*err != 0)
+                       return NULL;
+       } else {
+@@ -1114,7 +1491,7 @@ struct ext3_dir_entry_2 *move_entries(st
+       hash2 = map[split].hash;
+       continued = hash2 == map[split - 1].hash;
+       dxtrace(printk("Split block %i at %x, %i/%i\n",
+-              dx_get_block(frame->at), hash2, split, count - split));
++              frame->leaf, hash2, split, count - split));
+       /* Fancy dance to stay within two buffers */
+       de2 = dx_move_dirents(data1, data2, map + split, count - split);
+@@ -1484,16 +1861,38 @@ static int shift_entries(struct iam_path
+              (char *) iam_entry_shift(path, entries, count1),
+              count2 * iam_entry_size(path));
+-      dx_set_count(entries, count1);
+       dx_set_count(entries2, count2 + delta);
+       dx_set_limit(entries2, dx_node_limit(path));
+-      iam_insert_key(path, parent, pivot, newblock);
++      /*
++       * NOTE: very subtle piece of code competing dx_probe() may find 2nd
++       * level index in root index, then we insert new index here and set
++       * new count in that 2nd level index. so, dx_probe() may see 2nd level
++       * index w/o hash it looks for. the solution is to check root index
++       * after we locked just founded 2nd level index -bzzz
++       */
++      iam_insert_key_lock(path, parent, pivot, newblock);
++
++      /*
++       * now old and new 2nd level index blocks contain all pointers, so
++       * dx_probe() may find it in the both.  it's OK -bzzz
++       */
++      dx_lock_bh(frame->bh);
++      dx_set_count(entries, count1);
++      dx_unlock_bh(frame->bh);
++
++      /*
++       * now old 2nd level index block points to first half of leafs. it's
++       * importand that dx_probe() must check root index block for changes
++       * under dx_lock_bh(frame->bh) -bzzz
++       */
++
+       return count1;
+ }
+ #ifdef CONFIG_EXT3_INDEX
+-int split_index_node(handle_t *handle, struct iam_path *path)
++int split_index_node(handle_t *handle, struct iam_path *path,
++                   struct dynlock_handle **lh)
+ {
+       struct iam_entry *entries;   /* old block contents */
+@@ -1501,6 +1900,8 @@ int split_index_node(handle_t *handle, s
+       struct iam_frame *frame, *safe;
+       struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+       u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
++      struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
++      struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
+       struct inode *dir = iam_path_obj(path);
+       struct iam_descr *descr;
+       int nr_splet;
+@@ -1523,12 +1924,14 @@ int split_index_node(handle_t *handle, s
+        *   - first allocate all necessary blocks
+        *
+        *   - insert pointers into them atomically.
+-       *
+-       * XXX nikita: this algorithm is *not* scalable, as it assumes that at
+-       * least nodes in the path are locked.
+        */
+-      /* Block full, should compress but for now just split */
++      /*
++       * Locking: leaf is already locked. htree-locks are acquired on all
++       * index nodes that require split bottom-to-top, on the "safe" node,
++       * and on all new nodes
++       */
++
+       dxtrace(printk("using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+@@ -1536,6 +1939,7 @@ int split_index_node(handle_t *handle, s
+       for (nr_splet = 0; frame >= path->ip_frames &&
+            dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+            --frame, ++nr_splet) {
++              do_corr(schedule());
+               if (nr_splet == DX_MAX_TREE_HEIGHT) {
+                       ext3_warning(dir->i_sb, __FUNCTION__,
+                                    "Directory index full!\n");
+@@ -1545,14 +1949,53 @@ int split_index_node(handle_t *handle, s
+       }
+       safe = frame;
+-      /* Go back down, allocating blocks, and adding blocks into
++
++      /*
++       * Lock all nodes, bottom to top.
++       */
++      for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
++              do_corr(schedule());
++              lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE);
++              if (lock[i] == NULL) {
++                      err = -ENOMEM;
++                      goto cleanup;
++              }
++      }
++
++      /*
++       * Check for concurrent index modification.
++       */
++      err = dx_check_full_path(path, 1);
++      if (err)
++              goto cleanup;
++      /*
++       * And check that the same number of nodes is to be split.
++       */
++      for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
++           dx_get_count(frame->entries) == dx_get_limit(frame->entries);
++           --frame, ++i) {
++              ;
++      }
++      if (i != nr_splet) {
++              err = -EAGAIN;
++              goto cleanup;
++      }
++
++      /* Go back down, allocating blocks, locking them, and adding into
+        * transaction... */
+       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
++              do_corr(schedule());
+               if (!bh_new[i] ||
+                   descr->id_ops->id_node_init(path->ip_container,
+                                               bh_new[i], 0) != 0)
+                       goto cleanup;
++              new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE);
++              if (new_lock[i] == NULL) {
++                      err = -ENOMEM;
++                      goto cleanup;
++              }
++              do_corr(schedule());
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+               if (err)
+@@ -1560,6 +2003,7 @@ int split_index_node(handle_t *handle, s
+       }
+       /* Add "safe" node to transaction too */
+       if (safe + 1 != path->ip_frames) {
++              do_corr(schedule());
+               err = ext3_journal_get_write_access(handle, safe->bh);
+               if (err)
+                       goto journal_error;
+@@ -1596,16 +2040,21 @@ int split_index_node(handle_t *handle, s
+                       assert_corr(i == 0);
++                      do_corr(schedule());
++
+                       frames = path->ip_frames;
+                       memcpy((char *) entries2, (char *) entries,
+                              count * iam_entry_size(path));
+                       dx_set_limit(entries2, dx_node_limit(path));
+                       /* Set up root */
++                      dx_lock_bh(frame->bh);
+                       next = descr->id_ops->id_root_inc(path->ip_container,
+                                                         path, frame);
+                       dx_set_block(path, next, newblock[0]);
++                      dx_unlock_bh(frame->bh);
++                      do_corr(schedule());
+                       /* Shift frames in the path */
+                       memmove(frames + 2, frames + 1,
+                               (sizeof path->ip_frames) - 2 * sizeof frames[0]);
+@@ -1621,10 +2070,12 @@ int split_index_node(handle_t *handle, s
+                       err = ext3_journal_get_write_access(handle, bh2);
+                       if (err)
+                               goto journal_error;
++                      do_corr(schedule());
+               } else {
+                       /* splitting non-root index node. */
+                       struct iam_frame *parent = frame - 1;
++                      do_corr(schedule());
+                       count = shift_entries(path, frame, count,
+                                             entries, entries2, newblock[i]);
+                       /* Which index block gets the new entry? */
+@@ -1634,7 +2085,11 @@ int split_index_node(handle_t *handle, s
+                               frame->at = iam_entry_shift(path, entries2,
+                                                           idx - count + d);
+                               frame->entries = entries = entries2;
++                              frame->curidx = newblock[i];
+                               swap(frame->bh, bh2);
++                              assert_corr(lock[i + 1] != NULL);
++                              assert_corr(new_lock[i] != NULL);
++                              swap(lock[i + 1], new_lock[i]);
+                               bh_new[i] = bh2;
+                               parent->at = iam_entry_shift(path,
+                                                            parent->at, +1);
+@@ -1647,20 +2102,25 @@ int split_index_node(handle_t *handle, s
+                       err = ext3_journal_dirty_metadata(handle, bh2);
+                       if (err)
+                               goto journal_error;
++                      do_corr(schedule());
+                       err = ext3_journal_dirty_metadata(handle, parent->bh);
+                       if (err)
+                               goto journal_error;
+               }
++              do_corr(schedule());
+               err = ext3_journal_dirty_metadata(handle, bh);
+               if (err)
+                       goto journal_error;
++      }
+               /*
+                * This function was called to make insertion of new leaf
+                * possible. Check that it fulfilled its obligations.
+                */
+               assert_corr(dx_get_count(path->ip_frame->entries) <
+                           dx_get_limit(path->ip_frame->entries));
+-              }
++      assert_corr(lock[nr_splet] != NULL);
++      *lh = lock[nr_splet];
++      lock[nr_splet] = NULL;
+       if (nr_splet > 0) {
+               /*
+                * Log ->i_size modification.
+@@ -1674,6 +2134,12 @@ journal_error:
+       ext3_std_error(dir->i_sb, err);
+ cleanup:
++      dx_unlock_array(dir, lock);
++      dx_unlock_array(dir, new_lock);
++
++      assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
++
++      do_corr(schedule());
+       for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
+               if (bh_new[i] != NULL)
+                       brelse(bh_new[i]);
+@@ -1695,18 +2161,18 @@ static int ext3_dx_add_entry(handle_t *h
+       struct buffer_head * bh = NULL;
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct ext3_dir_entry_2 *de;
++      struct dynlock_handle *dummy = NULL;
+       int err;
+       size_t isize;
+       iam_path_compat_init(&cpath, dir);
+       param = iam_path_descr(path);
+-      err = dx_probe(dentry, NULL, &hinfo, path);
++      err = dx_probe(&dentry->d_name, NULL, &hinfo, path);
+       if (err != 0)
+               return err;
+       frame = path->ip_frame;
+-      /* XXX nikita: global serialization! */
+       isize = dir->i_size;
+       err = param->id_ops->id_node_read(path->ip_container,
+@@ -1726,7 +2192,7 @@ static int ext3_dx_add_entry(handle_t *h
+               goto cleanup;
+       }
+       
+-      err = split_index_node(handle, path);
++      err = split_index_node(handle, path, &dummy);
+       if (err)
+               goto cleanup;   
+@@ -1742,6 +2208,7 @@ static int ext3_dx_add_entry(handle_t *h
+ journal_error:
+       ext3_std_error(dir->i_sb, err);
+ cleanup:
++      dx_unlock_htree(dir, dummy);
+       if (bh)
+               brelse(bh);
+ cleanup2:
+Index: iam/fs/ext3/super.c
+===================================================================
+--- iam.orig/fs/ext3/super.c
++++ iam/fs/ext3/super.c
+@@ -465,4 +465,8 @@ static struct inode *ext3_alloc_inode(st
+       ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+       ei->vfs_inode.i_version = 1;
++
++      dynlock_init(&ei->i_htree_lock);
++      sema_init(&ei->i_rename_sem, 1);
++      sema_init(&ei->i_append_sem, 1);
+       memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
+Index: iam/include/linux/ext3_fs_i.h
+===================================================================
+--- iam.orig/include/linux/ext3_fs_i.h
++++ iam/include/linux/ext3_fs_i.h
+@@ -19,6 +19,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
++#include <linux/dynlocks.h>
+ struct reserve_window {
+       __u32                   _rsv_start;     /* First byte reserved */
+@@ -127,6 +128,12 @@ struct ext3_inode_info {
+        * by other means, so we have truncate_sem.
+        */
+       struct semaphore truncate_sem;
++
++      /* following fields for parallel directory operations -bzzz */
++      struct dynlock   i_htree_lock;
++      struct semaphore i_append_sem;
++      struct semaphore i_rename_sem;
++
+       struct inode vfs_inode;
+       __u32 i_cached_extent[4];
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h
++++ iam/include/linux/lustre_iam.h
+@@ -39,6 +39,9 @@ enum {
+          * Maximal number of non-leaf levels in htree. In the stock ext3 this
+          * is 2.
+          */
++        /*
++         * XXX reduced back to 2 to make per-node locking work.
++         */
+       DX_MAX_TREE_HEIGHT = 5,
+         /*
+          * Scratch keys used by generic code for temporaries.
+@@ -62,7 +65,7 @@ enum {
+         /*
+          * Maximal format name length.
+          */
+-        DX_FMT_NAME_LEN    = 16
++        DX_FMT_NAME_LEN    = 16,
+ };
+ #ifdef __KERNEL__
+@@ -133,8 +136,10 @@ enum {
+ #if EXT3_CORRECTNESS_ON
+ #define assert_corr(test) J_ASSERT(test)
++#define do_corr(exp) exp
+ #else
+ #define assert_corr(test) do {;} while (0)
++#define do_corr(exp) do {;} while (0)
+ #endif
+ #if EXT3_INVARIANT_ON
+@@ -179,7 +184,7 @@ struct iam_ikey;
+  * support interfaces like readdir(), where iteration over index has to be
+  * re-startable.
+  */
+-typedef __u64 iam_ptr_t;
++typedef __u32 iam_ptr_t;
+ /*
+  * Index node traversed during tree lookup.
+@@ -188,6 +193,11 @@ struct iam_frame {
+       struct buffer_head *bh;    /* buffer holding node data */
+       struct iam_entry *entries; /* array of entries */
+       struct iam_entry *at;      /* target entry, found by binary search */
++      iam_ptr_t         leaf;    /* (logical) offset of child node found by
++                                    * binary search. */
++      iam_ptr_t         curidx;  /* (logical) offset of this node. Used to
++                                    * per-node locking to detect concurrent
++                                    * splits. */
+ };
+ /*
+@@ -205,6 +215,11 @@ struct iam_leaf {
+       struct buffer_head *il_bh;
+       struct iam_lentry  *il_entries;
+       struct iam_lentry  *il_at;
++        /*
++         * Lock on a leaf node.
++         */
++        struct dynlock_handle *il_lock;
++        iam_ptr_t              il_curidx; /* logical offset of leaf node. */
+       void               *il_descr_data;
+ };
+@@ -215,19 +230,23 @@ enum iam_lookup_t {
+         /*
+          * lookup found a record with the key requested
+          */
+-        IAM_LOOKUP_EXACT,
++        IAM_LOOKUP_EXACT  = 0,
+         /*
+          * lookup positioned leaf on some record
+          */
+-        IAM_LOOKUP_OK,
++        IAM_LOOKUP_OK     = 1,
+         /*
+          * leaf was empty
+          */
+-        IAM_LOOKUP_EMPTY,
++        IAM_LOOKUP_EMPTY  = 2,
+         /*
+          * lookup positioned leaf before first record
+          */
+-        IAM_LOOKUP_BEFORE
++        IAM_LOOKUP_BEFORE = 3,
++        /*
++         * Found hash may have a continuation in the next leaf.
++         */
++        IAM_LOOKUP_LAST   = 0x100
+ };
+ /*
+@@ -270,9 +289,9 @@ struct iam_operations {
+                                          struct iam_path *path,
+                                          struct iam_frame *frame);
+-        struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c);
+-        void (*id_ipd_free)(const struct iam_container *c,
+-                            struct iam_path_descr *ipd);
++        struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c,
++                                               void *area);
++        void (*id_ipd_free)(struct iam_path_descr *ipd);
+         /*
+          * Format name.
+          */
+@@ -329,8 +348,10 @@ struct iam_leaf_operations {
+         void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
+         void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
++        void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r);
+       int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
++      int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k);
+         int (*key_size)(const struct iam_leaf *l);
+         /*
+@@ -473,11 +494,23 @@ struct iam_path_compat {
+       struct iam_container ipc_container;
+       __u32                 ipc_scratch[DX_SCRATCH_KEYS];
+       struct dx_hash_info  *ipc_hinfo;
+-      struct dentry        *ipc_dentry;
++      struct qstr          *ipc_qstr;
+       struct iam_path_descr ipc_descr;
+         struct dx_hash_info   ipc_hinfo_area;
+ };
++#define const_max(p, q) ((p > q) ? p : q)
++
++enum {
++        DX_MAX_IKEY_SIZE   = 32, /* be generous */
++        /*
++         * Hack to avoid dynamic allocation and freeing of ipd.
++         */
++        DX_IPD_MAX_SIZE    = const_max(sizeof(struct iam_path_compat),
++                                       DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS +
++                                       sizeof(struct iam_path_descr))
++};
++
+ /*
+  * iam cursor (iterator) api.
+  */
+@@ -554,6 +587,7 @@ struct iam_iterator {
+ void iam_path_init(struct iam_path *path, struct iam_container *c,
+                  struct iam_path_descr *pd);
+ void iam_path_fini(struct iam_path *path);
++void iam_path_release(struct iam_path *path);
+ void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
+ void iam_path_compat_fini(struct iam_path_compat *path);
+@@ -683,12 +717,6 @@ static inline int iam_ikeycmp(const stru
+       return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2);
+ }
+-static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
+-                            const struct iam_rec *rec_src)
+-{
+-      memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size);
+-}
+-
+ static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
+ {
+       return (void *)((char *)entry + off);
+@@ -720,6 +748,11 @@ iam_leaf_ops(const struct iam_leaf *leaf
+         return iam_leaf_descr(leaf)->id_leaf_ops;
+ }
++static inline void iam_reccpy(const struct iam_leaf *leaf,
++                              struct iam_rec *rec_dst)
++{
++        iam_leaf_ops(leaf)->rec_get(leaf, rec_dst);
++}
+ /*XXX These stuff put here, just because they are used by iam.c and namei.c*/
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+@@ -848,7 +881,36 @@ static inline struct iam_ikey *iam_path_
+       return path->ip_data->ipd_key_scratch[nr];
+ }
+-int dx_lookup(struct iam_path *path);
++static inline struct dynlock *path_dynlock(struct iam_path *path)
++{
++        return &EXT3_I(iam_path_obj(path))->i_htree_lock;
++}
++
++static inline int iam_leaf_is_locked(const struct iam_leaf *leaf)
++{
++        int result;
++
++        result = dynlock_is_locked(path_dynlock(leaf->il_path),
++                                   leaf->il_curidx);
++        if (!result)
++                dump_stack();
++        return result;
++}
++
++static inline int iam_frame_is_locked(struct iam_path *path,
++                                      const struct iam_frame *frame)
++{
++        int result;
++
++        result = dynlock_is_locked(path_dynlock(path), frame->curidx);
++        if (!result)
++                dump_stack();
++        return result;
++}
++
++int dx_lookup_lock(struct iam_path *path,
++                 struct dynlock_handle **dl, enum dynlock_type lt);
++
+ void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+                    u32 hash, u32 block);
+ int dx_index_is_compat(struct iam_path *path);
+@@ -858,7 +920,8 @@ int ext3_htree_next_block(struct inode *
+ struct buffer_head *ext3_append(handle_t *handle, struct inode *inode,
+                               u32 *block, int *err);
+-int split_index_node(handle_t *handle, struct iam_path *path);
++int split_index_node(handle_t *handle, struct iam_path *path,
++                   struct dynlock_handle **lh);
+ struct ext3_dir_entry_2 *split_entry(struct inode *dir,
+                                    struct ext3_dir_entry_2 *de,
+                                    unsigned long ino, mode_t mode,
+@@ -874,6 +937,10 @@ struct ext3_dir_entry_2 *move_entries(st
+ extern struct iam_descr iam_htree_compat_param;
++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
++                                   enum dynlock_type lt);
++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh);
++
+ /*
+  * external
+  */
+@@ -889,7 +956,7 @@ int iam_read_leaf(struct iam_path *p);
+ int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+                 handle_t *handle, struct buffer_head **bh);
+-void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
+                   const struct iam_ikey *key, iam_ptr_t ptr);
+ int  iam_leaf_at_end(const struct iam_leaf *l);
diff --git a/ldiskfs/kernel_patches/patches/ext3-tall-htree.patch b/ldiskfs/kernel_patches/patches/ext3-tall-htree.patch
new file mode 100644 (file)
index 0000000..5021759
--- /dev/null
@@ -0,0 +1,431 @@
+Index: linux-2.6.9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/namei.c   2006-04-23 22:35:38.000000000 +0800
++++ linux-2.6.9/fs/ext3/namei.c        2006-04-23 22:35:47.000000000 +0800
+@@ -48,6 +48,11 @@
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
++/*
++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
++ */
++#define DX_MAX_TREE_HEIGHT (5)
++
+ static struct buffer_head *ext3_append(handle_t *handle,
+                                       struct inode *inode,
+                                       u32 *block, int *err)
+@@ -75,7 +80,7 @@
+ #ifdef DX_DEBUG
+ #define dxtrace(command) command
+ #else
+-#define dxtrace(command) 
++#define dxtrace(command)
+ #endif
+ struct fake_dirent
+@@ -168,7 +173,7 @@
+ static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+-                               struct dx_frame *frames, 
++                               struct dx_frame *frames,
+                                __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+@@ -249,7 +254,7 @@
+ }
+ struct stats
+-{ 
++{
+       unsigned names;
+       unsigned space;
+       unsigned bcount;
+@@ -367,7 +372,7 @@
+               goto fail;
+       }
+-      if ((indirect = root->info.indirect_levels) > 1) {
++      if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) {
+               ext3_warning(dir->i_sb, __FUNCTION__,
+                            "Unimplemented inode hash depth: %#06x",
+                            root->info.indirect_levels);
+@@ -436,12 +441,15 @@
+ static void dx_release (struct dx_frame *frames)
+ {
++      int height;
++
+       if (frames[0].bh == NULL)
+               return;
+-
+-      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels;
++      for (; height >= 0; height--) {
++              assert(frames[height].bh != NULL);
++              brelse(frames[height].bh);
++      }
+ }
+ /*
+@@ -463,7 +471,7 @@
+  */
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+-                               struct dx_frame *frames, 
++                               struct dx_frame *frames,
+                                __u32 *start_hash)
+ {
+       struct dx_frame *p;
+@@ -582,7 +590,7 @@
+ {
+       struct dx_hash_info hinfo;
+       struct ext3_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
+       struct inode *dir;
+       int block, err;
+       int count = 0;
+@@ -627,7 +635,7 @@
+               }
+               count += ret;
+               hashval = ~0;
+-              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, 
++              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+                                           frame, frames, &hashval);
+               *next_hash = hashval;
+               if (ret < 0) {
+@@ -644,7 +652,7 @@
+                       break;
+       }
+       dx_release(frames);
+-      dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 
++      dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+                      count, *next_hash));
+       return count;
+ errout:
+@@ -918,7 +926,7 @@
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
+       struct ext3_dir_entry_2 *de, *top;
+       struct buffer_head *bh;
+       unsigned long block;
+@@ -1037,7 +1045,7 @@
+               parent = ERR_PTR(-ENOMEM);
+       }
+       return parent;
+-} 
++}
+ #define S_SHIFT 12
+ static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+@@ -1098,6 +1106,8 @@
+       return prev;
+ }
++/* Allocate new node, and split leaf node @bh into it, inserting new pointer
++ * into parent node identified by @frame */
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+                       struct buffer_head **bh,struct dx_frame *frame,
+                       struct dx_hash_info *hinfo, int *error)
+@@ -1185,7 +1195,7 @@
+  * add_dirent_to_buf will attempt search the directory block for
+  * space.  It will return -ENOSPC if no space is available, and -EIO
+  * and -EEXIST if directory entry already exists.
+- * 
++ *
+  * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
+  * all other cases bh is released.
+  */
+@@ -1286,7 +1296,7 @@
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+       struct dx_root  *root;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
+       struct dx_entry *entries;
+       struct ext3_dir_entry_2 *de, *de2;
+       char            *data1, *top;
+@@ -1427,20 +1437,29 @@
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
+-      struct dx_entry *entries, *at;
++      struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe;
++      struct dx_node *node2;
++      struct dx_entry *entries;   /* old block contents */
++      struct dx_entry *entries2;  /* new block contents */
+       struct dx_hash_info hinfo;
+       struct buffer_head * bh;
++      struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct super_block * sb = dir->i_sb;
+       struct ext3_dir_entry_2 *de;
++      u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+       int err;
++      int nr_splet;
++      int i;
++      size_t isize;
+       frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+       entries = frame->entries;
+-      at = frame->at;
++
++      /* XXX nikita: global serialization! */
++      isize = dir->i_size;
+       if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+               goto cleanup;
+@@ -1456,29 +1475,43 @@
+               goto cleanup;
+       }
++      /*
++       * Tall-tree handling: we might have to split multiple index blocks
++       * all the way up to tree root. Tricky point here is error handling:
++       * to avoid complicated undo/rollback we
++       *
++       *   - first allocate all necessary blocks
++       *
++       *   - insert pointers into them atomically.
++       *
++       * XXX nikita: this algorithm is *not* scalable, as it assumes that at
++       * least nodes in the path are locked.
++       */
++
+       /* Block full, should compress but for now just split */
+       dxtrace(printk("using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+-      /* Need to split index? */
+-      if (dx_get_count(entries) == dx_get_limit(entries)) {
+-              u32 newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
+-              struct dx_entry *entries2;
+-              struct dx_node *node2;
+-              struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
++      /* What levels need split? */
++      for (nr_splet = 0; frame >= frames &&
++           dx_get_count(frame->entries) == dx_get_limit(frame->entries);
++           --frame, ++nr_splet) {
++              if (nr_splet == DX_MAX_TREE_HEIGHT) {
+                       ext3_warning(sb, __FUNCTION__,
+                                    "Directory index full!\n");
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
+-              bh2 = ext3_append (handle, dir, &newblock, &err);
+-              if (!(bh2))
++      }
++
++      safe = frame;
++      /* Go back down, allocating blocks, and adding blocks into
++       * transaction... */
++      for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
++              bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
++              if (!bh_new[i])
+                       goto cleanup;
+-              node2 = (struct dx_node *)(bh2->b_data);
++              node2 = (struct dx_node *)(bh_new[i]->b_data);
+               entries2 = node2->entries;
+               node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+               node2->fake.inode = 0;
+@@ -1486,72 +1519,112 @@
+               err = ext3_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
+-                      unsigned icount1 = icount/2, icount2 = icount - icount1;
+-                      unsigned hash2 = dx_get_hash(entries + icount1);
+-                      dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+-
+-                      BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+-                      err = ext3_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++      }
++      /* Add "safe" node to transaction too */
++      if (safe + 1 != frames) {
++              err = ext3_journal_get_write_access(handle, safe->bh);
++              if (err)
++                      goto journal_error;
++      }
++
++      /* Go through nodes once more, inserting pointers */
++      for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
++              unsigned count;
++              int idx;
++              struct buffer_head *bh2;
++
++              entries = frame->entries;
++              count = dx_get_count(entries);
++              idx = frame->at - entries;
++
++              bh2 = bh_new[i];
++              node2 = (struct dx_node *)(bh2->b_data);
++              entries2 = node2->entries;
++
++              if (frame == frames) {
++                      /* splitting root node. Tricky point:
++                       *
++                       * In the "normal" B-tree we'd split root *and* add
++                       * new root to the tree with pointers to the old root
++                       * and its sibling (thus introducing two new nodes).
++                       *
++                       * In htree it's enough to add one node, because
++                       * capacity of the root node is smaller than that of
++                       * non-root one.
++                       */
++                      struct dx_root *root;
++                      u8 indirects;
++
++                      root = (struct dx_root *) frames->bh->b_data;
++                      indirects = root->info.indirect_levels;
++                      dxtrace(printk("Creating new root %d\n", indirects));
++                      memcpy((char *) entries2, (char *) entries,
++                             count * sizeof(struct dx_entry));
++                      dx_set_limit(entries2, dx_node_limit(dir));
++
++                      /* Set up root */
++                      dx_set_count(entries, 1);
++                      dx_set_block(entries + 0, newblock[i]);
++                      root->info.indirect_levels = indirects + 1;
++
++                      /* Shift frames in the path */
++                      memmove(frames + 2, frames + 1,
++                              (sizeof frames) - 2 * sizeof frames[0]);
++                      /* Add new access path frame */
++                      frames[1].at = entries2 + idx;
++                      frames[1].entries = entries = entries2;
++                      frames[1].bh = bh2;
++                      ++ frame;
++                      bh_new[i] = NULL; /* buffer head is "consumed" */
++                      err = ext3_journal_get_write_access(handle, bh2);
+                       if (err)
+                               goto journal_error;
+-
+-                      memcpy ((char *) entries2, (char *) (entries + icount1),
+-                              icount2 * sizeof(struct dx_entry));
+-                      dx_set_count (entries, icount1);
+-                      dx_set_count (entries2, icount2);
++              } else {
++                      /* splitting non-root index node. */
++                      unsigned count1 = count/2, count2 = count - count1;
++                      unsigned hash2 = dx_get_hash(entries + count1);
++                      dxtrace(printk("Split index %i/%i\n", count1, count2));
++
++                      memcpy ((char *) entries2, (char *) (entries + count1),
++                              count2 * sizeof(struct dx_entry));
++                      dx_set_count (entries, count1);
++                      dx_set_count (entries2, count2);
+                       dx_set_limit (entries2, dx_node_limit(dir));
+                       /* Which index block gets the new entry? */
+-                      if (at - entries >= icount1) {
+-                              frame->at = at = at - entries - icount1 + entries2;
++                      if (idx >= count1) {
++                              frame->at = entries2 + idx - count1;
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
++                              bh_new[i] = bh2;
+                       }
+-                      dx_insert_block (frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index ("node", frames[1].entries));
++                      dx_insert_block (frame - 1, hash2, newblock[i]);
++                      dxtrace(dx_show_index ("node", frame->entries));
+                       dxtrace(dx_show_index ("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+                       err = ext3_journal_dirty_metadata(handle, bh2);
+                       if (err)
+                               goto journal_error;
+-                      brelse (bh2);
+-              } else {
+-                      dxtrace(printk("Creating second level index...\n"));
+-                      memcpy((char *) entries2, (char *) entries,
+-                             icount * sizeof(struct dx_entry));
+-                      dx_set_limit(entries2, dx_node_limit(dir));
+-
+-                      /* Set up root */
+-                      dx_set_count(entries, 1);
+-                      dx_set_block(entries + 0, newblock);
+-                      ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext3_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
+               }
+-              ext3_journal_dirty_metadata(handle, frames[0].bh);
+       }
+-      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++      de = do_split(handle, dir, &bh, --frame, &hinfo, &err);
+       if (!de)
+               goto cleanup;
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+-      bh = NULL;
+-      goto cleanup;
++      goto cleanup2;
+ journal_error:
+       ext3_std_error(dir->i_sb, err);
+ cleanup:
+       if (bh)
+               brelse(bh);
++cleanup2:
++      for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
++              if (bh_new[i] != NULL)
++                      brelse(bh_new[i]);
++      }
++      if (err)
++              inode->i_size = isize;
+       dx_release(frames);
+       return err;
+ }
+@@ -1561,7 +1634,7 @@
+  * ext3_delete_entry deletes a directory entry by merging it with the
+  * previous entry
+  */
+-static int ext3_delete_entry (handle_t *handle, 
++static int ext3_delete_entry (handle_t *handle,
+                             struct inode * dir,
+                             struct ext3_dir_entry_2 * de_del,
+                             struct buffer_head * bh)
+@@ -1821,7 +1894,7 @@
+       de1 = (struct ext3_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       if (le32_to_cpu(de->inode) != inode->i_ino ||
+-                      !le32_to_cpu(de1->inode) || 
++                      !le32_to_cpu(de1->inode) ||
+                       strcmp (".", de->name) ||
+                       strcmp ("..", de1->name)) {
+               ext3_warning (inode->i_sb, "empty_dir",
+@@ -1891,7 +1964,7 @@
+        * being truncated, or files being unlinked. */
+       /* @@@ FIXME: Observation from aviro:
+-       * I think I can trigger J_ASSERT in ext3_orphan_add().  We block 
++       * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
+        * here (on lock_super()), so race with ext3_link() which might bump
+        * ->i_nlink. For, say it, character device. Not a regular file,
+        * not a directory, not a symlink and ->i_nlink > 0.
+@@ -2415,4 +2488,4 @@
+       .removexattr    = generic_removexattr,
+ #endif
+       .permission     = ext3_permission,
+-}; 
++};
index f71e470..adba428 100644 (file)
@@ -91,17 +91,9 @@ diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/ioctl.c RH_2_6_9_42_0_3/fs/ext3/ioctl.c
 diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/namei.c RH_2_6_9_42_0_3/fs/ext3/namei.c
 --- RH_2_6_9_42_0_3.orig/fs/ext3/namei.c       2006-10-23 13:32:59.000000000 +0300
 +++ RH_2_6_9_42_0_3/fs/ext3/namei.c    2007-02-22 18:58:13.000000000 +0200
-@@ -97,6 +97,7 @@ struct dx_entry
-       __le32 block;
- };
-+
- /*
-  * dx_root_info is laid out so that if it should somehow get overlaid by a
-  * dirent the two low bits of the hash version will be zero.  Therefore, the
-@@ -141,6 +142,14 @@ struct dx_map_entry
-       u32 offs;
- };
+@@ -1624,6 +1633,28 @@ static int ext3_add_nondir(handle_t *han
+       return err;
+ }
  
 +#define LVFS_DENTRY_PARAM_MAGIC               20070216UL
 +struct lvfs_dentry_params
@@ -110,14 +102,7 @@ diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/namei.c RH_2_6_9_42_0_3/fs/ext3/namei.c
 +      void            *p_ptr; 
 +      u32             magic;
 +};
-+
- #ifdef CONFIG_EXT3_INDEX
- static inline unsigned dx_get_block (struct dx_entry *entry);
- static void dx_set_block (struct dx_entry *entry, unsigned value);
-@@ -1624,6 +1633,20 @@ static int ext3_add_nondir(handle_t *han
-       return err;
- }
++ 
 +static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
 +                                              int mode, struct dentry *dentry)
 +{
index 7c2246f..a4cc364 100644 (file)
@@ -14,8 +14,19 @@ ext3-mballoc3-core.patch
 ext3-mballoc3-rhel4.patch 
 ext3-nlinks-2.6.9.patch
 ext3-ialloc-2.6.patch
-ext3-lookup-dotdot-2.6.9.patch
-ext3-sector_t-overflow-2.6.9-rhel4.patch
-ext3-check-jbd-errors-2.6.9.patch
+ext3-tall-htree.patch
+ext3-htree-path.patch
+ext3-htree-r5-hash.patch
+ext3-htree-path-ops.patch
+ext3-hash-selection.patch
+ext3-htree-comments.patch
+ext3-lookup-dotdot-2.6.9.patch 
+ext3-sector_t-overflow-2.6.9-rhel4.patch 
+ext3-check-jbd-errors-2.6.9.patch 
 ext3-uninit-2.6.9.patch
 ext3-nanosecond-2.6-rhel4.patch
+ext3-iam-ops.patch 
+ext3-iam-separate.patch 
+ext3-iam-uapi.patch 
+ext3-orphans-delay.patch
+ext3-pdirops-2.6.9.patch