--- /dev/null
+Index: linux-2.6.9/fs/ext3/hash.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/hash.c 2006-04-23 22:39:01.000000000 +0800
++++ linux-2.6.9/fs/ext3/hash.c 2006-04-23 22:39:16.000000000 +0800
+@@ -127,6 +127,11 @@
+ return a;
+ }
+
++static __u32 dx_same_hash(const signed char *msg, int len)
++{
++ return 0xcafebabeUL;
++}
++
+ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+ {
+ __u32 pad, val;
+@@ -220,6 +225,9 @@
+ case DX_HASH_R5:
+ hash = dx_r5_hash(name, len);
+ break;
++ case DX_HASH_SAME:
++ hash = dx_same_hash(name, len);
++ break;
+ default:
+ hinfo->hash = 0;
+ return -1;
+Index: linux-2.6.9/fs/ext3/super.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/super.c 2006-08-17 09:26:01.000000000 +0300
++++ linux-2.6.9/fs/ext3/super.c 2006-08-17 09:31:22.000000000 +0300
+@@ -599,6 +599,7 @@ enum {
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_extents, Opt_noextents, Opt_extdebug,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe,
++ Opt_hashfunc,
+ };
+
+ static match_table_t tokens = {
+@@ -655,6 +656,7 @@ static match_table_t tokens = {
+ {Opt_stripe, "stripe=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
++ {Opt_hashfunc,"hash=%s"},
+ };
+
+ static unsigned long get_sb_block(void **data)
+@@ -679,6 +681,7 @@ static unsigned long get_sb_block(void *
+ return sb_block;
+ }
+
++int user_selected_hash_function = -1;
+ static int parse_options (char * options, struct super_block *sb,
+ unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
+ {
+@@ -980,6 +983,22 @@ clear_qf_name:
+ return 0;
+ sbi->s_stripe = option;
+ break;
++ case Opt_hashfunc:
++ if (strncmp (args[0].from,"legacy",6) == 0){
++ user_selected_hash_function = 0;
++ } else if (strncmp (args[0].from,"half_md4",8) == 0){
++ user_selected_hash_function = 1;
++ } else if (strncmp (args[0].from,"tea",3) == 0){
++ user_selected_hash_function = 2;
++ } else if (strncmp (args[0].from,"r5",2) == 0){
++ user_selected_hash_function = 3;
++ } else if (strncmp (args[0].from,"same",4) == 0){
++ user_selected_hash_function = 4;
++ } else {
++ printk ("Hashfunc name wrong\n");
++ return 0;
++ }
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+Index: linux-2.6.9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:39:02.000000000 +0800
++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:39:16.000000000 +0800
+@@ -365,10 +365,7 @@
+ struct htree_cookie *hc = cookie;
+
+ root = data;
+- if (root->info.hash_version != DX_HASH_TEA &&
+- root->info.hash_version != DX_HASH_HALF_MD4 &&
+- root->info.hash_version != DX_HASH_R5 &&
+- root->info.hash_version != DX_HASH_LEGACY) {
++ if (root->info.hash_version > DX_HASH_MAX) {
+ ext3_warning(sb, __FUNCTION__,
+ "Unrecognised inode hash code %d",
+ root->info.hash_version);
+@@ -1467,6 +1464,7 @@
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
++extern int user_selected_hash_function;
+ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+ struct inode *inode, struct buffer_head *bh)
+ {
+@@ -1522,7 +1520,9 @@
+ memset (&root->info, 0, sizeof(root->info));
+ root->info.info_length = sizeof(root->info);
+ root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+- root->info.hash_version = DX_HASH_R5;
++ if (user_selected_hash_function >= 0 &&
++ user_selected_hash_function <= DX_HASH_MAX)
++ root->info.hash_version = user_selected_hash_function;
+ entries = (void *)root->entries;
+ dx_set_block (&path, entries, 1);
+ dx_set_count (entries, 1);
+Index: linux-2.6.9/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.9.orig/include/linux/ext3_fs.h 2006-04-23 22:39:01.000000000 +0800
++++ linux-2.6.9/include/linux/ext3_fs.h 2006-04-23 22:39:16.000000000 +0800
+@@ -665,6 +665,8 @@
+ #define DX_HASH_HALF_MD4 1
+ #define DX_HASH_TEA 2
+ #define DX_HASH_R5 3
++#define DX_HASH_SAME 4
++#define DX_HASH_MAX 4
+
+ /* hash info structure used by the directory hash */
+ struct dx_hash_info
--- /dev/null
+Index: linux-2.6.9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-17 18:32:12.000000000 +0800
++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 21:40:41.000000000 +0800
+@@ -24,6 +24,78 @@
+ * Theodore Ts'o, 2002
+ */
+
++/*
++ * iam: big theory statement.
++ *
++ * iam (Index Access Module) is a module providing abstraction of persistent
++ * transactional container on top of generalized ext3 htree.
++ *
++ * iam supports:
++ *
++ * - key, pointer, and record size specifiable per container.
++ *
++ * - trees taller than 2 index levels.
++ *
++ * - read/write to existing ext3 htree directories as iam containers.
++ *
++ * iam container is a tree, consisting of leaf nodes containing keys and
++ * records stored in this container, and index nodes, containing keys and
++ * pointers to leaf or index nodes.
++ *
++ * iam does not work with keys directly, instead it calls user-supplied key
++ * comparison function (->dpo_keycmp()).
++ *
++ * Pointers are (currently) interpreted as logical offsets (measured in
++ * blocksful) within underlying flat file on top of which iam tree lives.
++ *
++ * On-disk format:
++ *
++ * iam mostly tries to reuse existing htree formats.
++ *
++ * Format of index node:
++ *
++ * +-----+-------+-------+-------+------+-------+------------+
++ * | | count | | | | | |
++ * | gap | / | entry | entry | .... | entry | free space |
++ * | | limit | | | | | |
++ * +-----+-------+-------+-------+------+-------+------------+
++ *
++ * gap this part of node is never accessed by iam code. It
++ * exists for binary compatibility with ext3 htree (that,
++ * in turn, stores fake struct ext2_dirent for ext2
++ * compatibility), and to keep some unspecified per-node
++ * data. Gap can be different for root and non-root index
++ * nodes. Gap size can be specified for each container
++ * (gap of 0 is allowed).
++ *
++ * count/limit current number of entries in this node, and the maximal
++ * number of entries that can fit into node. count/limit
++ * has the same size as entry, and is itself counted in
++ * count.
++ *
++ * entry index entry: consists of a key immediately followed by
++ * a pointer to a child node. Size of a key and size of a
++ * pointer depends on container. Entry has neither
++ * alignment nor padding.
++ *
++ * free space portion of node new entries are added to
++ *
++ * Entries in index node are sorted by their key value.
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ */
++
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -98,14 +170,6 @@
+ __le16 count;
+ };
+
+-struct dx_entry; /* incomplete type */
+-struct dx_key; /* incomplete type */
+-
+-struct dx_entry_compat {
+- __le32 hash;
+- __le32 block;
+-};
+-
+ /*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero. Therefore, the
+@@ -135,111 +199,513 @@
+ struct {} entries[0];
+ };
+
+-
+-struct dx_frame
+-{
+- struct buffer_head *bh;
+- struct dx_entry *entries;
+- struct dx_entry *at;
+-};
+-
+ struct dx_map_entry
+ {
+ u32 hash;
+ u32 offs;
+ };
+
+-struct dx_path;
+-struct dx_param {
+- size_t dpo_key_size;
+- size_t dpo_ptr_size;
+- size_t dpo_node_gap;
+- size_t dpo_root_gap;
+-
+- u32 (*dpo_root_ptr)(struct dx_path *path);
+- int (*dpo_node_check)(struct dx_path *path,
+- struct dx_frame *frame, void *cookie);
+- int (*dpo_node_init)(struct dx_path *path,
+- struct buffer_head *bh, int root);
+- int (*dpo_keycmp)(struct dx_path *path,
+- struct dx_key *k1, struct dx_key *k2);
++/*
++ * Entry within index tree node. Consists of a key immediately followed
++ * (without padding) by a pointer to the child node.
++ *
++ * Both key and pointer are of variable size, hence incomplete type.
++ */
++struct iam_entry;
++
++struct iam_entry_compat {
++ __le32 hash;
++ __le32 block;
++};
++
++/*
++ * Incomplete type used to refer to keys in iam container.
++ *
++ * As key size can be different from container to container, iam has to use
++ * incomplete type. Clients cast pointer to iam_key to real key type and back.
++ */
++struct iam_key;
++
++/* Incomplete type use to refer to the records stored in iam containers. */
++struct iam_rec;
++
++typedef __u64 iam_ptr_t;
++
++/*
++ * Index node traversed during tree lookup.
++ */
++struct iam_frame {
++ struct buffer_head *bh; /* buffer holding node data */
++ struct iam_entry *entries; /* array of entries */
++ struct iam_entry *at; /* target entry, found by binary search */
++};
++
++/* leaf node reached by tree lookup */
++struct iam_leaf {
++ struct buffer_head *bh;
++ struct iam_leaf_entry *entries;
++ struct iam_leaf_entry *at;
++};
++
++struct iam_path;
++struct iam_container;
++
++/*
++ * Parameters, describing a flavor of iam container.
++ */
++struct iam_descr {
++ /*
++ * Size of a key in this container, in bytes.
++ */
++ size_t id_key_size;
++ /*
++ * Size of a pointer to the next level (stored in index nodes), in
++ * bytes.
++ */
++ size_t id_ptr_size;
++ /*
++ * Size of a record (stored in leaf nodes), in bytes.
++ */
++ size_t id_rec_size;
++ /*
++ * Size of unused (by iam) space at the beginning of every non-root
++ * node, in bytes. Used for compatibility with ext3.
++ */
++ size_t id_node_gap;
++ /*
++ * Size of unused (by iam) space at the beginning of root node, in
++ * bytes. Used for compatibility with ext3.
++ */
++ size_t id_root_gap;
++
++ /*
++ * Returns pointer (in the same sense as pointer in index entry) to
++ * the root node.
++ */
++ __u32 (*id_root_ptr)(struct iam_container *c);
++
++ /*
++ * Check validity and consistency of index node. This is called when
++ * iam just loaded new node into frame.
++ */
++ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
++ /*
++ * Initialize new node (stored in @bh) that is going to be added into
++ * tree.
++ */
++ int (*id_node_init)(struct iam_container *c,
++ struct buffer_head *bh, int root);
++ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *h, struct buffer_head **bh);
++ /*
++ * Key comparison function. Returns -1, 0, +1.
++ */
++ int (*id_keycmp)(struct iam_container *c,
++ struct iam_key *k1, struct iam_key *k2);
++ /*
++ * Create new container.
++ *
++ * Newly created container has a root node and a single leaf. Leaf
++ * contains single record with the smallest possible key.
++ */
++ int (*id_create)(struct iam_container *c);
++ struct {
++ /*
++ * leaf operations.
++ */
++ /*
++ * returns true iff leaf is positioned at the last entry.
++ */
++ int (*at_end)(struct iam_container *c, struct iam_leaf *l);
++ /* position leaf at the first entry */
++ void (*start)(struct iam_container *c, struct iam_leaf *l);
++ /* more leaf to the next entry. */
++ void (*next)(struct iam_container *c, struct iam_leaf *l);
++ /* return key of current leaf record in @k */
++ void (*key)(struct iam_container *c, struct iam_leaf *l,
++ struct iam_key *k);
++ /* return pointer to entry body */
++ struct iam_rec *(*rec)(struct iam_container *c,
++ struct iam_leaf *l);
++ } id_leaf;
++};
++
++struct iam_container {
++ /*
++ * Underlying flat file. IO against this object is issued to
++ * read/write nodes.
++ */
++ struct inode *ic_object;
++ /*
++ * container flavor.
++ */
++ struct iam_descr *ic_descr;
++ /*
++ * pointer to flavor-specific per-container data.
++ */
++ void *ic_descr_data;
+ };
+
+ /*
+ * Structure to keep track of a path drilled through htree.
+ */
+-struct dx_path {
+- struct inode *dp_object;
+- struct dx_param *dp_param;
+- int dp_indirect;
+- struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT];
+- struct dx_frame *dp_frame;
+- struct dx_key *dp_key_target;
+- struct dx_key *dp_key_scratch[DX_SCRATCH_KEYS];
+-};
+-
+-struct dx_path_compat {
+- struct dx_path dpc_path;
+- __u32 dpc_scrach[DX_SCRATCH_KEYS];
+-};
+-
+-static u32 htree_root_ptr(struct dx_path *p);
+-static int htree_node_check(struct dx_path *path,
+- struct dx_frame *frame, void *cookie);
+-static int htree_node_init(struct dx_path *path,
++struct iam_path {
++ /*
++ * Parent container.
++ */
++ struct iam_container *ip_container;
++ /*
++ * Number of index levels minus one.
++ */
++ int ip_indirect;
++ /*
++ * Nodes that top-to-bottom traversal passed through.
++ */
++ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT];
++ /*
++ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
++ * immediately above leaf).
++ */
++ struct iam_frame *ip_frame;
++ /*
++ * Leaf node: a child of ->ip_frame.
++ */
++ struct iam_leaf *ip_leaf;
++ /*
++ * Key searched for.
++ */
++ struct iam_key *ip_key_target;
++ /*
++ * Scratch-pad area for temporary keys.
++ */
++ struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS];
++ /*
++ * pointer to flavor-specific per-container data.
++ */
++ void *ip_descr_data;
++};
++
++/*
++ * Helper structure for legacy htrees.
++ */
++struct iam_path_compat {
++ struct iam_path ipc_path;
++ struct iam_container ipc_container;
++ __u32 ipc_scrach[DX_SCRATCH_KEYS];
++};
++
++static u32 htree_root_ptr(struct iam_container *c);
++static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
++static int htree_node_init(struct iam_container *c,
+ struct buffer_head *bh, int root);
+-static int htree_keycmp(struct dx_path *path,
+- struct dx_key *k1, struct dx_key *k2);
++static int htree_keycmp(struct iam_container *c,
++ struct iam_key *k1, struct iam_key *k2);
++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *h, struct buffer_head **bh);
++
++/*
++ * Parameters describing iam compatibility mode in which existing ext3 htrees
++ * can be manipulated.
++ */
++static struct iam_descr htree_compat_param = {
++ .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++ .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
++ .id_node_gap = offsetof(struct dx_node, entries),
++ .id_root_gap = offsetof(struct dx_root, entries),
++
++ .id_root_ptr = htree_root_ptr,
++ .id_node_check = htree_node_check,
++ .id_node_init = htree_node_init,
++ .id_node_read = htree_node_read,
++ .id_keycmp = htree_keycmp
++};
++
++
++struct iam_key;
++struct iam_rec;
++struct iam_descr;
++struct iam_container;
++struct iam_path;
+
+-static struct dx_param htree_compat_param = {
+- .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
+- .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
+- .dpo_node_gap = offsetof(struct dx_node, entries),
+- .dpo_root_gap = offsetof(struct dx_root, entries),
+-
+- .dpo_root_ptr = htree_root_ptr,
+- .dpo_node_check = htree_node_check,
+- .dpo_node_init = htree_node_init,
+- .dpo_keycmp = htree_keycmp
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++ struct iam_descr *descr, struct inode *inode);
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c);
++
++/*
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
++ *
++ *
++ *
++ * Return values: +ve: found, 0: not-found, -ve: error
++ */
++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++/*
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h.
++ *
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
++ *
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ * iam_lookup(c, k, r2) > 0 &&
++ * !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_insert(handle_t *h, struct iam_container *c,
++ struct iam_key *k, struct iam_rec *r);
++/*
++ * Replace existing record with key @k, or insert new one. New record data are
++ * in @r.
++ *
++ * Return values: 0: success, -ve: error.
++ *
++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
++ * !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_update(handle_t *h, struct iam_container *c,
++ struct iam_key *k, struct iam_rec *r);
++/*
++ * Delete existing record with key @k.
++ *
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ * !iam_lookup(c, k, *));
++ */
++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
++
++/*
++ * iam cursor (iterator) api.
++ */
++
++/*
++ * Flags controlling iterator functionality.
++ */
++enum iam_it_flags {
++ /*
++ * this iterator will move (iam_it_{prev,next}() will be called on it)
++ */
++ IAM_IT_MOVE = (1 << 0),
++ /*
++ * tree can be updated through this iterator.
++ */
++ IAM_IT_WRITE = (1 << 1)
+ };
+
++/*
++ * States of iterator state machine.
++ */
++enum iam_it_state {
++ /* initial state */
++ IAM_IT_DETACHED,
++ /* iterator is above particular record in the container */
++ IAM_IT_ATTACHED
++};
++
++/*
++ * Iterator.
++ *
++ * Immediately after call to iam_it_init() iterator is in "detached"
++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but
++ * doesn't point to any particular record in this container.
++ *
++ * After successful call to iam_it_get() and until corresponding call to
++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
++ *
++ * Attached iterator can move through records in a container (provided
++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
++ * passes over them, and can modify container (provided IAM_IT_WRITE
++ * permission).
++ *
++ * Concurrency: iterators are supposed to be local to thread. Interfaces below
++ * do no internal serialization.
++ *
++ */
++struct iam_iterator {
++ /*
++ * iterator flags, taken from enum iam_it_flags.
++ */
++ __u32 ii_flags;
++ enum iam_it_state ii_state;
++ /*
++ * path to the record. Valid in IAM_IT_ATTACHED state.
++ */
++ struct iam_path ii_path;
++};
++
++static inline struct iam_key *keycpy(struct iam_container *c,
++ struct iam_key *k1, struct iam_key *k2)
++{
++ return memcpy(k1, k2, c->ic_descr->id_key_size);
++}
++
++static inline int keycmp(struct iam_container *c,
++ struct iam_key *k1, struct iam_key *k2)
++{
++ return c->ic_descr->id_keycmp(c, k1, k2);
++}
++
++static struct iam_container *iam_it_container(struct iam_iterator *it)
++{
++ return it->ii_path.ip_container;
++}
++
++static inline int it_keycmp(struct iam_iterator *it,
++ struct iam_key *k1, struct iam_key *k2)
++{
++ return keycmp(iam_it_container(it), k1, k2);
++}
++
++/*
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags);
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it);
++
++/*
++ * Attach iterator. After successful completion, @it points to record with the
++ * largest key not larger than @k. Semantics of ->id_create() method guarantee
++ * that such record will always be found.
++ *
++ * Return value: 0: positioned on existing record,
++ * -ve: error.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0,
++ * (it_state(it) == IAM_IT_ATTACHED &&
++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0))
++ */
++int iam_it_get(struct iam_iterator *it, struct iam_key *k);
++
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ * iam_it_container(dst) == iam_it_container(src) &&
++ * dst->ii_flags = src->ii_flags &&
++ * ergo(it_state(it) == IAM_IT_ATTACHED,
++ * iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
++ */
++void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
++
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it);
++
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ * +1: end of container reached
++ * -ve: error
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_next(struct iam_iterator *it);
++
++/*
++ * Return pointer to the record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++const struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
++
++/*
++ * Replace contents of record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
++
++/*
++ * Place key under iterator in @k, return @k
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++const struct iam_key *iam_it_key_get(struct iam_iterator *it,
++ struct iam_key *k);
++
++/*
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED &&
++ * it->ii_flags&IAM_IT_WRITE &&
++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ * ergo(result == 0,
++ * it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
++ * !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++ struct iam_key *k, struct iam_rec *r);
++/*
++ * Delete record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+
+ #ifdef CONFIG_EXT3_INDEX
+-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry);
+-static void dx_set_block(struct dx_path *p,
+- struct dx_entry *entry, unsigned value);
+-static inline struct dx_key *dx_get_key(struct dx_path *p,
+- struct dx_entry *entry,
+- struct dx_key *key);
+-static void dx_set_key(struct dx_path *p, struct dx_entry *entry,
+- struct dx_key *key);
+-static unsigned dx_get_count(struct dx_entry *entries);
+-static unsigned dx_get_limit(struct dx_entry *entries);
+-static void dx_set_count(struct dx_entry *entries, unsigned value);
+-static void dx_set_limit(struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit(struct dx_path *p);
+-static unsigned dx_node_limit(struct dx_path *p);
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
++static void dx_set_block(struct iam_path *p,
++ struct iam_entry *entry, unsigned value);
++static inline struct iam_key *dx_get_key(struct iam_path *p,
++ struct iam_entry *entry,
++ struct iam_key *key);
++static void dx_set_key(struct iam_path *p, struct iam_entry *entry,
++ struct iam_key *key);
++static unsigned dx_get_count(struct iam_entry *entries);
++static unsigned dx_get_limit(struct iam_entry *entries);
++static void dx_set_count(struct iam_entry *entries, unsigned value);
++static void dx_set_limit(struct iam_entry *entries, unsigned value);
++static unsigned dx_root_limit(struct iam_path *p);
++static unsigned dx_node_limit(struct iam_path *p);
+ static int dx_probe(struct dentry *dentry,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+- struct dx_path *path);
++ struct iam_path *path);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_path *path,
+- struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct iam_path *path,
++ struct iam_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+- struct dx_path *path, __u32 *start_hash);
++ struct iam_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+ struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+
+-static inline void dx_path_init(struct dx_path *path, struct inode *inode);
+-static inline void dx_path_fini(struct dx_path *path);
++static inline void iam_path_init(struct iam_path *path,
++ struct iam_container *c);
++static inline void iam_path_fini(struct iam_path *path);
+
+
+ /*
+@@ -247,153 +713,154 @@
+ * Mask them off for now.
+ */
+
+-static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off)
++static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off)
+ {
+ return (void *)((char *)entry + off);
+ }
+
+-static inline size_t dx_entry_size(struct dx_path *p)
++static inline struct iam_descr *path_descr(struct iam_path *p)
+ {
+- return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size;
++ return p->ip_container->ic_descr;
+ }
+
+-static inline struct dx_entry *dx_entry_shift(struct dx_path *p,
+- struct dx_entry *entry, int shift)
++static inline struct inode *path_obj(struct iam_path *p)
++{
++ return p->ip_container->ic_object;
++}
++
++static inline size_t iam_entry_size(struct iam_path *p)
++{
++ return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size;
++}
++
++static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
++ struct iam_entry *entry, int shift)
+ {
+ void *e = entry;
+- return e + shift * dx_entry_size(p);
++ return e + shift * iam_entry_size(p);
+ }
+
+-static inline ptrdiff_t dx_entry_diff(struct dx_path *p,
+- struct dx_entry *e1, struct dx_entry *e2)
++static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
++ struct iam_entry *e1, struct iam_entry *e2)
+ {
+ ptrdiff_t diff;
+
+ diff = (void *)e1 - (void *)e2;
+- assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff);
+- return diff / dx_entry_size(p);
++ assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
++ return diff / iam_entry_size(p);
+ }
+
+-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry)
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+ {
+- return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size))
++ return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size))
+ & 0x00ffffff;
+ }
+
+-static inline void dx_set_block(struct dx_path *p,
+- struct dx_entry *entry, unsigned value)
++static inline void dx_set_block(struct iam_path *p,
++ struct iam_entry *entry, unsigned value)
+ {
+- *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value);
++ *(u32*)entry_off(entry,
++ path_descr(p)->id_key_size) = cpu_to_le32(value);
+ }
+
+-static inline struct dx_key *dx_get_key(struct dx_path *p,
+- struct dx_entry *entry,
+- struct dx_key *key)
++static inline struct iam_key *dx_get_key(struct iam_path *p,
++ struct iam_entry *entry,
++ struct iam_key *key)
+ {
+- memcpy(key, entry, p->dp_param->dpo_key_size);
++ memcpy(key, entry, path_descr(p)->id_key_size);
+ return key;
+ }
+
+-static inline struct dx_key *dx_key_at(struct dx_path *p,
+- struct dx_entry *entry)
++static inline struct iam_key *iam_key_at(struct iam_path *p,
++ struct iam_entry *entry)
+ {
+- return (struct dx_key *)entry;
++ return (struct iam_key *)entry;
+ }
+
+-static inline void dx_set_key(struct dx_path *p,
+- struct dx_entry *entry, struct dx_key *key)
++static inline void dx_set_key(struct iam_path *p,
++ struct iam_entry *entry, struct iam_key *key)
+ {
+- memcpy(entry, key, p->dp_param->dpo_key_size);
++ memcpy(entry, key, path_descr(p)->id_key_size);
+ }
+
+-static inline unsigned dx_get_count (struct dx_entry *entries)
++static inline unsigned dx_get_count (struct iam_entry *entries)
+ {
+ return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+ }
+
+-static inline unsigned dx_get_limit (struct dx_entry *entries)
++static inline unsigned dx_get_limit (struct iam_entry *entries)
+ {
+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+ }
+
+-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++static inline void dx_set_count (struct iam_entry *entries, unsigned value)
+ {
+ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+ }
+
+-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++static inline void dx_set_limit (struct iam_entry *entries, unsigned value)
+ {
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+
+-static inline unsigned dx_root_limit(struct dx_path *p)
++static inline unsigned dx_root_limit(struct iam_path *p)
+ {
+- struct dx_param *param = p->dp_param;
+- unsigned entry_space = p->dp_object->i_sb->s_blocksize -
+- param->dpo_root_gap;
+- return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++ struct iam_descr *param = path_descr(p);
++ unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
++ param->id_root_gap;
++ return entry_space / (param->id_key_size + param->id_ptr_size);
+ }
+
+-static inline unsigned dx_node_limit(struct dx_path *p)
++static inline unsigned dx_node_limit(struct iam_path *p)
+ {
+- struct dx_param *param = p->dp_param;
+- unsigned entry_space = p->dp_object->i_sb->s_blocksize -
+- param->dpo_node_gap;
+- return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++ struct iam_descr *param = path_descr(p);
++ unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
++ param->id_node_gap;
++ return entry_space / (param->id_key_size + param->id_ptr_size);
+ }
+
+-static inline int dx_index_is_compat(struct dx_path *path)
++static inline int dx_index_is_compat(struct iam_path *path)
+ {
+- return path->dp_param == &htree_compat_param;
++ return path_descr(path) == &htree_compat_param;
+ }
+
+-static struct dx_entry *dx_get_entries(struct dx_path *path, void *data,
++static struct iam_entry *dx_get_entries(struct iam_path *path, void *data,
+ int root)
+ {
+ return data +
+ (root ?
+- path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap);
++ path_descr(path)->id_root_gap : path_descr(path)->id_node_gap);
+ }
+
+-static struct dx_entry *dx_node_get_entries(struct dx_path *path,
+- struct dx_frame *frame)
++static struct iam_entry *dx_node_get_entries(struct iam_path *path,
++ struct iam_frame *frame)
+ {
+ return dx_get_entries(path,
+- frame->bh->b_data, frame == path->dp_frames);
+-}
+-
+-static inline struct dx_key *keycpy(struct dx_path *p,
+- struct dx_key *k1, struct dx_key *k2)
+-{
+- return memcpy(k1, k2, p->dp_param->dpo_key_size);
+-}
+-
+-static inline int keycmp(struct dx_path *p,
+- struct dx_key *k1, struct dx_key *k2)
+-{
+- return p->dp_param->dpo_keycmp(p, k1, k2);
++ frame->bh->b_data, frame == path->ip_frames);
+ }
+
+-static int dx_node_check(struct dx_path *p, struct dx_frame *f)
++static int dx_node_check(struct iam_path *p, struct iam_frame *f)
+ {
+- struct dx_entry *e;
++ struct iam_entry *e;
++ struct iam_container *c;
+ unsigned count;
+ unsigned i;
+
++ c = p->ip_container;
+ e = dx_node_get_entries(p, f);
+ count = dx_get_count(e);
+- e = dx_entry_shift(p, e, 1);
+- for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) {
+- keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]);
+- dx_get_key(p, e, p->dp_key_scratch[1]);
++ e = iam_entry_shift(p, e, 1);
++ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
++ keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]);
++ dx_get_key(p, e, p->ip_key_scratch[1]);
+ if (i > 0 &&
+- keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0)
++ keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0)
+ return 0;
+ }
+ return 1;
+ }
+
+-static u32 htree_root_ptr(struct dx_path *path)
++static u32 htree_root_ptr(struct iam_container *c)
+ {
+ return 0;
+ }
+@@ -403,20 +870,19 @@
+ struct dentry *dentry;
+ };
+
+-static int htree_node_check(struct dx_path *path, struct dx_frame *frame,
+- void *cookie)
++static int htree_node_check(struct iam_path *path, struct iam_frame *frame)
+ {
+ void *data;
+- struct dx_entry *entries;
++ struct iam_entry *entries;
+ struct super_block *sb;
+
+ data = frame->bh->b_data;
+ entries = dx_node_get_entries(path, frame);
+- sb = path->dp_object->i_sb;
+- if (frame == path->dp_frames) {
++ sb = path_obj(path)->i_sb;
++ if (frame == path->ip_frames) {
+ /* root node */
+ struct dx_root *root;
+- struct htree_cookie *hc = cookie;
++ struct htree_cookie *hc = path->ip_descr_data;
+
+ root = data;
+ if (root->info.hash_version > DX_HASH_MAX) {
+@@ -433,8 +899,8 @@
+ return ERR_BAD_DX_DIR;
+ }
+
+- path->dp_indirect = root->info.indirect_levels;
+- if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) {
++ path->ip_indirect = root->info.indirect_levels;
++ if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) {
+ ext3_warning(sb, __FUNCTION__,
+ "Unimplemented inode hash depth: %#06x",
+ root->info.indirect_levels);
+@@ -450,17 +916,17 @@
+ if (hc->dentry)
+ ext3fs_dirhash(hc->dentry->d_name.name,
+ hc->dentry->d_name.len, hc->hinfo);
+- path->dp_key_target = (struct dx_key *)&hc->hinfo->hash;
++ path->ip_key_target = (struct iam_key *)&hc->hinfo->hash;
+ } else {
+ /* non-root index */
+- assert(entries == data + path->dp_param->dpo_node_gap);
++ assert(entries == data + path_descr(path)->id_node_gap);
+ assert(dx_get_limit(entries) == dx_node_limit(path));
+ }
+ frame->entries = frame->at = entries;
+ return 0;
+ }
+
+-static int htree_node_init(struct dx_path *path,
++static int htree_node_init(struct iam_container *c,
+ struct buffer_head *bh, int root)
+ {
+ struct dx_node *node;
+@@ -468,13 +934,24 @@
+ assert(!root);
+
+ node = (void *)bh->b_data;
+- node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize);
++ node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize);
+ node->fake.inode = 0;
+ return 0;
+ }
+
+-static int htree_keycmp(struct dx_path *path,
+- struct dx_key *k1, struct dx_key *k2)
++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *handle, struct buffer_head **bh)
++{
++ int result = 0;
++
++ *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result);
++ if (*bh == NULL)
++ result = -EIO;
++ return result;
++}
++
++static int htree_keycmp(struct iam_container *c,
++ struct iam_key *k1, struct iam_key *k2)
+ {
+ __u32 p1 = le32_to_cpu(*(__u32 *)k1);
+ __u32 p2 = le32_to_cpu(*(__u32 *)k2);
+@@ -486,7 +963,7 @@
+ * Debug
+ */
+ #ifdef DX_DEBUG
+-static void dx_show_index (char * label, struct dx_entry *entries)
++static void dx_show_index (char * label, struct iam_entry *entries)
+ {
+ int i, n = dx_get_count (entries);
+ printk("%s index ", label);
+@@ -535,7 +1012,7 @@
+ }
+
+ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+- struct dx_entry *entries, int levels)
++ struct iam_entry *entries, int levels)
+ {
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+@@ -565,32 +1042,33 @@
+ }
+ #endif /* DX_DEBUG */
+
+-static int dx_lookup(struct dx_path *path, void *cookie)
++static int dx_lookup(struct iam_path *path)
+ {
+ u32 ptr;
+- int err;
++ int err = 0;
+ int i;
+
+- struct dx_param *param;
+- struct dx_frame *frame;
+-
+- param = path->dp_param;
++ struct iam_descr *param;
++ struct iam_frame *frame;
++ struct iam_container *c;
+
+- for (frame = path->dp_frames, i = 0,
+- ptr = param->dpo_root_ptr(path); i <= path->dp_indirect;
++ param = path_descr(path);
++ c = path->ip_container;
++
++ for (frame = path->ip_frames, i = 0,
++ ptr = param->id_root_ptr(path->ip_container);
++ i <= path->ip_indirect;
+ ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+- struct dx_entry *entries;
+- struct dx_entry *p;
+- struct dx_entry *q;
+- struct dx_entry *m;
++ struct iam_entry *entries;
++ struct iam_entry *p;
++ struct iam_entry *q;
++ struct iam_entry *m;
+ unsigned count;
+
+- frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err);
+- if (frame->bh == NULL) {
+- err = -EIO;
++ err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh);
++ if (err != 0)
+ break;
+- }
+- err = param->dpo_node_check(path, frame, cookie);
++ err = param->id_node_check(path, frame);
+ if (err != 0)
+ break;
+
+@@ -599,37 +1077,37 @@
+ entries = frame->entries;
+ count = dx_get_count(entries);
+ assert(count && count <= dx_get_limit(entries));
+- p = dx_entry_shift(path, entries, 1);
+- q = dx_entry_shift(path, entries, count - 1);
++ p = iam_entry_shift(path, entries, 1);
++ q = iam_entry_shift(path, entries, count - 1);
+ while (p <= q) {
+- m = dx_entry_shift(path,
+- p, dx_entry_diff(path, q, p) / 2);
++ m = iam_entry_shift(path,
++ p, iam_entry_diff(path, q, p) / 2);
+ dxtrace(printk("."));
+- if (keycmp(path, dx_key_at(path, m),
+- path->dp_key_target) > 0)
+- q = dx_entry_shift(path, m, -1);
++ if (keycmp(c, iam_key_at(path, m),
++ path->ip_key_target) > 0)
++ q = iam_entry_shift(path, m, -1);
+ else
+- p = dx_entry_shift(path, m, +1);
++ p = iam_entry_shift(path, m, +1);
+ }
+
+- frame->at = dx_entry_shift(path, p, -1);
++ frame->at = iam_entry_shift(path, p, -1);
+ if (1) { // linear search cross check
+ unsigned n = count - 1;
+- struct dx_entry *at;
++ struct iam_entry *at;
+
+ at = entries;
+ while (n--) {
+ dxtrace(printk(","));
+- at = dx_entry_shift(path, at, +1);
+- if (keycmp(path, dx_key_at(path, at),
+- path->dp_key_target) > 0) {
+- if (at != dx_entry_shift(path, frame->at, 1)) {
++ at = iam_entry_shift(path, at, +1);
++ if (keycmp(c, iam_key_at(path, at),
++ path->ip_key_target) > 0) {
++ if (at != iam_entry_shift(path, frame->at, 1)) {
+ BREAKPOINT;
+ printk(KERN_EMERG "%i\n",
+- keycmp(path, dx_key_at(path, at),
+- path->dp_key_target));
++ keycmp(c, iam_key_at(path, at),
++ path->ip_key_target));
+ }
+- at = dx_entry_shift(path, at, -1);
++ at = iam_entry_shift(path, at, -1);
+ break;
+ }
+ }
+@@ -637,8 +1115,8 @@
+ }
+ }
+ if (err != 0)
+- dx_path_fini(path);
+- path->dp_frame = --frame;
++ iam_path_fini(path);
++ path->ip_frame = --frame;
+ return err;
+ }
+
+@@ -652,7 +1130,7 @@
+ * back to userspace.
+ */
+ static int dx_probe(struct dentry *dentry, struct inode *dir,
+- struct dx_hash_info *hinfo, struct dx_path *path)
++ struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+ int err;
+ struct htree_cookie hc = {
+@@ -661,39 +1139,78 @@
+ };
+
+ assert(dx_index_is_compat(path));
+- err = dx_lookup(path, &hc);
+- assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL);
++ path->ip_descr_data = &hc;
++ err = dx_lookup(path);
++ assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+ return err;
+ }
+
+-static inline void dx_path_init(struct dx_path *path, struct inode *inode)
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++ struct iam_descr *descr, struct inode *inode)
++{
++ memset(c, 0, sizeof *c);
++ c->ic_descr = descr;
++ c->ic_object = igrab(inode);
++ if (c->ic_object != NULL)
++ return 0;
++ else
++ return -ENOENT;
++}
++
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c)
++{
++ if (c->ic_object != NULL) {
++ iput(c->ic_object);
++ c->ic_object = NULL;
++ }
++}
++
++static inline void iam_path_init(struct iam_path *path, struct iam_container *c)
+ {
+ memset(path, 0, sizeof *path);
+- path->dp_object = inode;
+- path->dp_frame = path->dp_frames;
++ path->ip_container = c;
++ path->ip_frame = path->ip_frames;
+ }
+
+-static inline void dx_path_fini(struct dx_path *path)
++static inline void iam_path_fini(struct iam_path *path)
+ {
+ int i;
+
+- for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) {
+- if (path->dp_frames[i].bh != NULL) {
+- brelse(path->dp_frames[i].bh);
+- path->dp_frames[i].bh = NULL;
++ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
++ if (path->ip_frames[i].bh != NULL) {
++ brelse(path->ip_frames[i].bh);
++ path->ip_frames[i].bh = NULL;
+ }
+ }
+ }
+
+-static void dx_path_compat_init(struct dx_path_compat *path,
+- struct inode *inode)
++static void iam_path_compat_init(struct iam_path_compat *path,
++ struct inode *inode)
+ {
+ int i;
+- dx_path_init(&path->dpc_path, inode);
+- path->dpc_path.dp_param = &htree_compat_param;
+- for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i)
+- path->dpc_path.dp_key_scratch[i] =
+- (struct dx_key *)&path->dpc_scrach[i];
++
++ iam_container_init(&path->ipc_container, &htree_compat_param, inode);
++ /*
++ * XXX hack allowing finalization of iam_path_compat with
++ * iam_path_fini().
++ */
++ iput(inode);
++ iam_path_init(&path->ipc_path, &path->ipc_container);
++ for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
++ path->ipc_path.ip_key_scratch[i] =
++ (struct iam_key *)&path->ipc_scrach[i];
++}
++
++static void iam_path_compat_fini(struct iam_path_compat *path)
++{
++ iam_path_fini(&path->ipc_path);
++ iam_container_fini(&path->ipc_container);
+ }
+
+ /*
+@@ -714,16 +1231,16 @@
+ * hash of the next page.
+ */
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+- struct dx_path *path, __u32 *start_hash)
++ struct iam_path *path, __u32 *start_hash)
+ {
+- struct dx_frame *p;
++ struct iam_frame *p;
+ struct buffer_head *bh;
+ int err, num_frames = 0;
+ __u32 bhash;
+
+ assert(dx_index_is_compat(path));
+
+- p = path->dp_frame;
++ p = path->ip_frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+ * If we run out of entries in the interior node, loop around and
+@@ -732,11 +1249,11 @@
+ * nodes need to be read.
+ */
+ while (1) {
+- p->at = dx_entry_shift(path, p->at, +1);
+- if (p->at < dx_entry_shift(path, p->entries,
++ p->at = iam_entry_shift(path, p->at, +1);
++ if (p->at < iam_entry_shift(path, p->entries,
+ dx_get_count(p->entries)))
+ break;
+- if (p == path->dp_frames)
++ if (p == path->ip_frames)
+ return 0;
+ num_frames++;
+ --p;
+@@ -749,7 +1266,7 @@
+ * desired contiuation hash. If it doesn't, return since
+ * there's no point to read in the successive index pages.
+ */
+- dx_get_key(path, p->at, (struct dx_key *)&bhash);
++ dx_get_key(path, p->at, (struct iam_key *)&bhash);
+ if (start_hash)
+ *start_hash = bhash;
+ if ((hash & 1) == 0) {
+@@ -761,8 +1278,10 @@
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+- if (!(bh = ext3_bread(NULL, dir,
+- dx_get_block(path, p->at), 0, &err)))
++ err = path_descr(path)->id_node_read(path->ip_container,
++ (iam_ptr_t)dx_get_block(path, p->at),
++ NULL, &bh);
++ if (err != 0)
+ return err; /* Failure */
+ ++p;
+ brelse (p->bh);
+@@ -837,8 +1356,8 @@
+ {
+ struct dx_hash_info hinfo;
+ struct ext3_dir_entry_2 *de;
+- struct dx_path_compat cpath;
+- struct dx_path *path = &cpath.dpc_path;
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
+ struct inode *dir;
+ int block, err;
+ int count = 0;
+@@ -848,7 +1367,7 @@
+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+ start_minor_hash));
+ dir = dir_file->f_dentry->d_inode;
+- dx_path_compat_init(&cpath, dir);
++ iam_path_compat_init(&cpath, dir);
+ if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+ hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+ hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -865,7 +1384,7 @@
+
+ /* Add '.' and '..' from the htree header */
+ if (!start_hash && !start_minor_hash) {
+- de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data;
++ de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data;
+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ goto errout;
+ de = ext3_next_entry(de);
+@@ -875,7 +1394,7 @@
+ }
+
+ while (1) {
+- block = dx_get_block(path, path->dp_frame->at);
++ block = dx_get_block(path, path->ip_frame->at);
+ ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+ start_hash, start_minor_hash);
+ if (ret < 0) {
+@@ -900,12 +1419,12 @@
+ (count && ((hashval & 1) == 0)))
+ break;
+ }
+- dx_path_fini(path);
++ iam_path_fini(path);
+ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+ count, *next_hash));
+ return count;
+ errout:
+- dx_path_fini(path);
++ iam_path_fini(path);
+ return (err);
+ }
+
+@@ -964,18 +1483,18 @@
+ } while(more);
+ }
+
+-static void dx_insert_block(struct dx_path *path,
+- struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct iam_path *path,
++ struct iam_frame *frame, u32 hash, u32 block)
+ {
+- struct dx_entry *entries = frame->entries;
+- struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1);
++ struct iam_entry *entries = frame->entries;
++ struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1);
+ int count = dx_get_count(entries);
+
+ assert(count < dx_get_limit(entries));
+- assert(old < dx_entry_shift(path, entries, count));
+- memmove(dx_entry_shift(path, new, 1), new,
+- (char *)dx_entry_shift(path, entries, count) - (char *)new);
+- dx_set_key(path, new, (struct dx_key *)&hash);
++ assert(old < iam_entry_shift(path, entries, count));
++ memmove(iam_entry_shift(path, new, 1), new,
++ (char *)iam_entry_shift(path, entries, count) - (char *)new);
++ dx_set_key(path, new, (struct iam_key *)&hash);
+ dx_set_block(path, new, block);
+ dx_set_count(entries, count + 1);
+ }
+@@ -1177,9 +1696,9 @@
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+- struct dx_path_compat cpath;
+- struct dx_path *path = &cpath.dpc_path;
+- struct dx_entry_compat dummy_dot = {
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct iam_entry_compat dummy_dot = {
+ .block = 0
+ };
+ struct ext3_dir_entry_2 *de, *top;
+@@ -1190,8 +1709,8 @@
+ const u8 *name = dentry->d_name.name;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+- dx_path_compat_init(&cpath, dir);
+-
++ iam_path_compat_init(&cpath, dir);
++
+ sb = dir->i_sb;
+ /* NFS may look up ".." - look at dx_root directory block */
+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+@@ -1199,13 +1718,15 @@
+ if (*err != 0)
+ return NULL;
+ } else {
+- path->dp_frame->bh = NULL; /* for dx_path_fini() */
+- path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
++ path->ip_frame->bh = NULL; /* for iam_path_fini() */
++ path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
+ }
+ hash = hinfo.hash;
+ do {
+- block = dx_get_block(path, path->dp_frame->at);
+- if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++ block = dx_get_block(path, path->ip_frame->at);
++ *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block,
++ NULL, &bh);
++ if (*err != 0)
+ goto errout;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+@@ -1220,7 +1741,7 @@
+ goto errout;
+ }
+ *res_dir = de;
+- dx_path_fini(path);
++ iam_path_fini(path);
+ return bh;
+ }
+ brelse (bh);
+@@ -1238,7 +1759,7 @@
+ *err = -ENOENT;
+ errout:
+ dxtrace(printk("%s not found\n", name));
+- dx_path_fini(path);
++ iam_path_fini(path);
+ return NULL;
+ }
+ #endif
+@@ -1363,11 +1884,11 @@
+
+ /* Allocate new node, and split leaf node @bh into it, inserting new pointer
+ * into parent node identified by @frame */
+-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path,
+- struct buffer_head **bh,struct dx_frame *frame,
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path,
++ struct buffer_head **bh,struct iam_frame *frame,
+ struct dx_hash_info *hinfo, int *error)
+ {
+- struct inode *dir = path->dp_object;
++ struct inode *dir = path_obj(path);
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+ struct buffer_head *bh2;
+@@ -1553,9 +2074,9 @@
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+ struct dx_root *root;
+- struct dx_path_compat cpath;
+- struct dx_path *path = &cpath.dpc_path;
+- struct dx_entry *entries;
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct iam_entry *entries;
+ struct ext3_dir_entry_2 *de, *de2;
+ char *data1, *top;
+ unsigned len;
+@@ -1565,7 +2086,7 @@
+ u32 block;
+ struct fake_dirent *fde;
+
+- dx_path_compat_init(&cpath, dir);
++ iam_path_compat_init(&cpath, dir);
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk("Creating index\n"));
+ retval = ext3_journal_get_write_access(handle, bh);
+@@ -1612,12 +2133,12 @@
+ hinfo.hash_version = root->info.hash_version;
+ hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+ ext3fs_dirhash(name, namelen, &hinfo);
+- path->dp_frame->entries = entries;
+- path->dp_frame->at = entries;
+- path->dp_frame->bh = bh;
++ path->ip_frame->entries = entries;
++ path->ip_frame->at = entries;
++ path->ip_frame->bh = bh;
+ bh = bh2;
+- de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval);
+- dx_path_fini(path);
++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &retval);
++ iam_path_fini(path);
+ if (!de)
+ return retval;
+
+@@ -1698,12 +2219,12 @@
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_path_compat cpath;
+- struct dx_path *path = &cpath.dpc_path;
+- struct dx_param *param;
+- struct dx_frame *frame, *safe;
+- struct dx_entry *entries; /* old block contents */
+- struct dx_entry *entries2; /* new block contents */
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct iam_descr *param;
++ struct iam_frame *frame, *safe;
++ struct iam_entry *entries; /* old block contents */
++ struct iam_entry *entries2; /* new block contents */
+ struct dx_hash_info hinfo;
+ struct buffer_head * bh;
+ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+@@ -1716,20 +2237,22 @@
+ int i;
+ size_t isize;
+
+- dx_path_compat_init(&cpath, dir);
+- param = path->dp_param;
++ iam_path_compat_init(&cpath, dir);
++ param = path_descr(path);
+
+ err = dx_probe(dentry, NULL, &hinfo, path);
+ if (err != 0)
+ return err;
+- frame = path->dp_frame;
++ frame = path->ip_frame;
+ entries = frame->entries;
+
+ /* XXX nikita: global serialization! */
+ isize = dir->i_size;
+
+- if (!(bh = ext3_bread(handle, dir,
+- dx_get_block(path, frame->at), 0, &err)))
++ err = param->id_node_read(path->ip_container,
++ (iam_ptr_t)dx_get_block(path,
++ frame->at), handle, &bh);
++ if (err != 0)
+ goto cleanup;
+
+ BUFFER_TRACE(bh, "get_write_access");
+@@ -1761,7 +2284,7 @@
+ dx_get_count(entries), dx_get_limit(entries)));
+
+ /* What levels need split? */
+- for (nr_splet = 0; frame >= path->dp_frames &&
++ for (nr_splet = 0; frame >= path->ip_frames &&
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++nr_splet) {
+ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+@@ -1778,7 +2301,7 @@
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+ if (!bh_new[i] ||
+- param->dpo_node_init(path, bh_new[i], 0) != 0)
++ param->id_node_init(path->ip_container, bh_new[i], 0) != 0)
+ goto cleanup;
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -1786,7 +2309,7 @@
+ goto journal_error;
+ }
+ /* Add "safe" node to transaction too */
+- if (safe + 1 != path->dp_frames) {
++ if (safe + 1 != path->ip_frames) {
+ err = ext3_journal_get_write_access(handle, safe->bh);
+ if (err)
+ goto journal_error;
+@@ -1800,12 +2323,12 @@
+
+ entries = frame->entries;
+ count = dx_get_count(entries);
+- idx = dx_entry_diff(path, frame->at, entries);
++ idx = iam_entry_diff(path, frame->at, entries);
+
+ bh2 = bh_new[i];
+ entries2 = dx_get_entries(path, bh2->b_data, 0);
+
+- if (frame == path->dp_frames) {
++ if (frame == path->ip_frames) {
+ /* splitting root node. Tricky point:
+ *
+ * In the "normal" B-tree we'd split root *and* add
+@@ -1818,14 +2341,14 @@
+ */
+ struct dx_root *root;
+ u8 indirects;
+- struct dx_frame *frames;
++ struct iam_frame *frames;
+
+- frames = path->dp_frames;
++ frames = path->ip_frames;
+ root = (struct dx_root *) frames->bh->b_data;
+ indirects = root->info.indirect_levels;
+ dxtrace(printk("Creating new root %d\n", indirects));
+ memcpy((char *) entries2, (char *) entries,
+- count * dx_entry_size(path));
++ count * iam_entry_size(path));
+ dx_set_limit(entries2, dx_node_limit(path));
+
+ /* Set up root */
+@@ -1835,9 +2358,9 @@
+
+ /* Shift frames in the path */
+ memmove(frames + 2, frames + 1,
+- (sizeof path->dp_frames) - 2 * sizeof frames[0]);
++ (sizeof path->ip_frames) - 2 * sizeof frames[0]);
+ /* Add new access path frame */
+- frames[1].at = dx_entry_shift(path, entries2, idx);
++ frames[1].at = iam_entry_shift(path, entries2, idx);
+ frames[1].entries = entries = entries2;
+ frames[1].bh = bh2;
+ assert(dx_node_check(path, frame));
+@@ -1853,22 +2376,22 @@
+ unsigned hash2;
+
+ dx_get_key(path,
+- dx_entry_shift(path, entries, count1),
+- (struct dx_key *)&hash2);
++ iam_entry_shift(path, entries, count1),
++ (struct iam_key *)&hash2);
+
+ dxtrace(printk("Split index %i/%i\n", count1, count2));
+
+ memcpy ((char *) entries2,
+- (char *) dx_entry_shift(path, entries, count1),
+- count2 * dx_entry_size(path));
++ (char *) iam_entry_shift(path, entries, count1),
++ count2 * iam_entry_size(path));
+ dx_set_count (entries, count1);
+ dx_set_count (entries2, count2);
+ dx_set_limit (entries2, dx_node_limit(path));
+
+ /* Which index block gets the new entry? */
+ if (idx >= count1) {
+- frame->at = dx_entry_shift(path, entries2,
+- idx - count1);
++ frame->at = iam_entry_shift(path, entries2,
++ idx - count1);
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ bh_new[i] = bh2;
+@@ -1903,7 +2426,7 @@
+ }
+ if (err)
+ inode->i_size = isize;
+- dx_path_fini(path);
++ iam_path_fini(path);
+ return err;
+ }
+ #endif
--- /dev/null
+Index: iam-src/fs/ext3/namei.c
+===================================================================
+--- iam-src.orig/fs/ext3/namei.c 2006-02-15 18:31:48.000000000 +0300
++++ iam-src/fs/ext3/namei.c 2006-02-15 21:25:34.000000000 +0300
+@@ -51,7 +51,10 @@
+ /*
+ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
+ */
+-#define DX_MAX_TREE_HEIGHT (5)
++enum {
++ DX_MAX_TREE_HEIGHT = 5,
++ DX_SCRATCH_KEYS = 2
++};
+
+ static struct buffer_head *ext3_append(handle_t *handle,
+ struct inode *inode,
+@@ -83,22 +86,22 @@ static struct buffer_head *ext3_append(h
+ #define dxtrace(command)
+ #endif
+
+-struct fake_dirent
+-{
++struct fake_dirent {
+ __le32 inode;
+ __le16 rec_len;
+ u8 name_len;
+ u8 file_type;
+ };
+
+-struct dx_countlimit
+-{
++struct dx_countlimit {
+ __le16 limit;
+ __le16 count;
+ };
+
+-struct dx_entry
+-{
++struct dx_entry; /* incomplete type */
++struct dx_key; /* incomplete type */
++
++struct dx_entry_compat {
+ __le32 hash;
+ __le32 block;
+ };
+@@ -109,8 +112,7 @@ struct dx_entry
+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
+ */
+
+-struct dx_root
+-{
++struct dx_root {
+ struct fake_dirent dot;
+ char dot_name[4];
+ struct fake_dirent dotdot;
+@@ -124,13 +126,13 @@ struct dx_root
+ u8 unused_flags;
+ }
+ info;
+- struct dx_entry entries[0];
++ struct {} entries[0];
+ };
+
+ struct dx_node
+ {
+ struct fake_dirent fake;
+- struct dx_entry entries[0];
++ struct {} entries[0];
+ };
+
+
+@@ -147,38 +149,88 @@ struct dx_map_entry
+ u32 offs;
+ };
+
++struct dx_path;
++struct dx_param {
++ size_t dpo_key_size;
++ size_t dpo_ptr_size;
++ size_t dpo_node_gap;
++ size_t dpo_root_gap;
++
++ u32 (*dpo_root_ptr)(struct dx_path *path);
++ int (*dpo_node_check)(struct dx_path *path,
++ struct dx_frame *frame, void *cookie);
++ int (*dpo_node_init)(struct dx_path *path,
++ struct buffer_head *bh, int root);
++ int (*dpo_keycmp)(struct dx_path *path,
++ struct dx_key *k1, struct dx_key *k2);
++};
++
+ /*
+ * Structure to keep track of a path drilled through htree.
+ */
+ struct dx_path {
+- struct inode *dp_object;
+- struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT];
+- struct dx_frame *dp_frame;
++ struct inode *dp_object;
++ struct dx_param *dp_param;
++ int dp_indirect;
++ struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT];
++ struct dx_frame *dp_frame;
++ struct dx_key *dp_key_target;
++ struct dx_key *dp_key_scratch[DX_SCRATCH_KEYS];
++};
++
++struct dx_path_compat {
++ struct dx_path dpc_path;
++ __u32 dpc_scrach[DX_SCRATCH_KEYS];
+ };
+
++static u32 htree_root_ptr(struct dx_path *p);
++static int htree_node_check(struct dx_path *path,
++ struct dx_frame *frame, void *cookie);
++static int htree_node_init(struct dx_path *path,
++ struct buffer_head *bh, int root);
++static int htree_keycmp(struct dx_path *path,
++ struct dx_key *k1, struct dx_key *k2);
++
++static struct dx_param htree_compat_param = {
++ .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++ .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
++ .dpo_node_gap = offsetof(struct dx_node, entries),
++ .dpo_root_gap = offsetof(struct dx_root, entries),
++
++ .dpo_root_ptr = htree_root_ptr,
++ .dpo_node_check = htree_node_check,
++ .dpo_node_init = htree_node_init,
++ .dpo_keycmp = htree_keycmp
++};
++
++
+ #ifdef CONFIG_EXT3_INDEX
+-static inline unsigned dx_get_block (struct dx_entry *entry);
+-static void dx_set_block (struct dx_entry *entry, unsigned value);
+-static inline unsigned dx_get_hash (struct dx_entry *entry);
+-static void dx_set_hash (struct dx_entry *entry, unsigned value);
+-static unsigned dx_get_count (struct dx_entry *entries);
+-static unsigned dx_get_limit (struct dx_entry *entries);
+-static void dx_set_count (struct dx_entry *entries, unsigned value);
+-static void dx_set_limit (struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+-static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
+- struct inode *dir,
+- struct dx_hash_info *hinfo,
+- struct dx_path *path,
+- int *err);
++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry);
++static void dx_set_block(struct dx_path *p,
++ struct dx_entry *entry, unsigned value);
++static inline struct dx_key *dx_get_key(struct dx_path *p,
++ struct dx_entry *entry,
++ struct dx_key *key);
++static void dx_set_key(struct dx_path *p, struct dx_entry *entry,
++ struct dx_key *key);
++static unsigned dx_get_count(struct dx_entry *entries);
++static unsigned dx_get_limit(struct dx_entry *entries);
++static void dx_set_count(struct dx_entry *entries, unsigned value);
++static void dx_set_limit(struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit(struct dx_path *p);
++static unsigned dx_node_limit(struct dx_path *p);
++static int dx_probe(struct dentry *dentry,
++ struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct dx_path *path);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct dx_path *path,
++ struct dx_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+@@ -186,29 +238,72 @@ static struct buffer_head * ext3_dx_find
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+
++static inline void dx_path_init(struct dx_path *path, struct inode *inode);
++static inline void dx_path_fini(struct dx_path *path);
++
++
+ /*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+-static inline unsigned dx_get_block (struct dx_entry *entry)
++static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off)
+ {
+- return le32_to_cpu(entry->block) & 0x00ffffff;
++ return (void *)((char *)entry + off);
+ }
+
+-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++static inline size_t dx_entry_size(struct dx_path *p)
+ {
+- entry->block = cpu_to_le32(value);
++ return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size;
+ }
+
+-static inline unsigned dx_get_hash (struct dx_entry *entry)
++static inline struct dx_entry *dx_entry_shift(struct dx_path *p,
++ struct dx_entry *entry, int shift)
+ {
+- return le32_to_cpu(entry->hash);
++ void *e = entry;
++ return e + shift * dx_entry_size(p);
++}
++
++static inline ptrdiff_t dx_entry_diff(struct dx_path *p,
++ struct dx_entry *e1, struct dx_entry *e2)
++{
++ ptrdiff_t diff;
++
++ diff = (void *)e1 - (void *)e2;
++ assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff);
++ return diff / dx_entry_size(p);
+ }
+
+-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry)
+ {
+- entry->hash = cpu_to_le32(value);
++ return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size))
++ & 0x00ffffff;
++}
++
++static inline void dx_set_block(struct dx_path *p,
++ struct dx_entry *entry, unsigned value)
++{
++ *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value);
++}
++
++static inline struct dx_key *dx_get_key(struct dx_path *p,
++ struct dx_entry *entry,
++ struct dx_key *key)
++{
++ memcpy(key, entry, p->dp_param->dpo_key_size);
++ return key;
++}
++
++static inline struct dx_key *dx_key_at(struct dx_path *p,
++ struct dx_entry *entry)
++{
++ return (struct dx_key *)entry;
++}
++
++static inline void dx_set_key(struct dx_path *p,
++ struct dx_entry *entry, struct dx_key *key)
++{
++ memcpy(entry, key, p->dp_param->dpo_key_size);
+ }
+
+ static inline unsigned dx_get_count (struct dx_entry *entries)
+@@ -231,17 +326,163 @@ static inline void dx_set_limit (struct
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+
+-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++static inline unsigned dx_root_limit(struct dx_path *p)
++{
++ struct dx_param *param = p->dp_param;
++ unsigned entry_space = p->dp_object->i_sb->s_blocksize -
++ param->dpo_root_gap;
++ return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++}
++
++static inline unsigned dx_node_limit(struct dx_path *p)
++{
++ struct dx_param *param = p->dp_param;
++ unsigned entry_space = p->dp_object->i_sb->s_blocksize -
++ param->dpo_node_gap;
++ return entry_space / (param->dpo_key_size + param->dpo_ptr_size);
++}
++
++static inline int dx_index_is_compat(struct dx_path *path)
++{
++ return path->dp_param == &htree_compat_param;
++}
++
++static struct dx_entry *dx_get_entries(struct dx_path *path, void *data,
++ int root)
++{
++ return data +
++ (root ?
++ path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap);
++}
++
++static struct dx_entry *dx_node_get_entries(struct dx_path *path,
++ struct dx_frame *frame)
++{
++ return dx_get_entries(path,
++ frame->bh->b_data, frame == path->dp_frames);
++}
++
++static inline struct dx_key *keycpy(struct dx_path *p,
++ struct dx_key *k1, struct dx_key *k2)
++{
++ return memcpy(k1, k2, p->dp_param->dpo_key_size);
++}
++
++static inline int keycmp(struct dx_path *p,
++ struct dx_key *k1, struct dx_key *k2)
++{
++ return p->dp_param->dpo_keycmp(p, k1, k2);
++}
++
++static int dx_node_check(struct dx_path *p, struct dx_frame *f)
++{
++ struct dx_entry *e;
++ unsigned count;
++ unsigned i;
++
++ e = dx_node_get_entries(p, f);
++ count = dx_get_count(e);
++ e = dx_entry_shift(p, e, 1);
++ for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) {
++ keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]);
++ dx_get_key(p, e, p->dp_key_scratch[1]);
++ if (i > 0 &&
++ keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0)
++ return 0;
++ }
++ return 1;
++}
++
++static u32 htree_root_ptr(struct dx_path *path)
++{
++ return 0;
++}
++
++struct htree_cookie {
++ struct dx_hash_info *hinfo;
++ struct dentry *dentry;
++};
++
++static int htree_node_check(struct dx_path *path, struct dx_frame *frame,
++ void *cookie)
++{
++ void *data;
++ struct dx_entry *entries;
++ struct super_block *sb;
++
++ data = frame->bh->b_data;
++ entries = dx_node_get_entries(path, frame);
++ sb = path->dp_object->i_sb;
++ if (frame == path->dp_frames) {
++ /* root node */
++ struct dx_root *root;
++ struct htree_cookie *hc = cookie;
++
++ root = data;
++ if (root->info.hash_version != DX_HASH_TEA &&
++ root->info.hash_version != DX_HASH_HALF_MD4 &&
++ root->info.hash_version != DX_HASH_R5 &&
++ root->info.hash_version != DX_HASH_LEGACY) {
++ ext3_warning(sb, __FUNCTION__,
++ "Unrecognised inode hash code %d",
++ root->info.hash_version);
++ return ERR_BAD_DX_DIR;
++ }
++
++ if (root->info.unused_flags & 1) {
++ ext3_warning(sb, __FUNCTION__,
++ "Unimplemented inode hash flags: %#06x",
++ root->info.unused_flags);
++ return ERR_BAD_DX_DIR;
++ }
++
++ path->dp_indirect = root->info.indirect_levels;
++ if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) {
++ ext3_warning(sb, __FUNCTION__,
++ "Unimplemented inode hash depth: %#06x",
++ root->info.indirect_levels);
++ return ERR_BAD_DX_DIR;
++ }
++
++ assert((char *)entries == (((char *)&root->info) +
++ root->info.info_length));
++ assert(dx_get_limit(entries) == dx_root_limit(path));
++
++ hc->hinfo->hash_version = root->info.hash_version;
++ hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed;
++ if (hc->dentry)
++ ext3fs_dirhash(hc->dentry->d_name.name,
++ hc->dentry->d_name.len, hc->hinfo);
++ path->dp_key_target = (struct dx_key *)&hc->hinfo->hash;
++ } else {
++ /* non-root index */
++ assert(entries == data + path->dp_param->dpo_node_gap);
++ assert(dx_get_limit(entries) == dx_node_limit(path));
++ }
++ frame->entries = frame->at = entries;
++ return 0;
++}
++
++static int htree_node_init(struct dx_path *path,
++ struct buffer_head *bh, int root)
+ {
+- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
+- EXT3_DIR_REC_LEN(2) - infosize;
+- return 0? 20: entry_space / sizeof(struct dx_entry);
++ struct dx_node *node;
++
++ assert(!root);
++
++ node = (void *)bh->b_data;
++ node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize);
++ node->fake.inode = 0;
++ return 0;
+ }
+
+-static inline unsigned dx_node_limit (struct inode *dir)
++static int htree_keycmp(struct dx_path *path,
++ struct dx_key *k1, struct dx_key *k2)
+ {
+- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
+- return 0? 22: entry_space / sizeof(struct dx_entry);
++ __u32 p1 = le32_to_cpu(*(__u32 *)k1);
++ __u32 p2 = le32_to_cpu(*(__u32 *)k2);
++
++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
+ }
+
+ /*
+@@ -327,123 +568,105 @@ struct stats dx_show_entries(struct dx_h
+ }
+ #endif /* DX_DEBUG */
+
+-/*
+- * Probe for a directory leaf block to search.
+- *
+- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+- * error in the directory index, and the caller should fall back to
+- * searching the directory normally. The callers of dx_probe **MUST**
+- * check for this error code, and make sure it never gets reflected
+- * back to userspace.
+- */
+-static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
+- struct dx_hash_info *hinfo, struct dx_path *path, int *err)
+-{
+- unsigned count, indirect;
+- struct dx_entry *at, *entries, *p, *q, *m;
+- struct dx_root *root;
+- struct buffer_head *bh;
+- struct dx_frame *frame = path->dp_frames;
+- u32 hash;
++static int dx_lookup(struct dx_path *path, void *cookie)
++{
++ u32 ptr;
++ int err;
++ int i;
+
+- frame->bh = NULL;
+- if (dentry)
+- dir = dentry->d_parent->d_inode;
+- if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+- goto fail;
+- root = (struct dx_root *) bh->b_data;
+- if (root->info.hash_version != DX_HASH_TEA &&
+- root->info.hash_version != DX_HASH_HALF_MD4 &&
+- root->info.hash_version != DX_HASH_R5 &&
+- root->info.hash_version != DX_HASH_LEGACY) {
+- ext3_warning(dir->i_sb, __FUNCTION__,
+- "Unrecognised inode hash code %d", root->info.hash_version);
+- brelse(bh);
+- *err = ERR_BAD_DX_DIR;
+- goto fail;
+- }
+- hinfo->hash_version = root->info.hash_version;
+- hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+- if (dentry)
+- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+- hash = hinfo->hash;
+-
+- if (root->info.unused_flags & 1) {
+- ext3_warning(dir->i_sb, __FUNCTION__,
+- "Unimplemented inode hash flags: %#06x",
+- root->info.unused_flags);
+- brelse(bh);
+- *err = ERR_BAD_DX_DIR;
+- goto fail;
+- }
++ struct dx_param *param;
++ struct dx_frame *frame;
+
+- if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) {
+- ext3_warning(dir->i_sb, __FUNCTION__,
+- "Unimplemented inode hash depth: %#06x",
+- root->info.indirect_levels);
+- brelse(bh);
+- *err = ERR_BAD_DX_DIR;
+- goto fail;
+- }
++ param = path->dp_param;
+
+- entries = (struct dx_entry *) (((char *)&root->info) +
+- root->info.info_length);
+- assert(dx_get_limit(entries) == dx_root_limit(dir,
+- root->info.info_length));
+- dxtrace (printk("Look up %x", hash));
+- while (1)
+- {
++ for (frame = path->dp_frames, i = 0,
++ ptr = param->dpo_root_ptr(path); i <= path->dp_indirect;
++ ptr = dx_get_block(path, frame->at), ++frame, ++i) {
++ struct dx_entry *entries;
++ struct dx_entry *p;
++ struct dx_entry *q;
++ struct dx_entry *m;
++ unsigned count;
++
++ frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err);
++ if (frame->bh == NULL) {
++ err = -EIO;
++ break;
++ }
++ err = param->dpo_node_check(path, frame, cookie);
++ if (err != 0)
++ break;
++
++ assert(dx_node_check(path, frame));
++
++ entries = frame->entries;
+ count = dx_get_count(entries);
+- assert (count && count <= dx_get_limit(entries));
+- p = entries + 1;
+- q = entries + count - 1;
+- while (p <= q)
+- {
+- m = p + (q - p)/2;
++ assert(count && count <= dx_get_limit(entries));
++ p = dx_entry_shift(path, entries, 1);
++ q = dx_entry_shift(path, entries, count - 1);
++ while (p <= q) {
++ m = dx_entry_shift(path,
++ p, dx_entry_diff(path, q, p) / 2);
+ dxtrace(printk("."));
+- if (dx_get_hash(m) > hash)
+- q = m - 1;
++ if (keycmp(path, dx_key_at(path, m),
++ path->dp_key_target) > 0)
++ q = dx_entry_shift(path, m, -1);
+ else
+- p = m + 1;
++ p = dx_entry_shift(path, m, +1);
+ }
+
+- if (0) // linear search cross check
+- {
++ frame->at = dx_entry_shift(path, p, -1);
++ if (1) { // linear search cross check
+ unsigned n = count - 1;
++ struct dx_entry *at;
++
+ at = entries;
+- while (n--)
+- {
++ while (n--) {
+ dxtrace(printk(","));
+- if (dx_get_hash(++at) > hash)
+- {
+- at--;
++ at = dx_entry_shift(path, at, +1);
++ if (keycmp(path, dx_key_at(path, at),
++ path->dp_key_target) > 0) {
++ if (at != dx_entry_shift(path, frame->at, 1)) {
++ BREAKPOINT;
++ printk(KERN_EMERG "%i\n",
++ keycmp(path, dx_key_at(path, at),
++ path->dp_key_target));
++ }
++ at = dx_entry_shift(path, at, -1);
+ break;
+ }
+ }
+- assert (at == p - 1);
++ assert(at == frame->at);
+ }
+-
+- at = p - 1;
+- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+- frame->bh = bh;
+- frame->entries = entries;
+- frame->at = at;
+- if (!indirect--)
+- return path->dp_frame = frame;
+- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+- goto fail2;
+- at = entries = ((struct dx_node *) bh->b_data)->entries;
+- assert (dx_get_limit(entries) == dx_node_limit (dir));
+- frame++;
+- }
+-fail2:
+- while (frame >= path->dp_frames) {
+- brelse(frame->bh);
+- frame--;
+ }
+-fail:
+- return NULL;
++ if (err != 0)
++ dx_path_fini(path);
++ path->dp_frame = --frame;
++ return err;
++}
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally. The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static int dx_probe(struct dentry *dentry, struct inode *dir,
++ struct dx_hash_info *hinfo, struct dx_path *path)
++{
++ int err;
++ struct htree_cookie hc = {
++ .dentry = dentry,
++ .hinfo = hinfo
++ };
++
++ assert(dx_index_is_compat(path));
++ err = dx_lookup(path, &hc);
++ assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL);
++ return err;
+ }
+
+ static inline void dx_path_init(struct dx_path *path, struct inode *inode)
+@@ -458,11 +681,24 @@ static inline void dx_path_fini(struct d
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) {
+- if (path->dp_frames[i].bh != NULL)
++ if (path->dp_frames[i].bh != NULL) {
+ brelse(path->dp_frames[i].bh);
++ path->dp_frames[i].bh = NULL;
++ }
+ }
+ }
+
++static void dx_path_compat_init(struct dx_path_compat *path,
++ struct inode *inode)
++{
++ int i;
++ dx_path_init(&path->dpc_path, inode);
++ path->dpc_path.dp_param = &htree_compat_param;
++ for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i)
++ path->dpc_path.dp_key_scratch[i] =
++ (struct dx_key *)&path->dpc_scrach[i];
++}
++
+ /*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+@@ -488,6 +724,8 @@ static int ext3_htree_next_block(struct
+ int err, num_frames = 0;
+ __u32 bhash;
+
++ assert(dx_index_is_compat(path));
++
+ p = path->dp_frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+@@ -497,7 +735,9 @@ static int ext3_htree_next_block(struct
+ * nodes need to be read.
+ */
+ while (1) {
+- if (++(p->at) < p->entries + dx_get_count(p->entries))
++ p->at = dx_entry_shift(path, p->at, +1);
++ if (p->at < dx_entry_shift(path, p->entries,
++ dx_get_count(p->entries)))
+ break;
+ if (p == path->dp_frames)
+ return 0;
+@@ -512,7 +752,7 @@ static int ext3_htree_next_block(struct
+ * desired contiuation hash. If it doesn't, return since
+ * there's no point to read in the successive index pages.
+ */
+- bhash = dx_get_hash(p->at);
++ dx_get_key(path, p->at, (struct dx_key *)&bhash);
+ if (start_hash)
+ *start_hash = bhash;
+ if ((hash & 1) == 0) {
+@@ -524,12 +764,14 @@ static int ext3_htree_next_block(struct
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err)))
++ if (!(bh = ext3_bread(NULL, dir,
++ dx_get_block(path, p->at), 0, &err)))
+ return err; /* Failure */
+ ++p;
+ brelse (p->bh);
+ p->bh = bh;
+- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ p->at = p->entries = dx_node_get_entries(path, p);
++ assert(dx_node_check(path, p));
+ }
+ return 1;
+ }
+@@ -598,7 +840,8 @@ int ext3_htree_fill_tree(struct file *di
+ {
+ struct dx_hash_info hinfo;
+ struct ext3_dir_entry_2 *de;
+- struct dx_path path;
++ struct dx_path_compat cpath;
++ struct dx_path *path = &cpath.dpc_path;
+ struct inode *dir;
+ int block, err;
+ int count = 0;
+@@ -608,7 +851,7 @@ int ext3_htree_fill_tree(struct file *di
+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+ start_minor_hash));
+ dir = dir_file->f_dentry->d_inode;
+- dx_path_init(&path, dir);
++ dx_path_compat_init(&cpath, dir);
+ if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+ hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+ hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -619,12 +862,13 @@ int ext3_htree_fill_tree(struct file *di
+ }
+ hinfo.hash = start_hash;
+ hinfo.minor_hash = 0;
+- if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err))
++ err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, path);
++ if (err != 0)
+ return err;
+
+ /* Add '.' and '..' from the htree header */
+ if (!start_hash && !start_minor_hash) {
+- de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data;
++ de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data;
+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ goto errout;
+ de = ext3_next_entry(de);
+@@ -634,7 +878,7 @@ int ext3_htree_fill_tree(struct file *di
+ }
+
+ while (1) {
+- block = dx_get_block(path.dp_frame->at);
++ block = dx_get_block(path, path->dp_frame->at);
+ ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+ start_hash, start_minor_hash);
+ if (ret < 0) {
+@@ -643,7 +887,8 @@ int ext3_htree_fill_tree(struct file *di
+ }
+ count += ret;
+ hashval = ~0;
+- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval);
++ ret = ext3_htree_next_block(dir,
++ HASH_NB_ALWAYS, path, &hashval);
+ *next_hash = hashval;
+ if (ret < 0) {
+ err = ret;
+@@ -658,12 +903,12 @@ int ext3_htree_fill_tree(struct file *di
+ (count && ((hashval & 1) == 0)))
+ break;
+ }
+- dx_path_fini(&path);
++ dx_path_fini(path);
+ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+ count, *next_hash));
+ return count;
+ errout:
+- dx_path_fini(&path);
++ dx_path_fini(path);
+ return (err);
+ }
+
+@@ -722,17 +967,19 @@ static void dx_sort_map (struct dx_map_e
+ } while(more);
+ }
+
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct dx_path *path,
++ struct dx_frame *frame, u32 hash, u32 block)
+ {
+ struct dx_entry *entries = frame->entries;
+- struct dx_entry *old = frame->at, *new = old + 1;
++ struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1);
+ int count = dx_get_count(entries);
+
+ assert(count < dx_get_limit(entries));
+- assert(old < entries + count);
+- memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+- dx_set_hash(new, hash);
+- dx_set_block(new, block);
++ assert(old < dx_entry_shift(path, entries, count));
++ memmove(dx_entry_shift(path, new, 1), new,
++ (char *)dx_entry_shift(path, entries, count) - (char *)new);
++ dx_set_key(path, new, (struct dx_key *)&hash);
++ dx_set_block(path, new, block);
+ dx_set_count(entries, count + 1);
+ }
+ #endif
+@@ -933,8 +1180,11 @@ static struct buffer_head * ext3_dx_find
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+- struct dx_path path;
+- struct dx_entry dummy_dot;
++ struct dx_path_compat cpath;
++ struct dx_path *path = &cpath.dpc_path;
++ struct dx_entry_compat dummy_dot = {
++ .block = 0
++ };
+ struct ext3_dir_entry_2 *de, *top;
+ struct buffer_head *bh;
+ unsigned long block;
+@@ -943,20 +1193,21 @@ static struct buffer_head * ext3_dx_find
+ const u8 *name = dentry->d_name.name;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+- dx_path_init(&path, dir);
++ dx_path_compat_init(&cpath, dir);
++
+ sb = dir->i_sb;
+ /* NFS may look up ".." - look at dx_root directory block */
+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+- if (!(dx_probe(dentry, NULL, &hinfo, &path, err)))
++ *err = dx_probe(dentry, NULL, &hinfo, path);
++ if (*err != 0)
+ return NULL;
+ } else {
+- path.dp_frame->bh = NULL; /* for dx_path_fini() */
+- path.dp_frame->at = &dummy_dot; /* hack for zero entry*/
+- dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */
++ path->dp_frame->bh = NULL; /* for dx_path_fini() */
++ path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
+ }
+ hash = hinfo.hash;
+ do {
+- block = dx_get_block(path.dp_frame->at);
++ block = dx_get_block(path, path->dp_frame->at);
+ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ goto errout;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -972,12 +1223,12 @@ static struct buffer_head * ext3_dx_find
+ goto errout;
+ }
+ *res_dir = de;
+- dx_path_fini(&path);
++ dx_path_fini(path);
+ return bh;
+ }
+ brelse (bh);
+ /* Check to see if we should continue to search */
+- retval = ext3_htree_next_block(dir, hash, &path, NULL);
++ retval = ext3_htree_next_block(dir, hash, path, NULL);
+ if (retval < 0) {
+ ext3_warning(sb, __FUNCTION__,
+ "error reading index page in directory #%lu",
+@@ -990,7 +1241,7 @@ static struct buffer_head * ext3_dx_find
+ *err = -ENOENT;
+ errout:
+ dxtrace(printk("%s not found\n", name));
+- dx_path_fini(&path);
++ dx_path_fini(path);
+ return NULL;
+ }
+ #endif
+@@ -1115,10 +1366,11 @@ static struct ext3_dir_entry_2* dx_pack_
+
+ /* Allocate new node, and split leaf node @bh into it, inserting new pointer
+ * into parent node identified by @frame */
+-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path,
+ struct buffer_head **bh,struct dx_frame *frame,
+ struct dx_hash_info *hinfo, int *error)
+ {
++ struct inode *dir = path->dp_object;
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+ struct buffer_head *bh2;
+@@ -1180,7 +1432,7 @@ static struct ext3_dir_entry_2 *do_split
+ swap(*bh, bh2);
+ de = de2;
+ }
+- dx_insert_block (frame, hash2 + continued, newblock);
++ dx_insert_block(path, frame, hash2 + continued, newblock);
+ err = ext3_journal_dirty_metadata (handle, bh2);
+ if (err)
+ goto journal_error;
+@@ -1303,7 +1555,8 @@ static int make_indexed_dir(handle_t *ha
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+ struct dx_root *root;
+- struct dx_path path;
++ struct dx_path_compat cpath;
++ struct dx_path *path = &cpath.dpc_path;
+ struct dx_entry *entries;
+ struct ext3_dir_entry_2 *de, *de2;
+ char *data1, *top;
+@@ -1314,7 +1567,7 @@ static int make_indexed_dir(handle_t *ha
+ u32 block;
+ struct fake_dirent *fde;
+
+- dx_path_init(&path, dir);
++ dx_path_compat_init(&cpath, dir);
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk("Creating index\n"));
+ retval = ext3_journal_get_write_access(handle, bh);
+@@ -1350,21 +1603,21 @@ static int make_indexed_dir(handle_t *ha
+ root->info.info_length = sizeof(root->info);
+ root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+ root->info.hash_version = DX_HASH_R5;
+- entries = root->entries;
+- dx_set_block (entries, 1);
++ entries = (void *)root->entries;
++ dx_set_block (path, entries, 1);
+ dx_set_count (entries, 1);
+- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++ dx_set_limit (entries, dx_root_limit(path));
+
+ /* Initialize as for dx_probe */
+ hinfo.hash_version = root->info.hash_version;
+ hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+ ext3fs_dirhash(name, namelen, &hinfo);
+- path.dp_frame->entries = entries;
+- path.dp_frame->at = entries;
+- path.dp_frame->bh = bh;
++ path->dp_frame->entries = entries;
++ path->dp_frame->at = entries;
++ path->dp_frame->bh = bh;
+ bh = bh2;
+- de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval);
+- dx_path_fini(&path);
++ de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval);
++ dx_path_fini(path);
+ if (!de)
+ return retval;
+
+@@ -1445,9 +1698,10 @@ static int ext3_add_entry (handle_t *han
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_path path;
++ struct dx_path_compat cpath;
++ struct dx_path *path = &cpath.dpc_path;
++ struct dx_param *param;
+ struct dx_frame *frame, *safe;
+- struct dx_node *node2;
+ struct dx_entry *entries; /* old block contents */
+ struct dx_entry *entries2; /* new block contents */
+ struct dx_hash_info hinfo;
+@@ -1462,16 +1716,20 @@ static int ext3_dx_add_entry(handle_t *h
+ int i;
+ size_t isize;
+
+- dx_path_init(&path, dir);
+- if (!dx_probe(dentry, NULL, &hinfo, &path, &err))
++ dx_path_compat_init(&cpath, dir);
++ param = path->dp_param;
++
++ err = dx_probe(dentry, NULL, &hinfo, path);
++ if (err != 0)
+ return err;
+- frame = path.dp_frame;
++ frame = path->dp_frame;
+ entries = frame->entries;
+
+ /* XXX nikita: global serialization! */
+ isize = dir->i_size;
+
+- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++ if (!(bh = ext3_bread(handle, dir,
++ dx_get_block(path, frame->at), 0, &err)))
+ goto cleanup;
+
+ BUFFER_TRACE(bh, "get_write_access");
+@@ -1503,7 +1761,7 @@ static int ext3_dx_add_entry(handle_t *h
+ dx_get_count(entries), dx_get_limit(entries)));
+
+ /* What levels need split? */
+- for (nr_splet = 0; frame >= path.dp_frames &&
++ for (nr_splet = 0; frame >= path->dp_frames &&
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++nr_splet) {
+ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+@@ -1519,19 +1777,16 @@ static int ext3_dx_add_entry(handle_t *h
+ * transaction... */
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+- if (!bh_new[i])
++ if (!bh_new[i] ||
++ param->dpo_node_init(path, bh_new[i], 0) != 0)
+ goto cleanup;
+- node2 = (struct dx_node *)(bh_new[i]->b_data);
+- entries2 = node2->entries;
+- node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+- node2->fake.inode = 0;
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+ }
+ /* Add "safe" node to transaction too */
+- if (safe + 1 != path.dp_frames) {
++ if (safe + 1 != path->dp_frames) {
+ err = ext3_journal_get_write_access(handle, safe->bh);
+ if (err)
+ goto journal_error;
+@@ -1545,13 +1800,12 @@ static int ext3_dx_add_entry(handle_t *h
+
+ entries = frame->entries;
+ count = dx_get_count(entries);
+- idx = frame->at - entries;
++ idx = dx_entry_diff(path, frame->at, entries);
+
+ bh2 = bh_new[i];
+- node2 = (struct dx_node *)(bh2->b_data);
+- entries2 = node2->entries;
++ entries2 = dx_get_entries(path, bh2->b_data, 0);
+
+- if (frame == path.dp_frames) {
++ if (frame == path->dp_frames) {
+ /* splitting root node. Tricky point:
+ *
+ * In the "normal" B-tree we'd split root *and* add
+@@ -1566,27 +1820,29 @@ static int ext3_dx_add_entry(handle_t *h
+ u8 indirects;
+ struct dx_frame *frames;
+
+- frames = path.dp_frames;
++ frames = path->dp_frames;
+ root = (struct dx_root *) frames->bh->b_data;
+ indirects = root->info.indirect_levels;
+ dxtrace(printk("Creating new root %d\n", indirects));
+ memcpy((char *) entries2, (char *) entries,
+- count * sizeof(struct dx_entry));
+- dx_set_limit(entries2, dx_node_limit(dir));
++ count * dx_entry_size(path));
++ dx_set_limit(entries2, dx_node_limit(path));
+
+ /* Set up root */
+ dx_set_count(entries, 1);
+- dx_set_block(entries + 0, newblock[i]);
++ dx_set_block(path, entries, newblock[i]);
+ root->info.indirect_levels = indirects + 1;
+
+ /* Shift frames in the path */
+ memmove(frames + 2, frames + 1,
+- (sizeof path.dp_frames) - 2 * sizeof frames[0]);
++ (sizeof path->dp_frames) - 2 * sizeof frames[0]);
+ /* Add new access path frame */
+- frames[1].at = entries2 + idx;
++ frames[1].at = dx_entry_shift(path, entries2, idx);
+ frames[1].entries = entries = entries2;
+ frames[1].bh = bh2;
++ assert(dx_node_check(path, frame));
+ ++ frame;
++ assert(dx_node_check(path, frame));
+ bh_new[i] = NULL; /* buffer head is "consumed" */
+ err = ext3_journal_get_write_access(handle, bh2);
+ if (err)
+@@ -1594,23 +1850,32 @@ static int ext3_dx_add_entry(handle_t *h
+ } else {
+ /* splitting non-root index node. */
+ unsigned count1 = count/2, count2 = count - count1;
+- unsigned hash2 = dx_get_hash(entries + count1);
++ unsigned hash2;
++
++ dx_get_key(path,
++ dx_entry_shift(path, entries, count1),
++ (struct dx_key *)&hash2);
++
+ dxtrace(printk("Split index %i/%i\n", count1, count2));
+
+- memcpy ((char *) entries2, (char *) (entries + count1),
+- count2 * sizeof(struct dx_entry));
++ memcpy ((char *) entries2,
++ (char *) dx_entry_shift(path, entries, count1),
++ count2 * dx_entry_size(path));
+ dx_set_count (entries, count1);
+ dx_set_count (entries2, count2);
+- dx_set_limit (entries2, dx_node_limit(dir));
++ dx_set_limit (entries2, dx_node_limit(path));
+
+ /* Which index block gets the new entry? */
+ if (idx >= count1) {
+- frame->at = entries2 + idx - count1;
++ frame->at = dx_entry_shift(path, entries2,
++ idx - count1);
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ bh_new[i] = bh2;
+ }
+- dx_insert_block (frame - 1, hash2, newblock[i]);
++ dx_insert_block(path, frame - 1, hash2, newblock[i]);
++ assert(dx_node_check(path, frame));
++ assert(dx_node_check(path, frame - 1));
+ dxtrace(dx_show_index ("node", frame->entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+@@ -1619,9 +1884,10 @@ static int ext3_dx_add_entry(handle_t *h
+ goto journal_error;
+ }
+ }
+- de = do_split(handle, dir, &bh, --frame, &hinfo, &err);
++ de = do_split(handle, path, &bh, --frame, &hinfo, &err);
+ if (!de)
+ goto cleanup;
++ assert(dx_node_check(path, frame));
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ goto cleanup2;
+
+@@ -1637,7 +1903,7 @@ cleanup2:
+ }
+ if (err)
+ inode->i_size = isize;
+- dx_path_fini(&path);
++ dx_path_fini(path);
+ return err;
+ }
+ #endif
--- /dev/null
+Index: iam-src/fs/ext3/namei.c
+===================================================================
+--- iam-src.orig/fs/ext3/namei.c 2006-02-09 20:44:02.000000000 +0300
++++ iam-src/fs/ext3/namei.c 2006-02-10 18:23:32.000000000 +0300
+@@ -147,6 +147,15 @@ struct dx_map_entry
+ u32 offs;
+ };
+
++/*
++ * Structure to keep track of a path drilled through htree.
++ */
++struct dx_path {
++ struct inode *dp_object;
++ struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT];
++ struct dx_frame *dp_frame;
++};
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -161,9 +170,8 @@ static unsigned dx_node_limit (struct in
+ static struct dx_frame *dx_probe(struct dentry *dentry,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+- struct dx_frame *frame,
++ struct dx_path *path,
+ int *err);
+-static void dx_release (struct dx_frame *frames);
+ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+@@ -172,9 +180,7 @@ static struct ext3_dir_entry_2 *dx_move_
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+ static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+- struct dx_frame *frame,
+- struct dx_frame *frames,
+- __u32 *start_hash);
++ struct dx_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+ struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+@@ -332,13 +338,13 @@ struct stats dx_show_entries(struct dx_h
+ */
+ static struct dx_frame *
+ dx_probe(struct dentry *dentry, struct inode *dir,
+- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++ struct dx_hash_info *hinfo, struct dx_path *path, int *err)
+ {
+ unsigned count, indirect;
+ struct dx_entry *at, *entries, *p, *q, *m;
+ struct dx_root *root;
+ struct buffer_head *bh;
+- struct dx_frame *frame = frame_in;
++ struct dx_frame *frame = path->dp_frames;
+ u32 hash;
+
+ frame->bh = NULL;
+@@ -352,8 +358,7 @@ dx_probe(struct dentry *dentry, struct i
+ root->info.hash_version != DX_HASH_R5 &&
+ root->info.hash_version != DX_HASH_LEGACY) {
+ ext3_warning(dir->i_sb, __FUNCTION__,
+- "Unrecognised inode hash code %d",
+- root->info.hash_version);
++ "Unrecognised inode hash code %d", root->info.hash_version);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+@@ -424,7 +429,8 @@ dx_probe(struct dentry *dentry, struct i
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+- if (!indirect--) return frame;
++ if (!indirect--)
++ return path->dp_frame = frame;
+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+ goto fail2;
+ at = entries = ((struct dx_node *) bh->b_data)->entries;
+@@ -432,7 +438,7 @@ dx_probe(struct dentry *dentry, struct i
+ frame++;
+ }
+ fail2:
+- while (frame >= frame_in) {
++ while (frame >= path->dp_frames) {
+ brelse(frame->bh);
+ frame--;
+ }
+@@ -440,16 +446,20 @@ fail:
+ return NULL;
+ }
+
+-static void dx_release (struct dx_frame *frames)
++static inline void dx_path_init(struct dx_path *path, struct inode *inode)
+ {
+- int height;
++ memset(path, 0, sizeof *path);
++ path->dp_object = inode;
++ path->dp_frame = path->dp_frames;
++}
+
+- if (frames[0].bh == NULL)
+- return;
+- height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels;
+- for (; height >= 0; height--) {
+- assert(frames[height].bh != NULL);
+- brelse(frames[height].bh);
++static inline void dx_path_fini(struct dx_path *path)
++{
++ int i;
++
++ for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) {
++ if (path->dp_frames[i].bh != NULL)
++ brelse(path->dp_frames[i].bh);
+ }
+ }
+
+@@ -471,16 +481,14 @@ static void dx_release (struct dx_frame
+ * hash of the next page.
+ */
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+- struct dx_frame *frame,
+- struct dx_frame *frames,
+- __u32 *start_hash)
++ struct dx_path *path, __u32 *start_hash)
+ {
+ struct dx_frame *p;
+ struct buffer_head *bh;
+ int err, num_frames = 0;
+ __u32 bhash;
+
+- p = frame;
++ p = path->dp_frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+ * If we run out of entries in the interior node, loop around and
+@@ -491,10 +499,10 @@ static int ext3_htree_next_block(struct
+ while (1) {
+ if (++(p->at) < p->entries + dx_get_count(p->entries))
+ break;
+- if (p == frames)
++ if (p == path->dp_frames)
+ return 0;
+ num_frames++;
+- p--;
++ --p;
+ }
+
+ /*
+@@ -516,10 +524,9 @@ static int ext3_htree_next_block(struct
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+- 0, &err)))
++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err)))
+ return err; /* Failure */
+- p++;
++ ++p;
+ brelse (p->bh);
+ p->bh = bh;
+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+@@ -591,7 +598,7 @@ int ext3_htree_fill_tree(struct file *di
+ {
+ struct dx_hash_info hinfo;
+ struct ext3_dir_entry_2 *de;
+- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
++ struct dx_path path;
+ struct inode *dir;
+ int block, err;
+ int count = 0;
+@@ -601,6 +608,7 @@ int ext3_htree_fill_tree(struct file *di
+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+ start_minor_hash));
+ dir = dir_file->f_dentry->d_inode;
++ dx_path_init(&path, dir);
+ if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+ hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+ hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+@@ -611,13 +619,12 @@ int ext3_htree_fill_tree(struct file *di
+ }
+ hinfo.hash = start_hash;
+ hinfo.minor_hash = 0;
+- frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+- if (!frame)
++ if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err))
+ return err;
+
+ /* Add '.' and '..' from the htree header */
+ if (!start_hash && !start_minor_hash) {
+- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++ de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data;
+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ goto errout;
+ de = ext3_next_entry(de);
+@@ -627,7 +634,7 @@ int ext3_htree_fill_tree(struct file *di
+ }
+
+ while (1) {
+- block = dx_get_block(frame->at);
++ block = dx_get_block(path.dp_frame->at);
+ ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+ start_hash, start_minor_hash);
+ if (ret < 0) {
+@@ -636,8 +643,7 @@ int ext3_htree_fill_tree(struct file *di
+ }
+ count += ret;
+ hashval = ~0;
+- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+- frame, frames, &hashval);
++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval);
+ *next_hash = hashval;
+ if (ret < 0) {
+ err = ret;
+@@ -652,12 +658,12 @@ int ext3_htree_fill_tree(struct file *di
+ (count && ((hashval & 1) == 0)))
+ break;
+ }
+- dx_release(frames);
++ dx_path_fini(&path);
+ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+ count, *next_hash));
+ return count;
+ errout:
+- dx_release(frames);
++ dx_path_fini(&path);
+ return (err);
+ }
+
+@@ -927,7 +933,8 @@ static struct buffer_head * ext3_dx_find
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
++ struct dx_path path;
++ struct dx_entry dummy_dot;
+ struct ext3_dir_entry_2 *de, *top;
+ struct buffer_head *bh;
+ unsigned long block;
+@@ -936,20 +943,20 @@ static struct buffer_head * ext3_dx_find
+ const u8 *name = dentry->d_name.name;
+ struct inode *dir = dentry->d_parent->d_inode;
+
++ dx_path_init(&path, dir);
+ sb = dir->i_sb;
+ /* NFS may look up ".." - look at dx_root directory block */
+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
++ if (!(dx_probe(dentry, NULL, &hinfo, &path, err)))
+ return NULL;
+ } else {
+- frame = frames;
+- frame->bh = NULL; /* for dx_release() */
+- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
+- dx_set_block(frame->at, 0); /* dx_root block is 0 */
++ path.dp_frame->bh = NULL; /* for dx_path_fini() */
++ path.dp_frame->at = &dummy_dot; /* hack for zero entry*/
++ dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */
+ }
+ hash = hinfo.hash;
+ do {
+- block = dx_get_block(frame->at);
++ block = dx_get_block(path.dp_frame->at);
+ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ goto errout;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -965,13 +972,12 @@ static struct buffer_head * ext3_dx_find
+ goto errout;
+ }
+ *res_dir = de;
+- dx_release (frames);
++ dx_path_fini(&path);
+ return bh;
+ }
+ brelse (bh);
+ /* Check to see if we should continue to search */
+- retval = ext3_htree_next_block(dir, hash, frame,
+- frames, NULL);
++ retval = ext3_htree_next_block(dir, hash, &path, NULL);
+ if (retval < 0) {
+ ext3_warning(sb, __FUNCTION__,
+ "error reading index page in directory #%lu",
+@@ -984,7 +990,7 @@ static struct buffer_head * ext3_dx_find
+ *err = -ENOENT;
+ errout:
+ dxtrace(printk("%s not found\n", name));
+- dx_release (frames);
++ dx_path_fini(&path);
+ return NULL;
+ }
+ #endif
+@@ -1297,7 +1303,7 @@ static int make_indexed_dir(handle_t *ha
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+ struct dx_root *root;
+- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
++ struct dx_path path;
+ struct dx_entry *entries;
+ struct ext3_dir_entry_2 *de, *de2;
+ char *data1, *top;
+@@ -1308,6 +1314,7 @@ static int make_indexed_dir(handle_t *ha
+ u32 block;
+ struct fake_dirent *fde;
+
++ dx_path_init(&path, dir);
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk("Creating index\n"));
+ retval = ext3_journal_get_write_access(handle, bh);
+@@ -1352,14 +1359,13 @@ static int make_indexed_dir(handle_t *ha
+ hinfo.hash_version = root->info.hash_version;
+ hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+ ext3fs_dirhash(name, namelen, &hinfo);
+- frame = frames;
+- frame->entries = entries;
+- frame->at = entries;
+- frame->bh = bh;
++ path.dp_frame->entries = entries;
++ path.dp_frame->at = entries;
++ path.dp_frame->bh = bh;
+ bh = bh2;
+- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+- dx_release (frames);
+- if (!(de))
++ de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval);
++ dx_path_fini(&path);
++ if (!de)
+ return retval;
+
+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
+@@ -1439,7 +1445,8 @@ static int ext3_add_entry (handle_t *han
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe;
++ struct dx_path path;
++ struct dx_frame *frame, *safe;
+ struct dx_node *node2;
+ struct dx_entry *entries; /* old block contents */
+ struct dx_entry *entries2; /* new block contents */
+@@ -1455,9 +1462,10 @@ static int ext3_dx_add_entry(handle_t *h
+ int i;
+ size_t isize;
+
+- frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+- if (!frame)
++ dx_path_init(&path, dir);
++ if (!dx_probe(dentry, NULL, &hinfo, &path, &err))
+ return err;
++ frame = path.dp_frame;
+ entries = frame->entries;
+
+ /* XXX nikita: global serialization! */
+@@ -1495,7 +1503,7 @@ static int ext3_dx_add_entry(handle_t *h
+ dx_get_count(entries), dx_get_limit(entries)));
+
+ /* What levels need split? */
+- for (nr_splet = 0; frame >= frames &&
++ for (nr_splet = 0; frame >= path.dp_frames &&
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++nr_splet) {
+ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+@@ -1523,7 +1531,7 @@ static int ext3_dx_add_entry(handle_t *h
+ goto journal_error;
+ }
+ /* Add "safe" node to transaction too */
+- if (safe + 1 != frames) {
++ if (safe + 1 != path.dp_frames) {
+ err = ext3_journal_get_write_access(handle, safe->bh);
+ if (err)
+ goto journal_error;
+@@ -1543,7 +1551,7 @@ static int ext3_dx_add_entry(handle_t *h
+ node2 = (struct dx_node *)(bh2->b_data);
+ entries2 = node2->entries;
+
+- if (frame == frames) {
++ if (frame == path.dp_frames) {
+ /* splitting root node. Tricky point:
+ *
+ * In the "normal" B-tree we'd split root *and* add
+@@ -1556,7 +1564,9 @@ static int ext3_dx_add_entry(handle_t *h
+ */
+ struct dx_root *root;
+ u8 indirects;
++ struct dx_frame *frames;
+
++ frames = path.dp_frames;
+ root = (struct dx_root *) frames->bh->b_data;
+ indirects = root->info.indirect_levels;
+ dxtrace(printk("Creating new root %d\n", indirects));
+@@ -1571,7 +1581,7 @@ static int ext3_dx_add_entry(handle_t *h
+
+ /* Shift frames in the path */
+ memmove(frames + 2, frames + 1,
+- (sizeof frames) - 2 * sizeof frames[0]);
++ (sizeof path.dp_frames) - 2 * sizeof frames[0]);
+ /* Add new access path frame */
+ frames[1].at = entries2 + idx;
+ frames[1].entries = entries = entries2;
+@@ -1627,7 +1637,7 @@ cleanup2:
+ }
+ if (err)
+ inode->i_size = isize;
+- dx_release(frames);
++ dx_path_fini(&path);
+ return err;
+ }
+ #endif
--- /dev/null
+Index: iam-src/fs/ext3/hash.c
+===================================================================
+--- iam-src.orig/fs/ext3/hash.c 2006-02-11 01:08:59.000000000 +0300
++++ iam-src/fs/ext3/hash.c 2006-02-11 20:46:22.000000000 +0300
+@@ -4,7 +4,7 @@
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+- *
++ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+@@ -115,6 +115,18 @@ static __u32 dx_hack_hash (const char *n
+ return (hash0 << 1);
+ }
+
++static __u32 dx_r5_hash(const signed char *msg, int len)
++{
++ __u32 a = 0;
++ while (len--) {
++ a += *msg << 4;
++ a += *msg >> 4;
++ a *= 11;
++ msg++;
++ }
++ return a;
++}
++
+ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+ {
+ __u32 pad, val;
+@@ -146,11 +158,11 @@ static void str2hashbuf(const char *msg,
+ * Returns the hash of a filename. If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+- *
++ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash. If the seed is all zero's, then some default seed
+ * may be used.
+- *
++ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits. 32 bit hashes will return 0 for the minor hash.
+@@ -205,6 +217,9 @@ int ext3fs_dirhash(const char *name, int
+ hash = buf[0];
+ minor_hash = buf[1];
+ break;
++ case DX_HASH_R5:
++ hash = dx_r5_hash(name, len);
++ break;
+ default:
+ hinfo->hash = 0;
+ return -1;
+Index: iam-src/fs/ext3/namei.c
+===================================================================
+--- iam-src.orig/fs/ext3/namei.c 2006-02-11 01:09:12.000000000 +0300
++++ iam-src/fs/ext3/namei.c 2006-02-11 20:45:58.000000000 +0300
+@@ -370,6 +370,7 @@ dx_probe(struct dentry *dentry, struct i
+ root = (struct dx_root *) bh->b_data;
+ if (root->info.hash_version != DX_HASH_TEA &&
+ root->info.hash_version != DX_HASH_HALF_MD4 &&
++ root->info.hash_version != DX_HASH_R5 &&
+ root->info.hash_version != DX_HASH_LEGACY) {
+ ext3_warning(dir->i_sb, __FUNCTION__,
+ "Unrecognised inode hash code %d", root->info.hash_version);
+@@ -1363,6 +1364,7 @@ static int make_indexed_dir(handle_t *ha
+ memset (&root->info, 0, sizeof(root->info));
+ root->info.info_length = sizeof(root->info);
+ root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
++ root->info.hash_version = DX_HASH_R5;
+ entries = root->entries;
+ dx_set_block (entries, 1);
+ dx_set_count (entries, 1);
+Index: iam-src/include/linux/ext3_fs.h
+===================================================================
+--- iam-src.orig/include/linux/ext3_fs.h 2006-02-11 01:08:59.000000000 +0300
++++ iam-src/include/linux/ext3_fs.h 2006-02-11 20:45:58.000000000 +0300
+@@ -665,6 +665,7 @@ struct ext3_dir_entry_2 {
+ #define DX_HASH_LEGACY 0
+ #define DX_HASH_HALF_MD4 1
+ #define DX_HASH_TEA 2
++#define DX_HASH_R5 3
+
+ /* hash info structure used by the directory hash */
+ struct dx_hash_info
--- /dev/null
+Index: iam/fs/ext3/namei.c
+===================================================================
+--- iam.orig/fs/ext3/namei.c
++++ iam/fs/ext3/namei.c
+@@ -82,13 +82,16 @@
+ *
+ * Entries in index node are sorted by their key value.
+ *
++ * Format of leaf node:
+ *
+- *
+- *
+- *
+- *
+- *
+- *
++ * +-----+-------+-------+-------+------+-------+------------+
++ * | | count | | | | | |
++ * | gap | / | leaf | leaf | .... | leaf | free space |
++ * | | limit | | | | | |
++ * +-----+-------+-------+-------+------+-------+------------+
++
++ * leaf For leaf entry: consists of a rec immediately followd by
++ * a key. size of a key and size of a rec depends on container.
+ *
+ *
+ *
+@@ -96,6 +99,7 @@
+ *
+ */
+
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -111,7 +115,7 @@
+ #include "xattr.h"
+ #include "iopen.h"
+ #include "acl.h"
+-
++#include <linux/lustre_iam.h>
+ /*
+ * define how far ahead to read directories while searching them.
+ */
+@@ -120,13 +124,6 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
+-/*
+- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
+- */
+-enum {
+- DX_MAX_TREE_HEIGHT = 5,
+- DX_SCRATCH_KEYS = 2
+-};
+
+ static struct buffer_head *ext3_append(handle_t *handle,
+ struct inode *inode,
+@@ -205,194 +202,6 @@ struct dx_map_entry
+ u32 offs;
+ };
+
+-/*
+- * Entry within index tree node. Consists of a key immediately followed
+- * (without padding) by a pointer to the child node.
+- *
+- * Both key and pointer are of variable size, hence incomplete type.
+- */
+-struct iam_entry;
+-
+-struct iam_entry_compat {
+- __le32 hash;
+- __le32 block;
+-};
+-
+-/*
+- * Incomplete type used to refer to keys in iam container.
+- *
+- * As key size can be different from container to container, iam has to use
+- * incomplete type. Clients cast pointer to iam_key to real key type and back.
+- */
+-struct iam_key;
+-
+-/* Incomplete type use to refer to the records stored in iam containers. */
+-struct iam_rec;
+-
+-typedef __u64 iam_ptr_t;
+-
+-/*
+- * Index node traversed during tree lookup.
+- */
+-struct iam_frame {
+- struct buffer_head *bh; /* buffer holding node data */
+- struct iam_entry *entries; /* array of entries */
+- struct iam_entry *at; /* target entry, found by binary search */
+-};
+-
+-/* leaf node reached by tree lookup */
+-struct iam_leaf {
+- struct buffer_head *bh;
+- struct iam_leaf_entry *entries;
+- struct iam_leaf_entry *at;
+-};
+-
+-struct iam_path;
+-struct iam_container;
+-
+-/*
+- * Parameters, describing a flavor of iam container.
+- */
+-struct iam_descr {
+- /*
+- * Size of a key in this container, in bytes.
+- */
+- size_t id_key_size;
+- /*
+- * Size of a pointer to the next level (stored in index nodes), in
+- * bytes.
+- */
+- size_t id_ptr_size;
+- /*
+- * Size of a record (stored in leaf nodes), in bytes.
+- */
+- size_t id_rec_size;
+- /*
+- * Size of unused (by iam) space at the beginning of every non-root
+- * node, in bytes. Used for compatibility with ext3.
+- */
+- size_t id_node_gap;
+- /*
+- * Size of unused (by iam) space at the beginning of root node, in
+- * bytes. Used for compatibility with ext3.
+- */
+- size_t id_root_gap;
+-
+- /*
+- * Returns pointer (in the same sense as pointer in index entry) to
+- * the root node.
+- */
+- __u32 (*id_root_ptr)(struct iam_container *c);
+-
+- /*
+- * Check validity and consistency of index node. This is called when
+- * iam just loaded new node into frame.
+- */
+- int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
+- /*
+- * Initialize new node (stored in @bh) that is going to be added into
+- * tree.
+- */
+- int (*id_node_init)(struct iam_container *c,
+- struct buffer_head *bh, int root);
+- int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
+- handle_t *h, struct buffer_head **bh);
+- /*
+- * Key comparison function. Returns -1, 0, +1.
+- */
+- int (*id_keycmp)(struct iam_container *c,
+- struct iam_key *k1, struct iam_key *k2);
+- /*
+- * Create new container.
+- *
+- * Newly created container has a root node and a single leaf. Leaf
+- * contains single record with the smallest possible key.
+- */
+- int (*id_create)(struct iam_container *c);
+- struct {
+- /*
+- * leaf operations.
+- */
+- /*
+- * returns true iff leaf is positioned at the last entry.
+- */
+- int (*at_end)(struct iam_container *c, struct iam_leaf *l);
+- /* position leaf at the first entry */
+- void (*start)(struct iam_container *c, struct iam_leaf *l);
+- /* more leaf to the next entry. */
+- void (*next)(struct iam_container *c, struct iam_leaf *l);
+- /* return key of current leaf record in @k */
+- void (*key)(struct iam_container *c, struct iam_leaf *l,
+- struct iam_key *k);
+- /* return pointer to entry body */
+- struct iam_rec *(*rec)(struct iam_container *c,
+- struct iam_leaf *l);
+- } id_leaf;
+-};
+-
+-struct iam_container {
+- /*
+- * Underlying flat file. IO against this object is issued to
+- * read/write nodes.
+- */
+- struct inode *ic_object;
+- /*
+- * container flavor.
+- */
+- struct iam_descr *ic_descr;
+- /*
+- * pointer to flavor-specific per-container data.
+- */
+- void *ic_descr_data;
+-};
+-
+-/*
+- * Structure to keep track of a path drilled through htree.
+- */
+-struct iam_path {
+- /*
+- * Parent container.
+- */
+- struct iam_container *ip_container;
+- /*
+- * Number of index levels minus one.
+- */
+- int ip_indirect;
+- /*
+- * Nodes that top-to-bottom traversal passed through.
+- */
+- struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT];
+- /*
+- * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
+- * immediately above leaf).
+- */
+- struct iam_frame *ip_frame;
+- /*
+- * Leaf node: a child of ->ip_frame.
+- */
+- struct iam_leaf *ip_leaf;
+- /*
+- * Key searched for.
+- */
+- struct iam_key *ip_key_target;
+- /*
+- * Scratch-pad area for temporary keys.
+- */
+- struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS];
+- /*
+- * pointer to flavor-specific per-container data.
+- */
+- void *ip_descr_data;
+-};
+-
+-/*
+- * Helper structure for legacy htrees.
+- */
+-struct iam_path_compat {
+- struct iam_path ipc_path;
+- struct iam_container ipc_container;
+- __u32 ipc_scrach[DX_SCRATCH_KEYS];
+-};
+
+ static u32 htree_root_ptr(struct iam_container *c);
+ static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
+@@ -427,58 +236,7 @@ struct iam_descr;
+ struct iam_container;
+ struct iam_path;
+
+-/*
+- * Initialize container @c, acquires additional reference on @inode.
+- */
+-int iam_container_init(struct iam_container *c,
+- struct iam_descr *descr, struct inode *inode);
+-/*
+- * Finalize container @c, release all resources.
+- */
+-void iam_container_fini(struct iam_container *c);
+
+-/*
+- * Search container @c for record with key @k. If record is found, its data
+- * are moved into @r.
+- *
+- *
+- *
+- * Return values: +ve: found, 0: not-found, -ve: error
+- */
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-/*
+- * Insert new record @r with key @k into container @c (within context of
+- * transaction @h.
+- *
+- * Return values: 0: success, -ve: error, including -EEXIST when record with
+- * given key is already present.
+- *
+- * postcondition: ergo(result == 0 || result == -EEXIST,
+- * iam_lookup(c, k, r2) > 0 &&
+- * !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_insert(handle_t *h, struct iam_container *c,
+- struct iam_key *k, struct iam_rec *r);
+-/*
+- * Replace existing record with key @k, or insert new one. New record data are
+- * in @r.
+- *
+- * Return values: 0: success, -ve: error.
+- *
+- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
+- * !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_update(handle_t *h, struct iam_container *c,
+- struct iam_key *k, struct iam_rec *r);
+-/*
+- * Delete existing record with key @k.
+- *
+- * Return values: 0: success, -ENOENT: not-found, -ve: other error.
+- *
+- * postcondition: ergo(result == 0 || result == -ENOENT,
+- * !iam_lookup(c, k, *));
+- */
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
+
+ /*
+ * iam cursor (iterator) api.
+@@ -508,6 +266,11 @@ enum iam_it_state {
+ IAM_IT_ATTACHED
+ };
+
++struct htree_cookie {
++ struct dx_hash_info *hinfo;
++ struct dentry *dentry;
++};
++
+ /*
+ * Iterator.
+ *
+@@ -704,7 +467,7 @@ static int ext3_dx_add_entry(handle_t *h
+ struct inode *inode);
+
+ static inline void iam_path_init(struct iam_path *path,
+- struct iam_container *c);
++ struct iam_container *c, struct htree_cookie *hc);
+ static inline void iam_path_fini(struct iam_path *path);
+
+
+@@ -865,11 +628,6 @@ static u32 htree_root_ptr(struct iam_con
+ return 0;
+ }
+
+-struct htree_cookie {
+- struct dx_hash_info *hinfo;
+- struct dentry *dentry;
+-};
+-
+ static int htree_node_check(struct iam_path *path, struct iam_frame *frame)
+ {
+ void *data;
+@@ -1171,11 +929,13 @@ void iam_container_fini(struct iam_conta
+ }
+ }
+
+-static inline void iam_path_init(struct iam_path *path, struct iam_container *c)
++static inline void iam_path_init(struct iam_path *path, struct iam_container *c,
++ struct htree_cookie *hc)
+ {
+ memset(path, 0, sizeof *path);
+ path->ip_container = c;
+ path->ip_frame = path->ip_frames;
++ path->ip_descr_data = hc;
+ }
+
+ static inline void iam_path_fini(struct iam_path *path)
+@@ -1201,7 +961,7 @@ static void iam_path_compat_init(struct
+ * iam_path_fini().
+ */
+ iput(inode);
+- iam_path_init(&path->ipc_path, &path->ipc_container);
++ iam_path_init(&path->ipc_path, &path->ipc_container, NULL);
+ for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
+ path->ipc_path.ip_key_scratch[i] =
+ (struct iam_key *)&path->ipc_scrach[i];
+@@ -1213,6 +973,425 @@ static void iam_path_compat_fini(struct
+ iam_container_fini(&path->ipc_container);
+ }
+
++static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf)
++{
++ int block, err;
++ struct buffer_head *bh;
++
++ block = dx_get_block(path, path->ip_frame->at);
++ err = path_descr(path)->id_node_read(path->ip_container, block,
++ NULL, &bh);
++ if (err)
++ return err;
++
++ leaf->bh = bh;
++ leaf->entries = (struct iam_leaf_entry *)bh->b_data;
++ return 0;
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf)
++{
++ if (leaf->bh)
++ brelse(leaf->bh);
++}
++
++/*
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
++ *
++ *
++ *
++ * Return values: +ve: found, 0: not-found, -ve: error
++ */
++
++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r)
++{
++ struct dx_hash_info hinfo;
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct htree_cookie hc = {
++ .hinfo = &hinfo
++ };
++ int err, i;
++
++ iam_path_init(path, c, &hc);
++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++ path->ip_key_scratch[i] =
++ (struct iam_key *)&cpath.ipc_scrach[i];
++ err = dx_lookup(path);
++ do {
++ struct iam_leaf leaf;
++ err = iam_leaf_init(path, &leaf);
++ if (err)
++ goto errout;
++
++ for (path_descr(path)->id_leaf.start(c, &leaf);
++ !path_descr(path)->id_leaf.at_end(c, &leaf);
++ path_descr(path)->id_leaf.next(c, &leaf)) {
++ struct iam_key *key;
++
++ key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL);
++ path_descr(path)->id_leaf.key(c, &leaf, key);
++ if (keycmp(c, k, key) == 0) {
++ memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf),
++ path_descr(path)->id_rec_size);
++ iam_path_fini(path);
++ iam_leaf_fini(&leaf);
++ return 0;
++ }
++ }
++
++ iam_leaf_fini(&leaf);
++ /* Check to see if we should continue to search */
++ err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL);
++ if (err < 0)
++ goto errout;
++ } while (err == 1);
++errout:
++ iam_path_fini(path);
++ return(err);
++}
++EXPORT_SYMBOL(iam_lookup);
++
++static inline size_t iam_leaf_entry_size(struct iam_path *p)
++{
++ return path_descr(p)->id_rec_size + path_descr(p)->id_key_size;
++}
++
++static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p,
++ struct iam_leaf_entry *e1, struct iam_leaf_entry *e2)
++{
++ ptrdiff_t diff;
++
++ diff = (void *)e1 - (void *)e2;
++ assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff);
++ return diff / iam_leaf_entry_size(p);
++}
++
++static inline struct iam_leaf_entry*
++iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift)
++{
++ void *e = entry;
++ return e + shift * iam_leaf_entry_size(p);
++}
++
++static inline struct iam_key *
++dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key)
++{
++ memcpy(key, e, path_descr(p)->id_key_size);
++ return key;
++}
++
++static inline struct iam_key *
++iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry)
++{
++ void *e = entry;
++ return e + path_descr(p)->id_rec_size;
++}
++static inline struct iam_leaf_entry *
++iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry)
++{
++ return entry;
++}
++
++static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf,
++ struct iam_key *k)
++{
++ struct iam_leaf_entry *p, *q, *m;
++ struct iam_leaf_entry *entries = leaf->entries;
++ int count = dx_get_count((struct iam_entry *)entries);
++
++ p = iam_leaf_entry_shift(path, entries, 1);
++ q = iam_leaf_entry_shift(path, entries, count - 1);
++ while (p <= q) {
++ m = iam_leaf_entry_shift(path,
++ p, iam_leaf_entry_diff(path, q, p) / 2);
++ dxtrace(printk("."));
++ if (keycmp(path->ip_container, iam_leaf_key_at(path, m),
++ path->ip_key_target) > 0)
++ q = iam_leaf_entry_shift(path, m, -1);
++ else
++ p = iam_leaf_entry_shift(path, m, +1);
++ }
++ leaf->at = q;
++ return 0;
++}
++
++/*XXX what kind of lock should this entry be locked: WangDi */
++static int iam_leaf_insert(handle_t *handle, struct iam_path *path,
++ struct iam_key *k, struct iam_rec *r)
++{
++ struct iam_leaf leaf;
++ struct iam_leaf_entry *p, *q;
++ int err, count;
++
++ err = iam_leaf_init(path, &leaf);
++ if (err)
++ goto errout;
++ path_descr(path)->id_leaf.start(path->ip_container, &leaf);
++ count = dx_get_count((struct iam_entry *)leaf.entries);
++ if (dx_get_count((struct iam_entry *)leaf.entries) >=
++ dx_get_limit((struct iam_entry *)leaf.entries)){
++ err = -ENOSPC;
++ goto errout;
++ }
++
++ err = iam_leaf_lookup(path, &leaf, k);
++ if (err)
++ goto errout;
++
++ /*insert the k/r to leaf entries*/
++ p = iam_leaf_entry_shift(path, leaf.at, 1);
++ q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
++ while (q < p) {
++ memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path));
++ q = iam_leaf_entry_shift(path, q, -1);
++ }
++ memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size);
++ memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size);
++
++ dx_set_count((struct iam_entry*)leaf.entries, count + 1);
++ err = ext3_journal_dirty_metadata(handle, leaf.bh);
++ if (err)
++ ext3_std_error(path->ip_container->ic_object->i_sb, err);
++errout:
++ iam_leaf_fini(&leaf);
++ return err;
++}
++
++static int split_leaf_node(handle_t *handle, struct iam_path *path)
++{
++ struct inode *dir = path_obj(path);
++ unsigned continued = 0;
++ struct buffer_head *bh2;
++ u32 newblock, hash_split;
++ char *data2;
++ struct iam_leaf leaf;
++ unsigned split;
++ int err;
++
++ bh2 = ext3_append (handle, dir, &newblock, &err);
++ if (!(bh2)) {
++ err = -ENOSPC;
++ goto errout;
++ }
++ err = iam_leaf_init(path, &leaf);
++ if (err)
++ goto errout;
++
++ BUFFER_TRACE(leaf.bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, leaf.bh);
++ if (err) {
++ journal_error:
++ iam_leaf_fini(&leaf);
++ brelse(bh2);
++ ext3_std_error(dir->i_sb, err);
++ err = -EIO;
++ goto errout;
++ }
++ data2 = bh2->b_data;
++ split = dx_get_count((struct iam_entry*)leaf.entries)/2;
++ hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split));
++ if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)),
++ iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0)
++ continued = 1;
++
++ memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1),
++ iam_leaf_entry_shift(path, leaf.entries, split),
++ split * iam_leaf_entry_size(path));
++
++ /* Which block gets the new entry? */
++ dx_insert_block(path, path->ip_frame, hash_split + continued, newblock);
++ err = ext3_journal_dirty_metadata (handle, bh2);
++ if (err)
++ goto journal_error;
++ err = ext3_journal_dirty_metadata (handle, leaf.bh);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ iam_leaf_fini(&leaf);
++errout:
++ return err;
++}
++
++static int split_index_node(handle_t *handle, struct iam_path *path);
++/*
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h.
++ *
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
++ *
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ * iam_lookup(c, k, r2) > 0 &&
++ * !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k,
++ struct iam_rec *r)
++{
++ struct dx_hash_info hinfo;
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct htree_cookie hc = {
++ .hinfo = &hinfo
++ };
++ int err, i;
++
++ iam_path_init(path, c, &hc);
++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++ path->ip_key_scratch[i] =
++ (struct iam_key *)&cpath.ipc_scrach[i];
++ err = dx_lookup(path);
++ if (err)
++ goto errout;
++
++ err = iam_leaf_insert(handle, path, k, r);
++
++ if (err != -ENOSPC)
++ goto errout;
++
++ err = split_index_node(handle, path);
++ if (err)
++ goto errout;
++
++ err = split_leaf_node(handle, path);
++ if (err)
++ goto errout;
++
++ err = iam_leaf_insert(handle, path, k, r);
++errout:
++ iam_path_fini(path);
++ return(err);
++}
++
++EXPORT_SYMBOL(iam_insert);
++static int iam_leaf_delete(handle_t *handle, struct iam_path *path,
++ struct iam_key *k)
++{
++ struct iam_leaf leaf;
++ struct iam_leaf_entry *p, *q;
++ int err, count;
++
++ err = iam_leaf_init(path, &leaf);
++ if (err)
++ goto errout;
++
++ err = iam_leaf_lookup(path, &leaf, k);
++ if (err)
++ goto errout;
++
++ count = dx_get_count((struct iam_entry*)leaf.entries);
++ /*delete the k to leaf entries*/
++ p = iam_leaf_entry_shift(path, leaf.at, 1);
++ q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
++ while (p < q) {
++ memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path));
++ p = iam_leaf_entry_shift(path, p, 1);
++ }
++ dx_set_count((struct iam_entry*)leaf.entries, count - 1);
++
++ err = ext3_journal_dirty_metadata(handle, leaf.bh);
++ if (err)
++ ext3_std_error(path_obj(path)->i_sb, err);
++errout:
++ iam_leaf_fini(&leaf);
++ return err;
++}
++
++/*
++ * Delete existing record with key @k.
++ *
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ * !iam_lookup(c, k, *));
++ */
++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k)
++{
++ struct dx_hash_info hinfo;
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct htree_cookie hc = {
++ .hinfo = &hinfo
++ };
++ int err, i;
++
++ iam_path_init(path, c, &hc);
++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++ path->ip_key_scratch[i] =
++ (struct iam_key *)&cpath.ipc_scrach[i];
++ err = dx_lookup(path);
++ if (err)
++ goto errout;
++
++ err = iam_leaf_delete(h, path, k);
++errout:
++ iam_path_fini(path);
++ return err;
++}
++
++EXPORT_SYMBOL(iam_delete);
++
++static int iam_leaf_update(handle_t *handle, struct iam_path *path,
++ struct iam_key *k, struct iam_rec *r)
++{
++ struct iam_leaf leaf;
++ int err;
++
++ err = iam_leaf_init(path, &leaf);
++ if (err)
++ goto errout;
++
++ err = iam_leaf_lookup(path, &leaf, k);
++ if (err)
++ goto errout;
++
++ memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size);
++ memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size);
++
++ err = ext3_journal_dirty_metadata(handle, leaf.bh);
++ if (err)
++ ext3_std_error(path_obj(path)->i_sb, err);
++errout:
++ iam_leaf_fini(&leaf);
++ return err;
++}
++/*
++ * Replace existing record with key @k, or insert new one. New record data are
++ * in @r.
++ *
++ * Return values: 0: success, -ve: error.
++ *
++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
++ * !memcmp(r, r2, c->ic_descr->id_rec_size));
++ */
++int iam_update(handle_t *h, struct iam_container *c,
++ struct iam_key *k, struct iam_rec *r)
++{
++ struct dx_hash_info hinfo;
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct htree_cookie hc = {
++ .hinfo = &hinfo
++ };
++ int err, i;
++
++ iam_path_init(path, c, &hc);
++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
++ path->ip_key_scratch[i] =
++ (struct iam_key *)&cpath.ipc_scrach[i];
++ err = dx_lookup(path);
++ if (err)
++ goto errout;
++
++ err = iam_leaf_update(h, path, k, r);
++errout:
++ iam_path_fini(path);
++ return err;
++}
++
++EXPORT_SYMBOL(iam_update);
++
+ /*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+@@ -2245,59 +2424,21 @@ static int ext3_add_entry (handle_t *han
+ }
+
+ #ifdef CONFIG_EXT3_INDEX
+-/*
+- * Returns 0 for success, or a negative error value
+- */
+-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode)
++static int split_index_node(handle_t *handle, struct iam_path *path)
+ {
+- struct iam_path_compat cpath;
+- struct iam_path *path = &cpath.ipc_path;
+- struct iam_descr *param;
+- struct iam_frame *frame, *safe;
++
+ struct iam_entry *entries; /* old block contents */
+ struct iam_entry *entries2; /* new block contents */
+- struct dx_hash_info hinfo;
+- struct buffer_head * bh;
++ struct iam_frame *frame, *safe;
+ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+- struct inode *dir = dentry->d_parent->d_inode;
+- struct super_block * sb = dir->i_sb;
+- struct ext3_dir_entry_2 *de;
+ u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+- int err;
++ struct inode *dir = path_obj(path);
+ int nr_splet;
+- int i;
+- size_t isize;
+-
+- iam_path_compat_init(&cpath, dir);
+- param = path_descr(path);
++ int i, err;
+
+- err = dx_probe(dentry, NULL, &hinfo, path);
+- if (err != 0)
+- return err;
+ frame = path->ip_frame;
+ entries = frame->entries;
+
+- /* XXX nikita: global serialization! */
+- isize = dir->i_size;
+-
+- err = param->id_node_read(path->ip_container,
+- (iam_ptr_t)dx_get_block(path,
+- frame->at), handle, &bh);
+- if (err != 0)
+- goto cleanup;
+-
+- BUFFER_TRACE(bh, "get_write_access");
+- err = ext3_journal_get_write_access(handle, bh);
+- if (err)
+- goto journal_error;
+-
+- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+- if (err != -ENOSPC) {
+- bh = NULL;
+- goto cleanup;
+- }
+-
+ /*
+ * Tall-tree handling: we might have to split multiple index blocks
+ * all the way up to tree root. Tricky point here is error handling:
+@@ -2320,7 +2461,7 @@ static int ext3_dx_add_entry(handle_t *h
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++nr_splet) {
+ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+- ext3_warning(sb, __FUNCTION__,
++ ext3_warning(dir->i_sb, __FUNCTION__,
+ "Directory index full!\n");
+ err = -ENOSPC;
+ goto cleanup;
+@@ -2333,7 +2474,7 @@ static int ext3_dx_add_entry(handle_t *h
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+ if (!bh_new[i] ||
+- param->id_node_init(path->ip_container, bh_new[i], 0) != 0)
++ path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0)
+ goto cleanup;
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -2439,9 +2580,71 @@ static int ext3_dx_add_entry(handle_t *h
+ goto journal_error;
+ }
+ }
++ goto cleanup;
++journal_error:
++ ext3_std_error(dir->i_sb, err);
++
++cleanup:
++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
++ if (bh_new[i] != NULL)
++ brelse(bh_new[i]);
++ }
++ return err;
++}
++
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ struct iam_path_compat cpath;
++ struct iam_path *path = &cpath.ipc_path;
++ struct iam_descr *param;
++ struct iam_frame *frame;
++ struct dx_hash_info hinfo;
++ struct buffer_head * bh = NULL;
++ struct inode *dir = dentry->d_parent->d_inode;
++ struct ext3_dir_entry_2 *de;
++ int err;
++ size_t isize;
++
++ iam_path_compat_init(&cpath, dir);
++ param = path_descr(path);
++
++ err = dx_probe(dentry, NULL, &hinfo, path);
++ if (err != 0)
++ return err;
++ frame = path->ip_frame;
++
++ /* XXX nikita: global serialization! */
++ isize = dir->i_size;
++
++ err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at),
++ handle, &bh);
++ if (err != 0)
++ goto cleanup;
++
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto journal_error;
++
++ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
++ if (err != -ENOSPC) {
++ bh = NULL;
++ goto cleanup;
++ }
++
++ err = split_index_node(handle, path);
++ if (err)
++ goto cleanup;
++
++ /*copy split inode too*/
+ de = do_split(handle, path, &bh, --frame, &hinfo, &err);
+ if (!de)
+ goto cleanup;
++
+ assert(dx_node_check(path, frame));
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ goto cleanup2;
+@@ -2452,10 +2655,6 @@ cleanup:
+ if (bh)
+ brelse(bh);
+ cleanup2:
+- for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
+- if (bh_new[i] != NULL)
+- brelse(bh_new[i]);
+- }
+ if (err)
+ inode->i_size = isize;
+ iam_path_fini(path);
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h
++++ iam/include/linux/lustre_iam.h
+@@ -0,0 +1,212 @@
++/*
++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
++ */
++enum {
++ DX_MAX_TREE_HEIGHT = 5,
++ DX_SCRATCH_KEYS = 2
++};
++
++/*
++ * Entry within index tree node. Consists of a key immediately followed
++ * (without padding) by a pointer to the child node.
++ *
++ * Both key and pointer are of variable size, hence incomplete type.
++ */
++struct iam_entry;
++
++struct iam_entry_compat {
++ __le32 hash;
++ __le32 block;
++};
++
++/*
++ * Incomplete type used to refer to keys in iam container.
++ *
++ * As key size can be different from container to container, iam has to use
++ * incomplete type. Clients cast pointer to iam_key to real key type and back.
++ */
++struct iam_key;
++
++/* Incomplete type use to refer to the records stored in iam containers. */
++struct iam_rec;
++
++typedef __u64 iam_ptr_t;
++
++/*
++ * Index node traversed during tree lookup.
++ */
++struct iam_frame {
++ struct buffer_head *bh; /* buffer holding node data */
++ struct iam_entry *entries; /* array of entries */
++ struct iam_entry *at; /* target entry, found by binary search */
++};
++
++/* leaf node reached by tree lookup */
++#define iam_leaf_entry iam_rec
++struct iam_leaf {
++ struct buffer_head *bh;
++ struct iam_leaf_entry *entries;
++ struct iam_leaf_entry *at;
++};
++
++struct iam_path;
++struct iam_container;
++
++/*
++ * Parameters, describing a flavor of iam container.
++ */
++struct iam_descr {
++ /*
++ * Size of a key in this container, in bytes.
++ */
++ size_t id_key_size;
++ /*
++ * Size of a pointer to the next level (stored in index nodes), in
++ * bytes.
++ */
++ size_t id_ptr_size;
++ /*
++ * Size of a record (stored in leaf nodes), in bytes.
++ */
++ size_t id_rec_size;
++ /*
++ * Size of unused (by iam) space at the beginning of every non-root
++ * node, in bytes. Used for compatibility with ext3.
++ */
++ size_t id_node_gap;
++ /*
++ * Size of unused (by iam) space at the beginning of root node, in
++ * bytes. Used for compatibility with ext3.
++ */
++ size_t id_root_gap;
++
++ /*
++ * Returns pointer (in the same sense as pointer in index entry) to
++ * the root node.
++ */
++ __u32 (*id_root_ptr)(struct iam_container *c);
++
++ /*
++ * Check validity and consistency of index node. This is called when
++ * iam just loaded new node into frame.
++ */
++ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
++ /*
++ * Initialize new node (stored in @bh) that is going to be added into
++ * tree.
++ */
++ int (*id_node_init)(struct iam_container *c,
++ struct buffer_head *bh, int root);
++ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *h, struct buffer_head **bh);
++ /*
++ * Key comparison function. Returns -1, 0, +1.
++ */
++ int (*id_keycmp)(struct iam_container *c,
++ struct iam_key *k1, struct iam_key *k2);
++ /*
++ * Create new container.
++ *
++ * Newly created container has a root node and a single leaf. Leaf
++ * contains single record with the smallest possible key.
++ */
++ int (*id_create)(struct iam_container *c);
++ struct {
++ /*
++ * leaf operations.
++ */
++ /*
++ * returns true iff leaf is positioned at the last entry.
++ */
++ int (*at_end)(struct iam_container *c, struct iam_leaf *l);
++ /* position leaf at the first entry */
++ void (*start)(struct iam_container *c, struct iam_leaf *l);
++ /* more leaf to the next entry. */
++ void (*next)(struct iam_container *c, struct iam_leaf *l);
++ /* return key of current leaf record in @k */
++ void (*key)(struct iam_container *c, struct iam_leaf *l,
++ struct iam_key *k);
++ /* return pointer to entry body */
++ struct iam_rec *(*rec)(struct iam_container *c,
++ struct iam_leaf *l);
++ } id_leaf;
++};
++
++struct iam_container {
++ /*
++ * Underlying flat file. IO against this object is issued to
++ * read/write nodes.
++ */
++ struct inode *ic_object;
++ /*
++ * container flavor.
++ */
++ struct iam_descr *ic_descr;
++ /*
++ * pointer to flavor-specific per-container data.
++ */
++ void *ic_descr_data;
++};
++
++/*
++ * Structure to keep track of a path drilled through htree.
++ */
++struct iam_path {
++ /*
++ * Parent container.
++ */
++ struct iam_container *ip_container;
++ /*
++ * Number of index levels minus one.
++ */
++ int ip_indirect;
++ /*
++ * Nodes that top-to-bottom traversal passed through.
++ */
++ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT];
++ /*
++ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
++ * immediately above leaf).
++ */
++ struct iam_frame *ip_frame;
++ /*
++ * Leaf node: a child of ->ip_frame.
++ */
++ struct iam_leaf *ip_leaf;
++ /*
++ * Key searched for.
++ */
++ struct iam_key *ip_key_target;
++ /*
++ * Scratch-pad area for temporary keys.
++ */
++ struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS];
++ /*
++ * pointer to flavor-specific per-container data.
++ */
++ void *ip_descr_data;
++};
++
++/*
++ * Helper structure for legacy htrees.
++ */
++struct iam_path_compat {
++ struct iam_path ipc_path;
++ struct iam_container ipc_container;
++ __u32 ipc_scrach[DX_SCRATCH_KEYS];
++};
++
++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
++int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++ struct iam_descr *descr, struct inode *inode);
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c);
++
--- /dev/null
+Index: iam/include/linux/ext3_fs.h
+===================================================================
+--- iam.orig/include/linux/ext3_fs.h 2007-05-23 11:18:17.000000000 +0800
++++ iam/include/linux/ext3_fs.h 2007-05-23 11:18:20.000000000 +0800
+@@ -758,9 +758,7 @@
+ extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv);
+
+ /* dir.c */
+-extern int ext3_check_dir_entry(const char *, struct inode *,
+- struct ext3_dir_entry_2 *,
+- struct buffer_head *, unsigned long);
++
+ extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext3_dir_entry_2 *dirent);
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h 2007-05-23 11:18:18.000000000 +0800
++++ iam/include/linux/lustre_iam.h 2007-05-23 11:18:20.000000000 +0800
+@@ -1,9 +1,68 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * lustre_iam.c
++ * Top-level entry points into osd module
++ *
++ * Copyright (c) 2006 Cluster File Systems, Inc.
++ * Author: Wang Di <wangdi@clusterfs.com>
++ * Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ * This file is part of the Lustre file system, http://www.lustre.org
++ * Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ * You may have signed or agreed to another license before downloading
++ * this software. If so, you are bound by the terms and conditions
++ * of that agreement, and the following does not apply to you. See the
++ * LICENSE file included with this distribution for more information.
++ *
++ * If you did not agree to a different license, then this copy of Lustre
++ * is open source software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * In either case, Lustre is distributed in the hope that it will be
++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * license text for more details.
++ */
++
++#ifndef __LINUX_LUSTRE_IAM_H__
++#define __LINUX_LUSTRE_IAM_H__
++
++/* handle_t, journal_start(), journal_stop() */
++#include <linux/jbd.h>
++
+ /*
+- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
++ * linux/include/linux/lustre_iam.h
+ */
++
+ enum {
++ /*
++ * Maximal number of non-leaf levels in htree. In the stock ext3 this
++ * is 2.
++ */
+ DX_MAX_TREE_HEIGHT = 5,
+- DX_SCRATCH_KEYS = 2
++ /*
++ * Scratch keys used by generic code for temporaries.
++ *
++ * Allocation:
++ *
++ * [0] reserved for assertions and as a staging area for
++ * record keys immediately used for key comparisons.
++ *
++ * [1] reserved for record key, stored during iteration over
++ * node records (see dx_node_check()).
++ *
++ * [2] reserved for leaf node operations.
++ *
++ * [3] reserved for index operations.
++ */
++ DX_SCRATCH_KEYS = 4,
++ /*
++ * Maximal format name length.
++ */
++ DX_FMT_NAME_LEN = 16
+ };
+
+ /*
+@@ -30,6 +89,11 @@
+ /* Incomplete type use to refer to the records stored in iam containers. */
+ struct iam_rec;
+
++struct iam_cookie {
++ struct iam_key *ic_key;
++ struct iam_rec *ic_rec;
++};
++
+ typedef __u64 iam_ptr_t;
+
+ /*
+@@ -41,45 +105,25 @@
+ struct iam_entry *at; /* target entry, found by binary search */
+ };
+
+-/* leaf node reached by tree lookup */
+-#define iam_leaf_entry iam_rec
+-struct iam_leaf {
+- struct buffer_head *bh;
+- struct iam_leaf_entry *entries;
+- struct iam_leaf_entry *at;
+-};
++/*
++ * Opaque entry in the leaf node.
++ */
++struct iam_lentry;
+
+ struct iam_path;
+ struct iam_container;
+
+-/*
+- * Parameters, describing a flavor of iam container.
+- */
+-struct iam_descr {
+- /*
+- * Size of a key in this container, in bytes.
+- */
+- size_t id_key_size;
+- /*
+- * Size of a pointer to the next level (stored in index nodes), in
+- * bytes.
+- */
+- size_t id_ptr_size;
+- /*
+- * Size of a record (stored in leaf nodes), in bytes.
+- */
+- size_t id_rec_size;
+- /*
+- * Size of unused (by iam) space at the beginning of every non-root
+- * node, in bytes. Used for compatibility with ext3.
+- */
+- size_t id_node_gap;
+- /*
+- * Size of unused (by iam) space at the beginning of root node, in
+- * bytes. Used for compatibility with ext3.
+- */
+- size_t id_root_gap;
+
++/* leaf node reached by tree lookup */
++struct iam_leaf {
++ struct iam_path *il_path;
++ struct buffer_head *il_bh;
++ struct iam_lentry *il_entries;
++ struct iam_lentry *il_at;
++ void *il_descr_data;
++};
++
++struct iam_operations {
+ /*
+ * Returns pointer (in the same sense as pointer in index entry) to
+ * the root node.
+@@ -102,8 +146,8 @@
+ /*
+ * Key comparison function. Returns -1, 0, +1.
+ */
+- int (*id_keycmp)(struct iam_container *c,
+- struct iam_key *k1, struct iam_key *k2);
++ int (*id_keycmp)(const struct iam_container *c,
++ const struct iam_key *k1, const struct iam_key *k2);
+ /*
+ * Create new container.
+ *
+@@ -111,25 +155,113 @@
+ * contains single record with the smallest possible key.
+ */
+ int (*id_create)(struct iam_container *c);
+- struct {
++ /*
++ * Format name.
++ */
++ char id_name[DX_FMT_NAME_LEN];
++};
++
++struct iam_leaf_operations {
+ /*
+ * leaf operations.
+ */
++
++ /*
++ * initialize just loaded leaf node.
++ */
++ int (*init)(struct iam_leaf *p);
++ /*
++ * Format new node.
++ */
++ void (*init_new)(struct iam_container *c, struct buffer_head *bh);
++ /*
++ * Release resources.
++ */
++ void (*fini)(struct iam_leaf *l);
+ /*
+ * returns true iff leaf is positioned at the last entry.
+ */
+- int (*at_end)(struct iam_container *c, struct iam_leaf *l);
++ int (*at_end)(const struct iam_leaf *l);
+ /* position leaf at the first entry */
+- void (*start)(struct iam_container *c, struct iam_leaf *l);
++ void (*start)(struct iam_leaf *l);
+ /* more leaf to the next entry. */
+- void (*next)(struct iam_container *c, struct iam_leaf *l);
+- /* return key of current leaf record in @k */
+- void (*key)(struct iam_container *c, struct iam_leaf *l,
+- struct iam_key *k);
+- /* return pointer to entry body */
+- struct iam_rec *(*rec)(struct iam_container *c,
+- struct iam_leaf *l);
+- } id_leaf;
++ void (*next)(struct iam_leaf *l);
++ /* return key of current leaf record. This method may return
++ * either pointer to the key stored in node, or copy key into
++ * @k buffer supplied by caller and return pointer to this
++ * buffer. The latter approach is used when keys in nodes are
++ * not stored in plain form (e.g., htree doesn't store keys at
++ * all).
++ *
++ * Caller should assume that returned pointer is only valid
++ * while leaf node is pinned and locked.*/
++ struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k);
++ /* return pointer to entry body. Pointer is valid while
++ corresponding leaf node is locked and pinned. */
++ struct iam_rec *(*rec)(const struct iam_leaf *l);
++
++ void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
++ void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
++
++ /*
++ * Search leaf @l for a record with key @k or for a place
++ * where such record is to be inserted.
++ *
++ * Scratch keys from @path can be used.
++ */
++ int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
++
++ int (*can_add)(const struct iam_leaf *l,
++ const struct iam_key *k, const struct iam_rec *r);
++ /*
++ * add rec for a leaf
++ */
++ void (*rec_add)(struct iam_leaf *l,
++ const struct iam_key *k, const struct iam_rec *r);
++ /*
++ * remove rec for a leaf
++ */
++ void (*rec_del)(struct iam_leaf *l);
++ /*
++ * split leaf node, moving some entries into @bh (the latter currently
++ * is assumed to be empty).
++ */
++ void (*split)(struct iam_leaf *l, struct buffer_head *bh);
++};
++
++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
++
++/*
++ * Parameters, describing a flavor of iam container.
++ */
++struct iam_descr {
++ /*
++ * Size of a key in this container, in bytes.
++ */
++ size_t id_key_size;
++ /*
++ * Size of a pointer to the next level (stored in index nodes), in
++ * bytes.
++ */
++ size_t id_ptr_size;
++ /*
++ * Size of a record (stored in leaf nodes), in bytes.
++ */
++ size_t id_rec_size;
++ /*
++ * Size of unused (by iam) space at the beginning of every non-root
++ * node, in bytes. Used for compatibility with ext3.
++ */
++ size_t id_node_gap;
++ /*
++ * Size of unused (by iam) space at the beginning of root node, in
++ * bytes. Used for compatibility with ext3.
++ */
++ size_t id_root_gap;
++
++ struct iam_operations *id_ops;
++ struct iam_leaf_operations *id_leaf_ops;
+ };
+
+ struct iam_container {
+@@ -142,10 +274,17 @@
+ * container flavor.
+ */
+ struct iam_descr *ic_descr;
++};
++
++/*
++ * description-specific part of iam_path. This is usually embedded into larger
++ * structure.
++ */
++struct iam_path_descr {
+ /*
+- * pointer to flavor-specific per-container data.
++ * Scratch-pad area for temporary keys.
+ */
+- void *ic_descr_data;
++ struct iam_key *ipd_key_scratch[DX_SCRATCH_KEYS];
+ };
+
+ /*
+@@ -172,36 +311,240 @@
+ /*
+ * Leaf node: a child of ->ip_frame.
+ */
+- struct iam_leaf *ip_leaf;
++ struct iam_leaf ip_leaf;
+ /*
+ * Key searched for.
+ */
+- struct iam_key *ip_key_target;
+- /*
+- * Scratch-pad area for temporary keys.
+- */
+- struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS];
++ const struct iam_key *ip_key_target;
+ /*
+- * pointer to flavor-specific per-container data.
++ * Description-specific data.
+ */
+- void *ip_descr_data;
++ struct iam_path_descr *ip_data;
+ };
+
++struct dx_hash_info;
++
+ /*
+ * Helper structure for legacy htrees.
+ */
+ struct iam_path_compat {
+ struct iam_path ipc_path;
+ struct iam_container ipc_container;
+- __u32 ipc_scrach[DX_SCRATCH_KEYS];
++ __u32 ipc_scratch[DX_SCRATCH_KEYS];
++ struct dx_hash_info *ipc_hinfo;
++ struct dentry *ipc_dentry;
++ struct iam_path_descr ipc_descr;
++};
++
++/*
++ * iam cursor (iterator) api.
++ */
++
++/*
++ * States of iterator state machine.
++ */
++enum iam_it_state {
++ /* initial state */
++ IAM_IT_DETACHED,
++ /* iterator is above particular record in the container */
++ IAM_IT_ATTACHED
+ };
+
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
+-int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+ /*
+- * Initialize container @c, acquires additional reference on @inode.
++ * Flags controlling iterator functionality.
++ */
++enum iam_it_flags {
++ /*
++ * this iterator will move (iam_it_{prev,next}() will be called on it)
++ */
++ IAM_IT_MOVE = (1 << 0),
++ /*
++ * tree can be updated through this iterator.
++ */
++ IAM_IT_WRITE = (1 << 1)
++};
++
++/*
++ * Iterator.
++ *
++ * Immediately after call to iam_it_init() iterator is in "detached"
++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but
++ * doesn't point to any particular record in this container.
++ *
++ * After successful call to iam_it_get() and until corresponding call to
++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
++ *
++ * Attached iterator can move through records in a container (provided
++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
++ * passes over them, and can modify container (provided IAM_IT_WRITE
++ * permission).
++ *
++ * Concurrency: iterators are supposed to be local to thread. Interfaces below
++ * do no internal serialization.
++ *
++ */
++struct iam_iterator {
++ /*
++ * iterator flags, taken from enum iam_it_flags.
++ */
++ __u32 ii_flags;
++ enum iam_it_state ii_state;
++ /*
++ * path to the record. Valid in IAM_IT_ATTACHED state.
++ */
++ struct iam_path ii_path;
++};
++
++void iam_path_init(struct iam_path *path, struct iam_container *c,
++ struct iam_path_descr *pd);
++void iam_path_fini(struct iam_path *path);
++
++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
++void iam_path_compat_fini(struct iam_path_compat *path);
++
++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize);
++void iam_ipd_free(struct iam_path_descr *ipd);
++
++/*
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
++ struct iam_path_descr *pd);
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it);
++
++/*
++ * Attach iterator. After successful completion, @it points to record with the
++ * largest key not larger than @k. Semantics of ->id_create() method guarantee
++ * that such record will always be found.
++ *
++ * Return value: 0: positioned on existing record,
++ * -ve: error.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0,
++ * (it_state(it) == IAM_IT_ATTACHED &&
++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0))
++ */
++int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
++
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ * iam_it_container(dst) == iam_it_container(src) &&
++ * dst->ii_flags = src->ii_flags &&
++ * ergo(it_state(it) == IAM_IT_ATTACHED,
++ * iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
++ */
++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
++
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it);
++
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ * +1: end of container reached
++ * -ve: error
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_next(struct iam_iterator *it);
++
++/*
++ * Return pointer to the record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
++
++/*
++ * Replace contents of record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
++
++/*
++ * Place key under iterator in @k, return @k
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_key *iam_it_key_get(const struct iam_iterator *it,
++ struct iam_key *k);
++
++/*
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED &&
++ * it->ii_flags&IAM_IT_WRITE &&
++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ * ergo(result == 0,
++ * it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
++ * !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++ const struct iam_key *k, const struct iam_rec *r);
++/*
++ * Delete record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
++
++typedef __u64 iam_pos_t;
++
++/*
++ * Convert iterator to cookie.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED &&
++ * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++iam_pos_t iam_it_store(const struct iam_iterator *it);
++
++/*
++ * Restore iterator from cookie.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
++ * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
++ * iam_it_store(it) == pos)
++ */
++int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
++
++int iam_lookup(struct iam_container *c, const struct iam_key *k,
++ struct iam_rec *r, struct iam_path_descr *pd);
++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
++ struct iam_path_descr *pd);
++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
++ struct iam_rec *r, struct iam_path_descr *pd);
++int iam_insert(handle_t *handle, struct iam_container *c,
++ const struct iam_key *k,
++ struct iam_rec *r, struct iam_path_descr *pd);
++/*
++ * Initialize container @c.
+ */
+ int iam_container_init(struct iam_container *c,
+ struct iam_descr *descr, struct inode *inode);
+@@ -210,3 +553,170 @@
+ */
+ void iam_container_fini(struct iam_container *c);
+
++/*
++ * Determine container format.
++ */
++int iam_container_setup(struct iam_container *c);
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++static inline struct iam_descr *iam_container_descr(struct iam_container *c)
++{
++ return c->ic_descr;
++}
++
++static inline struct iam_descr *iam_path_descr(const struct iam_path *p)
++{
++ return p->ip_container->ic_descr;
++}
++
++static inline struct inode *iam_path_obj(struct iam_path *p)
++{
++ return p->ip_container->ic_object;
++}
++
++static inline void iam_keycpy(const struct iam_container *c,
++ struct iam_key *k1, const struct iam_key *k2)
++{
++ memcpy(k1, k2, c->ic_descr->id_key_size);
++}
++
++static inline int iam_keycmp(const struct iam_container *c,
++ const struct iam_key *k1, const struct iam_key *k2)
++{
++ return c->ic_descr->id_ops->id_keycmp(c, k1, k2);
++}
++
++static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
++ const struct iam_rec *rec_src)
++{
++ memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size);
++}
++
++static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
++{
++ return (void *)((char *)entry + off);
++}
++
++/*XXX These stuff put here, just because they are used by iam.c and namei.c*/
++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
++{
++ return le32_to_cpu(*(u32*)iam_entry_off(entry,
++ iam_path_descr(p)->id_key_size))
++ & 0x00ffffff;
++}
++
++static inline void dx_set_block(struct iam_path *p,
++ struct iam_entry *entry, unsigned value)
++{
++ *(u32*)iam_entry_off(entry,
++ iam_path_descr(p)->id_key_size) =
++ cpu_to_le32(value);
++}
++
++static inline void dx_set_key(struct iam_path *p, struct iam_entry *entry,
++ const struct iam_key *key)
++{
++ iam_keycpy(p->ip_container, iam_entry_off(entry, 0), key);
++}
++
++struct dx_countlimit {
++ __le16 limit;
++ __le16 count;
++};
++
++static inline unsigned dx_get_count(struct iam_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->count);
++}
++
++static inline unsigned dx_get_limit(struct iam_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
++}
++
++static inline void dx_set_count(struct iam_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
++}
++
++static inline unsigned dx_node_limit(struct iam_path *p)
++{
++ struct iam_descr *param = iam_path_descr(p);
++ unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize -
++ param->id_node_gap;
++ return entry_space / (param->id_key_size + param->id_ptr_size);
++}
++
++static inline struct iam_entry *dx_get_entries(struct iam_path *path,
++ void *data, int root)
++{
++ struct iam_descr *param = iam_path_descr(path);
++ return data + (root ? param->id_root_gap : param->id_node_gap);
++}
++
++
++static inline struct iam_entry *dx_node_get_entries(struct iam_path *path,
++ struct iam_frame *frame)
++{
++ return dx_get_entries(path,
++ frame->bh->b_data, frame == path->ip_frames);
++}
++
++static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr)
++{
++ assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
++ return path->ip_data->ipd_key_scratch[nr];
++}
++
++int dx_lookup(struct iam_path *path);
++void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
++ u32 hash, u32 block);
++
++int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct iam_path *path, __u32 *start_hash);
++
++struct buffer_head *ext3_append(handle_t *handle, struct inode *inode,
++ u32 *block, int *err);
++int split_index_node(handle_t *handle, struct iam_path *path);
++
++/*
++ * external
++ */
++void iam_container_write_lock(struct iam_container *c);
++void iam_container_write_unlock(struct iam_container *c);
++
++void iam_container_read_lock(struct iam_container *c);
++void iam_container_read_unlock(struct iam_container *c);
++
++int iam_index_next(struct iam_container *c, struct iam_path *p);
++int iam_read_leaf(struct iam_path *p);
++
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *handle, struct buffer_head **bh);
++
++void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++ const struct iam_key *key, iam_ptr_t ptr);
++
++int iam_leaf_at_end(const struct iam_leaf *l);
++void iam_leaf_next(struct iam_leaf *folio);
++
++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
++struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf);
++struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
++
++
++struct iam_format {
++ int (*if_guess)(struct iam_container *c);
++ struct list_head if_linkage;
++};
++
++void iam_format_register(struct iam_format *fmt);
++
++void iam_lfix_format_init(void);
++
++/* __LINUX_LUSTRE_IAM_H__ */
++#endif
+Index: iam/fs/ext3/iam.c
+===================================================================
+--- iam.orig/fs/ext3/iam.c 2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam.c 2007-05-23 11:18:20.000000000 +0800
+@@ -0,0 +1,1436 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * iam.c
++ * Top-level entry points into iam module
++ *
++ * Copyright (c) 2006 Cluster File Systems, Inc.
++ * Author: Wang Di <wangdi@clusterfs.com>
++ * Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ * This file is part of the Lustre file system, http://www.lustre.org
++ * Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ * You may have signed or agreed to another license before downloading
++ * this software. If so, you are bound by the terms and conditions
++ * of that agreement, and the following does not apply to you. See the
++ * LICENSE file included with this distribution for more information.
++ *
++ * If you did not agree to a different license, then this copy of Lustre
++ * is open source software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * In either case, Lustre is distributed in the hope that it will be
++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * license text for more details.
++ */
++
++/*
++ * iam: big theory statement.
++ *
++ * iam (Index Access Module) is a module providing abstraction of persistent
++ * transactional container on top of generalized ext3 htree.
++ *
++ * iam supports:
++ *
++ * - key, pointer, and record size specifiable per container.
++ *
++ * - trees taller than 2 index levels.
++ *
++ * - read/write to existing ext3 htree directories as iam containers.
++ *
++ * iam container is a tree, consisting of leaf nodes containing keys and
++ * records stored in this container, and index nodes, containing keys and
++ * pointers to leaf or index nodes.
++ *
++ * iam does not work with keys directly, instead it calls user-supplied key
++ * comparison function (->dpo_keycmp()).
++ *
++ * Pointers are (currently) interpreted as logical offsets (measured in
++ * blocksful) within underlying flat file on top of which iam tree lives.
++ *
++ * On-disk format:
++ *
++ * iam mostly tries to reuse existing htree formats.
++ *
++ * Format of index node:
++ *
++ * +-----+-------+-------+-------+------+-------+------------+
++ * | | count | | | | | |
++ * | gap | / | entry | entry | .... | entry | free space |
++ * | | limit | | | | | |
++ * +-----+-------+-------+-------+------+-------+------------+
++ *
++ * gap this part of node is never accessed by iam code. It
++ * exists for binary compatibility with ext3 htree (that,
++ * in turn, stores fake struct ext2_dirent for ext2
++ * compatibility), and to keep some unspecified per-node
++ * data. Gap can be different for root and non-root index
++ * nodes. Gap size can be specified for each container
++ * (gap of 0 is allowed).
++ *
++ * count/limit current number of entries in this node, and the maximal
++ * number of entries that can fit into node. count/limit
++ * has the same size as entry, and is itself counted in
++ * count.
++ *
++ * entry index entry: consists of a key immediately followed by
++ * a pointer to a child node. Size of a key and size of a
++ * pointer depends on container. Entry has neither
++ * alignment nor padding.
++ *
++ * free space portion of node new entries are added to
++ *
++ * Entries in index node are sorted by their key value.
++ *
++ * Format of a leaf node is not specified. Generic iam code accesses leaf
++ * nodes through ->id_leaf methods in struct iam_descr.
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/pagemap.h>
++#include <linux/jbd.h>
++#include <linux/time.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/fcntl.h>
++#include <linux/stat.h>
++#include <linux/string.h>
++#include <linux/quotaops.h>
++#include <linux/buffer_head.h>
++#include <linux/smp_lock.h>
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++#include "xattr.h"
++#include "iopen.h"
++#include "acl.h"
++
++/*
++ * List of all registered formats.
++ *
++ * No locking. Callers synchronize.
++ */
++static LIST_HEAD(iam_formats);
++
++void iam_format_register(struct iam_format *fmt)
++{
++ list_add(&fmt->if_linkage, &iam_formats);
++}
++EXPORT_SYMBOL(iam_format_register);
++
++/*
++ * Determine format of given container. This is done by scanning list of
++ * registered formats and calling ->if_guess() method of each in turn.
++ */
++static int iam_format_guess(struct iam_container *c)
++{
++ int result;
++ struct iam_format *fmt;
++
++ /*
++ * XXX temporary initialization hook.
++ */
++ {
++ static int initialized = 0;
++
++ if (!initialized) {
++ /*
++ * Keep that order: htree should be registered first,
++ * so that iam_htree_guess() runs last.
++ */
++ iam_htree_format_init();
++ iam_lvar_format_init();
++ iam_lfix_format_init();
++ initialized = 1;
++ }
++ }
++
++ result = -ENOENT;
++ list_for_each_entry(fmt, &iam_formats, if_linkage) {
++ result = fmt->if_guess(c);
++ if (result == 0)
++ break;
++ }
++ return result;
++}
++
++/*
++ * Initialize container @c.
++ */
++int iam_container_init(struct iam_container *c,
++ struct iam_descr *descr, struct inode *inode)
++{
++ memset(c, 0, sizeof *c);
++ c->ic_descr = descr;
++ c->ic_object = inode;
++ init_rwsem(&c->ic_sem);
++ return 0;
++}
++EXPORT_SYMBOL(iam_container_init);
++
++/*
++ * Determine container format.
++ */
++int iam_container_setup(struct iam_container *c)
++{
++ return iam_format_guess(c);
++}
++EXPORT_SYMBOL(iam_container_setup);
++
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c)
++{
++}
++EXPORT_SYMBOL(iam_container_fini);
++
++void iam_path_init(struct iam_path *path, struct iam_container *c,
++ struct iam_path_descr *pd)
++{
++ memset(path, 0, sizeof *path);
++ path->ip_container = c;
++ path->ip_frame = path->ip_frames;
++ path->ip_data = pd;
++ path->ip_leaf.il_path = path;
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf);
++
++void iam_path_release(struct iam_path *path)
++{
++ int i;
++
++ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
++ if (path->ip_frames[i].bh != NULL) {
++ brelse(path->ip_frames[i].bh);
++ path->ip_frames[i].bh = NULL;
++ }
++ }
++}
++
++void iam_path_fini(struct iam_path *path)
++{
++ iam_leaf_fini(&path->ip_leaf);
++ iam_path_release(path);
++}
++
++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
++{
++ int i;
++
++ path->ipc_hinfo = &path->ipc_hinfo_area;
++ for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
++ path->ipc_descr.ipd_key_scratch[i] =
++ (struct iam_ikey *)&path->ipc_scratch[i];
++
++ iam_container_init(&path->ipc_container,
++ &iam_htree_compat_param, inode);
++ iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
++}
++
++void iam_path_compat_fini(struct iam_path_compat *path)
++{
++ iam_path_fini(&path->ipc_path);
++ iam_container_fini(&path->ipc_container);
++}
++
++/*
++ * Helper function initializing iam_path_descr and its key scratch area.
++ */
++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
++{
++ struct iam_path_descr *ipd;
++ void *karea;
++ int i;
++
++ ipd = area;
++ karea = ipd + 1;
++ for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
++ ipd->ipd_key_scratch[i] = karea;
++ return ipd;
++}
++EXPORT_SYMBOL(iam_ipd_alloc);
++
++void iam_ipd_free(struct iam_path_descr *ipd)
++{
++}
++EXPORT_SYMBOL(iam_ipd_free);
++
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *h, struct buffer_head **bh)
++{
++ int result = 0;
++
++ *bh = ext3_bread(h, c->ic_object, (int)ptr, 0, &result);
++ if (*bh == NULL)
++ result = -EIO;
++ return result;
++}
++
++/*
++ * Return pointer to current leaf record. Pointer is valid while corresponding
++ * leaf node is locked and pinned.
++ */
++static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
++{
++ return iam_leaf_ops(leaf)->rec(leaf);
++}
++
++/*
++ * Return pointer to the current leaf key. This function returns pointer to
++ * the key stored in node.
++ *
++ * Caller should assume that returned pointer is only valid while leaf node is
++ * pinned and locked.
++ */
++static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
++{
++ return iam_leaf_ops(leaf)->key(leaf);
++}
++
++static int iam_leaf_key_size(const struct iam_leaf *leaf)
++{
++ return iam_leaf_ops(leaf)->key_size(leaf);
++}
++
++static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
++ struct iam_ikey *key)
++{
++ return iam_leaf_ops(leaf)->ikey(leaf, key);
++}
++
++static int iam_leaf_keycmp(const struct iam_leaf *leaf,
++ const struct iam_key *key)
++{
++ return iam_leaf_ops(leaf)->key_cmp(leaf, key);
++}
++
++static int iam_leaf_keyeq(const struct iam_leaf *leaf,
++ const struct iam_key *key)
++{
++ return iam_leaf_ops(leaf)->key_eq(leaf, key);
++}
++
++#if EXT3_INVARIANT_ON
++static int iam_leaf_check(struct iam_leaf *leaf);
++extern int dx_node_check(struct iam_path *p, struct iam_frame *f);
++
++static int iam_path_check(struct iam_path *p)
++{
++ int i;
++ int result;
++ struct iam_frame *f;
++ struct iam_descr *param;
++
++ result = 1;
++ param = iam_path_descr(p);
++ for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
++ f = &p->ip_frames[i];
++ if (f->bh != NULL) {
++ result = dx_node_check(p, f);
++ if (result)
++ result = !param->id_ops->id_node_check(p, f);
++ }
++ }
++ if (result && p->ip_leaf.il_bh != NULL)
++ result = iam_leaf_check(&p->ip_leaf);
++ if (result == 0) {
++ ext3_std_error(iam_path_obj(p)->i_sb, result);
++ }
++ return result;
++}
++#endif
++
++static int iam_leaf_load(struct iam_path *path)
++{
++ iam_ptr_t block;
++ int err;
++ struct iam_container *c;
++ struct buffer_head *bh;
++ struct iam_leaf *leaf;
++ struct iam_descr *descr;
++
++ c = path->ip_container;
++ leaf = &path->ip_leaf;
++ descr = iam_path_descr(path);
++ block = path->ip_frame->leaf;
++ if (block == 0) {
++ /* XXX bug 11027 */
++ printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
++ (long unsigned)path->ip_frame->leaf,
++ dx_get_count(dx_node_get_entries(path, path->ip_frame)),
++ path->ip_frames[0].bh, path->ip_frames[1].bh,
++ path->ip_frames[2].bh);
++ }
++ err = descr->id_ops->id_node_read(c, block, NULL, &bh);
++ if (err == 0) {
++ leaf->il_bh = bh;
++ leaf->il_curidx = block;
++ err = iam_leaf_ops(leaf)->init(leaf);
++ assert_inv(ergo(err == 0, iam_leaf_check(leaf)));
++ }
++ return err;
++}
++
++static void iam_leaf_unlock(struct iam_leaf *leaf)
++{
++ if (leaf->il_lock != NULL) {
++ dx_unlock_htree(iam_leaf_container(leaf)->ic_object,
++ leaf->il_lock);
++ do_corr(schedule());
++ leaf->il_lock = NULL;
++ }
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf)
++{
++ if (leaf->il_path != NULL) {
++ iam_leaf_unlock(leaf);
++ assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf)));
++ iam_leaf_ops(leaf)->fini(leaf);
++ if (leaf->il_bh) {
++ brelse(leaf->il_bh);
++ leaf->il_bh = NULL;
++ leaf->il_curidx = 0;
++ }
++ }
++}
++
++static void iam_leaf_start(struct iam_leaf *folio)
++{
++ iam_leaf_ops(folio)->start(folio);
++}
++
++void iam_leaf_next(struct iam_leaf *folio)
++{
++ iam_leaf_ops(folio)->next(folio);
++}
++
++static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
++ const struct iam_rec *rec)
++{
++ iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
++}
++
++static void iam_rec_del(struct iam_leaf *leaf, int shift)
++{
++ iam_leaf_ops(leaf)->rec_del(leaf, shift);
++}
++
++int iam_leaf_at_end(const struct iam_leaf *leaf)
++{
++ return iam_leaf_ops(leaf)->at_end(leaf);
++}
++
++void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr)
++{
++ iam_leaf_ops(l)->split(l, bh, nr);
++}
++
++int iam_leaf_can_add(const struct iam_leaf *l,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ return iam_leaf_ops(l)->can_add(l, k, r);
++}
++
++#if EXT3_INVARIANT_ON
++static int iam_leaf_check(struct iam_leaf *leaf)
++{
++ return 1;
++#if 0
++ struct iam_lentry *orig;
++ struct iam_path *path;
++ struct iam_container *bag;
++ struct iam_ikey *k0;
++ struct iam_ikey *k1;
++ int result;
++ int first;
++
++ orig = leaf->il_at;
++ path = iam_leaf_path(leaf);
++ bag = iam_leaf_container(leaf);
++
++ result = iam_leaf_ops(leaf)->init(leaf);
++ if (result != 0)
++ return result;
++
++ first = 1;
++ iam_leaf_start(leaf);
++ k0 = iam_path_ikey(path, 0);
++ k1 = iam_path_ikey(path, 1);
++ while (!iam_leaf_at_end(leaf)) {
++ iam_ikeycpy(bag, k0, k1);
++ iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1));
++ if (!first && iam_ikeycmp(bag, k0, k1) > 0) {
++ return 0;
++ }
++ first = 0;
++ iam_leaf_next(leaf);
++ }
++ leaf->il_at = orig;
++ return 1;
++#endif
++}
++#endif
++
++static int iam_txn_dirty(handle_t *handle,
++ struct iam_path *path, struct buffer_head *bh)
++{
++ int result;
++
++ result = ext3_journal_dirty_metadata(handle, bh);
++ if (result != 0)
++ ext3_std_error(iam_path_obj(path)->i_sb, result);
++ return result;
++}
++
++static int iam_txn_add(handle_t *handle,
++ struct iam_path *path, struct buffer_head *bh)
++{
++ int result;
++
++ result = ext3_journal_get_write_access(handle, bh);
++ if (result != 0)
++ ext3_std_error(iam_path_obj(path)->i_sb, result);
++ return result;
++}
++
++/***********************************************************************/
++/* iterator interface */
++/***********************************************************************/
++
++static enum iam_it_state it_state(const struct iam_iterator *it)
++{
++ return it->ii_state;
++}
++
++/*
++ * Helper function returning scratch key.
++ */
++static struct iam_container *iam_it_container(const struct iam_iterator *it)
++{
++ return it->ii_path.ip_container;
++}
++
++static inline int it_keycmp(const struct iam_iterator *it,
++ const struct iam_key *k)
++{
++ return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
++}
++
++static inline int it_keyeq(const struct iam_iterator *it,
++ const struct iam_key *k)
++{
++ return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
++}
++
++static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
++{
++ return iam_ikeycmp(it->ii_path.ip_container,
++ iam_leaf_ikey(&it->ii_path.ip_leaf,
++ iam_path_ikey(&it->ii_path, 0)), ik);
++}
++
++static inline int it_at_rec(const struct iam_iterator *it)
++{
++ return !iam_leaf_at_end(&it->ii_path.ip_leaf);
++}
++
++static inline int it_before(const struct iam_iterator *it)
++{
++ return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
++}
++
++/*
++ * Helper wrapper around iam_it_get(): returns 0 (success) only when record
++ * with exactly the same key as asked is found.
++ */
++static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
++{
++ int result;
++
++ result = iam_it_get(it, k);
++ if (result > 0)
++ result = 0;
++ else if (result == 0)
++ /*
++ * Return -ENOENT if cursor is located above record with a key
++ * different from one specified, or in the empty leaf.
++ *
++ * XXX returning -ENOENT only works if iam_it_get() never
++ * returns -ENOENT as a legitimate error.
++ */
++ result = -ENOENT;
++ return result;
++}
++
++void iam_container_write_lock(struct iam_container *ic)
++{
++ down_write(&ic->ic_sem);
++}
++
++void iam_container_write_unlock(struct iam_container *ic)
++{
++ up_write(&ic->ic_sem);
++}
++
++void iam_container_read_lock(struct iam_container *ic)
++{
++ down_read(&ic->ic_sem);
++}
++
++void iam_container_read_unlock(struct iam_container *ic)
++{
++ up_read(&ic->ic_sem);
++}
++
++/*
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
++ struct iam_path_descr *pd)
++{
++ memset(it, 0, sizeof *it);
++ it->ii_flags = flags;
++ it->ii_state = IAM_IT_DETACHED;
++ iam_path_init(&it->ii_path, c, pd);
++ return 0;
++}
++EXPORT_SYMBOL(iam_it_init);
++
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it)
++{
++ assert_corr(it_state(it) == IAM_IT_DETACHED);
++ iam_path_fini(&it->ii_path);
++}
++EXPORT_SYMBOL(iam_it_fini);
++
++/*
++ * Performs tree top-to-bottom traversal starting from root, and loads leaf
++ * node.
++ */
++static int iam_path_lookup(struct iam_path *path, int index)
++{
++ struct iam_container *c;
++ struct iam_descr *descr;
++ struct iam_leaf *leaf;
++ int result;
++
++ c = path->ip_container;
++ leaf = &path->ip_leaf;
++ descr = iam_path_descr(path);
++ result = dx_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
++ assert_inv(iam_path_check(path));
++ do_corr(schedule());
++ if (result == 0) {
++ result = iam_leaf_load(path);
++ assert_inv(ergo(result == 0, iam_leaf_check(leaf)));
++ if (result == 0) {
++ do_corr(schedule());
++ if (index)
++ result = iam_leaf_ops(leaf)->
++ ilookup(leaf, path->ip_ikey_target);
++ else
++ result = iam_leaf_ops(leaf)->
++ lookup(leaf, path->ip_key_target);
++ do_corr(schedule());
++ }
++ if (result < 0)
++ iam_leaf_unlock(leaf);
++ }
++ return result;
++}
++
++/*
++ * Common part of iam_it_{i,}get().
++ */
++static int __iam_it_get(struct iam_iterator *it, int index)
++{
++ int result;
++ assert_corr(it_state(it) == IAM_IT_DETACHED);
++
++ result = iam_path_lookup(&it->ii_path, index);
++ if (result >= 0) {
++ int collision;
++
++ collision = result & IAM_LOOKUP_LAST;
++ switch (result & ~IAM_LOOKUP_LAST) {
++ case IAM_LOOKUP_EXACT:
++ result = +1;
++ it->ii_state = IAM_IT_ATTACHED;
++ break;
++ case IAM_LOOKUP_OK:
++ result = 0;
++ it->ii_state = IAM_IT_ATTACHED;
++ break;
++ case IAM_LOOKUP_BEFORE:
++ case IAM_LOOKUP_EMPTY:
++ result = 0;
++ it->ii_state = IAM_IT_SKEWED;
++ break;
++ default:
++ assert(0);
++ }
++ result |= collision;
++ }
++ /*
++ * See iam_it_get_exact() for explanation.
++ */
++ assert_corr(result != -ENOENT);
++ return result;
++}
++
++/*
++ * Correct hash, but not the same key was found, iterate through hash
++ * collision chain, looking for correct record.
++ */
++static int iam_it_collision(struct iam_iterator *it)
++{
++ int result;
++
++ assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
++
++ while ((result = iam_it_next(it)) == 0) {
++ do_corr(schedule());
++ if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
++ return -ENOENT;
++ if (it_keyeq(it, it->ii_path.ip_key_target))
++ return 0;
++ }
++ return result;
++}
++
++/*
++ * Attach iterator. After successful completion, @it points to record with
++ * least key not larger than @k.
++ *
++ * Return value: 0: positioned on existing record,
++ * +ve: exact position found,
++ * -ve: error.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
++ * it_keycmp(it, k) <= 0)
++ */
++int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
++{
++ int result;
++ assert_corr(it_state(it) == IAM_IT_DETACHED);
++
++ it->ii_path.ip_ikey_target = NULL;
++ it->ii_path.ip_key_target = k;
++
++ result = __iam_it_get(it, 0);
++
++ if (result == IAM_LOOKUP_LAST) {
++ result = iam_it_collision(it);
++ if (result != 0) {
++ iam_it_put(it);
++ iam_it_fini(it);
++ result = __iam_it_get(it, 0);
++ } else
++ result = +1;
++ }
++ if (result > 0)
++ result &= ~IAM_LOOKUP_LAST;
++
++ assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
++ assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
++ it_keycmp(it, k) <= 0));
++ return result;
++}
++EXPORT_SYMBOL(iam_it_get);
++
++/*
++ * Attach iterator by index key.
++ */
++static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
++{
++ assert_corr(it_state(it) == IAM_IT_DETACHED);
++
++ it->ii_path.ip_ikey_target = k;
++ return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
++}
++
++/*
++ * Attach iterator, and assure it points to the record (not skewed).
++ *
++ * Return value: 0: positioned on existing record,
++ * +ve: exact position found,
++ * -ve: error.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED &&
++ * !(it->ii_flags&IAM_IT_WRITE)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
++{
++ int result;
++ assert_corr(it_state(it) == IAM_IT_DETACHED &&
++ !(it->ii_flags&IAM_IT_WRITE));
++ result = iam_it_get(it, k);
++ if (result == 0) {
++ if (it_state(it) != IAM_IT_ATTACHED) {
++ assert_corr(it_state(it) == IAM_IT_SKEWED);
++ result = iam_it_next(it);
++ }
++ }
++ assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
++ return result;
++}
++EXPORT_SYMBOL(iam_it_get_at);
++
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ * iam_it_container(dst) == iam_it_container(src) &&
++ * dst->ii_flags = src->ii_flags &&
++ * ergo(it_state(src) == IAM_IT_ATTACHED,
++ * iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ * iam_it_key_get(dst) == iam_it_key_get(src))
++ */
++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
++{
++ dst->ii_flags = src->ii_flags;
++ dst->ii_state = src->ii_state;
++ /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
++ /*
++ * XXX: duplicate lock.
++ */
++ assert_corr(it_state(dst) == it_state(src));
++ assert_corr(iam_it_container(dst) == iam_it_container(src));
++ assert_corr(dst->ii_flags = src->ii_flags);
++ assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
++ iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ iam_it_key_get(dst) == iam_it_key_get(src)));
++
++}
++
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it)
++{
++ if (it->ii_state != IAM_IT_DETACHED) {
++ it->ii_state = IAM_IT_DETACHED;
++ iam_leaf_fini(&it->ii_path.ip_leaf);
++ }
++}
++EXPORT_SYMBOL(iam_it_put);
++
++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
++ struct iam_ikey *ikey);
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ * +1: end of container reached
++ * -ve: error
++ *
++ * precondition: (it_state(it) == IAM_IT_ATTACHED ||
++ * it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
++ * ergo(result > 0, it_state(it) == IAM_IT_DETACHED)
++ */
++int iam_it_next(struct iam_iterator *it)
++{
++ int result;
++ struct iam_path *path;
++ struct iam_leaf *leaf;
++ struct inode *obj;
++ do_corr(struct iam_ikey *ik_orig);
++
++ /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
++ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++ it_state(it) == IAM_IT_SKEWED);
++
++ path = &it->ii_path;
++ leaf = &path->ip_leaf;
++ obj = iam_path_obj(path);
++
++ assert_corr(iam_leaf_is_locked(leaf));
++
++ result = 0;
++ do_corr(ik_orig = it_at_rec(it) ?
++ iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
++ if (it_before(it)) {
++ assert_corr(!iam_leaf_at_end(leaf));
++ it->ii_state = IAM_IT_ATTACHED;
++ } else {
++ if (!iam_leaf_at_end(leaf))
++ /* advance within leaf node */
++ iam_leaf_next(leaf);
++ /*
++ * multiple iterations may be necessary due to empty leaves.
++ */
++ while (result == 0 && iam_leaf_at_end(leaf)) {
++ do_corr(schedule());
++ /* advance index portion of the path */
++ result = iam_index_next(iam_it_container(it), path);
++ assert_corr(iam_leaf_is_locked(leaf));
++ if (result == 1) {
++ struct dynlock_handle *lh;
++ lh = dx_lock_htree(obj, path->ip_frame->leaf,
++ DLT_WRITE);
++ if (lh != NULL) {
++ iam_leaf_fini(leaf);
++ leaf->il_lock = lh;
++ result = iam_leaf_load(path);
++ if (result == 0)
++ iam_leaf_start(leaf);
++ } else
++ result = -ENOMEM;
++ } else if (result == 0)
++ /* end of container reached */
++ result = +1;
++ if (result != 0)
++ iam_it_put(it);
++ }
++ if (result == 0)
++ it->ii_state = IAM_IT_ATTACHED;
++ }
++ assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
++ assert_corr(ergo(result > 0, it_state(it) == IAM_IT_DETACHED));
++ assert_corr(ergo(result == 0 && ik_orig != NULL,
++ it_ikeycmp(it, ik_orig) >= 0));
++ return result;
++}
++EXPORT_SYMBOL(iam_it_next);
++
++/*
++ * Return pointer to the record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
++{
++ assert_corr(it_state(it) == IAM_IT_ATTACHED);
++ assert_corr(it_at_rec(it));
++ return iam_leaf_rec(&it->ii_path.ip_leaf);
++}
++EXPORT_SYMBOL(iam_it_rec_get);
++
++static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
++{
++ struct iam_leaf *folio;
++
++ folio = &it->ii_path.ip_leaf;
++ iam_leaf_ops(folio)->rec_set(folio, r);
++}
++
++/*
++ * Replace contents of record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED &&
++ * it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_set(handle_t *h,
++ struct iam_iterator *it, const struct iam_rec *r)
++{
++ int result;
++ struct iam_path *path;
++ struct buffer_head *bh;
++
++ assert_corr(it_state(it) == IAM_IT_ATTACHED &&
++ it->ii_flags&IAM_IT_WRITE);
++ assert_corr(it_at_rec(it));
++
++ path = &it->ii_path;
++ bh = path->ip_leaf.il_bh;
++ result = iam_txn_add(h, path, bh);
++ if (result == 0) {
++ iam_it_reccpy(it, r);
++ result = iam_txn_dirty(h, path, bh);
++ }
++ return result;
++}
++EXPORT_SYMBOL(iam_it_rec_set);
++
++/*
++ * Return pointer to the index key under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED ||
++ * it_state(it) == IAM_IT_SKEWED
++ */
++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
++ struct iam_ikey *ikey)
++{
++ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++ it_state(it) == IAM_IT_SKEWED);
++ assert_corr(it_at_rec(it));
++ return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
++}
++
++/*
++ * Return pointer to the key under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED ||
++ * it_state(it) == IAM_IT_SKEWED
++ */
++struct iam_key *iam_it_key_get(const struct iam_iterator *it)
++{
++ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++ it_state(it) == IAM_IT_SKEWED);
++ assert_corr(it_at_rec(it));
++ return iam_leaf_key(&it->ii_path.ip_leaf);
++}
++EXPORT_SYMBOL(iam_it_key_get);
++
++/*
++ * Return size of key under iterator (in bytes)
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED ||
++ * it_state(it) == IAM_IT_SKEWED
++ */
++int iam_it_key_size(const struct iam_iterator *it)
++{
++ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++ it_state(it) == IAM_IT_SKEWED);
++ assert_corr(it_at_rec(it));
++ return iam_leaf_key_size(&it->ii_path.ip_leaf);
++}
++EXPORT_SYMBOL(iam_it_key_size);
++
++/*
++ * Insertion of new record. Interaction with jbd during non-trivial case (when
++ * split happens) is as following:
++ *
++ * - new leaf node is involved into transaction by ext3_append();
++ *
++ * - old leaf node is involved into transaction by iam_add_rec();
++ *
++ * - leaf where insertion point ends in, is marked dirty by iam_add_rec();
++ *
++ * - leaf without insertion point is marked dirty (as @new_leaf) by
++ * iam_new_leaf();
++ *
++ * - split index nodes are involved into transaction and marked dirty by
++ * split_index_node().
++ *
++ * - "safe" index node, which is no split, but where new pointer is inserted
++ * is involved into transaction and marked dirty by split_index_node().
++ *
++ * - index node where pointer to new leaf is inserted is involved into
++ * transaction by split_index_node() and marked dirty by iam_add_rec().
++ *
++ * - inode is marked dirty by iam_add_rec().
++ *
++ */
++
++static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
++{
++ int err;
++ iam_ptr_t blknr;
++ struct buffer_head *new_leaf;
++ struct buffer_head *old_leaf;
++ struct iam_container *c;
++ struct inode *obj;
++ struct iam_path *path;
++
++ assert_inv(iam_leaf_check(leaf));
++
++ c = iam_leaf_container(leaf);
++ path = leaf->il_path;
++
++ obj = c->ic_object;
++ new_leaf = ext3_append(handle, obj, (__u32 *)&blknr, &err);
++ do_corr(schedule());
++ if (new_leaf != NULL) {
++ struct dynlock_handle *lh;
++
++ lh = dx_lock_htree(obj, blknr, DLT_WRITE);
++ do_corr(schedule());
++ if (lh != NULL) {
++ iam_leaf_ops(leaf)->init_new(c, new_leaf);
++ do_corr(schedule());
++ old_leaf = leaf->il_bh;
++ iam_leaf_split(leaf, &new_leaf, blknr);
++ if (old_leaf != leaf->il_bh) {
++ /*
++ * Switched to the new leaf.
++ */
++ iam_leaf_unlock(leaf);
++ leaf->il_lock = lh;
++ path->ip_frame->leaf = blknr;
++ } else
++ dx_unlock_htree(obj, lh);
++ do_corr(schedule());
++ err = iam_txn_dirty(handle, path, new_leaf);
++ brelse(new_leaf);
++ if (err == 0)
++ err = ext3_mark_inode_dirty(handle, obj);
++ do_corr(schedule());
++ } else
++ err = -ENOMEM;
++ }
++ assert_inv(iam_leaf_check(leaf));
++ assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf));
++ assert_inv(iam_path_check(iam_leaf_path(leaf)));
++ return err;
++}
++
++static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
++ struct iam_path *path,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ int err;
++ struct iam_leaf *leaf;
++
++ leaf = &path->ip_leaf;
++ assert_inv(iam_leaf_check(leaf));
++ assert_inv(iam_path_check(path));
++ err = iam_txn_add(handle, path, leaf->il_bh);
++ if (err == 0) {
++ do_corr(schedule());
++ if (!iam_leaf_can_add(leaf, k, r)) {
++ struct dynlock_handle *lh = NULL;
++
++ do {
++ assert_corr(lh == NULL);
++ do_corr(schedule());
++ err = split_index_node(handle, path, &lh);
++ if (err == -EAGAIN) {
++ assert_corr(lh == NULL);
++
++ iam_path_fini(path);
++ it->ii_state = IAM_IT_DETACHED;
++
++ do_corr(schedule());
++ err = iam_it_get_exact(it, k);
++ if (err == -ENOENT)
++ err = +1; /* repeat split */
++ else if (err == 0)
++ err = -EEXIST;
++ }
++ } while (err > 0);
++ assert_inv(iam_path_check(path));
++ if (err == 0) {
++ assert_corr(lh != NULL);
++ do_corr(schedule());
++ err = iam_new_leaf(handle, leaf);
++ if (err == 0)
++ err = iam_txn_dirty(handle, path,
++ path->ip_frame->bh);
++ }
++ dx_unlock_htree(iam_path_obj(path), lh);
++ do_corr(schedule());
++ }
++ if (err == 0) {
++ iam_leaf_rec_add(leaf, k, r);
++ err = iam_txn_dirty(handle, path, leaf->il_bh);
++ }
++ }
++ assert_inv(iam_leaf_check(leaf));
++ assert_inv(iam_leaf_check(&path->ip_leaf));
++ assert_inv(iam_path_check(path));
++ return err;
++}
++
++/*
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right. On success, iterator is positioned on the newly inserted record.
++ *
++ * precondition: it->ii_flags&IAM_IT_WRITE &&
++ * (it_state(it) == IAM_IT_ATTACHED ||
++ * it_state(it) == IAM_IT_SKEWED) &&
++ * ergo(it_state(it) == IAM_IT_ATTACHED,
++ * it_keycmp(it, k) <= 0) &&
++ * ergo(it_before(it), it_keycmp(it, k) > 0));
++ * postcondition: ergo(result == 0,
++ * it_state(it) == IAM_IT_ATTACHED &&
++ * it_keycmp(it, k) == 0 &&
++ * !memcmp(iam_it_rec_get(it), r, ...))
++ */
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ int result;
++ struct iam_path *path;
++
++ path = &it->ii_path;
++
++ assert_corr(it->ii_flags&IAM_IT_WRITE);
++ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++ it_state(it) == IAM_IT_SKEWED);
++ assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
++ it_keycmp(it, k) <= 0));
++ assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
++ result = iam_add_rec(h, it, path, k, r);
++ if (result == 0)
++ it->ii_state = IAM_IT_ATTACHED;
++ assert_corr(ergo(result == 0,
++ it_state(it) == IAM_IT_ATTACHED &&
++ it_keycmp(it, k) == 0));
++ return result;
++}
++EXPORT_SYMBOL(iam_it_rec_insert);
++
++/*
++ * Delete record under iterator.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED &&
++ * it->ii_flags&IAM_IT_WRITE &&
++ * it_at_rec(it)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED ||
++ * it_state(it) == IAM_IT_DETACHED
++ */
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
++{
++ int result;
++ struct iam_leaf *leaf;
++ struct iam_path *path;
++
++ assert_corr(it_state(it) == IAM_IT_ATTACHED &&
++ it->ii_flags&IAM_IT_WRITE);
++ assert_corr(it_at_rec(it));
++
++ path = &it->ii_path;
++ leaf = &path->ip_leaf;
++
++ assert_inv(iam_leaf_check(leaf));
++ assert_inv(iam_path_check(path));
++
++ result = iam_txn_add(h, path, leaf->il_bh);
++ /*
++ * no compaction for now.
++ */
++ if (result == 0) {
++ iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
++ result = iam_txn_dirty(h, path, leaf->il_bh);
++ if (result == 0 && iam_leaf_at_end(leaf) &&
++ it->ii_flags&IAM_IT_MOVE) {
++ result = iam_it_next(it);
++ if (result > 0)
++ result = 0;
++ }
++ }
++ assert_inv(iam_leaf_check(leaf));
++ assert_inv(iam_path_check(path));
++ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
++ it_state(it) == IAM_IT_DETACHED);
++ return result;
++}
++EXPORT_SYMBOL(iam_it_rec_delete);
++
++/*
++ * Convert iterator to cookie.
++ *
++ * precondition: it_state(it) == IAM_IT_ATTACHED &&
++ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++iam_pos_t iam_it_store(const struct iam_iterator *it)
++{
++ iam_pos_t result;
++
++ assert_corr(it_state(it) == IAM_IT_ATTACHED);
++ assert_corr(it_at_rec(it));
++ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
++ sizeof result);
++
++ result = 0;
++ return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
++}
++EXPORT_SYMBOL(iam_it_store);
++
++/*
++ * Restore iterator from cookie.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
++ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
++ * iam_it_store(it) == pos)
++ */
++int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
++{
++ assert_corr(it_state(it) == IAM_IT_DETACHED &&
++ it->ii_flags&IAM_IT_MOVE);
++ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
++ return iam_it_iget(it, (struct iam_ikey *)&pos);
++}
++EXPORT_SYMBOL(iam_it_load);
++
++/***********************************************************************/
++/* invariants */
++/***********************************************************************/
++
++static inline int ptr_inside(void *base, size_t size, void *ptr)
++{
++ return (base <= ptr) && (ptr < base + size);
++}
++
++int iam_frame_invariant(struct iam_frame *f)
++{
++ return
++ (f->bh != NULL &&
++ f->bh->b_data != NULL &&
++ ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
++ ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
++ f->entries <= f->at);
++}
++int iam_leaf_invariant(struct iam_leaf *l)
++{
++ return
++ l->il_bh != NULL &&
++ l->il_bh->b_data != NULL &&
++ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
++ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
++ l->il_entries <= l->il_at;
++}
++
++int iam_path_invariant(struct iam_path *p)
++{
++ int i;
++
++ if (p->ip_container == NULL ||
++ p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
++ p->ip_frame != p->ip_frames + p->ip_indirect ||
++ !iam_leaf_invariant(&p->ip_leaf))
++ return 0;
++ for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
++ if (i <= p->ip_indirect) {
++ if (!iam_frame_invariant(&p->ip_frames[i]))
++ return 0;
++ }
++ }
++ return 1;
++}
++
++int iam_it_invariant(struct iam_iterator *it)
++{
++ return
++ (it->ii_state == IAM_IT_DETACHED ||
++ it->ii_state == IAM_IT_ATTACHED ||
++ it->ii_state == IAM_IT_SKEWED) &&
++ !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
++ ergo(it->ii_state == IAM_IT_ATTACHED ||
++ it->ii_state == IAM_IT_SKEWED,
++ iam_path_invariant(&it->ii_path) &&
++ equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
++}
++
++/*
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
++ *
++ * Return values: 0: found, -ENOENT: not-found, -ve: error
++ */
++int iam_lookup(struct iam_container *c, const struct iam_key *k,
++ struct iam_rec *r, struct iam_path_descr *pd)
++{
++ struct iam_iterator it;
++ int result;
++
++ iam_it_init(&it, c, 0, pd);
++
++ result = iam_it_get_exact(&it, k);
++ if (result == 0)
++ /*
++ * record with required key found, copy it into user buffer
++ */
++ iam_reccpy(&it.ii_path.ip_leaf, r);
++ iam_it_put(&it);
++ iam_it_fini(&it);
++ return result;
++}
++EXPORT_SYMBOL(iam_lookup);
++
++/*
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h).
++ *
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
++ *
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ * iam_lookup(c, k, r2) > 0;
++ */
++int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
++ const struct iam_rec *r, struct iam_path_descr *pd)
++{
++ struct iam_iterator it;
++ int result;
++
++ iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++ result = iam_it_get_exact(&it, k);
++ if (result == -ENOENT)
++ result = iam_it_rec_insert(h, &it, k, r);
++ else if (result == 0)
++ result = -EEXIST;
++ iam_it_put(&it);
++ iam_it_fini(&it);
++ return result;
++}
++EXPORT_SYMBOL(iam_insert);
++
++/*
++ * Update record with the key @k in container @c (within context of
++ * transaction @h), new record is given by @r.
++ *
++ * Return values: 0: success, -ve: error, including -ENOENT if no record with
++ * the given key found.
++ */
++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
++ const struct iam_rec *r, struct iam_path_descr *pd)
++{
++ struct iam_iterator it;
++ int result;
++
++ iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++ result = iam_it_get_exact(&it, k);
++ if (result == 0)
++ iam_it_rec_set(h, &it, r);
++ iam_it_put(&it);
++ iam_it_fini(&it);
++ return result;
++}
++EXPORT_SYMBOL(iam_update);
++
++/*
++ * Delete existing record with key @k.
++ *
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ * !iam_lookup(c, k, *));
++ */
++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
++ struct iam_path_descr *pd)
++{
++ struct iam_iterator it;
++ int result;
++
++ iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++ result = iam_it_get_exact(&it, k);
++ if (result == 0)
++ iam_it_rec_delete(h, &it);
++ iam_it_put(&it);
++ iam_it_fini(&it);
++ return result;
++}
++EXPORT_SYMBOL(iam_delete);
++
+Index: iam/fs/ext3/namei.c
+===================================================================
+--- iam.orig/fs/ext3/namei.c 2007-05-23 11:18:18.000000000 +0800
++++ iam/fs/ext3/namei.c 2007-05-23 11:18:20.000000000 +0800
+@@ -24,81 +24,6 @@
+ * Theodore Ts'o, 2002
+ */
+
+-/*
+- * iam: big theory statement.
+- *
+- * iam (Index Access Module) is a module providing abstraction of persistent
+- * transactional container on top of generalized ext3 htree.
+- *
+- * iam supports:
+- *
+- * - key, pointer, and record size specifiable per container.
+- *
+- * - trees taller than 2 index levels.
+- *
+- * - read/write to existing ext3 htree directories as iam containers.
+- *
+- * iam container is a tree, consisting of leaf nodes containing keys and
+- * records stored in this container, and index nodes, containing keys and
+- * pointers to leaf or index nodes.
+- *
+- * iam does not work with keys directly, instead it calls user-supplied key
+- * comparison function (->dpo_keycmp()).
+- *
+- * Pointers are (currently) interpreted as logical offsets (measured in
+- * blocksful) within underlying flat file on top of which iam tree lives.
+- *
+- * On-disk format:
+- *
+- * iam mostly tries to reuse existing htree formats.
+- *
+- * Format of index node:
+- *
+- * +-----+-------+-------+-------+------+-------+------------+
+- * | | count | | | | | |
+- * | gap | / | entry | entry | .... | entry | free space |
+- * | | limit | | | | | |
+- * +-----+-------+-------+-------+------+-------+------------+
+- *
+- * gap this part of node is never accessed by iam code. It
+- * exists for binary compatibility with ext3 htree (that,
+- * in turn, stores fake struct ext2_dirent for ext2
+- * compatibility), and to keep some unspecified per-node
+- * data. Gap can be different for root and non-root index
+- * nodes. Gap size can be specified for each container
+- * (gap of 0 is allowed).
+- *
+- * count/limit current number of entries in this node, and the maximal
+- * number of entries that can fit into node. count/limit
+- * has the same size as entry, and is itself counted in
+- * count.
+- *
+- * entry index entry: consists of a key immediately followed by
+- * a pointer to a child node. Size of a key and size of a
+- * pointer depends on container. Entry has neither
+- * alignment nor padding.
+- *
+- * free space portion of node new entries are added to
+- *
+- * Entries in index node are sorted by their key value.
+- *
+- * Format of leaf node:
+- *
+- * +-----+-------+-------+-------+------+-------+------------+
+- * | | count | | | | | |
+- * | gap | / | leaf | leaf | .... | leaf | free space |
+- * | | limit | | | | | |
+- * +-----+-------+-------+-------+------+-------+------------+
+-
+- * leaf For leaf entry: consists of a rec immediately followd by
+- * a key. size of a key and size of a rec depends on container.
+- *
+- *
+- *
+- *
+- *
+- */
+-
+ #include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+@@ -112,10 +37,10 @@
+ #include <linux/quotaops.h>
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
++#include <linux/lustre_iam.h>
+ #include "xattr.h"
+ #include "iopen.h"
+ #include "acl.h"
+-#include <linux/lustre_iam.h>
+ /*
+ * define how far ahead to read directories while searching them.
+ */
+@@ -125,7 +50,7 @@
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
+
+-static struct buffer_head *ext3_append(handle_t *handle,
++struct buffer_head *ext3_append(handle_t *handle,
+ struct inode *inode,
+ u32 *block, int *err)
+ {
+@@ -136,14 +61,15 @@
+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT3_I(inode)->i_disksize = inode->i_size;
+- ext3_journal_get_write_access(handle,bh);
++ *err = ext3_journal_get_write_access(handle, bh);
++ if (*err != 0) {
++ brelse(bh);
++ bh = NULL;
++ }
+ }
+ return bh;
+ }
+
+-#ifndef assert
+-#define assert(test) J_ASSERT(test)
+-#endif
+
+ #ifndef swap
+ #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+@@ -155,293 +81,10 @@
+ #define dxtrace(command)
+ #endif
+
+-struct fake_dirent {
+- __le32 inode;
+- __le16 rec_len;
+- u8 name_len;
+- u8 file_type;
+-};
+-
+-struct dx_countlimit {
+- __le16 limit;
+- __le16 count;
+-};
+-
+-/*
+- * dx_root_info is laid out so that if it should somehow get overlaid by a
+- * dirent the two low bits of the hash version will be zero. Therefore, the
+- * hash version mod 4 should never be 0. Sincerely, the paranoia department.
+- */
+-
+-struct dx_root {
+- struct fake_dirent dot;
+- char dot_name[4];
+- struct fake_dirent dotdot;
+- char dotdot_name[4];
+- struct dx_root_info
+- {
+- __le32 reserved_zero;
+- u8 hash_version;
+- u8 info_length; /* 8 */
+- u8 indirect_levels;
+- u8 unused_flags;
+- }
+- info;
+- struct {} entries[0];
+-};
+-
+-struct dx_node
+-{
+- struct fake_dirent fake;
+- struct {} entries[0];
+-};
+-
+-struct dx_map_entry
+-{
+- u32 hash;
+- u32 offs;
+-};
+-
+-
+-static u32 htree_root_ptr(struct iam_container *c);
+-static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
+-static int htree_node_init(struct iam_container *c,
+- struct buffer_head *bh, int root);
+-static int htree_keycmp(struct iam_container *c,
+- struct iam_key *k1, struct iam_key *k2);
+-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
+- handle_t *h, struct buffer_head **bh);
+-
+-/*
+- * Parameters describing iam compatibility mode in which existing ext3 htrees
+- * can be manipulated.
+- */
+-static struct iam_descr htree_compat_param = {
+- .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
+- .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
+- .id_node_gap = offsetof(struct dx_node, entries),
+- .id_root_gap = offsetof(struct dx_root, entries),
+-
+- .id_root_ptr = htree_root_ptr,
+- .id_node_check = htree_node_check,
+- .id_node_init = htree_node_init,
+- .id_node_read = htree_node_read,
+- .id_keycmp = htree_keycmp
+-};
+-
+-
+-struct iam_key;
+-struct iam_rec;
+-struct iam_descr;
+-struct iam_container;
+-struct iam_path;
+-
+-
+-
+-/*
+- * iam cursor (iterator) api.
+- */
+-
+-/*
+- * Flags controlling iterator functionality.
+- */
+-enum iam_it_flags {
+- /*
+- * this iterator will move (iam_it_{prev,next}() will be called on it)
+- */
+- IAM_IT_MOVE = (1 << 0),
+- /*
+- * tree can be updated through this iterator.
+- */
+- IAM_IT_WRITE = (1 << 1)
+-};
+-
+-/*
+- * States of iterator state machine.
+- */
+-enum iam_it_state {
+- /* initial state */
+- IAM_IT_DETACHED,
+- /* iterator is above particular record in the container */
+- IAM_IT_ATTACHED
+-};
+-
+-struct htree_cookie {
+- struct dx_hash_info *hinfo;
+- struct dentry *dentry;
+-};
+-
+-/*
+- * Iterator.
+- *
+- * Immediately after call to iam_it_init() iterator is in "detached"
+- * (IAM_IT_DETACHED) state: it is associated with given parent container, but
+- * doesn't point to any particular record in this container.
+- *
+- * After successful call to iam_it_get() and until corresponding call to
+- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
+- *
+- * Attached iterator can move through records in a container (provided
+- * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+- * passes over them, and can modify container (provided IAM_IT_WRITE
+- * permission).
+- *
+- * Concurrency: iterators are supposed to be local to thread. Interfaces below
+- * do no internal serialization.
+- *
+- */
+-struct iam_iterator {
+- /*
+- * iterator flags, taken from enum iam_it_flags.
+- */
+- __u32 ii_flags;
+- enum iam_it_state ii_state;
+- /*
+- * path to the record. Valid in IAM_IT_ATTACHED state.
+- */
+- struct iam_path ii_path;
+-};
+-
+-static inline struct iam_key *keycpy(struct iam_container *c,
+- struct iam_key *k1, struct iam_key *k2)
+-{
+- return memcpy(k1, k2, c->ic_descr->id_key_size);
+-}
+-
+-static inline int keycmp(struct iam_container *c,
+- struct iam_key *k1, struct iam_key *k2)
+-{
+- return c->ic_descr->id_keycmp(c, k1, k2);
+-}
+-
+-static struct iam_container *iam_it_container(struct iam_iterator *it)
+-{
+- return it->ii_path.ip_container;
+-}
+-
+-static inline int it_keycmp(struct iam_iterator *it,
+- struct iam_key *k1, struct iam_key *k2)
+-{
+- return keycmp(iam_it_container(it), k1, k2);
+-}
+-
+-/*
+- * Initialize iterator to IAM_IT_DETACHED state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+-int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags);
+-/*
+- * Finalize iterator and release all resources.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- */
+-void iam_it_fini(struct iam_iterator *it);
+-
+-/*
+- * Attach iterator. After successful completion, @it points to record with the
+- * largest key not larger than @k. Semantics of ->id_create() method guarantee
+- * that such record will always be found.
+- *
+- * Return value: 0: positioned on existing record,
+- * -ve: error.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- * postcondition: ergo(result == 0,
+- * (it_state(it) == IAM_IT_ATTACHED &&
+- * it_keycmp(it, iam_it_key_get(it, *), k) < 0))
+- */
+-int iam_it_get(struct iam_iterator *it, struct iam_key *k);
+-
+-/*
+- * Duplicates iterator.
+- *
+- * postcondition: it_state(dst) == it_state(src) &&
+- * iam_it_container(dst) == iam_it_container(src) &&
+- * dst->ii_flags = src->ii_flags &&
+- * ergo(it_state(it) == IAM_IT_ATTACHED,
+- * iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+- * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
+- */
+-void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
+-
+-/*
+- * Detach iterator. Does nothing it detached state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+-void iam_it_put(struct iam_iterator *it);
+-
+-/*
+- * Move iterator one record right.
+- *
+- * Return value: 0: success,
+- * +1: end of container reached
+- * -ve: error
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
+- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
+- */
+-int iam_it_next(struct iam_iterator *it);
+-
+-/*
+- * Return pointer to the record under iterator.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-const struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
+-
+-/*
+- * Replace contents of record under iterator.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
+-
+-/*
+- * Place key under iterator in @k, return @k
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-const struct iam_key *iam_it_key_get(struct iam_iterator *it,
+- struct iam_key *k);
+-
+-/*
+- * Insert new record with key @k and contents from @r, shifting records to the
+- * right.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED &&
+- * it->ii_flags&IAM_IT_WRITE &&
+- * it_keycmp(it, iam_it_key_get(it, *), k) < 0
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- * ergo(result == 0,
+- * it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
+- * !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+- struct iam_key *k, struct iam_rec *r);
+-/*
+- * Delete record under iterator.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+-
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
+ static void dx_set_block(struct iam_path *p,
+ struct iam_entry *entry, unsigned value);
+-static inline struct iam_key *dx_get_key(struct iam_path *p,
+- struct iam_entry *entry,
+- struct iam_key *key);
+-static void dx_set_key(struct iam_path *p, struct iam_entry *entry,
+- struct iam_key *key);
+-static unsigned dx_get_count(struct iam_entry *entries);
+ static unsigned dx_get_limit(struct iam_entry *entries);
+ static void dx_set_count(struct iam_entry *entries, unsigned value);
+ static void dx_set_limit(struct iam_entry *entries, unsigned value);
+@@ -457,264 +100,62 @@
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct iam_path *path,
+- struct iam_frame *frame, u32 hash, u32 block);
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+- struct iam_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+ struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+
+-static inline void iam_path_init(struct iam_path *path,
+- struct iam_container *c, struct htree_cookie *hc);
+-static inline void iam_path_fini(struct iam_path *path);
+-
+-
+-/*
+- * Future: use high four bits of block for coalesce-on-delete flags
+- * Mask them off for now.
+- */
+-
+-static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off)
+-{
+- return (void *)((char *)entry + off);
+-}
+-
+-static inline struct iam_descr *path_descr(struct iam_path *p)
+-{
+- return p->ip_container->ic_descr;
+-}
+-
+-static inline struct inode *path_obj(struct iam_path *p)
+-{
+- return p->ip_container->ic_object;
+-}
+-
+-static inline size_t iam_entry_size(struct iam_path *p)
+-{
+- return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size;
+-}
+-
+-static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
+- struct iam_entry *entry, int shift)
+-{
+- void *e = entry;
+- return e + shift * iam_entry_size(p);
+-}
+-
+-static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
+- struct iam_entry *e1, struct iam_entry *e2)
+-{
+- ptrdiff_t diff;
+-
+- diff = (void *)e1 - (void *)e2;
+- assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
+- return diff / iam_entry_size(p);
+-}
+-
+-static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+-{
+- return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size))
+- & 0x00ffffff;
+-}
+-
+-static inline void dx_set_block(struct iam_path *p,
+- struct iam_entry *entry, unsigned value)
+-{
+- *(u32*)entry_off(entry,
+- path_descr(p)->id_key_size) = cpu_to_le32(value);
+-}
+-
+-static inline struct iam_key *dx_get_key(struct iam_path *p,
+- struct iam_entry *entry,
+- struct iam_key *key)
+-{
+- memcpy(key, entry, path_descr(p)->id_key_size);
+- return key;
+-}
+-
+-static inline struct iam_key *iam_key_at(struct iam_path *p,
+- struct iam_entry *entry)
+-{
+- return (struct iam_key *)entry;
+-}
+-
+-static inline void dx_set_key(struct iam_path *p,
+- struct iam_entry *entry, struct iam_key *key)
+-{
+- memcpy(entry, key, path_descr(p)->id_key_size);
+-}
+-
+-static inline unsigned dx_get_count (struct iam_entry *entries)
+-{
+- return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+-}
+-
+-static inline unsigned dx_get_limit (struct iam_entry *entries)
+-{
+- return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+-}
+-
+-static inline void dx_set_count (struct iam_entry *entries, unsigned value)
+-{
+- ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+-}
+-
+-static inline void dx_set_limit (struct iam_entry *entries, unsigned value)
++static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+ {
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+
+-static inline unsigned dx_root_limit(struct iam_path *p)
++int dx_index_is_compat(struct iam_path *path)
+ {
+- struct iam_descr *param = path_descr(p);
+- unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
+- param->id_root_gap;
+- return entry_space / (param->id_key_size + param->id_ptr_size);
++ return iam_path_descr(path) == &iam_htree_compat_param;
+ }
+
+-static inline unsigned dx_node_limit(struct iam_path *p)
+-{
+- struct iam_descr *param = path_descr(p);
+- unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
+- param->id_node_gap;
+- return entry_space / (param->id_key_size + param->id_ptr_size);
+-}
+-
+-static inline int dx_index_is_compat(struct iam_path *path)
+-{
+- return path_descr(path) == &htree_compat_param;
+-}
+-
+-static struct iam_entry *dx_get_entries(struct iam_path *path, void *data,
+- int root)
+-{
+- return data +
+- (root ?
+- path_descr(path)->id_root_gap : path_descr(path)->id_node_gap);
+-}
+-
+-static struct iam_entry *dx_node_get_entries(struct iam_path *path,
+- struct iam_frame *frame)
+-{
+- return dx_get_entries(path,
+- frame->bh->b_data, frame == path->ip_frames);
+-}
+
+-static int dx_node_check(struct iam_path *p, struct iam_frame *f)
++int dx_node_check(struct iam_path *p, struct iam_frame *f)
+ {
+ struct iam_entry *e;
+ struct iam_container *c;
+ unsigned count;
+ unsigned i;
++ iam_ptr_t blk;
++ iam_ptr_t root;
++ struct inode *inode;
+
+ c = p->ip_container;
+ e = dx_node_get_entries(p, f);
+ count = dx_get_count(e);
+ e = iam_entry_shift(p, e, 1);
++ root = iam_path_descr(p)->id_ops->id_root_ptr(c);
++
++ inode = iam_path_obj(p);
+ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
+- keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]);
+- dx_get_key(p, e, p->ip_key_scratch[1]);
++ iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1));
++ iam_get_ikey(p, e, iam_path_ikey(p, 1));
+ if (i > 0 &&
+- keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0)
++ iam_ikeycmp(c, iam_path_ikey(p, 0),
++ iam_path_ikey(p, 1)) > 0) {
++ BREAKPOINT();
+ return 0;
+ }
+- return 1;
+-}
+-
+-static u32 htree_root_ptr(struct iam_container *c)
+-{
++ blk = dx_get_block(p, e);
++ if (inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) {
++ BREAKPOINT();
+ return 0;
+-}
+-
+-static int htree_node_check(struct iam_path *path, struct iam_frame *frame)
+-{
+- void *data;
+- struct iam_entry *entries;
+- struct super_block *sb;
+-
+- data = frame->bh->b_data;
+- entries = dx_node_get_entries(path, frame);
+- sb = path_obj(path)->i_sb;
+- if (frame == path->ip_frames) {
+- /* root node */
+- struct dx_root *root;
+- struct htree_cookie *hc = path->ip_descr_data;
+-
+- root = data;
+- if (root->info.hash_version > DX_HASH_MAX) {
+- ext3_warning(sb, __FUNCTION__,
+- "Unrecognised inode hash code %d",
+- root->info.hash_version);
+- return ERR_BAD_DX_DIR;
+ }
+-
+- if (root->info.unused_flags & 1) {
+- ext3_warning(sb, __FUNCTION__,
+- "Unimplemented inode hash flags: %#06x",
+- root->info.unused_flags);
+- return ERR_BAD_DX_DIR;
+- }
+-
+- path->ip_indirect = root->info.indirect_levels;
+- if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) {
+- ext3_warning(sb, __FUNCTION__,
+- "Unimplemented inode hash depth: %#06x",
+- root->info.indirect_levels);
+- return ERR_BAD_DX_DIR;
++ /*
++ * By definition of a tree, no node points to the root.
++ */
++ if (blk == root) {
++ BREAKPOINT();
++ return 0;
+ }
+-
+- assert((char *)entries == (((char *)&root->info) +
+- root->info.info_length));
+- assert(dx_get_limit(entries) == dx_root_limit(path));
+-
+- hc->hinfo->hash_version = root->info.hash_version;
+- hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed;
+- if (hc->dentry)
+- ext3fs_dirhash(hc->dentry->d_name.name,
+- hc->dentry->d_name.len, hc->hinfo);
+- path->ip_key_target = (struct iam_key *)&hc->hinfo->hash;
+- } else {
+- /* non-root index */
+- assert(entries == data + path_descr(path)->id_node_gap);
+- assert(dx_get_limit(entries) == dx_node_limit(path));
+ }
+- frame->entries = frame->at = entries;
+- return 0;
+-}
+-
+-static int htree_node_init(struct iam_container *c,
+- struct buffer_head *bh, int root)
+-{
+- struct dx_node *node;
+-
+- assert(!root);
+-
+- node = (void *)bh->b_data;
+- node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize);
+- node->fake.inode = 0;
+- return 0;
+-}
+-
+-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
+- handle_t *handle, struct buffer_head **bh)
+-{
+- int result = 0;
+-
+- *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result);
+- if (*bh == NULL)
+- result = -EIO;
+- return result;
+-}
+-
+-static int htree_keycmp(struct iam_container *c,
+- struct iam_key *k1, struct iam_key *k2)
+-{
+- __u32 p1 = le32_to_cpu(*(__u32 *)k1);
+- __u32 p2 = le32_to_cpu(*(__u32 *)k2);
+-
+- return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
++ return 1;
+ }
+
+ /*
+@@ -797,601 +238,124 @@
+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
+ names, space/bcount,(space/bcount)*100/blocksize);
+ return (struct stats) { names, space, bcount};
+-}
+-#endif /* DX_DEBUG */
+-
+-static int dx_lookup(struct iam_path *path)
+-{
+- u32 ptr;
+- int err = 0;
+- int i;
+-
+- struct iam_descr *param;
+- struct iam_frame *frame;
+- struct iam_container *c;
+-
+- param = path_descr(path);
+- c = path->ip_container;
+-
+- for (frame = path->ip_frames, i = 0,
+- ptr = param->id_root_ptr(path->ip_container);
+- i <= path->ip_indirect;
+- ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+- struct iam_entry *entries;
+- struct iam_entry *p;
+- struct iam_entry *q;
+- struct iam_entry *m;
+- unsigned count;
+-
+- err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh);
+- if (err != 0)
+- break;
+- err = param->id_node_check(path, frame);
+- if (err != 0)
+- break;
+-
+- assert(dx_node_check(path, frame));
+-
+- entries = frame->entries;
+- count = dx_get_count(entries);
+- assert(count && count <= dx_get_limit(entries));
+- p = iam_entry_shift(path, entries, 1);
+- q = iam_entry_shift(path, entries, count - 1);
+- while (p <= q) {
+- m = iam_entry_shift(path,
+- p, iam_entry_diff(path, q, p) / 2);
+- dxtrace(printk("."));
+- if (keycmp(c, iam_key_at(path, m),
+- path->ip_key_target) > 0)
+- q = iam_entry_shift(path, m, -1);
+- else
+- p = iam_entry_shift(path, m, +1);
+- }
+-
+- frame->at = iam_entry_shift(path, p, -1);
+- if (1) { // linear search cross check
+- unsigned n = count - 1;
+- struct iam_entry *at;
+-
+- at = entries;
+- while (n--) {
+- dxtrace(printk(","));
+- at = iam_entry_shift(path, at, +1);
+- if (keycmp(c, iam_key_at(path, at),
+- path->ip_key_target) > 0) {
+- if (at != iam_entry_shift(path, frame->at, 1)) {
+- BREAKPOINT;
+- printk(KERN_EMERG "%i\n",
+- keycmp(c, iam_key_at(path, at),
+- path->ip_key_target));
+- }
+- at = iam_entry_shift(path, at, -1);
+- break;
+- }
+- }
+- assert(at == frame->at);
+- }
+- }
+- if (err != 0)
+- iam_path_fini(path);
+- path->ip_frame = --frame;
+- return err;
+-}
+-
+-/*
+- * Probe for a directory leaf block to search.
+- *
+- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+- * error in the directory index, and the caller should fall back to
+- * searching the directory normally. The callers of dx_probe **MUST**
+- * check for this error code, and make sure it never gets reflected
+- * back to userspace.
+- */
+-static int dx_probe(struct dentry *dentry, struct inode *dir,
+- struct dx_hash_info *hinfo, struct iam_path *path)
+-{
+- int err;
+- struct htree_cookie hc = {
+- .dentry = dentry,
+- .hinfo = hinfo
+- };
+-
+- assert(dx_index_is_compat(path));
+- path->ip_descr_data = &hc;
+- err = dx_lookup(path);
+- assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+- return err;
+-}
+-
+-/*
+- * Initialize container @c, acquires additional reference on @inode.
+- */
+-int iam_container_init(struct iam_container *c,
+- struct iam_descr *descr, struct inode *inode)
+-{
+- memset(c, 0, sizeof *c);
+- c->ic_descr = descr;
+- c->ic_object = igrab(inode);
+- if (c->ic_object != NULL)
+- return 0;
+- else
+- return -ENOENT;
+-}
+-
+-/*
+- * Finalize container @c, release all resources.
+- */
+-void iam_container_fini(struct iam_container *c)
+-{
+- if (c->ic_object != NULL) {
+- iput(c->ic_object);
+- c->ic_object = NULL;
+- }
+-}
+-
+-static inline void iam_path_init(struct iam_path *path, struct iam_container *c,
+- struct htree_cookie *hc)
+-{
+- memset(path, 0, sizeof *path);
+- path->ip_container = c;
+- path->ip_frame = path->ip_frames;
+- path->ip_descr_data = hc;
+-}
+-
+-static inline void iam_path_fini(struct iam_path *path)
+-{
+- int i;
+-
+- for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
+- if (path->ip_frames[i].bh != NULL) {
+- brelse(path->ip_frames[i].bh);
+- path->ip_frames[i].bh = NULL;
+- }
+- }
+-}
+-
+-static void iam_path_compat_init(struct iam_path_compat *path,
+- struct inode *inode)
+-{
+- int i;
+-
+- iam_container_init(&path->ipc_container, &htree_compat_param, inode);
+- /*
+- * XXX hack allowing finalization of iam_path_compat with
+- * iam_path_fini().
+- */
+- iput(inode);
+- iam_path_init(&path->ipc_path, &path->ipc_container, NULL);
+- for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
+- path->ipc_path.ip_key_scratch[i] =
+- (struct iam_key *)&path->ipc_scrach[i];
+-}
+-
+-static void iam_path_compat_fini(struct iam_path_compat *path)
+-{
+- iam_path_fini(&path->ipc_path);
+- iam_container_fini(&path->ipc_container);
+-}
+-
+-static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf)
+-{
+- int block, err;
+- struct buffer_head *bh;
+-
+- block = dx_get_block(path, path->ip_frame->at);
+- err = path_descr(path)->id_node_read(path->ip_container, block,
+- NULL, &bh);
+- if (err)
+- return err;
+-
+- leaf->bh = bh;
+- leaf->entries = (struct iam_leaf_entry *)bh->b_data;
+- return 0;
+-}
+-
+-static void iam_leaf_fini(struct iam_leaf *leaf)
+-{
+- if (leaf->bh)
+- brelse(leaf->bh);
+-}
+-
+-/*
+- * Search container @c for record with key @k. If record is found, its data
+- * are moved into @r.
+- *
+- *
+- *
+- * Return values: +ve: found, 0: not-found, -ve: error
+- */
+-
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r)
+-{
+- struct dx_hash_info hinfo;
+- struct iam_path_compat cpath;
+- struct iam_path *path = &cpath.ipc_path;
+- struct htree_cookie hc = {
+- .hinfo = &hinfo
+- };
+- int err, i;
+-
+- iam_path_init(path, c, &hc);
+- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+- path->ip_key_scratch[i] =
+- (struct iam_key *)&cpath.ipc_scrach[i];
+- err = dx_lookup(path);
+- do {
+- struct iam_leaf leaf;
+- err = iam_leaf_init(path, &leaf);
+- if (err)
+- goto errout;
+-
+- for (path_descr(path)->id_leaf.start(c, &leaf);
+- !path_descr(path)->id_leaf.at_end(c, &leaf);
+- path_descr(path)->id_leaf.next(c, &leaf)) {
+- struct iam_key *key;
+-
+- key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL);
+- path_descr(path)->id_leaf.key(c, &leaf, key);
+- if (keycmp(c, k, key) == 0) {
+- memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf),
+- path_descr(path)->id_rec_size);
+- iam_path_fini(path);
+- iam_leaf_fini(&leaf);
+- return 0;
+- }
+- }
+-
+- iam_leaf_fini(&leaf);
+- /* Check to see if we should continue to search */
+- err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL);
+- if (err < 0)
+- goto errout;
+- } while (err == 1);
+-errout:
+- iam_path_fini(path);
+- return(err);
+-}
+-EXPORT_SYMBOL(iam_lookup);
+-
+-static inline size_t iam_leaf_entry_size(struct iam_path *p)
+-{
+- return path_descr(p)->id_rec_size + path_descr(p)->id_key_size;
+-}
+-
+-static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p,
+- struct iam_leaf_entry *e1, struct iam_leaf_entry *e2)
+-{
+- ptrdiff_t diff;
+-
+- diff = (void *)e1 - (void *)e2;
+- assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff);
+- return diff / iam_leaf_entry_size(p);
+-}
+-
+-static inline struct iam_leaf_entry*
+-iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift)
+-{
+- void *e = entry;
+- return e + shift * iam_leaf_entry_size(p);
+-}
+-
+-static inline struct iam_key *
+-dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key)
+-{
+- memcpy(key, e, path_descr(p)->id_key_size);
+- return key;
+-}
+-
+-static inline struct iam_key *
+-iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry)
+-{
+- void *e = entry;
+- return e + path_descr(p)->id_rec_size;
+-}
+-static inline struct iam_leaf_entry *
+-iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry)
+-{
+- return entry;
+-}
+-
+-static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf,
+- struct iam_key *k)
+-{
+- struct iam_leaf_entry *p, *q, *m;
+- struct iam_leaf_entry *entries = leaf->entries;
+- int count = dx_get_count((struct iam_entry *)entries);
+-
+- p = iam_leaf_entry_shift(path, entries, 1);
+- q = iam_leaf_entry_shift(path, entries, count - 1);
+- while (p <= q) {
+- m = iam_leaf_entry_shift(path,
+- p, iam_leaf_entry_diff(path, q, p) / 2);
+- dxtrace(printk("."));
+- if (keycmp(path->ip_container, iam_leaf_key_at(path, m),
+- path->ip_key_target) > 0)
+- q = iam_leaf_entry_shift(path, m, -1);
+- else
+- p = iam_leaf_entry_shift(path, m, +1);
+- }
+- leaf->at = q;
+- return 0;
+-}
+-
+-/*XXX what kind of lock should this entry be locked: WangDi */
+-static int iam_leaf_insert(handle_t *handle, struct iam_path *path,
+- struct iam_key *k, struct iam_rec *r)
+-{
+- struct iam_leaf leaf;
+- struct iam_leaf_entry *p, *q;
+- int err, count;
+-
+- err = iam_leaf_init(path, &leaf);
+- if (err)
+- goto errout;
+- path_descr(path)->id_leaf.start(path->ip_container, &leaf);
+- count = dx_get_count((struct iam_entry *)leaf.entries);
+- if (dx_get_count((struct iam_entry *)leaf.entries) >=
+- dx_get_limit((struct iam_entry *)leaf.entries)){
+- err = -ENOSPC;
+- goto errout;
+- }
+-
+- err = iam_leaf_lookup(path, &leaf, k);
+- if (err)
+- goto errout;
+-
+- /*insert the k/r to leaf entries*/
+- p = iam_leaf_entry_shift(path, leaf.at, 1);
+- q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
+- while (q < p) {
+- memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path));
+- q = iam_leaf_entry_shift(path, q, -1);
+- }
+- memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size);
+- memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size);
+-
+- dx_set_count((struct iam_entry*)leaf.entries, count + 1);
+- err = ext3_journal_dirty_metadata(handle, leaf.bh);
+- if (err)
+- ext3_std_error(path->ip_container->ic_object->i_sb, err);
+-errout:
+- iam_leaf_fini(&leaf);
+- return err;
+-}
+-
+-static int split_leaf_node(handle_t *handle, struct iam_path *path)
+-{
+- struct inode *dir = path_obj(path);
+- unsigned continued = 0;
+- struct buffer_head *bh2;
+- u32 newblock, hash_split;
+- char *data2;
+- struct iam_leaf leaf;
+- unsigned split;
+- int err;
+-
+- bh2 = ext3_append (handle, dir, &newblock, &err);
+- if (!(bh2)) {
+- err = -ENOSPC;
+- goto errout;
+- }
+- err = iam_leaf_init(path, &leaf);
+- if (err)
+- goto errout;
+-
+- BUFFER_TRACE(leaf.bh, "get_write_access");
+- err = ext3_journal_get_write_access(handle, leaf.bh);
+- if (err) {
+- journal_error:
+- iam_leaf_fini(&leaf);
+- brelse(bh2);
+- ext3_std_error(dir->i_sb, err);
+- err = -EIO;
+- goto errout;
+- }
+- data2 = bh2->b_data;
+- split = dx_get_count((struct iam_entry*)leaf.entries)/2;
+- hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split));
+- if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)),
+- iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0)
+- continued = 1;
+-
+- memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1),
+- iam_leaf_entry_shift(path, leaf.entries, split),
+- split * iam_leaf_entry_size(path));
+-
+- /* Which block gets the new entry? */
+- dx_insert_block(path, path->ip_frame, hash_split + continued, newblock);
+- err = ext3_journal_dirty_metadata (handle, bh2);
+- if (err)
+- goto journal_error;
+- err = ext3_journal_dirty_metadata (handle, leaf.bh);
+- if (err)
+- goto journal_error;
+- brelse (bh2);
+- iam_leaf_fini(&leaf);
+-errout:
+- return err;
+-}
+-
+-static int split_index_node(handle_t *handle, struct iam_path *path);
+-/*
+- * Insert new record @r with key @k into container @c (within context of
+- * transaction @h.
+- *
+- * Return values: 0: success, -ve: error, including -EEXIST when record with
+- * given key is already present.
+- *
+- * postcondition: ergo(result == 0 || result == -EEXIST,
+- * iam_lookup(c, k, r2) > 0 &&
+- * !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k,
+- struct iam_rec *r)
++}
++#endif /* DX_DEBUG */
++
++int dx_lookup(struct iam_path *path)
+ {
+- struct dx_hash_info hinfo;
+- struct iam_path_compat cpath;
+- struct iam_path *path = &cpath.ipc_path;
+- struct htree_cookie hc = {
+- .hinfo = &hinfo
+- };
+- int err, i;
++ u32 ptr;
++ int err = 0;
++ int i;
++ int delta;
+
+- iam_path_init(path, c, &hc);
+- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+- path->ip_key_scratch[i] =
+- (struct iam_key *)&cpath.ipc_scrach[i];
+- err = dx_lookup(path);
+- if (err)
+- goto errout;
++ struct iam_descr *param;
++ struct iam_frame *frame;
++ struct iam_container *c;
+
+- err = iam_leaf_insert(handle, path, k, r);
++ param = iam_path_descr(path);
++ c = path->ip_container;
+
+- if (err != -ENOSPC)
+- goto errout;
++ delta = dx_index_is_compat(path) ? 1 : 2;
+
+- err = split_index_node(handle, path);
+- if (err)
+- goto errout;
++ for (frame = path->ip_frames, i = 0,
++ ptr = param->id_ops->id_root_ptr(c);
++ i <= path->ip_indirect;
++ ptr = dx_get_block(path, frame->at), ++frame, ++i) {
++ struct iam_entry *entries;
++ struct iam_entry *p;
++ struct iam_entry *q;
++ struct iam_entry *m;
++ unsigned count;
+
+- err = split_leaf_node(handle, path);
+- if (err)
+- goto errout;
+-
+- err = iam_leaf_insert(handle, path, k, r);
+-errout:
+- iam_path_fini(path);
+- return(err);
+-}
++ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
++ &frame->bh);
++ if (err != 0)
++ break;
+
+-EXPORT_SYMBOL(iam_insert);
+-static int iam_leaf_delete(handle_t *handle, struct iam_path *path,
+- struct iam_key *k)
+-{
+- struct iam_leaf leaf;
+- struct iam_leaf_entry *p, *q;
+- int err, count;
++ if (EXT3_INVARIANT_ON) {
++ err = param->id_ops->id_node_check(path, frame);
++ if (err != 0)
++ break;
++ }
+
+- err = iam_leaf_init(path, &leaf);
+- if (err)
+- goto errout;
++ err = param->id_ops->id_node_load(path, frame);
++ if (err != 0)
++ break;
++
++ assert_inv(dx_node_check(path, frame));
+
+- err = iam_leaf_lookup(path, &leaf, k);
+- if (err)
+- goto errout;
++ entries = frame->entries;
++ count = dx_get_count(entries);
++ assert_corr(count && count <= dx_get_limit(entries));
++ p = iam_entry_shift(path, entries, delta);
++ q = iam_entry_shift(path, entries, count - 1);
++ while (p <= q) {
++ m = iam_entry_shift(path,
++ p, iam_entry_diff(path, q, p) / 2);
++ dxtrace(printk("."));
++ if (iam_ikeycmp(c, iam_ikey_at(path, m),
++ path->ip_ikey_target) > 0)
++ q = iam_entry_shift(path, m, -1);
++ else
++ p = iam_entry_shift(path, m, +1);
++ }
+
+- count = dx_get_count((struct iam_entry*)leaf.entries);
+- /*delete the k to leaf entries*/
+- p = iam_leaf_entry_shift(path, leaf.at, 1);
+- q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
+- while (p < q) {
+- memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path));
+- p = iam_leaf_entry_shift(path, p, 1);
+- }
+- dx_set_count((struct iam_entry*)leaf.entries, count - 1);
++ frame->at = iam_entry_shift(path, p, -1);
++ if (EXT3_INVARIANT_ON) { // linear search cross check
++ unsigned n = count - 1;
++ struct iam_entry *at;
+
+- err = ext3_journal_dirty_metadata(handle, leaf.bh);
+- if (err)
+- ext3_std_error(path_obj(path)->i_sb, err);
+-errout:
+- iam_leaf_fini(&leaf);
++ at = entries;
++ while (n--) {
++ dxtrace(printk(","));
++ at = iam_entry_shift(path, at, +1);
++ if (iam_ikeycmp(c, iam_ikey_at(path, at),
++ path->ip_ikey_target) > 0) {
++ if (at != iam_entry_shift(path, frame->at, 1)) {
++ BREAKPOINT();
++ printk(KERN_EMERG "%i\n",
++ iam_ikeycmp(c, iam_ikey_at(path, at),
++ path->ip_ikey_target));
++ }
++ at = iam_entry_shift(path, at, -1);
++ break;
++ }
++ }
++ assert_corr(at == frame->at);
++ }
++ }
++ if (err != 0)
++ iam_path_fini(path);
++ path->ip_frame = --frame;
+ return err;
+ }
+
+ /*
+- * Delete existing record with key @k.
+- *
+- * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ * Probe for a directory leaf block to search.
+ *
+- * postcondition: ergo(result == 0 || result == -ENOENT,
+- * !iam_lookup(c, k, *));
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally. The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
+ */
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k)
+-{
+- struct dx_hash_info hinfo;
+- struct iam_path_compat cpath;
+- struct iam_path *path = &cpath.ipc_path;
+- struct htree_cookie hc = {
+- .hinfo = &hinfo
+- };
+- int err, i;
+-
+- iam_path_init(path, c, &hc);
+- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+- path->ip_key_scratch[i] =
+- (struct iam_key *)&cpath.ipc_scrach[i];
+- err = dx_lookup(path);
+- if (err)
+- goto errout;
+-
+- err = iam_leaf_delete(h, path, k);
+-errout:
+- iam_path_fini(path);
+- return err;
+-}
+-
+-EXPORT_SYMBOL(iam_delete);
+-
+-static int iam_leaf_update(handle_t *handle, struct iam_path *path,
+- struct iam_key *k, struct iam_rec *r)
++static int dx_probe(struct dentry *dentry, struct inode *dir,
++ struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+- struct iam_leaf leaf;
+ int err;
+-
+- err = iam_leaf_init(path, &leaf);
+- if (err)
+- goto errout;
++ struct iam_path_compat *ipc;
+
+- err = iam_leaf_lookup(path, &leaf, k);
+- if (err)
+- goto errout;
+-
+- memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size);
+- memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size);
++ assert_corr(path->ip_data != NULL);
++ ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
++ ipc->ipc_dentry = dentry;
++ ipc->ipc_hinfo = hinfo;
+
+- err = ext3_journal_dirty_metadata(handle, leaf.bh);
+- if (err)
+- ext3_std_error(path_obj(path)->i_sb, err);
+-errout:
+- iam_leaf_fini(&leaf);
+- return err;
+-}
+-/*
+- * Replace existing record with key @k, or insert new one. New record data are
+- * in @r.
+- *
+- * Return values: 0: success, -ve: error.
+- *
+- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
+- * !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_update(handle_t *h, struct iam_container *c,
+- struct iam_key *k, struct iam_rec *r)
+-{
+- struct dx_hash_info hinfo;
+- struct iam_path_compat cpath;
+- struct iam_path *path = &cpath.ipc_path;
+- struct htree_cookie hc = {
+- .hinfo = &hinfo
+- };
+- int err, i;
+-
+- iam_path_init(path, c, &hc);
+- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+- path->ip_key_scratch[i] =
+- (struct iam_key *)&cpath.ipc_scrach[i];
++ assert_corr(dx_index_is_compat(path));
+ err = dx_lookup(path);
+- if (err)
+- goto errout;
+-
+- err = iam_leaf_update(h, path, k, r);
+-errout:
+- iam_path_fini(path);
++ assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+ return err;
+ }
+
+-EXPORT_SYMBOL(iam_update);
+-
+ /*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+@@ -1409,16 +373,15 @@
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+- struct iam_path *path, __u32 *start_hash)
++static int ext3_htree_advance(struct inode *dir, __u32 hash,
++ struct iam_path *path, __u32 *start_hash,
++ int compat)
+ {
+ struct iam_frame *p;
+ struct buffer_head *bh;
+ int err, num_frames = 0;
+ __u32 bhash;
+
+- assert(dx_index_is_compat(path));
+-
+ p = path->ip_frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+@@ -1438,6 +401,10 @@
+ --p;
+ }
+
++ if (compat) {
++ /*
++ * Htree hash magic.
++ */
+ /*
+ * If the hash is 1, then continue only if the next page has a
+ * continuation hash of any value. This is used for readdir
+@@ -1445,19 +412,21 @@
+ * desired contiuation hash. If it doesn't, return since
+ * there's no point to read in the successive index pages.
+ */
+- dx_get_key(path, p->at, (struct iam_key *)&bhash);
++ iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
+ if (start_hash)
+ *start_hash = bhash;
+ if ((hash & 1) == 0) {
+ if ((bhash & ~1) != hash)
+ return 0;
+ }
++ }
+ /*
+ * If the hash is HASH_NB_ALWAYS, we always go to the next
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+- err = path_descr(path)->id_node_read(path->ip_container,
++ err = iam_path_descr(path)->id_ops->
++ id_node_read(path->ip_container,
+ (iam_ptr_t)dx_get_block(path, p->at),
+ NULL, &bh);
+ if (err != 0)
+@@ -1465,12 +434,23 @@
+ ++p;
+ brelse (p->bh);
+ p->bh = bh;
+- p->at = p->entries = dx_node_get_entries(path, p);
+- assert(dx_node_check(path, p));
++ p->entries = dx_node_get_entries(path, p);
++ p->at = iam_entry_shift(path, p->entries, !compat);
++ assert_inv(dx_node_check(path, p));
+ }
+ return 1;
+ }
+
++int iam_index_next(struct iam_container *c, struct iam_path *path)
++{
++ return ext3_htree_advance(c->ic_object, 0, path, NULL, 0);
++}
++
++int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct iam_path *path, __u32 *start_hash)
++{
++ return ext3_htree_advance(dir, hash, path, start_hash, 1);
++}
+
+ /*
+ * p is at least 6 bytes before the end of page
+@@ -1662,21 +642,30 @@
+ } while(more);
+ }
+
+-static void dx_insert_block(struct iam_path *path,
+- struct iam_frame *frame, u32 hash, u32 block)
++void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++ const struct iam_ikey *key, iam_ptr_t ptr)
+ {
+ struct iam_entry *entries = frame->entries;
+- struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1);
++ struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+ int count = dx_get_count(entries);
+
+- assert(count < dx_get_limit(entries));
+- assert(old < iam_entry_shift(path, entries, count));
++ assert_corr(count < dx_get_limit(entries));
++ assert_corr(frame->at < iam_entry_shift(path, entries, count));
++
+ memmove(iam_entry_shift(path, new, 1), new,
+ (char *)iam_entry_shift(path, entries, count) - (char *)new);
+- dx_set_key(path, new, (struct iam_key *)&hash);
+- dx_set_block(path, new, block);
++ dx_set_ikey(path, new, key);
++ dx_set_block(path, new, ptr);
+ dx_set_count(entries, count + 1);
+ }
++
++void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
++ u32 hash, u32 block)
++{
++ assert_corr(dx_index_is_compat(path));
++ iam_insert_key(path, frame, (struct iam_ikey *)&hash, block);
++}
++
+ #endif
+
+
+@@ -1903,7 +892,8 @@
+ hash = hinfo.hash;
+ do {
+ block = dx_get_block(path, path->ip_frame->at);
+- *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block,
++ *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container,
++ (iam_ptr_t)block,
+ NULL, &bh);
+ if (*err != 0)
+ goto errout;
+@@ -2093,22 +1083,69 @@
+ return prev;
+ }
+
++struct ext3_dir_entry_2 *move_entries(struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct buffer_head **bh1,
++ struct buffer_head **bh2,
++ __u32 *delim_hash)
++{
++ char *data1;
++ char *data2;
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count;
++ unsigned continued;
++ unsigned split;
++ u32 hash2;
++
++ struct dx_map_entry *map;
++ struct ext3_dir_entry_2 *de1;
++ struct ext3_dir_entry_2 *de2;
++
++ data1 = (*bh1)->b_data;
++ data2 = (*bh2)->b_data;
++
++ /* create map in the end of data2 block */
++ map = (struct dx_map_entry *) (data2 + blocksize);
++ count = dx_make_map((struct ext3_dir_entry_2 *) data1,
++ blocksize, hinfo, map);
++ map -= count;
++ split = count/2; // need to adjust to actual middle
++ dx_sort_map(map, count);
++ hash2 = map[split].hash;
++ continued = hash2 == map[split - 1].hash;
++ dxtrace(printk("Split block %i at %x, %i/%i\n",
++ dx_get_block(frame->at), hash2, split, count - split));
++
++ /* Fancy dance to stay within two buffers */
++ de2 = dx_move_dirents(data1, data2, map + split, count - split);
++ de1 = dx_pack_dirents(data1, blocksize);
++ de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1);
++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++ dxtrace(dx_show_leaf(hinfo,
++ (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++ dxtrace(dx_show_leaf(hinfo,
++ (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++ /* Which block gets the new entry? */
++ if (hinfo->hash >= hash2) {
++ swap(*bh1, *bh2);
++ de1 = de2;
++ }
++ *delim_hash = hash2 + continued;
++ return de1;
++}
++
+ /* Allocate new node, and split leaf node @bh into it, inserting new pointer
+ * into parent node identified by @frame */
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path,
+ struct buffer_head **bh,struct iam_frame *frame,
+ struct dx_hash_info *hinfo, int *error)
+ {
+- struct inode *dir = path_obj(path);
+- unsigned blocksize = dir->i_sb->s_blocksize;
+- unsigned count, continued;
++ struct inode *dir = iam_path_obj(path);
+ struct buffer_head *bh2;
+ u32 newblock;
+ u32 hash2;
+- struct dx_map_entry *map;
+- char *data1 = (*bh)->b_data, *data2;
+- unsigned split;
+- struct ext3_dir_entry_2 *de = NULL, *de2;
++ struct ext3_dir_entry_2 *de = NULL;
+ int err;
+
+ bh2 = ext3_append (handle, dir, &newblock, error);
+@@ -2133,35 +1170,9 @@
+ if (err)
+ goto journal_error;
+
+- data2 = bh2->b_data;
+-
+- /* create map in the end of data2 block */
+- map = (struct dx_map_entry *) (data2 + blocksize);
+- count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
+- blocksize, hinfo, map);
+- map -= count;
+- split = count/2; // need to adjust to actual middle
+- dx_sort_map (map, count);
+- hash2 = map[split].hash;
+- continued = hash2 == map[split - 1].hash;
+- dxtrace(printk("Split block %i at %x, %i/%i\n",
+- dx_get_block(frame->at), hash2, split, count-split));
+-
+- /* Fancy dance to stay within two buffers */
+- de2 = dx_move_dirents(data1, data2, map + split, count - split);
+- de = dx_pack_dirents(data1,blocksize);
+- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+- de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++ de = move_entries(dir, hinfo, bh, &bh2, &hash2);
+
+- /* Which block gets the new entry? */
+- if (hinfo->hash >= hash2)
+- {
+- swap(*bh, bh2);
+- de = de2;
+- }
+- dx_insert_block(path, frame, hash2 + continued, newblock);
++ dx_insert_block(path, frame, hash2, newblock);
+ err = ext3_journal_dirty_metadata (handle, bh2);
+ if (err)
+ goto journal_error;
+@@ -2175,6 +1186,63 @@
+ }
+ #endif
+
++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir,
++ struct buffer_head *bh,
++ const char *name, int namelen)
++{
++ struct ext3_dir_entry_2 *de;
++ char *top;
++ unsigned long offset;
++ int nlen;
++ int rlen;
++ int reclen;
++
++ reclen = EXT3_DIR_REC_LEN(namelen);
++ de = (struct ext3_dir_entry_2 *)bh->b_data;
++ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++ offset = 0;
++ while ((char *) de <= top) {
++ if (!ext3_check_dir_entry("ext3_add_entry",
++ dir, de, bh, offset))
++ return ERR_PTR(-EIO);
++ if (ext3_match(namelen, name, de))
++ return ERR_PTR(-EEXIST);
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if ((de->inode? rlen - nlen: rlen) >= reclen)
++ return de;
++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++ offset += rlen;
++ }
++ return ERR_PTR(-ENOSPC);
++}
++
++struct ext3_dir_entry_2 *split_entry(struct inode *dir,
++ struct ext3_dir_entry_2 *de,
++ unsigned long ino, mode_t mode,
++ const char *name, int namelen)
++{
++ int nlen;
++ int rlen;
++
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if (de->inode) {
++ struct ext3_dir_entry_2 *de1;
++
++ de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ de1->rec_len = cpu_to_le16(rlen - nlen);
++ de->rec_len = cpu_to_le16(nlen);
++ de = de1;
++ }
++ de->file_type = EXT3_FT_UNKNOWN;
++ de->inode = cpu_to_le32(ino);
++ if (ino != 0)
++ ext3_set_de_type(dir->i_sb, de, mode);
++ de->name_len = namelen;
++ memcpy(de->name, name, namelen);
++ return de;
++}
+
+ /*
+ * Add a new entry into a directory (leaf) block. If de is non-NULL,
+@@ -2194,34 +1262,16 @@
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+- unsigned long offset = 0;
+- unsigned short reclen;
+- int nlen, rlen, err;
+- char *top;
++ int err;
+
+- reclen = EXT3_DIR_REC_LEN(namelen);
+ if (!de) {
+- de = (struct ext3_dir_entry_2 *)bh->b_data;
+- top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+- while ((char *) de <= top) {
+- if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
+- bh, offset)) {
+- brelse (bh);
+- return -EIO;
+- }
+- if (ext3_match (namelen, name, de)) {
+- brelse (bh);
+- return -EEXIST;
+- }
+- nlen = EXT3_DIR_REC_LEN(de->name_len);
+- rlen = le16_to_cpu(de->rec_len);
+- if ((de->inode? rlen - nlen: rlen) >= reclen)
+- break;
+- de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
+- offset += rlen;
++ de = find_insertion_point(dir, bh, name, namelen);
++ if (IS_ERR(de)) {
++ err = PTR_ERR(de);
++ if (err != -ENOSPC)
++ brelse(bh);
++ return err;
+ }
+- if ((char *) de > top)
+- return -ENOSPC;
+ }
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh);
+@@ -2232,22 +1282,9 @@
+ }
+
+ /* By now the buffer is marked for journaling */
+- nlen = EXT3_DIR_REC_LEN(de->name_len);
+- rlen = le16_to_cpu(de->rec_len);
+- if (de->inode) {
+- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
+- de1->rec_len = cpu_to_le16(rlen - nlen);
+- de->rec_len = cpu_to_le16(nlen);
+- de = de1;
+- }
+- de->file_type = EXT3_FT_UNKNOWN;
+- if (inode) {
+- de->inode = cpu_to_le32(inode->i_ino);
+- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+- } else
+- de->inode = 0;
+- de->name_len = namelen;
+- memcpy (de->name, name, namelen);
++
++ split_entry(dir, de, inode ? inode->i_ino : 0,
++ inode ? inode->i_mode : 0, name, namelen);
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+@@ -2423,8 +1460,40 @@
+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ }
+
++static int shift_entries(struct iam_path *path,
++ struct iam_frame *frame, unsigned count,
++ struct iam_entry *entries, struct iam_entry *entries2,
++ u32 newblock)
++{
++ unsigned count1;
++ unsigned count2;
++ int delta;
++
++ struct iam_frame *parent = frame - 1;
++ struct iam_ikey *pivot = iam_path_ikey(path, 3);
++
++ delta = dx_index_is_compat(path) ? 0 : +1;
++
++ count1 = count/2 + delta;
++ count2 = count - count1;
++ iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
++
++ dxtrace(printk("Split index %i/%i\n", count1, count2));
++
++ memcpy((char *) iam_entry_shift(path, entries2, delta),
++ (char *) iam_entry_shift(path, entries, count1),
++ count2 * iam_entry_size(path));
++
++ dx_set_count(entries, count1);
++ dx_set_count(entries2, count2 + delta);
++ dx_set_limit(entries2, dx_node_limit(path));
++
++ iam_insert_key(path, parent, pivot, newblock);
++ return count1;
++}
++
+ #ifdef CONFIG_EXT3_INDEX
+-static int split_index_node(handle_t *handle, struct iam_path *path)
++int split_index_node(handle_t *handle, struct iam_path *path)
+ {
+
+ struct iam_entry *entries; /* old block contents */
+@@ -2432,10 +1501,17 @@
+ struct iam_frame *frame, *safe;
+ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+ u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+- struct inode *dir = path_obj(path);
++ struct inode *dir = iam_path_obj(path);
++ struct iam_descr *descr;
+ int nr_splet;
+ int i, err;
+
++ descr = iam_path_descr(path);
++ /*
++ * Algorithm below depends on this.
++ */
++ assert_corr(dx_root_limit(path) < dx_node_limit(path));
++
+ frame = path->ip_frame;
+ entries = frame->entries;
+
+@@ -2474,7 +1550,8 @@
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+ if (!bh_new[i] ||
+- path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0)
++ descr->id_ops->id_node_init(path->ip_container,
++ bh_new[i], 0) != 0)
+ goto cleanup;
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -2493,6 +1570,7 @@
+ unsigned count;
+ int idx;
+ struct buffer_head *bh2;
++ struct buffer_head *bh;
+
+ entries = frame->entries;
+ count = dx_get_count(entries);
+@@ -2501,6 +1579,7 @@
+ bh2 = bh_new[i];
+ entries2 = dx_get_entries(path, bh2->b_data, 0);
+
++ bh = frame->bh;
+ if (frame == path->ip_frames) {
+ /* splitting root node. Tricky point:
+ *
+@@ -2512,22 +1591,20 @@
+ * capacity of the root node is smaller than that of
+ * non-root one.
+ */
+- struct dx_root *root;
+- u8 indirects;
+ struct iam_frame *frames;
++ struct iam_entry *next;
++
++ assert_corr(i == 0);
+
+ frames = path->ip_frames;
+- root = (struct dx_root *) frames->bh->b_data;
+- indirects = root->info.indirect_levels;
+- dxtrace(printk("Creating new root %d\n", indirects));
+ memcpy((char *) entries2, (char *) entries,
+ count * iam_entry_size(path));
+ dx_set_limit(entries2, dx_node_limit(path));
+
+ /* Set up root */
+- dx_set_count(entries, 1);
+- dx_set_block(path, entries, newblock[i]);
+- root->info.indirect_levels = indirects + 1;
++ next = descr->id_ops->id_root_inc(path->ip_container,
++ path, frame);
++ dx_set_block(path, next, newblock[0]);
+
+ /* Shift frames in the path */
+ memmove(frames + 2, frames + 1,
+@@ -2536,49 +1613,61 @@
+ frames[1].at = iam_entry_shift(path, entries2, idx);
+ frames[1].entries = entries = entries2;
+ frames[1].bh = bh2;
+- assert(dx_node_check(path, frame));
++ assert_inv(dx_node_check(path, frame));
++ ++ path->ip_frame;
+ ++ frame;
+- assert(dx_node_check(path, frame));
+- bh_new[i] = NULL; /* buffer head is "consumed" */
++ assert_inv(dx_node_check(path, frame));
++ bh_new[0] = NULL; /* buffer head is "consumed" */
+ err = ext3_journal_get_write_access(handle, bh2);
+ if (err)
+ goto journal_error;
+ } else {
+ /* splitting non-root index node. */
+- unsigned count1 = count/2, count2 = count - count1;
+- unsigned hash2;
+-
+- dx_get_key(path,
+- iam_entry_shift(path, entries, count1),
+- (struct iam_key *)&hash2);
+-
+- dxtrace(printk("Split index %i/%i\n", count1, count2));
+-
+- memcpy ((char *) entries2,
+- (char *) iam_entry_shift(path, entries, count1),
+- count2 * iam_entry_size(path));
+- dx_set_count (entries, count1);
+- dx_set_count (entries2, count2);
+- dx_set_limit (entries2, dx_node_limit(path));
++ struct iam_frame *parent = frame - 1;
+
++ count = shift_entries(path, frame, count,
++ entries, entries2, newblock[i]);
+ /* Which index block gets the new entry? */
+- if (idx >= count1) {
++ if (idx >= count) {
++ int d = dx_index_is_compat(path) ? 0 : +1;
++
+ frame->at = iam_entry_shift(path, entries2,
+- idx - count1);
++ idx - count + d);
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ bh_new[i] = bh2;
++ parent->at = iam_entry_shift(path,
++ parent->at, +1);
+ }
+- dx_insert_block(path, frame - 1, hash2, newblock[i]);
+- assert(dx_node_check(path, frame));
+- assert(dx_node_check(path, frame - 1));
++ assert_inv(dx_node_check(path, frame));
++ assert_inv(dx_node_check(path, parent));
+ dxtrace(dx_show_index ("node", frame->entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (err)
+ goto journal_error;
++ err = ext3_journal_dirty_metadata(handle, parent->bh);
++ if (err)
++ goto journal_error;
++ }
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ goto journal_error;
++ /*
++ * This function was called to make insertion of new leaf
++ * possible. Check that it fulfilled its obligations.
++ */
++ assert_corr(dx_get_count(path->ip_frame->entries) <
++ dx_get_limit(path->ip_frame->entries));
+ }
++ if (nr_splet > 0) {
++ /*
++ * Log ->i_size modification.
++ */
++ err = ext3_mark_inode_dirty(handle, dir);
++ if (err)
++ goto journal_error;
+ }
+ goto cleanup;
+ journal_error:
+@@ -2610,7 +1699,7 @@
+ size_t isize;
+
+ iam_path_compat_init(&cpath, dir);
+- param = path_descr(path);
++ param = iam_path_descr(path);
+
+ err = dx_probe(dentry, NULL, &hinfo, path);
+ if (err != 0)
+@@ -2620,7 +1709,8 @@
+ /* XXX nikita: global serialization! */
+ isize = dir->i_size;
+
+- err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at),
++ err = param->id_ops->id_node_read(path->ip_container,
++ (iam_ptr_t)dx_get_block(path, frame->at),
+ handle, &bh);
+ if (err != 0)
+ goto cleanup;
+@@ -2641,11 +1731,11 @@
+ goto cleanup;
+
+ /*copy split inode too*/
+- de = do_split(handle, path, &bh, --frame, &hinfo, &err);
++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err);
+ if (!de)
+ goto cleanup;
+
+- assert(dx_node_check(path, frame));
++ assert_inv(dx_node_check(path, frame));
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ goto cleanup2;
+
+@@ -2752,6 +1842,26 @@
+ return ext3_new_inode(handle, dir, mode, inum);
+ }
+
++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode)
++{
++ struct inode *inode;
++
++ inode = ext3_new_inode(handle, dir, mode, 0);
++ if (!IS_ERR(inode)) {
++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) {
++#ifdef CONFIG_LDISKFS_FS_XATTR
++ inode->i_op = &ext3_special_inode_operations;
++#endif
++ } else {
++ inode->i_op = &ext3_file_inode_operations;
++ inode->i_fop = &ext3_file_operations;
++ ext3_set_aops(inode);
++ }
++ }
++ return inode;
++}
++EXPORT_SYMBOL(ext3_create_inode);
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+Index: iam/fs/ext3/Makefile
+===================================================================
+--- iam.orig/fs/ext3/Makefile 2007-05-23 11:18:11.000000000 +0800
++++ iam/fs/ext3/Makefile 2007-05-23 11:18:20.000000000 +0800
+@@ -6,7 +6,7 @@
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o \
+- extents.o mballoc.o
++ extents.o mballoc.o iam.o iam_lfix.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: iam/fs/ext3/iam_lvar.c
+===================================================================
+--- iam.orig/fs/ext3/iam_lvar.c 2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam_lvar.c 2007-05-23 11:19:15.000000000 +0800
+@@ -0,0 +1,1080 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * iam_lvar.c
++ * implementation of iam format for fixed size records, variable sized keys.
++ *
++ * Copyright (c) 2006 Cluster File Systems, Inc.
++ * Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ * This file is part of the Lustre file system, http://www.lustre.org
++ * Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ * You may have signed or agreed to another license before downloading
++ * this software. If so, you are bound by the terms and conditions
++ * of that agreement, and the following does not apply to you. See the
++ * LICENSE file included with this distribution for more information.
++ *
++ * If you did not agree to a different license, then this copy of Lustre
++ * is open source software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * In either case, Lustre is distributed in the hope that it will be
++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++/*
++ * Leaf operations.
++ */
++
++enum {
++ IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in
++ * lustre/utils/create_iam.c */
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct lvar_leaf_header {
++ __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
++ __le16 vlh_used; /* used bytes, including header */
++};
++
++/*
++ * Format of leaf entry:
++ *
++ * __le16 keysize
++ * u8 key[keysize]
++ * u8 record[rec_size]
++ *
++ * Entries are ordered in key order.
++ */
++
++/* This is duplicated in lustre/utils/create_iam.c */
++typedef __u32 lvar_hash_t;
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct lvar_leaf_entry {
++ __le32 vle_hash;
++ __le16 vle_keysize;
++ u8 vle_key[0];
++};
++
++#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
++
++
++static inline int blocksize(const struct iam_leaf *leaf)
++{
++ return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
++}
++
++static inline const char *kchar(const struct iam_key *key)
++{
++ return (void *)key;
++}
++
++static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
++{
++ return (struct iam_lentry *)ent;
++}
++
++static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
++{
++ return (struct lvar_leaf_entry *)lent;
++}
++
++
++static inline int e_keysize(const struct lvar_leaf_entry *ent)
++{
++ return le16_to_cpu(ent->vle_keysize);
++}
++
++/* This is duplicated in lustre/utils/create_iam.c */
++enum {
++ LVAR_PAD = 4,
++ LVAR_ROUND = LVAR_PAD - 1
++};
++
++static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
++{
++ CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1)));
++
++ return (offsetof(struct lvar_leaf_entry, vle_key) +
++ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
++}
++
++static inline int rec_size(const struct iam_rec *rec)
++{
++ return *(const char *)rec;
++}
++
++static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
++{
++ return ((void *)ent) +
++ offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
++}
++
++static inline int e_size(const struct iam_leaf *leaf,
++ const struct lvar_leaf_entry *ent)
++{
++ return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
++}
++
++static inline char *e_char(const struct lvar_leaf_entry *ent)
++{
++ return (char *)&ent->vle_key;
++}
++
++static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
++{
++ return (struct iam_key *)e_char(ent);
++}
++
++static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
++{
++ return le32_to_cpu(ent->vle_hash);
++}
++
++static void e_print(const struct lvar_leaf_entry *ent)
++{
++ printk(" %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
++ e_keysize(ent), e_keysize(ent), e_char(ent));
++}
++#if 0
++static int e_check(const struct iam_leaf *leaf,
++ const struct lvar_leaf_entry *ent)
++{
++ const void *point = ent;
++ const void *start = leaf->il_bh->b_data;
++ return
++ start + sizeof(struct lvar_leaf_header) <= point &&
++ point + e_size(leaf, ent) < start + blocksize(leaf);
++}
++#endif
++
++static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
++ const struct lvar_leaf_entry *ent)
++{
++ return ((void *)ent) + e_size(leaf, ent);
++}
++
++#define LVAR_HASH_SANDWICH (0)
++#define LVAR_HASH_TEA (1)
++#define LVAR_HASH_R5 (0)
++#define LVAR_HASH_PREFIX (0)
++
++static __u32 hash_build0(const char *name, int namelen)
++{
++ __u32 result;
++
++ if (namelen == 0)
++ return 0;
++ if (strncmp(name, ".", 1) == 0 && namelen == 1)
++ return 1;
++ if (strncmp(name, "..", 2) == 0 && namelen == 2)
++ return 2;
++
++ if (LVAR_HASH_PREFIX) {
++ result = 0;
++ strncpy((void *)&result,
++ name, min(namelen, (int)sizeof result));
++ } else {
++ struct dx_hash_info hinfo;
++
++ if (LVAR_HASH_TEA)
++ hinfo.hash_version = DX_HASH_TEA;
++ else
++ hinfo.hash_version = DX_HASH_R5;
++ hinfo.seed = 0;
++ ext3fs_dirhash(name, namelen, &hinfo);
++ result = hinfo.hash;
++ if (LVAR_HASH_SANDWICH) {
++ __u32 result2;
++
++ hinfo.hash_version = DX_HASH_TEA;
++ hinfo.seed = 0;
++ ext3fs_dirhash(name, namelen, &hinfo);
++ result2 = hinfo.hash;
++ result = (0xfc000000 & result2) | (0x03ffffff & result);
++ }
++ }
++ return result;
++}
++
++enum {
++ HASH_GRAY_AREA = 1024,
++ MAX_HASH_SIZE = 0x7fffffffUL
++};
++
++static __u32 hash_build(const char *name, int namelen)
++{
++ __u32 hash;
++
++ hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE;
++ if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA)
++ hash &= HASH_GRAY_AREA - 1;
++ return hash;
++}
++
++static inline lvar_hash_t get_hash(const struct iam_container *bag,
++ const char *name, int namelen)
++{
++ return hash_build(name, namelen);
++}
++
++static inline int e_eq(const struct lvar_leaf_entry *ent,
++ const char *name, int namelen)
++{
++ return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
++}
++
++static inline int e_cmp(const struct iam_leaf *leaf,
++ const struct lvar_leaf_entry *ent, lvar_hash_t hash)
++{
++ lvar_hash_t ehash;
++
++ ehash = e_hash(ent);
++ return ehash == hash ? 0 : (ehash < hash ? -1 : +1);
++}
++
++static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
++{
++ return (struct lvar_leaf_header *)l->il_bh->b_data;
++}
++
++static int h_used(const struct lvar_leaf_header *hdr)
++{
++ return le16_to_cpu(hdr->vlh_used);
++}
++
++static void h_used_adj(const struct iam_leaf *leaf,
++ struct lvar_leaf_header *hdr, int adj)
++{
++ int used;
++
++ used = h_used(hdr) + adj;
++ assert_corr(sizeof *hdr <= used && used <= blocksize(leaf));
++ hdr->vlh_used = cpu_to_le16(used);
++}
++
++static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
++{
++ return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
++}
++
++static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
++{
++ return (void *)l->il_bh->b_data + h_used(n_head(l));
++}
++
++static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
++{
++ return lentry_lvar(l->il_at);
++}
++
++void n_print(const struct iam_leaf *l)
++{
++ struct lvar_leaf_entry *scan;
++
++ printk(KERN_EMERG "used: %d\n", h_used(n_head(l)));
++ for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
++ e_print(scan);
++}
++
++#if EXT3_CORRECTNESS_ON
++static int n_at_rec(const struct iam_leaf *folio)
++{
++ return
++ n_start(folio) <= lentry_lvar(folio->il_at) &&
++ lentry_lvar(folio->il_at) < n_end(folio);
++}
++
++#if EXT3_INVARIANT_ON
++static int n_invariant(const struct iam_leaf *leaf)
++{
++ struct iam_path *path;
++ struct lvar_leaf_entry *scan;
++ struct lvar_leaf_entry *end;
++ lvar_hash_t hash;
++ lvar_hash_t nexthash;
++ lvar_hash_t starthash;
++
++ end = n_end(leaf);
++ hash = 0;
++ path = leaf->il_path;
++
++ if (h_used(n_head(leaf)) > blocksize(leaf))
++ return 0;
++
++ /*
++ * Delimiting key in the parent index node. Clear least bit to account
++ * for hash collision marker.
++ */
++ starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
++ nexthash = e_hash(scan);
++ if (nexthash != get_hash(iam_leaf_container(leaf),
++ e_char(scan), e_keysize(scan))) {
++ BREAKPOINT();
++ return 0;
++ }
++ if (0 && nexthash < starthash) {
++ /*
++ * Unfortunately this useful invariant cannot be
++ * reliably checked as parent node is nor necessarily
++ * locked.
++ */
++ n_print(leaf);
++ printk("%#x < %#x\n", nexthash, starthash);
++ dump_stack();
++ return 0;
++ }
++ if (nexthash < hash) {
++ BREAKPOINT();
++ return 0;
++ }
++ hash = nexthash;
++ }
++ if (scan != end) {
++ BREAKPOINT();
++ return 0;
++ }
++ return 1;
++}
++/* EXT3_INVARIANT_ON */
++#endif
++
++/* EXT3_CORRECTNESS_ON */
++#endif
++
++static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
++ struct iam_ikey *key)
++{
++ lvar_hash_t *hash;
++
++ assert_corr(n_at_rec(l));
++
++ hash = (void *)key;
++ *hash = e_hash(n_cur(l));
++ return key;
++}
++
++static struct iam_key *lvar_key(const struct iam_leaf *l)
++{
++ return e_key(n_cur(l));
++}
++
++static int lvar_key_size(const struct iam_leaf *l)
++{
++ return e_keysize(n_cur(l));
++}
++
++static void lvar_start(struct iam_leaf *l)
++{
++ l->il_at = lvar_lentry(n_start(l));
++}
++
++static int lvar_init(struct iam_leaf *l)
++{
++ int result;
++ int used;
++ struct lvar_leaf_header *head;
++
++ assert_corr(l->il_bh != NULL);
++
++ head = n_head(l);
++ used = h_used(head);
++ if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) &&
++ used <= blocksize(l)) {
++ l->il_at = l->il_entries = lvar_lentry(n_start(l));
++ result = 0;
++ } else {
++ struct inode *obj;
++
++ obj = iam_leaf_container(l)->ic_object;
++ ext3_error(obj->i_sb, __FUNCTION__,
++ "Wrong magic in node %llu (#%lu): %#x != %#x or "
++ "wrong used: %i",
++ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
++ head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC),
++ used);
++ result = -EIO;
++ }
++ return result;
++}
++
++static void lvar_fini(struct iam_leaf *l)
++{
++ l->il_entries = l->il_at = NULL;
++}
++
++struct iam_rec *lvar_rec(const struct iam_leaf *l)
++{
++ assert_corr(n_at_rec(l));
++ return e_rec(n_cur(l));
++}
++
++static void lvar_next(struct iam_leaf *l)
++{
++ assert_corr(n_at_rec(l));
++ assert_corr(iam_leaf_is_locked(l));
++ l->il_at = lvar_lentry(e_next(l, n_cur(l)));
++}
++
++static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
++{
++ struct lvar_leaf_entry *found;
++ struct lvar_leaf_entry *scan;
++ struct lvar_leaf_entry *end;
++ int result;
++ const char *name;
++ int namelen;
++ int found_equal;
++ lvar_hash_t hash;
++ int last;
++
++ assert_inv(n_invariant(leaf));
++ end = n_end(leaf);
++
++ name = kchar(k);
++ namelen = strlen(name);
++ hash = get_hash(iam_leaf_container(leaf), name, namelen);
++ found = NULL;
++ found_equal = 0;
++ last = 1;
++
++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
++ lvar_hash_t scan_hash;
++
++ scan_hash = e_hash(scan);
++ if (scan_hash < hash)
++ found = scan;
++ else if (scan_hash == hash) {
++ if (e_eq(scan, name, namelen)) {
++ /*
++ * perfect match
++ */
++ leaf->il_at = lvar_lentry(scan);
++ return IAM_LOOKUP_EXACT;
++ } else if (!found_equal) {
++ found = scan;
++ found_equal = 1;
++ }
++ } else {
++ last = 0;
++ break;
++ }
++ }
++ if (found == NULL) {
++ /*
++ * @k is less than all hashes in the leaf.
++ */
++ lvar_start(leaf);
++ result = IAM_LOOKUP_BEFORE;
++ } else {
++ leaf->il_at = lvar_lentry(found);
++ result = IAM_LOOKUP_OK;
++ assert_corr(n_at_rec(leaf));
++ }
++ if (last)
++ result |= IAM_LOOKUP_LAST;
++ assert_inv(n_invariant(leaf));
++
++ return result;
++}
++
++static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
++{
++ struct lvar_leaf_entry *scan;
++ struct lvar_leaf_entry *end;
++ lvar_hash_t hash;
++
++ assert_inv(n_invariant(leaf));
++ end = n_end(leaf);
++ hash = *(const lvar_hash_t *)ik;
++
++ lvar_start(leaf);
++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
++ lvar_hash_t scan_hash;
++
++ scan_hash = e_hash(scan);
++ if (scan_hash > hash)
++ return scan == n_start(leaf) ?
++ IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
++ leaf->il_at = lvar_lentry(scan);
++ if (scan_hash == hash)
++ return IAM_LOOKUP_EXACT;
++ }
++ assert_inv(n_invariant(leaf));
++ /*
++ * @ik is greater than any key in the node. Return last record in the
++ * node.
++ */
++ return IAM_LOOKUP_OK;
++}
++
++static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++ memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
++}
++
++static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++ assert_corr(n_at_rec(l));
++ assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
++ assert_corr(iam_leaf_is_locked(l));
++ __lvar_key_set(l, k);
++ assert_inv(n_invariant(l));
++}
++
++static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
++{
++ lvar_hash_t hash;
++ const char *name;
++
++ name = kchar(k);
++
++ hash = get_hash(iam_leaf_container(l), name, strlen(name));
++ return e_cmp(l, n_cur(l), hash);
++}
++
++static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
++{
++ const char *name;
++
++ name = kchar(k);
++ return e_eq(n_cur(l), name, strlen(name));
++}
++
++static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++ memcpy(e_rec(n_cur(l)), r, rec_size(r));
++}
++
++static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++ assert_corr(n_at_rec(l));
++ assert_corr(iam_leaf_is_locked(l));
++ __lvar_rec_set(l, r);
++ assert_inv(n_invariant(l));
++}
++
++static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
++{
++ struct iam_rec *rec;
++
++ rec = e_rec(n_cur(l));
++ assert_corr(n_at_rec(l));
++ assert_corr(iam_leaf_is_locked(l));
++ memcpy(r, rec, rec_size(rec));
++ assert_inv(n_invariant(l));
++}
++
++static int lvar_can_add(const struct iam_leaf *l,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ assert_corr(iam_leaf_is_locked(l));
++ return
++ h_used(n_head(l)) +
++ getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
++}
++
++static int lvar_at_end(const struct iam_leaf *folio)
++{
++ assert_corr(iam_leaf_is_locked(folio));
++ return n_cur(folio) == n_end(folio);
++}
++
++static void lvar_rec_add(struct iam_leaf *leaf,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ const char *key;
++ int ksize;
++ int shift;
++ void *end;
++ void *start;
++ ptrdiff_t diff;
++
++ assert_corr(lvar_can_add(leaf, k, r));
++ assert_inv(n_invariant(leaf));
++ assert_corr(iam_leaf_is_locked(leaf));
++
++ key = kchar(k);
++ ksize = strlen(key);
++ shift = getsize(leaf, ksize, rec_size(r));
++
++ if (!lvar_at_end(leaf)) {
++ assert_corr(n_cur(leaf) < n_end(leaf));
++ end = n_end(leaf);
++ if (lvar_key_cmp(leaf, k) <= 0)
++ lvar_next(leaf);
++ else
++ /*
++ * Another exceptional case: insertion with the key
++ * less than least key in the leaf.
++ */
++ assert_corr(leaf->il_at == leaf->il_entries);
++
++ start = leaf->il_at;
++ diff = PDIFF(end, start);
++ assert_corr(diff >= 0);
++ memmove(start + shift, start, diff);
++ }
++ h_used_adj(leaf, n_head(leaf), shift);
++ n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
++ n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
++ key, ksize));
++ __lvar_key_set(leaf, k);
++ __lvar_rec_set(leaf, r);
++ assert_corr(n_at_rec(leaf));
++ assert_inv(n_invariant(leaf));
++}
++
++static void lvar_rec_del(struct iam_leaf *leaf, int shift)
++{
++ void *next;
++ void *end;
++ int nob;
++
++ assert_corr(n_at_rec(leaf));
++ assert_inv(n_invariant(leaf));
++ assert_corr(iam_leaf_is_locked(leaf));
++
++ end = n_end(leaf);
++ next = e_next(leaf, n_cur(leaf));
++ nob = e_size(leaf, n_cur(leaf));
++ memmove(leaf->il_at, next, end - next);
++ h_used_adj(leaf, n_head(leaf), -nob);
++ assert_inv(n_invariant(leaf));
++}
++
++static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
++{
++ struct lvar_leaf_header *hdr;
++
++ hdr = (struct lvar_leaf_header *)bh->b_data;
++ hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
++ hdr->vlh_used = sizeof *hdr;
++}
++
++static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
++ struct lvar_leaf_entry **prev)
++{
++ void *scan;
++ void *start;
++ int threshold;
++
++ *prev = NULL;
++ threshold = blocksize(leaf) / 2;
++ for (scan = start = n_start(leaf); scan - start <= threshold;
++ *prev = scan, scan = e_next(leaf, scan)) {
++ ;
++ }
++ return scan;
++}
++
++static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
++ iam_ptr_t new_blknr)
++{
++ struct lvar_leaf_entry *first_to_move;
++ struct lvar_leaf_entry *last_to_stay;
++ struct iam_path *path;
++ struct lvar_leaf_header *hdr;
++ struct buffer_head *new_leaf;
++
++ ptrdiff_t tomove;
++ lvar_hash_t hash;
++
++ assert_inv(n_invariant(leaf));
++ assert_corr(iam_leaf_is_locked(leaf));
++
++ new_leaf = *bh;
++ path = iam_leaf_path(leaf);
++
++ hdr = (void *)new_leaf->b_data;
++
++ first_to_move = find_pivot(leaf, &last_to_stay);
++ assert_corr(last_to_stay != NULL);
++ assert_corr(e_next(leaf, last_to_stay) == first_to_move);
++
++ hash = e_hash(first_to_move);
++ if (hash == e_hash(last_to_stay))
++ /*
++ * Duplicate hash.
++ */
++ hash |= 1;
++
++ tomove = PDIFF(n_end(leaf), first_to_move);
++ memmove(hdr + 1, first_to_move, tomove);
++
++ h_used_adj(leaf, hdr, tomove);
++ h_used_adj(leaf, n_head(leaf), -tomove);
++
++ assert_corr(n_end(leaf) == first_to_move);
++
++ if (n_cur(leaf) >= first_to_move) {
++ /*
++ * insertion point moves into new leaf.
++ */
++ ptrdiff_t shift;
++ int result;
++
++ shift = PDIFF(leaf->il_at, first_to_move);
++ *bh = leaf->il_bh;
++ leaf->il_bh = new_leaf;
++ leaf->il_curidx = new_blknr;
++
++ assert_corr(iam_leaf_is_locked(leaf));
++ result = lvar_init(leaf);
++ /*
++ * init cannot fail, as node was just initialized.
++ */
++ assert_corr(result == 0);
++ leaf->il_at = ((void *)leaf->il_at) + shift;
++ }
++ /*
++ * Insert pointer to the new node (together with the least key in
++ * the node) into index node.
++ */
++ iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
++ new_blknr);
++ assert_corr(n_cur(leaf) < n_end(leaf));
++ assert_inv(n_invariant(leaf));
++}
++
++static struct iam_leaf_operations lvar_leaf_ops = {
++ .init = lvar_init,
++ .init_new = lvar_init_new,
++ .fini = lvar_fini,
++ .start = lvar_start,
++ .next = lvar_next,
++ .key = lvar_key,
++ .ikey = lvar_ikey,
++ .rec = lvar_rec,
++ .key_set = lvar_key_set,
++ .key_cmp = lvar_key_cmp,
++ .key_eq = lvar_key_eq,
++ .key_size = lvar_key_size,
++ .rec_set = lvar_rec_set,
++ .rec_get = lvar_rec_get,
++ .lookup = lvar_lookup,
++ .ilookup = lvar_ilookup,
++ .at_end = lvar_at_end,
++ .rec_add = lvar_rec_add,
++ .rec_del = lvar_rec_del,
++ .can_add = lvar_can_add,
++ .split = lvar_split
++};
++
++/*
++ * Index operations.
++ */
++
++enum {
++ /* This is duplicated in lustre/utils/create_iam.c */
++ /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
++ IAM_LVAR_ROOT_MAGIC = 0xb01dface
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct lvar_root {
++ __le32 vr_magic;
++ __le16 vr_recsize;
++ __le16 vr_ptrsize;
++ u8 vr_indirect_levels;
++ u8 vr_padding0;
++ __le16 vr_padding1;
++};
++
++static __u32 lvar_root_ptr(struct iam_container *c)
++{
++ return 0;
++}
++
++static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
++ int root)
++{
++ return 0;
++}
++
++static struct iam_entry *lvar_root_inc(struct iam_container *c,
++ struct iam_path *path,
++ struct iam_frame *frame)
++{
++ struct lvar_root *root;
++ struct iam_entry *entries;
++
++ assert_corr(iam_frame_is_locked(path, frame));
++ entries = frame->entries;
++
++ dx_set_count(entries, 2);
++ assert_corr(dx_get_limit(entries) == dx_root_limit(path));
++
++ root = (void *)frame->bh->b_data;
++ assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
++ root->vr_indirect_levels ++;
++ frame->at = entries = iam_entry_shift(path, entries, 1);
++ memset(iam_ikey_at(path, entries), 0,
++ iam_path_descr(path)->id_ikey_size);
++ return entries;
++}
++
++static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++ unsigned count;
++ unsigned limit;
++ unsigned limit_correct;
++ struct iam_entry *entries;
++
++ entries = dx_node_get_entries(path, frame);
++
++ if (frame == path->ip_frames) {
++ struct lvar_root *root;
++
++ root = (void *)frame->bh->b_data;
++ if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
++ return -EIO;
++ limit_correct = dx_root_limit(path);
++ } else
++ limit_correct = dx_node_limit(path);
++ count = dx_get_count(entries);
++ limit = dx_get_limit(entries);
++ if (count > limit)
++ return -EIO;
++ if (limit != limit_correct)
++ return -EIO;
++ return 0;
++}
++
++static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
++{
++ struct iam_entry *entries;
++ void *data;
++ entries = dx_node_get_entries(path, frame);
++
++ data = frame->bh->b_data;
++
++ if (frame == path->ip_frames) {
++ struct lvar_root *root;
++ const char *name;
++
++ root = data;
++ name = kchar(path->ip_key_target);
++ path->ip_indirect = root->vr_indirect_levels;
++ if (path->ip_ikey_target == NULL) {
++ path->ip_ikey_target = iam_path_ikey(path, 4);
++ *(lvar_hash_t *)path->ip_ikey_target =
++ get_hash(path->ip_container, name,
++ strlen(name));
++ }
++ }
++ frame->entries = frame->at = entries;
++ return 0;
++}
++
++static int lvar_ikeycmp(const struct iam_container *c,
++ const struct iam_ikey *k1, const struct iam_ikey *k2)
++{
++ lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
++ lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
++
++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
++}
++
++static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
++ void *area)
++{
++ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
++}
++
++static int root_limit(int rootgap, int blocksize, int size)
++{
++ int limit;
++ int nlimit;
++
++ limit = (blocksize - rootgap) / size;
++ nlimit = blocksize / size;
++ if (limit == nlimit)
++ limit--;
++ return limit;
++}
++
++static int lvar_root_limit(int blocksize, int size)
++{
++ return root_limit(sizeof(struct lvar_root), blocksize, size);
++}
++
++static void lvar_root(void *buf,
++ int blocksize, int keysize, int ptrsize, int recsize)
++{
++ struct lvar_root *root;
++ struct dx_countlimit *limit;
++ void *entry;
++ int isize;
++
++ isize = sizeof(lvar_hash_t) + ptrsize;
++ root = buf;
++ *root = (typeof(*root)) {
++ .vr_magic = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
++ .vr_recsize = cpu_to_le16(recsize),
++ .vr_ptrsize = cpu_to_le16(ptrsize),
++ .vr_indirect_levels = 0
++ };
++
++ limit = (void *)(root + 1);
++ *limit = (typeof(*limit)){
++ /*
++ * limit itself + one pointer to the leaf.
++ */
++ .count = cpu_to_le16(2),
++ .limit = lvar_root_limit(blocksize,
++ sizeof (lvar_hash_t) + ptrsize)
++ };
++
++ entry = root + 1;
++ /*
++ * Skip over @limit.
++ */
++ entry += isize;
++
++ /*
++ * Entry format is <key> followed by <ptr>. In the minimal tree
++ * consisting of a root and single node, <key> is a minimal possible
++ * key.
++ */
++ *(lvar_hash_t *)entry = 0;
++ entry += sizeof(lvar_hash_t);
++ /* now @entry points to <ptr> */
++ if (ptrsize == 4)
++ *(u_int32_t *)entry = cpu_to_le32(1);
++ else
++ *(u_int64_t *)entry = cpu_to_le64(1);
++}
++
++static int lvar_esize(int namelen, int recsize)
++{
++ return (offsetof(struct lvar_leaf_entry, vle_key) +
++ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
++}
++
++static void lvar_leaf(void *buf,
++ int blocksize, int keysize, int ptrsize, int recsize)
++{
++ struct lvar_leaf_header *head;
++ struct lvar_leaf_entry *entry;
++
++ /* form leaf */
++ head = buf;
++ *head = (typeof(*head)) {
++ .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
++ .vlh_used = cpu_to_le16(sizeof *head + lvar_esize(0, recsize))
++ };
++ entry = (void *)(head + 1);
++ *entry = (typeof(*entry)) {
++ .vle_hash = 0,
++ .vle_keysize = 0
++ };
++ memset(e_rec(entry), 0, recsize);
++ *(char *)e_rec(entry) = recsize;
++}
++
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++
++int iam_lvar_create(struct inode *obj,
++ int keysize, int ptrsize, int recsize, handle_t *handle)
++{
++ struct buffer_head *root_node;
++ struct buffer_head *leaf_node;
++ struct super_block *sb;
++
++ u32 blknr;
++ int result;
++ unsigned long bsize;
++
++ assert_corr(obj->i_size == 0);
++
++ sb = obj->i_sb;
++ bsize = sb->s_blocksize;
++ root_node = ext3_append(handle, obj, &blknr, &result);
++ leaf_node = ext3_append(handle, obj, &blknr, &result);
++ if (root_node != NULL && leaf_node != NULL) {
++ lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
++ lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
++ ext3_mark_inode_dirty(handle, obj);
++ result = ext3_journal_dirty_metadata(handle, root_node);
++ if (result == 0)
++ result = ext3_journal_dirty_metadata(handle, leaf_node);
++ if (result != 0)
++ ext3_std_error(sb, result);
++ }
++ brelse(leaf_node);
++ brelse(root_node);
++ return result;
++}
++EXPORT_SYMBOL(iam_lvar_create);
++
++static struct iam_operations lvar_ops = {
++ .id_root_ptr = lvar_root_ptr,
++ .id_node_read = iam_node_read,
++ .id_node_init = lvar_node_init,
++ .id_node_check = lvar_node_check,
++ .id_node_load = lvar_node_load,
++ .id_ikeycmp = lvar_ikeycmp,
++ .id_root_inc = lvar_root_inc,
++ .id_ipd_alloc = lvar_ipd_alloc,
++ .id_ipd_free = iam_ipd_free,
++ .id_name = "lvar"
++};
++
++static int lvar_guess(struct iam_container *c)
++{
++ int result;
++ struct buffer_head *bh;
++ const struct lvar_root *root;
++
++ assert_corr(c->ic_object != NULL);
++
++ result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
++ if (result == 0) {
++ root = (void *)bh->b_data;
++ if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
++ struct iam_descr *descr;
++
++ descr = c->ic_descr;
++ descr->id_key_size = EXT3_NAME_LEN;
++ descr->id_ikey_size = sizeof (lvar_hash_t);
++ descr->id_rec_size = le16_to_cpu(root->vr_recsize);
++ descr->id_ptr_size = le16_to_cpu(root->vr_ptrsize);
++ descr->id_root_gap = sizeof *root;
++ descr->id_node_gap = 0;
++ descr->id_ops = &lvar_ops;
++ descr->id_leaf_ops = &lvar_leaf_ops;
++ } else
++ result = -EBADF;
++ brelse(bh);
++ }
++ return result;
++}
++
++static struct iam_format lvar_format = {
++ .if_guess = lvar_guess
++};
++
++void iam_lvar_format_init(void)
++{
++ iam_format_register(&lvar_format);
++}
++
+Index: iam/fs/ext3/iam_lfix.c
+===================================================================
+--- iam.orig/fs/ext3/iam_lfix.c 2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam_lfix.c 2007-05-23 11:18:20.000000000 +0800
+@@ -0,0 +1,735 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * iam_lfix.c
++ * implementation of iam format for fixed size records.
++ *
++ * Copyright (c) 2006 Cluster File Systems, Inc.
++ * Author: Wang Di <wangdi@clusterfs.com>
++ * Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ * This file is part of the Lustre file system, http://www.lustre.org
++ * Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ * You may have signed or agreed to another license before downloading
++ * this software. If so, you are bound by the terms and conditions
++ * of that agreement, and the following does not apply to you. See the
++ * LICENSE file included with this distribution for more information.
++ *
++ * If you did not agree to a different license, then this copy of Lustre
++ * is open source software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * In either case, Lustre is distributed in the hope that it will be
++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++/*
++ * Leaf operations.
++ */
++
++enum {
++ IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in
++ * lustre/utils/create_iam.c */
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct iam_leaf_head {
++ __le16 ill_magic;
++ __le16 ill_count;
++};
++
++static inline int iam_lfix_entry_size(const struct iam_leaf *l)
++{
++ return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size;
++}
++
++static inline struct iam_lentry *
++iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift)
++{
++ return (void *)entry + shift * iam_lfix_entry_size(l);
++}
++
++static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry)
++{
++ return (struct iam_key *)entry;
++}
++
++static inline int lfix_keycmp(const struct iam_container *c,
++ const struct iam_key *k1,
++ const struct iam_key *k2)
++{
++ return memcmp(k1, k2, c->ic_descr->id_key_size);
++}
++
++static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l)
++{
++ return (struct iam_leaf_head *)l->il_bh->b_data;
++}
++
++static struct iam_lentry *iam_entries(const struct buffer_head *bh)
++{
++ return (void *)bh->b_data + sizeof(struct iam_leaf_head);
++}
++
++static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l)
++{
++ return iam_entries(l->il_bh);
++}
++
++static int leaf_count_limit(const struct iam_leaf *leaf)
++{
++ int free_space;
++
++ free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
++ free_space -= sizeof(struct iam_leaf_head);
++ return free_space / iam_lfix_entry_size(leaf);
++}
++
++static int lentry_count_get(const struct iam_leaf *leaf)
++{
++ return le16_to_cpu(iam_get_head(leaf)->ill_count);
++}
++
++static void lentry_count_set(struct iam_leaf *leaf, unsigned count)
++{
++ assert_corr(0 <= count && count <= leaf_count_limit(leaf));
++ iam_get_head(leaf)->ill_count = cpu_to_le16(count);
++}
++
++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l);
++
++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON
++static int iam_leaf_at_rec(const struct iam_leaf *folio)
++{
++ return
++ iam_get_lentries(folio) <= folio->il_at &&
++ folio->il_at < iam_lfix_get_end(folio);
++}
++#endif
++
++static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l,
++ struct iam_ikey *key)
++{
++ void *ie = l->il_at;
++ assert_corr(iam_leaf_at_rec(l));
++ return (struct iam_ikey*)ie;
++}
++
++static struct iam_key *iam_lfix_key(const struct iam_leaf *l)
++{
++ void *ie = l->il_at;
++ assert_corr(iam_leaf_at_rec(l));
++ return (struct iam_key*)ie;
++}
++
++static int iam_lfix_key_size(const struct iam_leaf *l)
++{
++ return iam_leaf_descr(l)->id_key_size;
++}
++
++static void iam_lfix_start(struct iam_leaf *l)
++{
++ l->il_at = iam_get_lentries(l);
++}
++
++static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l,
++ const struct iam_lentry *e1,
++ const struct iam_lentry *e2)
++{
++ ptrdiff_t diff;
++ int esize;
++
++ esize = iam_lfix_entry_size(l);
++ diff = (void *)e1 - (void *)e2;
++ assert_corr(diff / esize * esize == diff);
++ return diff / esize;
++}
++
++static int iam_lfix_init(struct iam_leaf *l)
++{
++ int result;
++ struct iam_leaf_head *ill;
++ int count;
++
++ assert_corr(l->il_bh != NULL);
++
++ ill = iam_get_head(l);
++ count = le16_to_cpu(ill->ill_count);
++ if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) &&
++ 0 <= count && count <= leaf_count_limit(l)) {
++ l->il_at = l->il_entries = iam_get_lentries(l);
++ result = 0;
++ } else {
++ struct inode *obj;
++
++ obj = iam_leaf_container(l)->ic_object;
++ ext3_error(obj->i_sb, __FUNCTION__,
++ "Wrong magic in node %llu (#%lu): %#x != %#x or "
++ "wrong count: %i (%i)",
++ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
++ ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC),
++ count, leaf_count_limit(l));
++ result = -EIO;
++ }
++ return result;
++}
++
++static void iam_lfix_fini(struct iam_leaf *l)
++{
++ l->il_entries = l->il_at = NULL;
++}
++
++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l)
++{
++ int count = lentry_count_get(l);
++ struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count);
++
++ return ile;
++}
++
++struct iam_rec *iam_lfix_rec(const struct iam_leaf *l)
++{
++ void *e = l->il_at;
++ assert_corr(iam_leaf_at_rec(l));
++ return e + iam_leaf_descr(l)->id_key_size;
++}
++
++static void iam_lfix_next(struct iam_leaf *l)
++{
++ assert_corr(iam_leaf_at_rec(l));
++ l->il_at = iam_lfix_shift(l, l->il_at, 1);
++}
++
++/*
++ * Bug chasing.
++ */
++int lfix_dump = 0;
++EXPORT_SYMBOL(lfix_dump);
++
++static char hdigit(char ch)
++{
++ static char d[] = "0123456789abcdef";
++ return d[ch & 0xf];
++}
++
++static char *hex(char ch, char *area)
++{
++ area[0] = hdigit(ch >> 4);
++ area[1] = hdigit(ch);
++ area[2] = 0;
++ return area;
++}
++
++static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry)
++{
++ int i;
++ char *area;
++ char h[3];
++
++ area = (char *)entry;
++ printk(KERN_EMERG "[");
++ for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area)
++ printk("%s", hex(*area, h));
++ printk("]-(");
++ for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area)
++ printk("%s", hex(*area, h));
++ printk(")\n");
++}
++
++static void lfix_print(struct iam_leaf *leaf)
++{
++ struct iam_lentry *entry;
++ int count;
++ int i;
++
++ entry = leaf->il_entries;
++ count = lentry_count_get(leaf);
++ printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count);
++ for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1))
++ l_print(leaf, entry);
++}
++
++static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k)
++{
++ struct iam_lentry *p, *q, *m, *t;
++ struct iam_container *c;
++ int count;
++ int result;
++
++ count = lentry_count_get(l);
++ if (count == 0)
++ return IAM_LOOKUP_EMPTY;
++
++ result = IAM_LOOKUP_OK;
++ c = iam_leaf_container(l);
++
++ p = l->il_entries;
++ q = iam_lfix_shift(l, p, count - 1);
++ if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) {
++ /*
++ * @k is less than the least key in the leaf
++ */
++ l->il_at = p;
++ result = IAM_LOOKUP_BEFORE;
++ } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) {
++ l->il_at = q;
++ } else {
++ /*
++ * EWD1293
++ */
++ while (iam_lfix_shift(l, p, 1) != q) {
++ m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2);
++ assert_corr(p < m && m < q);
++ if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0)
++ p = m;
++ else
++ q = m;
++ }
++ assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 &&
++ lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0);
++ /*
++ * skip over records with duplicate keys.
++ */
++ while (p > l->il_entries) {
++ t = iam_lfix_shift(l, p, -1);
++ if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0)
++ p = t;
++ else
++ break;
++ }
++ l->il_at = p;
++ }
++ assert_corr(iam_leaf_at_rec(l));
++
++ if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0)
++ result = IAM_LOOKUP_EXACT;
++
++ if (lfix_dump)
++ lfix_print(l);
++
++ return result;
++}
++
++static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik)
++{
++ assert(0);
++ return IAM_LOOKUP_OK;
++}
++
++static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++ assert_corr(iam_leaf_at_rec(l));
++ memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size);
++}
++
++static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
++{
++ return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k);
++}
++
++static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k)
++{
++ return !lfix_keycmp(iam_leaf_container(l),
++ iam_leaf_key_at(l->il_at), k);
++}
++
++static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++ assert_corr(iam_leaf_at_rec(l));
++ memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size);
++}
++
++static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r)
++{
++ assert_corr(iam_leaf_at_rec(l));
++ memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size);
++}
++
++static void iam_lfix_rec_add(struct iam_leaf *leaf,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ struct iam_lentry *end;
++ struct iam_lentry *cur;
++ struct iam_lentry *start;
++ ptrdiff_t diff;
++ int count;
++
++ assert_corr(iam_leaf_can_add(leaf, k, r));
++
++ count = lentry_count_get(leaf);
++ /*
++ * This branch handles two exceptional cases:
++ *
++ * - leaf positioned beyond last record, and
++ *
++ * - empty leaf.
++ */
++ if (!iam_leaf_at_end(leaf)) {
++ end = iam_lfix_get_end(leaf);
++ cur = leaf->il_at;
++ if (lfix_keycmp(iam_leaf_container(leaf),
++ k, iam_leaf_key_at(cur)) >= 0)
++ iam_lfix_next(leaf);
++ else
++ /*
++ * Another exceptional case: insertion with the key
++ * less than least key in the leaf.
++ */
++ assert_corr(cur == leaf->il_entries);
++
++ start = leaf->il_at;
++ diff = (void *)end - (void *)start;
++ assert_corr(diff >= 0);
++ memmove(iam_lfix_shift(leaf, start, 1), start, diff);
++ }
++ lentry_count_set(leaf, count + 1);
++ iam_lfix_key_set(leaf, k);
++ iam_lfix_rec_set(leaf, r);
++ assert_corr(iam_leaf_at_rec(leaf));
++}
++
++static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift)
++{
++ struct iam_lentry *next, *end;
++ int count;
++ ptrdiff_t diff;
++
++ assert_corr(iam_leaf_at_rec(leaf));
++
++ count = lentry_count_get(leaf);
++ end = iam_lfix_get_end(leaf);
++ next = iam_lfix_shift(leaf, leaf->il_at, 1);
++ diff = (void *)end - (void *)next;
++ memmove(leaf->il_at, next, diff);
++
++ lentry_count_set(leaf, count - 1);
++}
++
++static int iam_lfix_can_add(const struct iam_leaf *l,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ return lentry_count_get(l) < leaf_count_limit(l);
++}
++
++static int iam_lfix_at_end(const struct iam_leaf *folio)
++{
++ return folio->il_at == iam_lfix_get_end(folio);
++}
++
++static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh)
++{
++ struct iam_leaf_head *hdr;
++
++ hdr = (struct iam_leaf_head*)bh->b_data;
++ hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC);
++ hdr->ill_count = cpu_to_le16(0);
++}
++
++static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh,
++ iam_ptr_t new_blknr)
++{
++ struct iam_path *path;
++ struct iam_leaf_head *hdr;
++ const struct iam_ikey *pivot;
++ struct buffer_head *new_leaf;
++
++ unsigned count;
++ unsigned split;
++
++ void *start;
++ void *finis;
++
++ new_leaf = *bh;
++ path = iam_leaf_path(l);
++
++ hdr = (void *)new_leaf->b_data;
++
++ count = lentry_count_get(l);
++ split = count / 2;
++
++ start = iam_lfix_shift(l, iam_get_lentries(l), split);
++ finis = iam_lfix_shift(l, iam_get_lentries(l), count);
++
++ pivot = (const struct iam_ikey *)iam_leaf_key_at(start);
++
++ memmove(iam_entries(new_leaf), start, finis - start);
++ hdr->ill_count = count - split;
++ lentry_count_set(l, split);
++ if ((void *)l->il_at >= start) {
++ /*
++ * insertion point moves into new leaf.
++ */
++ int shift;
++ int result;
++
++ shift = iam_lfix_diff(l, l->il_at, start);
++ *bh = l->il_bh;
++ l->il_bh = new_leaf;
++ l->il_curidx = new_blknr;
++ result = iam_lfix_init(l);
++ /*
++ * init cannot fail, as node was just initialized.
++ */
++ assert_corr(result == 0);
++ l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift);
++ }
++ /*
++ * Insert pointer to the new node (together with the least key in
++ * the node) into index node.
++ */
++ iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr);
++}
++
++static struct iam_leaf_operations iam_lfix_leaf_ops = {
++ .init = iam_lfix_init,
++ .init_new = iam_lfix_init_new,
++ .fini = iam_lfix_fini,
++ .start = iam_lfix_start,
++ .next = iam_lfix_next,
++ .key = iam_lfix_key,
++ .ikey = iam_lfix_ikey,
++ .rec = iam_lfix_rec,
++ .key_set = iam_lfix_key_set,
++ .key_cmp = iam_lfix_key_cmp,
++ .key_eq = iam_lfix_key_eq,
++ .key_size = iam_lfix_key_size,
++ .rec_set = iam_lfix_rec_set,
++ .rec_get = iam_lfix_rec_get,
++ .lookup = iam_lfix_lookup,
++ .ilookup = iam_lfix_ilookup,
++ .at_end = iam_lfix_at_end,
++ .rec_add = iam_lfix_rec_add,
++ .rec_del = iam_lfix_rec_del,
++ .can_add = iam_lfix_can_add,
++ .split = iam_lfix_split
++};
++
++/*
++ * Index operations.
++ */
++
++enum {
++ /* This is duplicated in lustre/utils/create_iam.c */
++ /*
++ * Then shalt thou see the dew-BEDABBLED wretch
++ * Turn, and return, indenting with the way;
++ * Each envious brier his weary legs doth scratch,
++ * Each shadow makes him stop, each murmur stay:
++ * For misery is trodden on by many,
++ * And being low never relieved by any.
++ */
++ IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct iam_lfix_root {
++ __le64 ilr_magic;
++ __le16 ilr_keysize;
++ __le16 ilr_recsize;
++ __le16 ilr_ptrsize;
++ u8 ilr_indirect_levels;
++ u8 ilr_padding;
++};
++
++static __u32 iam_lfix_root_ptr(struct iam_container *c)
++{
++ return 0;
++}
++
++static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh,
++ int root)
++{
++ return 0;
++}
++
++static struct iam_entry *iam_lfix_root_inc(struct iam_container *c,
++ struct iam_path *path,
++ struct iam_frame *frame)
++{
++ struct iam_lfix_root *root;
++ struct iam_entry *entries;
++
++ entries = frame->entries;
++
++ dx_set_count(entries, 2);
++ assert_corr(dx_get_limit(entries) == dx_root_limit(path));
++
++ root = (void *)frame->bh->b_data;
++ assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC);
++ root->ilr_indirect_levels ++;
++ frame->at = entries = iam_entry_shift(path, entries, 1);
++ memset(iam_ikey_at(path, entries), 0,
++ iam_path_descr(path)->id_ikey_size);
++ return entries;
++}
++
++static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++ unsigned count;
++ unsigned limit;
++ unsigned limit_correct;
++ struct iam_entry *entries;
++
++ entries = dx_node_get_entries(path, frame);
++
++ if (frame == path->ip_frames) {
++ struct iam_lfix_root *root;
++
++ root = (void *)frame->bh->b_data;
++ if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) {
++ return -EIO;
++ }
++ limit_correct = dx_root_limit(path);
++ } else
++ limit_correct = dx_node_limit(path);
++ count = dx_get_count(entries);
++ limit = dx_get_limit(entries);
++ if (count > limit) {
++ return -EIO;
++ }
++ if (limit != limit_correct) {
++ return -EIO;
++ }
++ return 0;
++}
++
++static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame)
++{
++ struct iam_entry *entries;
++ void *data;
++ entries = dx_node_get_entries(path, frame);
++
++ data = frame->bh->b_data;
++
++ if (frame == path->ip_frames) {
++ struct iam_lfix_root *root;
++
++ root = data;
++ path->ip_indirect = root->ilr_indirect_levels;
++ if (path->ip_ikey_target == NULL)
++ path->ip_ikey_target =
++ (struct iam_ikey *)path->ip_key_target;
++ }
++ frame->entries = frame->at = entries;
++ return 0;
++}
++
++static int iam_lfix_ikeycmp(const struct iam_container *c,
++ const struct iam_ikey *k1,
++ const struct iam_ikey *k2)
++{
++ return memcmp(k1, k2, c->ic_descr->id_ikey_size);
++}
++
++static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c,
++ void *area)
++{
++ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
++}
++
++static struct iam_operations iam_lfix_ops = {
++ .id_root_ptr = iam_lfix_root_ptr,
++ .id_node_read = iam_node_read,
++ .id_node_init = iam_lfix_node_init,
++ .id_node_check = iam_lfix_node_check,
++ .id_node_load = iam_lfix_node_load,
++ .id_ikeycmp = iam_lfix_ikeycmp,
++ .id_root_inc = iam_lfix_root_inc,
++ .id_ipd_alloc = iam_lfix_ipd_alloc,
++ .id_ipd_free = iam_ipd_free,
++ .id_name = "lfix"
++};
++
++static int iam_lfix_guess(struct iam_container *c)
++{
++ int result;
++ struct buffer_head *bh;
++ const struct iam_lfix_root *root;
++
++ assert_corr(c->ic_object != NULL);
++
++ result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh);
++ if (result == 0) {
++ root = (void *)bh->b_data;
++ if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) {
++ struct iam_descr *descr;
++
++ descr = c->ic_descr;
++ descr->id_key_size = le16_to_cpu(root->ilr_keysize);
++ descr->id_ikey_size = le16_to_cpu(root->ilr_keysize);
++ descr->id_rec_size = le16_to_cpu(root->ilr_recsize);
++ descr->id_ptr_size = le16_to_cpu(root->ilr_ptrsize);
++ descr->id_root_gap = sizeof(struct iam_lfix_root);
++ descr->id_node_gap = 0;
++ descr->id_ops = &iam_lfix_ops;
++ descr->id_leaf_ops = &iam_lfix_leaf_ops;
++ } else
++ result = -EBADF;
++ brelse(bh);
++ }
++ return result;
++}
++
++static struct iam_format iam_lfix_format = {
++ .if_guess = iam_lfix_guess
++};
++
++void iam_lfix_format_init(void)
++{
++ iam_format_register(&iam_lfix_format);
++}
++
++/*
++ * Debugging aid.
++ */
++
++#define KEYSIZE (8)
++#define RECSIZE (8)
++#define PTRSIZE (4)
++
++#define LFIX_ROOT_RECNO \
++ ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE))
++
++#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE))
++
++#define LFIX_LEAF_RECNO \
++ ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE))
++
++struct lfix_root {
++ struct iam_lfix_root lr_root;
++ struct {
++ char key[KEYSIZE];
++ char ptr[PTRSIZE];
++ } lr_entry[LFIX_ROOT_RECNO];
++};
++
++struct lfix_index {
++ struct dx_countlimit li_cl;
++ char li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)];
++ struct {
++ char key[KEYSIZE];
++ char ptr[PTRSIZE];
++ } li_entry[LFIX_INDEX_RECNO - 1];
++};
++
++struct lfix_leaf {
++ struct iam_leaf_head ll_head;
++ struct {
++ char key[KEYSIZE];
++ char rec[RECSIZE];
++ } ll_entry[LFIX_LEAF_RECNO];
++};
+Index: iam/fs/ext3/iam_htree.c
+===================================================================
+--- iam.orig/fs/ext3/iam_htree.c 2007-05-23 09:56:30.476305206 +0800
++++ iam/fs/ext3/iam_htree.c 2007-05-23 11:18:20.000000000 +0800
+@@ -0,0 +1,687 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * iam_htree.c
++ * implementation of iam format for ext3/htree.
++ *
++ * Copyright (c) 2006 Cluster File Systems, Inc.
++ * Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ * This file is part of the Lustre file system, http://www.lustre.org
++ * Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ * You may have signed or agreed to another license before downloading
++ * this software. If so, you are bound by the terms and conditions
++ * of that agreement, and the following does not apply to you. See the
++ * LICENSE file included with this distribution for more information.
++ *
++ * If you did not agree to a different license, then this copy of Lustre
++ * is open source software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * In either case, Lustre is distributed in the hope that it will be
++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error(), EXT3_DIR_ROUND() */
++#include <linux/ext3_fs.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++static inline struct ext3_dir_entry_2 *dent(struct iam_lentry *ent)
++{
++ return (struct ext3_dir_entry_2 *)ent;
++}
++
++static inline struct iam_path_compat *getipc(const struct iam_leaf *folio)
++{
++ struct iam_path *path;
++
++ path = iam_leaf_path(folio);
++ assert_corr(dx_index_is_compat(path));
++ assert_corr(path->ip_data != NULL);
++ return container_of(path->ip_data, struct iam_path_compat, ipc_descr);
++}
++
++static inline struct ext3_dir_entry_2 *getent(const struct iam_leaf *folio)
++{
++ return dent(folio->il_at);
++}
++
++static __u32 hashname(const struct iam_leaf *folio,
++ const char *name, int namelen)
++{
++ int result;
++ struct dx_hash_info *hinfo;
++
++ hinfo = getipc(folio)->ipc_hinfo;
++ assert_corr(hinfo != NULL);
++ result = ext3fs_dirhash(name, namelen, hinfo);
++ assert_corr(result == 0);
++ return hinfo->hash;
++}
++
++static __u32 gethash(const struct iam_leaf *folio,
++ const struct ext3_dir_entry_2 *ent)
++{
++ return hashname(folio, ent->name, ent->name_len);
++}
++
++static inline size_t recsize(size_t namelen)
++{
++ return EXT3_DIR_REC_LEN(namelen);
++}
++
++static struct ext3_dir_entry_2 *getlast(const struct iam_leaf *folio, int namelen)
++{
++ return
++ (void *)folio->il_bh->b_data +
++ iam_leaf_container(folio)->ic_object->i_sb->s_blocksize -
++ recsize(namelen);
++}
++
++static struct ext3_dir_entry_2 *gettop(const struct iam_leaf *folio)
++{
++ return getlast(folio, 0);
++}
++
++static inline int ent_is_live(const struct ext3_dir_entry_2 *ent)
++{
++ return ent->inode != 0;
++}
++
++static struct ext3_dir_entry_2 *entnext(const struct ext3_dir_entry_2 *ent)
++{
++ return (void *)ent + le16_to_cpu(ent->rec_len);
++}
++
++static struct ext3_dir_entry_2 *skipdead(struct ext3_dir_entry_2 *ent)
++{
++ if (!ent_is_live(ent))
++ ent = entnext(ent);
++ /*
++ * There can be no more than one dead entry in a row.
++ */
++ return ent;
++}
++
++static struct ext3_dir_entry_2 *getstart(const struct iam_leaf *folio)
++{
++ return (void *)folio->il_bh->b_data;
++}
++
++static int getfreespace(const struct ext3_dir_entry_2 *ent)
++{
++ int free;
++
++ free = le16_to_cpu(ent->rec_len);
++ if (ent_is_live(ent))
++ free -= recsize(ent->name_len);
++ assert_corr(free >= 0);
++ return free;
++}
++
++static int entcmp(const struct iam_leaf *folio,
++ const struct ext3_dir_entry_2 *e0, const struct ext3_dir_entry_2 *e1)
++{
++ __u32 hash0;
++ __u32 hash1;
++
++ assert_corr(ent_is_live(e0));
++ assert_corr(ent_is_live(e1));
++
++ hash0 = gethash(folio, e0);
++ hash1 = gethash(folio, e1);
++ if (hash0 < hash1)
++ return -1;
++ else if (hash0 > hash1)
++ return +1;
++ else if (e0 < e1)
++ return -1;
++ else if (e0 > e1)
++ return +1;
++ else
++ return 0;
++}
++
++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON
++static int iam_leaf_at_rec(const struct iam_leaf *folio)
++{
++ struct ext3_dir_entry_2 *ent;
++
++ ent = getent(folio);
++ return getstart(folio) <= ent &&
++ ent < gettop(folio) && ent_is_live(ent);
++}
++#endif
++
++/*
++ * Leaf operations.
++ */
++
++static struct iam_ikey *iam_htree_ikey(const struct iam_leaf *l,
++ struct iam_ikey *key)
++{
++ __u32 *hash;
++ assert_corr(iam_leaf_at_rec(l));
++
++ hash = (void *)key;
++ *hash = gethash(l, getent(l));
++ return key;
++}
++
++static struct iam_key *iam_htree_key(const struct iam_leaf *l)
++{
++ assert_corr(iam_leaf_at_rec(l));
++
++ return (struct iam_key *)&getent(l)->name;
++}
++
++static int iam_htree_key_size(const struct iam_leaf *l)
++{
++ assert_corr(iam_leaf_at_rec(l));
++
++ return getent(l)->name_len;
++}
++
++static void iam_htree_start(struct iam_leaf *l)
++{
++ l->il_at = (void *)skipdead(getstart(l));
++}
++
++static int iam_htree_init(struct iam_leaf *l)
++{
++ assert_corr(l->il_bh != NULL);
++
++ l->il_at = l->il_entries = (void *)getstart(l);
++ return 0;
++}
++
++static void iam_htree_fini(struct iam_leaf *l)
++{
++ l->il_entries = l->il_at = NULL;
++}
++
++struct iam_rec *iam_htree_rec(const struct iam_leaf *l)
++{
++ assert_corr(iam_leaf_at_rec(l));
++ return (void *)&getent(l)->inode;
++}
++
++static void iam_htree_next(struct iam_leaf *l)
++{
++ struct ext3_dir_entry_2 *scan;
++ struct ext3_dir_entry_2 *found;
++
++ assert_corr(iam_leaf_at_rec(l));
++ found = NULL;
++ for (scan = getstart(l); scan < gettop(l); scan = entnext(scan)) {
++ if (scan != getent(l) && ent_is_live(scan) &&
++ entcmp(l, getent(l), scan) < 0 &&
++ (found == NULL || entcmp(l, scan, found) < 0))
++ found = scan;
++ }
++ assert_corr(ergo(found != NULL,
++ gethash(l, getent(l)) <= gethash(l, found)));
++ l->il_at = (void *)(found ? : gettop(l));
++}
++
++static int iam_htree_at_end(const struct iam_leaf *folio)
++{
++ return getent(folio) >= gettop(folio);
++}
++
++
++static inline int match(int len, const char *const name,
++ struct ext3_dir_entry_2 *de)
++{
++ if (len != de->name_len)
++ return 0;
++ if (!de->inode)
++ return 0;
++ return !memcmp(name, de->name, len);
++}
++
++static int iam_htree_lookup(struct iam_leaf *l, const struct iam_key *k)
++{
++ struct iam_container *c;
++ struct ext3_dir_entry_2 *scan;
++ struct ext3_dir_entry_2 *found;
++ __u32 hash;
++ int result;
++ int namelen;
++ int last = 1;
++ const char *name;
++
++ c = iam_leaf_container(l);
++ name = (const char *)k;
++ namelen = strlen(name);
++ hash = hashname(l, name, namelen);
++ found = NULL;
++ result = IAM_LOOKUP_OK;
++ for (scan = getstart(l); scan < getlast(l, namelen);
++ scan = entnext(scan)) {
++ if (match(namelen, name, scan)) {
++ found = scan;
++ result = IAM_LOOKUP_EXACT;
++ break;
++ } else if (ent_is_live(scan)) {
++ if (gethash(l, scan) <= hash)
++ found = scan;
++ else
++ last = 0;
++ }
++ }
++ if (found == NULL) {
++ /*
++ * @k is less than all hashes in the leaf.
++ */
++ iam_htree_start(l);
++ result = IAM_LOOKUP_BEFORE;
++ } else {
++ l->il_at = (void *)found;
++ assert_corr(iam_leaf_at_rec(l));
++ }
++ if (last)
++ result |= IAM_LOOKUP_LAST;
++ return result;
++}
++
++static int iam_htree_ilookup(struct iam_leaf *l, const struct iam_ikey *ik)
++{
++ assert(0);
++ return IAM_LOOKUP_OK;
++}
++
++static void iam_htree_key_set(struct iam_leaf *l, const struct iam_key *k)
++{
++ assert_corr(iam_leaf_at_rec(l));
++ assert(0);
++}
++
++static int iam_htree_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
++{
++ const char *name;
++ __u32 h0;
++ __u32 h1;
++
++ name = (const char *)k;
++
++ assert_corr(ent_is_live(getent(l)));
++
++ h0 = gethash(l, getent(l));
++ h1 = hashname(l, name, strlen(name));
++
++ return h0 < h1 ? -1 : (h0 == h1 ? 0 : +1);
++}
++
++static int iam_htree_key_eq(const struct iam_leaf *l, const struct iam_key *k)
++{
++ const char *name;
++
++ name = (const char *)k;
++ return match(strlen(name), name, getent(l));
++}
++
++static void iam_htree_rec_set(struct iam_leaf *l, const struct iam_rec *r)
++{
++ __u32 *ino;
++
++ ino = (void *)r;
++ getent(l)->inode = cpu_to_le32(*ino);
++}
++
++static void iam_htree_rec_get(const struct iam_leaf *l, struct iam_rec *r)
++{
++ __u32 *ino;
++
++ ino = (void *)r;
++ *ino = le32_to_cpu(getent(l)->inode);
++}
++
++static void iam_htree_rec_add(struct iam_leaf *leaf, const struct iam_key *k,
++ const struct iam_rec *r)
++{
++ struct ext3_dir_entry_2 *scan;
++ struct inode *dir;
++ const char *name;
++
++ __u32 *ino;
++ int namelen;
++
++ assert_corr(iam_leaf_can_add(leaf, k, r));
++
++ dir = iam_leaf_container(leaf)->ic_object;
++ ino = (void *)r;
++ name = (const char *)k;
++ namelen = strlen(name);
++
++ scan = find_insertion_point(dir, leaf->il_bh, name, namelen);
++ assert_corr(!IS_ERR(scan));
++ scan = split_entry(dir, scan, *ino, EXT3_FT_UNKNOWN, name, namelen);
++ leaf->il_at = (void *)scan;
++}
++
++static void iam_htree_rec_del(struct iam_leaf *leaf, int shift)
++{
++ struct ext3_dir_entry_2 *orig;
++ struct ext3_dir_entry_2 *scan;
++ struct ext3_dir_entry_2 *prev;
++
++ assert_corr(iam_leaf_at_rec(leaf));
++
++ orig = getent(leaf);
++
++ if (shift)
++ iam_htree_next(leaf);
++
++ for (prev = NULL, scan = getstart(leaf); scan < orig;
++ prev = scan, scan = entnext(scan))
++ ;
++
++ assert_corr(scan == orig);
++ if (prev != NULL) {
++ prev->rec_len = cpu_to_le16(le16_to_cpu(prev->rec_len) +
++ le16_to_cpu(scan->rec_len));
++ } else {
++ assert_corr(scan == getstart(leaf));
++ scan->inode = 0;
++ }
++ iam_leaf_container(leaf)->ic_object->i_version ++;
++}
++
++static int iam_htree_can_add(const struct iam_leaf *leaf,
++ const struct iam_key *k, const struct iam_rec *r)
++{
++ struct ext3_dir_entry_2 *scan;
++ int size;
++
++ size = recsize(strlen((const char *)k));
++ for (scan = getstart(leaf);
++ scan < gettop(leaf); scan = entnext(scan)) {
++ if (getfreespace(scan) >= size)
++ return 1;
++ }
++ return 0;
++}
++
++static void iam_htree_init_new(struct iam_container *c, struct buffer_head *bh)
++{
++ /*
++ * Do nothing, all work is done by iam_htree_split().
++ */
++}
++
++static void iam_htree_split(struct iam_leaf *l, struct buffer_head **bh,
++ iam_ptr_t new_blknr)
++{
++ __u32 delim_hash;
++ __u32 old_hash;
++ struct buffer_head *newbh = *bh;
++ struct iam_path *path;
++
++ old_hash = gethash(l, getent(l));
++ move_entries(iam_leaf_container(l)->ic_object,
++ getipc(l)->ipc_hinfo, &l->il_bh, bh, &delim_hash);
++ /*
++ * Insert pointer to the new node (together with the least key in
++ * the node) into index node.
++ */
++ path = iam_leaf_path(l);
++ if (l->il_bh == newbh) {
++ /*
++ * insertion point moves into new leaf.
++ */
++ assert_corr(delim_hash >= old_hash);
++ l->il_curidx = new_blknr;
++ iam_htree_lookup(l, (void *)&old_hash);
++ }
++ iam_insert_key_lock(path,
++ path->ip_frame, (void *)&delim_hash, new_blknr);
++}
++
++static struct iam_leaf_operations iam_htree_leaf_ops = {
++ .init = iam_htree_init,
++ .init_new = iam_htree_init_new,
++ .fini = iam_htree_fini,
++ .start = iam_htree_start,
++ .next = iam_htree_next,
++ .key = iam_htree_key,
++ .ikey = iam_htree_ikey,
++ .rec = iam_htree_rec,
++ .key_set = iam_htree_key_set,
++ .key_cmp = iam_htree_key_cmp,
++ .key_eq = iam_htree_key_eq,
++ .key_size = iam_htree_key_size,
++ .rec_set = iam_htree_rec_set,
++ .rec_get = iam_htree_rec_get,
++ .lookup = iam_htree_lookup,
++ .ilookup = iam_htree_ilookup,
++ .at_end = iam_htree_at_end,
++ .rec_add = iam_htree_rec_add,
++ .rec_del = iam_htree_rec_del,
++ .can_add = iam_htree_can_add,
++ .split = iam_htree_split
++};
++
++/*
++ * Index operations.
++ */
++
++static __u32 iam_htree_root_ptr(struct iam_container *c)
++{
++ return 0;
++}
++
++static int iam_htree_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++ /* XXX no checks yet */
++ return 0;
++}
++
++static int is_htree(struct super_block *sb,
++ const struct dx_root *root, int silent)
++{
++ if (root->info.hash_version > DX_HASH_MAX) {
++ if (!silent)
++ ext3_warning(sb, __FUNCTION__,
++ "Unrecognised inode hash code %d",
++ root->info.hash_version);
++ return -EIO;
++ }
++
++ if (root->info.unused_flags & 1) {
++ if (!silent)
++ ext3_warning(sb, __FUNCTION__,
++ "Unimplemented inode hash flags: %#06x",
++ root->info.unused_flags);
++ return -EIO;
++ }
++
++ if (root->info.indirect_levels > DX_MAX_TREE_HEIGHT - 1) {
++ if (!silent)
++ ext3_warning(sb, __FUNCTION__,
++ "Unimplemented inode hash depth: %#06x",
++ root->info.indirect_levels);
++ return -EIO;
++ }
++ return 0;
++}
++
++static int iam_htree_node_load(struct iam_path *path, struct iam_frame *frame)
++{
++ void *data;
++ struct iam_entry *entries;
++ struct super_block *sb;
++
++ data = frame->bh->b_data;
++ entries = dx_node_get_entries(path, frame);
++ sb = iam_path_obj(path)->i_sb;
++ if (frame == path->ip_frames) {
++ /* root node */
++ struct dx_root *root;
++ struct iam_path_compat *ipc;
++ int check;
++ const char *name;
++ int namelen;
++
++ root = data;
++ assert_corr(path->ip_data != NULL);
++ ipc = container_of(path->ip_data, struct iam_path_compat,
++ ipc_descr);
++
++ check = is_htree(sb, root, 0);
++ if (check != 0)
++ return check;
++ path->ip_indirect = root->info.indirect_levels;
++
++ assert_corr((char *)entries == (((char *)&root->info) +
++ root->info.info_length));
++ assert_corr(dx_get_limit(entries) == dx_root_limit(path));
++
++ ipc->ipc_hinfo->hash_version = root->info.hash_version;
++ ipc->ipc_hinfo->seed = EXT3_SB(sb)->s_hash_seed;
++ name = NULL;
++ if (ipc->ipc_qstr) {
++ name = ipc->ipc_qstr->name;
++ namelen = ipc->ipc_qstr->len;
++ } else if (ipc->ipc_hinfo == &ipc->ipc_hinfo_area){
++ name = (const char *)path->ip_key_target;
++ namelen = strlen(name);
++ }
++ if (name != NULL)
++ ext3fs_dirhash(name, namelen, ipc->ipc_hinfo);
++ if (path->ip_ikey_target == NULL) {
++ path->ip_ikey_target = iam_path_ikey(path, 4);
++ *(__u32 *)path->ip_ikey_target = ipc->ipc_hinfo->hash;
++ }
++ } else {
++ /* non-root index */
++ assert_corr(entries ==
++ data + iam_path_descr(path)->id_node_gap);
++ assert_corr(dx_get_limit(entries) == dx_node_limit(path));
++ }
++ frame->entries = frame->at = entries;
++ return 0;
++}
++
++static int iam_htree_node_init(struct iam_container *c,
++ struct buffer_head *bh, int root)
++{
++ struct dx_node *node;
++
++ assert_corr(!root);
++
++ node = (void *)bh->b_data;
++ node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize);
++ node->fake.inode = 0;
++ return 0;
++}
++
++static struct iam_entry *iam_htree_root_inc(struct iam_container *c,
++ struct iam_path *path,
++ struct iam_frame *frame)
++{
++ struct dx_root *root;
++ struct iam_entry *entries;
++
++ entries = frame->entries;
++
++ dx_set_count(entries, 1);
++ root = (struct dx_root *) frame->bh->b_data;
++ root->info.indirect_levels++;
++
++ return entries;
++}
++
++static int iam_htree_ikeycmp(const struct iam_container *c,
++ const struct iam_ikey *k1,
++ const struct iam_ikey *k2)
++{
++ __u32 p1 = le32_to_cpu(*(__u32 *)k1);
++ __u32 p2 = le32_to_cpu(*(__u32 *)k2);
++
++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
++}
++
++static struct iam_path_descr *iam_htree_ipd_alloc(const struct iam_container *c,
++ void *area)
++{
++ struct iam_path_compat *ipc;
++
++ ipc = area;
++ memset(ipc, 0, sizeof *ipc);
++ iam_path_compat_init(ipc, c->ic_object);
++ return &ipc->ipc_descr;
++}
++
++static void iam_htree_ipd_free(struct iam_path_descr *ipd)
++{
++}
++
++static struct iam_operations iam_htree_ops = {
++ .id_root_ptr = iam_htree_root_ptr,
++ .id_node_read = iam_node_read,
++ .id_node_init = iam_htree_node_init,
++ .id_node_check = iam_htree_node_check,
++ .id_node_load = iam_htree_node_load,
++ .id_ikeycmp = iam_htree_ikeycmp,
++ .id_root_inc = iam_htree_root_inc,
++ .id_ipd_alloc = iam_htree_ipd_alloc,
++ .id_ipd_free = iam_htree_ipd_free,
++ .id_name = "htree"
++};
++
++/*
++ * Parameters describing iam compatibility mode in which existing ext3 htrees
++ * can be manipulated.
++ */
++struct iam_descr iam_htree_compat_param = {
++ .id_key_size = EXT3_NAME_LEN,
++ .id_rec_size = sizeof ((struct ext3_dir_entry_2 *)NULL)->inode,
++ .id_ikey_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++ .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
++ .id_node_gap = offsetof(struct dx_node, entries),
++ .id_root_gap = offsetof(struct dx_root, entries),
++ .id_ops = &iam_htree_ops,
++ .id_leaf_ops = &iam_htree_leaf_ops
++};
++EXPORT_SYMBOL(iam_htree_compat_param);
++
++static int iam_htree_guess(struct iam_container *c)
++{
++ int result;
++ struct buffer_head *bh;
++ const struct dx_root *root;
++
++ assert_corr(c->ic_object != NULL);
++
++ result = iam_node_read(c, iam_htree_root_ptr(c), NULL, &bh);
++ if (result == 0) {
++ root = (void *)bh->b_data;
++ result = is_htree(c->ic_object->i_sb, root, 1);
++ if (result == 0)
++ c->ic_descr = &iam_htree_compat_param;
++ else
++ result = -EBADF;
++ brelse(bh);
++ }
++ return result;
++}
++
++static struct iam_format iam_htree_format = {
++ .if_guess = iam_htree_guess
++};
++
++void iam_htree_format_init(void)
++{
++ iam_format_register(&iam_htree_format);
++}
--- /dev/null
+Index: iam/fs/ext3/Makefile
+===================================================================
+--- iam.orig/fs/ext3/Makefile
++++ iam/fs/ext3/Makefile
+@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o \
+- extents.o mballoc.o iam.o iam_lfix.o
++ extents.o mballoc.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: iam/fs/ext3/dir.c
+===================================================================
+--- iam.orig/fs/ext3/dir.c
++++ iam/fs/ext3/dir.c
+@@ -28,6 +28,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/slab.h>
+ #include <linux/rbtree.h>
++#include <linux/lustre_iam.h>
+
+ static unsigned char ext3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+@@ -61,6 +62,7 @@ static unsigned char get_dtype(struct su
+ }
+
+
++#if EXT3_INVARIANT_ON
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ struct ext3_dir_entry_2 * de,
+ struct buffer_head * bh,
+@@ -90,6 +92,7 @@ int ext3_check_dir_entry (const char * f
+ rlen, de->name_len);
+ return error_msg == NULL ? 1 : 0;
+ }
++#endif
+
+ static int ext3_readdir(struct file * filp,
+ void * dirent, filldir_t filldir)
+@@ -305,12 +308,14 @@ static void free_rb_tree_fname(struct rb
+ root->rb_node = NULL;
+ }
+
++extern struct iam_private_info *ext3_iam_alloc_info(int flags);
++extern void ext3_iam_release_info(struct iam_private_info *info);
+
+ struct dir_private_info *create_dir_info(loff_t pos)
+ {
+ struct dir_private_info *p;
+
+- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++ p = (void *)ext3_iam_alloc_info(GFP_KERNEL);
+ if (!p)
+ return NULL;
+ p->root.rb_node = NULL;
+@@ -326,6 +331,7 @@ struct dir_private_info *create_dir_info
+ void ext3_htree_free_dir_info(struct dir_private_info *p)
+ {
+ free_rb_tree_fname(&p->root);
++ ext3_iam_release_info((void *)p);
+ kfree(p);
+ }
+
+Index: iam/fs/ext3/file.c
+===================================================================
+--- iam.orig/fs/ext3/file.c
++++ iam/fs/ext3/file.c
+@@ -23,6 +23,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/lustre_iam.h>
+ #include "xattr.h"
+ #include "acl.h"
+
+@@ -31,14 +32,18 @@
+ * from ext3_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+-static int ext3_release_file (struct inode * inode, struct file * filp)
++static int ext3_release_file(struct inode * inode, struct file * filp)
+ {
+ /* if we are the last writer on the inode, drop the block reservation */
+ if ((filp->f_mode & FMODE_WRITE) &&
+ (atomic_read(&inode->i_writecount) == 1))
+ ext3_discard_reservation(inode);
+- if (is_dx(inode) && filp->private_data)
++ if (is_dx(inode) && filp->private_data) {
++ if (S_ISDIR(inode->i_mode))
+ ext3_htree_free_dir_info(filp->private_data);
++ else
++ ext3_iam_release(filp, inode);
++ }
+
+ return 0;
+ }
+Index: iam/fs/ext3/iam-uapi.c
+===================================================================
+--- iam.orig/fs/ext3/iam-uapi.c
++++ iam/fs/ext3/iam-uapi.c
+@@ -0,0 +1,368 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * iam_uapi.c
++ * User-level interface to iam (ioctl based)
++ *
++ * Copyright (c) 2006 Cluster File Systems, Inc.
++ * Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ * This file is part of the Lustre file system, http://www.lustre.org
++ * Lustre is a trademark of Cluster File Systems, Inc.
++ *
++ * You may have signed or agreed to another license before downloading
++ * this software. If so, you are bound by the terms and conditions
++ * of that agreement, and the following does not apply to you. See the
++ * LICENSE file included with this distribution for more information.
++ *
++ * If you did not agree to a different license, then this copy of Lustre
++ * is open source software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * In either case, Lustre is distributed in the hope that it will be
++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * license text for more details.
++ */
++
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++
++#include <linux/lustre_iam.h>
++
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++struct iam_private_info {
++ struct dir_private_info ipi_dir; /* has to be first */
++ struct iam_container ipi_bag;
++ struct iam_descr ipi_descr;
++ struct iam_iterator ipi_it;
++ struct iam_path_descr *ipi_ipd;
++ char ipi_ipd_area[DX_IPD_MAX_SIZE];
++};
++
++enum {
++ IAM_INSERT_CREDITS = 20
++};
++
++static struct iam_private_info *get_ipi(struct file *filp)
++{
++ return filp->private_data;
++}
++
++static int iam_uapi_it(int cmd, struct inode *inode,
++ struct file *filp, struct iam_uapi_it *itop)
++{
++ struct iam_private_info *ipi;
++ struct iam_iterator *it;
++ enum iam_it_state st;
++ int result = 0;
++
++ ipi = get_ipi(filp);
++ it = &ipi->ipi_it;
++ st = it->ii_state;
++ switch (cmd) {
++ case IAM_IOC_IT_START:
++ result = iam_it_init(it, &ipi->ipi_bag,
++ IAM_IT_MOVE, ipi->ipi_ipd);
++ if (result == 0)
++ result = iam_it_get(it, itop->iui_op.iul_key);
++ break;
++ case IAM_IOC_IT_NEXT:
++ if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED)
++ result = iam_it_next(it);
++ else
++ result = -EBUSY;
++ break;
++ case IAM_IOC_IT_STOP:
++ iam_it_put(it);
++ iam_it_fini(it);
++ result = 0;
++ break;
++ }
++ st = it->ii_state;
++ if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED)
++ memcpy(itop->iui_op.iul_key, iam_it_key_get(it),
++ iam_it_key_size(it));
++ if (st == IAM_IT_ATTACHED)
++ iam_reccpy(&it->ii_path.ip_leaf, itop->iui_op.iul_rec);
++ itop->iui_state = st;
++ return result;
++}
++
++static int iam_uapi_op(int cmd, struct inode *inode,
++ struct file *filp, struct iam_uapi_op *op)
++{
++ int result;
++ struct iam_private_info *ipi;
++
++ ipi = get_ipi(filp);
++ if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_DELETE) {
++ handle_t *h;
++
++ h = ext3_journal_start(inode, IAM_INSERT_CREDITS);
++ if (!IS_ERR(h)) {
++ if (cmd == IAM_IOC_INSERT)
++ result = iam_insert(h, &ipi->ipi_bag,
++ op->iul_key,
++ op->iul_rec, ipi->ipi_ipd);
++ else
++ result = iam_delete(h, &ipi->ipi_bag,
++ op->iul_key, ipi->ipi_ipd);
++ ext3_journal_stop(h);
++ } else {
++ result = PTR_ERR(h);
++ ext3_std_error(inode->i_sb, result);
++ }
++ } else
++ result = iam_lookup(&ipi->ipi_bag, op->iul_key,
++ op->iul_rec, ipi->ipi_ipd);
++ return result;
++}
++
++struct iam_private_info *ext3_iam_alloc_info(int flags)
++{
++ struct iam_private_info *info;
++
++ info = kmalloc(sizeof *info, flags);
++ if (info != NULL)
++ memset(info, 0, sizeof *info);
++ return info;
++}
++
++void ext3_iam_release_info(struct iam_private_info *info)
++{
++ iam_it_put(&info->ipi_it);
++ iam_it_fini(&info->ipi_it);
++ if (info->ipi_ipd != NULL)
++ info->ipi_bag.ic_descr->id_ops->id_ipd_free(info->ipi_ipd);
++ iam_container_fini(&info->ipi_bag);
++}
++
++void ext3_iam_release(struct file *filp, struct inode *inode)
++{
++ struct iam_private_info *info;
++
++ info = filp->private_data;
++ ext3_iam_release_info(info);
++
++ kfree(info);
++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
++static int iam_uapi_init(struct inode *inode,
++ struct file *filp, struct iam_uapi_info *ua)
++{
++ int result;
++ struct iam_private_info *info;
++
++ info = ext3_iam_alloc_info(GFP_KERNEL);
++ if (info != NULL) {
++ struct iam_container *bag;
++ struct iam_descr *des;
++
++ bag = &info->ipi_bag;
++ des = &info->ipi_descr;
++ result = iam_container_init(bag, des, inode);
++ if (result == 0) {
++ result = iam_container_setup(bag);
++ if (result == 0) {
++ /*
++ * Container setup might change ->ic_descr
++ */
++ des = bag->ic_descr;
++ info->ipi_ipd = des->id_ops->
++ id_ipd_alloc(bag, info->ipi_ipd_area);
++ if (info->ipi_ipd != NULL) {
++ filp->private_data = info;
++ EXT3_I(inode)->i_flags |= EXT3_INDEX_FL;
++ } else
++ result = -ENOMEM;
++ }
++ }
++ } else
++ result = -ENOMEM;
++ return result;
++}
++
++
++static int getua(struct iam_uapi_info *ua, unsigned long arg)
++{
++ if (copy_from_user(ua, (struct iam_uapi_info __user *)arg, sizeof *ua))
++ return -EFAULT;
++ else
++ return 0;
++}
++
++static int putua(struct iam_uapi_info *ua, unsigned long arg)
++{
++ if (copy_to_user((struct iam_uapi_info __user *)arg, ua, sizeof *ua))
++ return -EFAULT;
++ else
++ return 0;
++}
++
++enum outop_t {
++ KEY = 1 << 0,
++ REC = 1 << 1,
++ STATE = 1 << 2
++};
++
++static int outop(struct iam_uapi_op *op, struct iam_uapi_op *uop,
++ struct iam_descr *des, enum outop_t opt)
++{
++ int result;
++
++ if (((opt & REC) && copy_to_user((void __user *)uop->iul_rec,
++ op->iul_rec, des->id_rec_size)) ||
++ ((opt & KEY) && copy_to_user((void __user *)uop->iul_key,
++ op->iul_key, des->id_key_size)))
++ result = -EFAULT;
++ else
++ result = 0;
++ return result;
++}
++
++static void putop(struct iam_uapi_op *op)
++{
++ kfree(op->iul_key);
++ kfree(op->iul_rec);
++}
++
++static int getop(struct iam_uapi_op *op, struct iam_uapi_op *uop,
++ struct iam_descr *des, unsigned long arg)
++{
++ int result;
++ int ks;
++ int rs;
++
++ ks = des->id_key_size;
++ rs = des->id_rec_size;
++ op->iul_key = kmalloc(ks, GFP_KERNEL);
++ op->iul_rec = kmalloc(rs, GFP_KERNEL);
++ if (!copy_from_user(uop,
++ (struct iam_uapi_op __user *)arg, sizeof *uop) &&
++ op->iul_key != NULL && op->iul_rec != NULL &&
++ !copy_from_user(op->iul_key, (void __user *)uop->iul_key, ks) &&
++ !copy_from_user(op->iul_rec, (void __user *)uop->iul_rec, rs))
++ result = 0;
++ else {
++ result = -EFAULT;
++ putop(op);
++ }
++ return result;
++}
++
++static int outit(struct iam_uapi_it *it, struct iam_uapi_it *uit,
++ struct iam_descr *des, enum outop_t opt, unsigned long arg)
++{
++ int result;
++
++ result = outop(&it->iui_op, &uit->iui_op, des, opt);
++ if (result == 0 && (opt&STATE))
++ result = put_user(it->iui_state, (int __user *) arg);
++ return result;
++}
++
++static void putit(struct iam_uapi_it *it)
++{
++ putop(&it->iui_op);
++}
++
++static int getit(struct iam_uapi_it *it, struct iam_uapi_it *uit,
++ struct iam_descr *des, unsigned long arg)
++{
++ return getop(&it->iui_op, &uit->iui_op, des,
++ (unsigned long)&((struct iam_uapi_it *)arg)->iui_op);
++}
++
++int iam_uapi_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int result;
++ struct iam_uapi_info ua;
++ struct iam_uapi_op uop;
++ struct iam_uapi_op op;
++ struct iam_uapi_it uit;
++ struct iam_uapi_it it;
++ enum outop_t opt;
++
++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) {
++ result = -EACCES;
++ } else if (cmd == IAM_IOC_POLYMORPH) {
++ /*
++ * If polymorphing into directory, increase hard-link count.
++ */
++ if (S_ISDIR((umode_t)arg) && !S_ISDIR(inode->i_mode))
++ inode->i_nlink++;
++ else if (!S_ISDIR((umode_t)arg) && S_ISDIR(inode->i_mode))
++ inode->i_nlink--;
++ inode->i_mode = (umode_t)arg;
++ mark_inode_dirty(inode);
++ result = 0;
++ } else if (cmd == IAM_IOC_INIT) {
++ if (filp->private_data == NULL) {
++ result = getua(&ua, arg);
++ if (result == 0)
++ result = iam_uapi_init(inode, filp, &ua);
++ } else
++ result = -EBUSY;
++ } else if (is_dx(inode) && filp->private_data != NULL) {
++ struct iam_descr *des;
++
++ switch (cmd) {
++ case IAM_IOC_IT_START:
++ case IAM_IOC_IT_NEXT:
++ opt = KEY|REC|STATE;
++ break;
++ case IAM_IOC_LOOKUP:
++ opt = REC;
++ break;
++ default:
++ opt = 0;
++ break;
++ }
++
++ des = get_ipi(filp)->ipi_bag.ic_descr;
++ if (cmd == IAM_IOC_GETINFO) {
++ ua.iui_keysize = des->id_key_size;
++ ua.iui_recsize = des->id_rec_size;
++ ua.iui_ptrsize = des->id_ptr_size;
++ ua.iui_height = 0; /* not yet */
++ memcpy(ua.iui_fmt_name, des->id_ops->id_name,
++ ARRAY_SIZE(ua.iui_fmt_name));
++ result = putua(&ua, arg);
++ } else if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_LOOKUP ||
++ cmd == IAM_IOC_DELETE) {
++ result = getop(&op, &uop, des, arg);
++ if (result == 0) {
++ int res2;
++ result = iam_uapi_op(cmd, inode, filp, &op);
++
++ res2 = outop(&op, &uop, des, opt);
++ result = result ? : res2;
++ putop(&op);
++ }
++ } else if (cmd == IAM_IOC_IT_START || cmd == IAM_IOC_IT_NEXT ||
++ cmd == IAM_IOC_IT_STOP) {
++ result = getit(&it, &uit, des, arg);
++ if (result == 0) {
++ int res2;
++
++ result = iam_uapi_it(cmd, inode, filp, &it);
++
++ res2 = outit(&it, &uit, des, opt, arg);
++ result = result ? : res2;
++ putit(&it);
++ }
++ } else
++ result = -EINVAL;
++ } else
++ result = -ENOENT;
++ return result;
++}
+Index: iam/fs/ext3/ioctl.c
+===================================================================
+--- iam.orig/fs/ext3/ioctl.c
++++ iam/fs/ext3/ioctl.c
+@@ -250,6 +250,6 @@ flags_err:
+
+
+ default:
+- return -ENOTTY;
++ return iam_uapi_ioctl(inode, filp, cmd, arg);
+ }
+ }
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h
++++ iam/include/linux/lustre_iam.h
+@@ -30,9 +30,6 @@
+ #ifndef __LINUX_LUSTRE_IAM_H__
+ #define __LINUX_LUSTRE_IAM_H__
+
+-/* handle_t, journal_start(), journal_stop() */
+-#include <linux/jbd.h>
+-
+ /*
+ * linux/include/linux/lustre_iam.h
+ */
+@@ -57,14 +54,95 @@ enum {
+ * [2] reserved for leaf node operations.
+ *
+ * [3] reserved for index operations.
++ *
++ * [4] reserved for path->ip_ikey_target
++ *
+ */
+- DX_SCRATCH_KEYS = 4,
++ DX_SCRATCH_KEYS = 5,
+ /*
+ * Maximal format name length.
+ */
+ DX_FMT_NAME_LEN = 16
+ };
+
++#ifdef __KERNEL__
++/* handle_t, journal_start(), journal_stop() */
++#include <linux/jbd.h>
++
++/*
++ * Debugging.
++ *
++ * Various debugging levels.
++ */
++
++#if 0
++/*
++ * Following macros are defined in config.h and are tunable through
++ * appropriate configure switches (indicated below).
++ */
++
++/*
++ * Compile basic assertions in. You want this most of the time.
++ *
++ * --{enable,disable}-ldiskfs-assert (on by default).
++ */
++#define EXT3_ASSERT (1)
++
++/*
++ * Compile heavier correctness checks in. You want this during development
++ * cycle.
++ *
++ * --{enable,disable}-ldiskfs-correctness (off by default).
++ */
++#define EXT3_CORRECTNESS (1)
++
++/*
++ * Compile heavy invariant checking in. You want this early during development
++ * or when chasing a bug.
++ *
++ * --{enable,disable}-ldiskfs-invariant (off by default).
++ */
++#define EXT3_INVARIANT (1)
++#endif
++
++#if defined(EXT3_ASSERT)
++#define EXT3_ASSERT_ON (1)
++#else
++#define EXT3_ASSERT_ON (0)
++#endif
++
++#if defined(EXT3_CORRECTNESS)
++#define EXT3_CORRECTNESS_ON (1)
++#else
++#define EXT3_CORRECTNESS_ON (0)
++#endif
++
++#if defined(EXT3_INVARIANT)
++#define EXT3_INVARIANT_ON (1)
++#else
++#define EXT3_INVARIANT_ON (0)
++#endif
++
++#ifndef assert
++#if EXT3_ASSERT_ON
++#define assert(test) J_ASSERT(test)
++#else
++#define assert(test) ((void)(test))
++#endif
++#endif
++
++#if EXT3_CORRECTNESS_ON
++#define assert_corr(test) J_ASSERT(test)
++#else
++#define assert_corr(test) do {;} while (0)
++#endif
++
++#if EXT3_INVARIANT_ON
++#define assert_inv(test) J_ASSERT(test)
++#else
++#define assert_inv(test) do {;} while (0)
++#endif
++
+ /*
+ * Entry within index tree node. Consists of a key immediately followed
+ * (without padding) by a pointer to the child node.
+@@ -86,14 +164,21 @@ struct iam_entry_compat {
+ */
+ struct iam_key;
+
+-/* Incomplete type use to refer to the records stored in iam containers. */
++/*
++ * Incomplete type use to refer to the records stored in iam containers.
++ */
+ struct iam_rec;
+
+-struct iam_cookie {
+- struct iam_key *ic_key;
+- struct iam_rec *ic_rec;
+-};
++/*
++ * Key in index node. Possibly compressed. Fixed size.
++ */
++struct iam_ikey;
+
++/*
++ * Scalar type into which certain iam_key's can be uniquely mapped. Used to
++ * support interfaces like readdir(), where iteration over index has to be
++ * re-startable.
++ */
+ typedef __u64 iam_ptr_t;
+
+ /*
+@@ -123,6 +208,31 @@ struct iam_leaf {
+ void *il_descr_data;
+ };
+
++/*
++ * Return values of ->lookup() operation from struct iam_leaf_operations.
++ */
++enum iam_lookup_t {
++ /*
++ * lookup found a record with the key requested
++ */
++ IAM_LOOKUP_EXACT,
++ /*
++ * lookup positioned leaf on some record
++ */
++ IAM_LOOKUP_OK,
++ /*
++ * leaf was empty
++ */
++ IAM_LOOKUP_EMPTY,
++ /*
++ * lookup positioned leaf before first record
++ */
++ IAM_LOOKUP_BEFORE
++};
++
++/*
++ * Format-specific container operations. These are called by generic iam code.
++ */
+ struct iam_operations {
+ /*
+ * Returns pointer (in the same sense as pointer in index entry) to
+@@ -131,11 +241,15 @@ struct iam_operations {
+ __u32 (*id_root_ptr)(struct iam_container *c);
+
+ /*
+- * Check validity and consistency of index node. This is called when
+- * iam just loaded new node into frame.
++ * Check validity and consistency of index node.
+ */
+ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
+ /*
++ * Copy some data from node header into frame. This is called when
++ * new node is loaded into frame.
++ */
++ int (*id_node_load)(struct iam_path *path, struct iam_frame *frame);
++ /*
+ * Initialize new node (stored in @bh) that is going to be added into
+ * tree.
+ */
+@@ -144,23 +258,33 @@ struct iam_operations {
+ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
+ handle_t *h, struct buffer_head **bh);
+ /*
+- * Key comparison function. Returns -1, 0, +1.
++ * Key comparison functions. Returns -1, 0, +1.
+ */
+- int (*id_keycmp)(const struct iam_container *c,
+- const struct iam_key *k1, const struct iam_key *k2);
++ int (*id_ikeycmp)(const struct iam_container *c,
++ const struct iam_ikey *k1,
++ const struct iam_ikey *k2);
+ /*
+- * Create new container.
+- *
+- * Newly created container has a root node and a single leaf. Leaf
+- * contains single record with the smallest possible key.
++ * Modify root node when tree height increases.
+ */
+- int (*id_create)(struct iam_container *c);
++ struct iam_entry *(*id_root_inc)(struct iam_container *c,
++ struct iam_path *path,
++ struct iam_frame *frame);
++
++ struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c);
++ void (*id_ipd_free)(const struct iam_container *c,
++ struct iam_path_descr *ipd);
+ /*
+ * Format name.
+ */
+ char id_name[DX_FMT_NAME_LEN];
+ };
+
++/*
++ * Another format-specific operation vector, consisting of methods to access
++ * leaf nodes. This is separated from struct iam_operations, because it is
++ * assumed that there will be many formats with different format of leaf
++ * nodes, yes the same struct iam_operations.
++ */
+ struct iam_leaf_operations {
+ /*
+ * leaf operations.
+@@ -186,7 +310,8 @@ struct iam_leaf_operations {
+ void (*start)(struct iam_leaf *l);
+ /* more leaf to the next entry. */
+ void (*next)(struct iam_leaf *l);
+- /* return key of current leaf record. This method may return
++ /*
++ * return key of current leaf record. This method may return
+ * either pointer to the key stored in node, or copy key into
+ * @k buffer supplied by caller and return pointer to this
+ * buffer. The latter approach is used when keys in nodes are
+@@ -194,8 +319,10 @@ struct iam_leaf_operations {
+ * all).
+ *
+ * Caller should assume that returned pointer is only valid
+- * while leaf node is pinned and locked.*/
+- struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k);
++ * while leaf node is pinned and locked.
++ */
++ struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k);
++ struct iam_key *(*key)(const struct iam_leaf *l);
+ /* return pointer to entry body. Pointer is valid while
+ corresponding leaf node is locked and pinned. */
+ struct iam_rec *(*rec)(const struct iam_leaf *l);
+@@ -203,6 +330,9 @@ struct iam_leaf_operations {
+ void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
+ void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
+
++ int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
++
++ int (*key_size)(const struct iam_leaf *l);
+ /*
+ * Search leaf @l for a record with key @k or for a place
+ * where such record is to be inserted.
+@@ -210,6 +340,7 @@ struct iam_leaf_operations {
+ * Scratch keys from @path can be used.
+ */
+ int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
++ int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik);
+
+ int (*can_add)(const struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r);
+@@ -221,17 +352,15 @@ struct iam_leaf_operations {
+ /*
+ * remove rec for a leaf
+ */
+- void (*rec_del)(struct iam_leaf *l);
++ void (*rec_del)(struct iam_leaf *l, int shift);
+ /*
+ * split leaf node, moving some entries into @bh (the latter currently
+ * is assumed to be empty).
+ */
+- void (*split)(struct iam_leaf *l, struct buffer_head *bh);
++ void (*split)(struct iam_leaf *l, struct buffer_head **bh,
++ iam_ptr_t newblknr);
+ };
+
+-struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
+-struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
+-
+ /*
+ * Parameters, describing a flavor of iam container.
+ */
+@@ -241,6 +370,10 @@ struct iam_descr {
+ */
+ size_t id_key_size;
+ /*
++ * Size of a key in index nodes, in bytes.
++ */
++ size_t id_ikey_size;
++ /*
+ * Size of a pointer to the next level (stored in index nodes), in
+ * bytes.
+ */
+@@ -264,6 +397,9 @@ struct iam_descr {
+ struct iam_leaf_operations *id_leaf_ops;
+ };
+
++/*
++ * An instance of iam container.
++ */
+ struct iam_container {
+ /*
+ * Underlying flat file. IO against this object is issued to
+@@ -274,6 +410,10 @@ struct iam_container {
+ * container flavor.
+ */
+ struct iam_descr *ic_descr;
++ /*
++ * read-write lock protecting index consistency.
++ */
++ struct rw_semaphore ic_sem;
+ };
+
+ /*
+@@ -284,7 +424,7 @@ struct iam_path_descr {
+ /*
+ * Scratch-pad area for temporary keys.
+ */
+- struct iam_key *ipd_key_scratch[DX_SCRATCH_KEYS];
++ struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS];
+ };
+
+ /*
+@@ -316,6 +456,7 @@ struct iam_path {
+ * Key searched for.
+ */
+ const struct iam_key *ip_key_target;
++ const struct iam_ikey *ip_ikey_target;
+ /*
+ * Description-specific data.
+ */
+@@ -334,6 +475,7 @@ struct iam_path_compat {
+ struct dx_hash_info *ipc_hinfo;
+ struct dentry *ipc_dentry;
+ struct iam_path_descr ipc_descr;
++ struct dx_hash_info ipc_hinfo_area;
+ };
+
+ /*
+@@ -347,7 +489,9 @@ enum iam_it_state {
+ /* initial state */
+ IAM_IT_DETACHED,
+ /* iterator is above particular record in the container */
+- IAM_IT_ATTACHED
++ IAM_IT_ATTACHED,
++ /* iterator is positioned before record */
++ IAM_IT_SKEWED
+ };
+
+ /*
+@@ -355,7 +499,7 @@ enum iam_it_state {
+ */
+ enum iam_it_flags {
+ /*
+- * this iterator will move (iam_it_{prev,next}() will be called on it)
++ * this iterator will move (iam_it_next() will be called on it)
+ */
+ IAM_IT_MOVE = (1 << 0),
+ /*
+@@ -372,15 +516,26 @@ enum iam_it_flags {
+ * doesn't point to any particular record in this container.
+ *
+ * After successful call to iam_it_get() and until corresponding call to
+- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
++ * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or
++ * IAM_IT_SKEWED.
+ *
+- * Attached iterator can move through records in a container (provided
++ * Active iterator can move through records in a container (provided
+ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+ * passes over them, and can modify container (provided IAM_IT_WRITE
+ * permission).
+ *
++ * Iteration may reach the end of container, at which point iterator switches
++ * into IAM_IT_DETACHED state.
++ *
+ * Concurrency: iterators are supposed to be local to thread. Interfaces below
+- * do no internal serialization.
++ * do no internal serialization of access to the iterator fields.
++ *
++ * When in non-detached state, iterator keeps some container nodes pinned in
++ * memory and locked (that locking may be implemented at the container
++ * granularity though). In particular, clients may assume that pointers to
++ * records and keys obtained through iterator interface as valid until
++ * iterator is detached (except that they may be invalidated by sub-sequent
++ * operations done through the same iterator).
+ *
+ */
+ struct iam_iterator {
+@@ -390,7 +545,8 @@ struct iam_iterator {
+ __u32 ii_flags;
+ enum iam_it_state ii_state;
+ /*
+- * path to the record. Valid in IAM_IT_ATTACHED state.
++ * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED
++ * states.
+ */
+ struct iam_path ii_path;
+ };
+@@ -405,133 +561,26 @@ void iam_path_compat_fini(struct iam_pat
+ struct iam_path_descr *iam_ipd_alloc(void *area, int keysize);
+ void iam_ipd_free(struct iam_path_descr *ipd);
+
+-/*
+- * Initialize iterator to IAM_IT_DETACHED state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+ int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
+ struct iam_path_descr *pd);
+-/*
+- * Finalize iterator and release all resources.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- */
+ void iam_it_fini(struct iam_iterator *it);
+-
+-/*
+- * Attach iterator. After successful completion, @it points to record with the
+- * largest key not larger than @k. Semantics of ->id_create() method guarantee
+- * that such record will always be found.
+- *
+- * Return value: 0: positioned on existing record,
+- * -ve: error.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- * postcondition: ergo(result == 0,
+- * (it_state(it) == IAM_IT_ATTACHED &&
+- * it_keycmp(it, iam_it_key_get(it, *), k) < 0))
+- */
+ int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
+-
+-/*
+- * Duplicates iterator.
+- *
+- * postcondition: it_state(dst) == it_state(src) &&
+- * iam_it_container(dst) == iam_it_container(src) &&
+- * dst->ii_flags = src->ii_flags &&
+- * ergo(it_state(it) == IAM_IT_ATTACHED,
+- * iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+- * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
+- */
++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k);
+ void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
+-
+-/*
+- * Detach iterator. Does nothing it detached state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+ void iam_it_put(struct iam_iterator *it);
+-
+-/*
+- * Move iterator one record right.
+- *
+- * Return value: 0: success,
+- * +1: end of container reached
+- * -ve: error
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
+- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
+- */
+ int iam_it_next(struct iam_iterator *it);
+-
+-/*
+- * Return pointer to the record under iterator.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+ struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
+-
+-/*
+- * Replace contents of record under iterator.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
+-
+-/*
+- * Place key under iterator in @k, return @k
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-struct iam_key *iam_it_key_get(const struct iam_iterator *it,
+- struct iam_key *k);
+-
+-/*
+- * Insert new record with key @k and contents from @r, shifting records to the
+- * right.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED &&
+- * it->ii_flags&IAM_IT_WRITE &&
+- * it_keycmp(it, iam_it_key_get(it, *), k) < 0
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- * ergo(result == 0,
+- * it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
+- * !memcmp(iam_it_rec_get(it), r, ...))
+- */
++int iam_it_rec_set(handle_t *h,
++ struct iam_iterator *it, const struct iam_rec *r);
++struct iam_key *iam_it_key_get(const struct iam_iterator *it);
++int iam_it_key_size(const struct iam_iterator *it);
+ int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+ const struct iam_key *k, const struct iam_rec *r);
+-/*
+- * Delete record under iterator.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+ int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+
+ typedef __u64 iam_pos_t;
+
+-/*
+- * Convert iterator to cookie.
+- *
+- * precondition: it_state(it) == IAM_IT_ATTACHED &&
+- * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+ iam_pos_t iam_it_store(const struct iam_iterator *it);
+-
+-/*
+- * Restore iterator from cookie.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
+- * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+- * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
+- * iam_it_store(it) == pos)
+- */
+ int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
+
+ int iam_lookup(struct iam_container *c, const struct iam_key *k,
+@@ -539,10 +588,10 @@ int iam_lookup(struct iam_container *c,
+ int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
+ struct iam_path_descr *pd);
+ int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
+- struct iam_rec *r, struct iam_path_descr *pd);
++ const struct iam_rec *r, struct iam_path_descr *pd);
+ int iam_insert(handle_t *handle, struct iam_container *c,
+ const struct iam_key *k,
+- struct iam_rec *r, struct iam_path_descr *pd);
++ const struct iam_rec *r, struct iam_path_descr *pd);
+ /*
+ * Initialize container @c.
+ */
+@@ -558,10 +607,6 @@ void iam_container_fini(struct iam_conta
+ */
+ int iam_container_setup(struct iam_container *c);
+
+-#ifndef assert
+-#define assert(test) J_ASSERT(test)
+-#endif
+-
+ static inline struct iam_descr *iam_container_descr(struct iam_container *c)
+ {
+ return c->ic_descr;
+@@ -577,16 +622,65 @@ static inline struct inode *iam_path_obj
+ return p->ip_container->ic_object;
+ }
+
+-static inline void iam_keycpy(const struct iam_container *c,
+- struct iam_key *k1, const struct iam_key *k2)
++static inline void iam_ikeycpy(const struct iam_container *c,
++ struct iam_ikey *k1, const struct iam_ikey *k2)
++{
++ memcpy(k1, k2, c->ic_descr->id_ikey_size);
++}
++
++static inline size_t iam_entry_size(struct iam_path *p)
++{
++ return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size;
++}
++
++static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
++ struct iam_entry *entry,
++ int shift)
++{
++ void *e = entry;
++ return e + shift * iam_entry_size(p);
++}
++
++static inline struct iam_ikey *iam_get_ikey(struct iam_path *p,
++ struct iam_entry *entry,
++ struct iam_ikey *key)
++{
++ return memcpy(key, entry, iam_path_descr(p)->id_ikey_size);
++}
++
++static inline struct iam_ikey *iam_ikey_at(struct iam_path *p,
++ struct iam_entry *entry)
++{
++ return (struct iam_ikey *)entry;
++}
++
++static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
++ struct iam_entry *e1,
++ struct iam_entry *e2)
++{
++ ptrdiff_t diff;
++
++ diff = (void *)e1 - (void *)e2;
++ assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
++ return diff / iam_entry_size(p);
++}
++
++/*
++ * Helper for the frequent case, where key was already placed into @k1 by
++ * callback.
++ */
++static inline void iam_ikeycpy0(const struct iam_container *c,
++ struct iam_ikey *k1, const struct iam_ikey *k2)
+ {
+- memcpy(k1, k2, c->ic_descr->id_key_size);
++ if (k1 != k2)
++ iam_ikeycpy(c, k1, k2);
+ }
+
+-static inline int iam_keycmp(const struct iam_container *c,
+- const struct iam_key *k1, const struct iam_key *k2)
++static inline int iam_ikeycmp(const struct iam_container *c,
++ const struct iam_ikey *k1,
++ const struct iam_ikey *k2)
+ {
+- return c->ic_descr->id_ops->id_keycmp(c, k1, k2);
++ return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2);
+ }
+
+ static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
+@@ -600,11 +694,38 @@ static inline void *iam_entry_off(struct
+ return (void *)((char *)entry + off);
+ }
+
++/*
++ * Leaf helpers.
++ */
++
++static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf)
++{
++ return leaf->il_path;
++}
++
++static inline struct iam_container *
++iam_leaf_container(const struct iam_leaf *leaf)
++{
++ return iam_leaf_path(leaf)->ip_container;
++}
++
++static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf)
++{
++ return iam_leaf_container(leaf)->ic_descr;
++}
++
++static inline struct iam_leaf_operations *
++iam_leaf_ops(const struct iam_leaf *leaf)
++{
++ return iam_leaf_descr(leaf)->id_leaf_ops;
++}
++
++
+ /*XXX These stuff put here, just because they are used by iam.c and namei.c*/
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+ {
+ return le32_to_cpu(*(u32*)iam_entry_off(entry,
+- iam_path_descr(p)->id_key_size))
++ iam_path_descr(p)->id_ikey_size))
+ & 0x00ffffff;
+ }
+
+@@ -612,21 +733,64 @@ static inline void dx_set_block(struct i
+ struct iam_entry *entry, unsigned value)
+ {
+ *(u32*)iam_entry_off(entry,
+- iam_path_descr(p)->id_key_size) =
++ iam_path_descr(p)->id_ikey_size) =
+ cpu_to_le32(value);
+ }
+
+-static inline void dx_set_key(struct iam_path *p, struct iam_entry *entry,
+- const struct iam_key *key)
++static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry,
++ const struct iam_ikey *key)
+ {
+- iam_keycpy(p->ip_container, iam_entry_off(entry, 0), key);
++ iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key);
+ }
+
++struct dx_map_entry
++{
++ u32 hash;
++ u32 offs;
++};
++
++struct fake_dirent {
++ __le32 inode;
++ __le16 rec_len;
++ u8 name_len;
++ u8 file_type;
++};
++
+ struct dx_countlimit {
+ __le16 limit;
+ __le16 count;
+ };
+
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero. Therefore, the
++ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
++ */
++
++struct dx_root {
++ struct fake_dirent dot;
++ char dot_name[4];
++ struct fake_dirent dotdot;
++ char dotdot_name[4];
++ struct dx_root_info
++ {
++ __le32 reserved_zero;
++ u8 hash_version;
++ u8 info_length; /* 8 */
++ u8 indirect_levels;
++ u8 unused_flags;
++ }
++ info;
++ struct {} entries[0];
++};
++
++struct dx_node
++{
++ struct fake_dirent fake;
++ struct {} entries[0];
++};
++
++
+ static inline unsigned dx_get_count(struct iam_entry *entries)
+ {
+ return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+@@ -647,9 +811,21 @@ static inline unsigned dx_node_limit(str
+ struct iam_descr *param = iam_path_descr(p);
+ unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize -
+ param->id_node_gap;
+- return entry_space / (param->id_key_size + param->id_ptr_size);
++ return entry_space / (param->id_ikey_size + param->id_ptr_size);
++}
++
++static inline unsigned dx_root_limit(struct iam_path *p)
++{
++ struct iam_descr *param = iam_path_descr(p);
++ unsigned limit = iam_path_obj(p)->i_sb->s_blocksize -
++ param->id_root_gap;
++ limit /= (param->id_ikey_size + param->id_ptr_size);
++ if (limit == dx_node_limit(p))
++ limit--;
++ return limit;
+ }
+
++
+ static inline struct iam_entry *dx_get_entries(struct iam_path *path,
+ void *data, int root)
+ {
+@@ -665,7 +841,8 @@ static inline struct iam_entry *dx_node_
+ frame->bh->b_data, frame == path->ip_frames);
+ }
+
+-static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr)
++static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path,
++ int nr)
+ {
+ assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
+ return path->ip_data->ipd_key_scratch[nr];
+@@ -674,6 +851,7 @@ static inline struct iam_key *iam_path_k
+ int dx_lookup(struct iam_path *path);
+ void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+ u32 hash, u32 block);
++int dx_index_is_compat(struct iam_path *path);
+
+ int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ struct iam_path *path, __u32 *start_hash);
+@@ -681,6 +859,20 @@ int ext3_htree_next_block(struct inode *
+ struct buffer_head *ext3_append(handle_t *handle, struct inode *inode,
+ u32 *block, int *err);
+ int split_index_node(handle_t *handle, struct iam_path *path);
++struct ext3_dir_entry_2 *split_entry(struct inode *dir,
++ struct ext3_dir_entry_2 *de,
++ unsigned long ino, mode_t mode,
++ const char *name, int namelen);
++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir,
++ struct buffer_head *bh,
++ const char *name, int namelen);
++struct ext3_dir_entry_2 *move_entries(struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct buffer_head **bh1,
++ struct buffer_head **bh2,
++ __u32 *delim_hash);
++
++extern struct iam_descr iam_htree_compat_param;
+
+ /*
+ * external
+@@ -698,10 +890,12 @@ int iam_node_read(struct iam_container *
+ handle_t *handle, struct buffer_head **bh);
+
+ void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
+- const struct iam_key *key, iam_ptr_t ptr);
++ const struct iam_ikey *key, iam_ptr_t ptr);
+
+ int iam_leaf_at_end(const struct iam_leaf *l);
+ void iam_leaf_next(struct iam_leaf *folio);
++int iam_leaf_can_add(const struct iam_leaf *l,
++ const struct iam_key *k, const struct iam_rec *r);
+
+ struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
+ struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
+@@ -709,14 +903,95 @@ struct iam_descr *iam_leaf_descr(const s
+ struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
+
+
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++ handle_t *h, struct buffer_head **bh);
++
++/*
++ * Container format.
++ */
+ struct iam_format {
++ /*
++ * Method called to recognize container format. Should return true iff
++ * container @c conforms to this format. This method may do IO to read
++ * container pages.
++ *
++ * If container is recognized, this method sets operation vectors
++ * ->id_ops and ->id_leaf_ops in container description (c->ic_descr),
++ * and fills other description fields.
++ */
+ int (*if_guess)(struct iam_container *c);
++ /*
++ * Linkage into global list of container formats.
++ */
+ struct list_head if_linkage;
+ };
+
+ void iam_format_register(struct iam_format *fmt);
+
+ void iam_lfix_format_init(void);
++void iam_lvar_format_init(void);
++void iam_htree_format_init(void);
++
++struct iam_private_info;
++
++void ext3_iam_release(struct file *filp, struct inode *inode);
++
++int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
++ unsigned long arg);
++
++/* dir.c */
++#if EXT3_INVARIANT_ON
++extern int ext3_check_dir_entry(const char *, struct inode *,
++ struct ext3_dir_entry_2 *,
++ struct buffer_head *, unsigned long);
++#else
++static inline int ext3_check_dir_entry(const char * function,
++ struct inode * dir,
++ struct ext3_dir_entry_2 * de,
++ struct buffer_head * bh,
++ unsigned long offset)
++{
++ return 1;
++}
++#endif
++
++/* __KERNEL__ */
++#endif
++
++/*
++ * User level API. Copy exists in lustre/lustre/tests/iam_ut.c
++ */
++
++struct iam_uapi_info {
++ __u16 iui_keysize;
++ __u16 iui_recsize;
++ __u16 iui_ptrsize;
++ __u16 iui_height;
++ char iui_fmt_name[DX_FMT_NAME_LEN];
++};
++
++struct iam_uapi_op {
++ void *iul_key;
++ void *iul_rec;
++};
++
++struct iam_uapi_it {
++ struct iam_uapi_op iui_op;
++ __u16 iui_state;
++};
++
++enum iam_ioctl_cmd {
++ IAM_IOC_INIT = _IOW('i', 1, struct iam_uapi_info),
++ IAM_IOC_GETINFO = _IOR('i', 2, struct iam_uapi_info),
++ IAM_IOC_INSERT = _IOR('i', 3, struct iam_uapi_op),
++ IAM_IOC_LOOKUP = _IOWR('i', 4, struct iam_uapi_op),
++ IAM_IOC_DELETE = _IOR('i', 5, struct iam_uapi_op),
++ IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it),
++ IAM_IOC_IT_NEXT = _IOW('i', 7, struct iam_uapi_it),
++ IAM_IOC_IT_STOP = _IOR('i', 8, struct iam_uapi_it),
++
++ IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long)
++};
+
+ /* __LINUX_LUSTRE_IAM_H__ */
+ #endif
--- /dev/null
+Index: iam/fs/ext3/super.c
+===================================================================
+--- iam.orig/fs/ext3/super.c
++++ iam/fs/ext3/super.c
+@@ -147,6 +147,8 @@ static void ext3_handle_error(struct sup
+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+
++ dump_stack();
++
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+@@ -1168,7 +1170,7 @@ static int ext3_check_descriptors (struc
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+-static void ext3_orphan_cleanup (struct super_block * sb,
++void ext3_orphan_cleanup (struct super_block * sb,
+ struct ext3_super_block * es)
+ {
+ unsigned int s_flags = sb->s_flags;
+@@ -1256,7 +1258,9 @@ static void ext3_orphan_cleanup (struct
+ }
+ #endif
+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
++ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+ }
++EXPORT_SYMBOL(ext3_orphan_cleanup);
+
+ #define log2(n) ffz(~(n))
+
+@@ -1682,8 +1686,7 @@ static int ext3_fill_super (struct super
+ * superblock lock.
+ */
+ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+- ext3_orphan_cleanup(sb, es);
+- EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
++
+ if (needs_recovery)
+ printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+ ext3_mark_recovery_complete(sb, es);
--- /dev/null
+Index: iam/fs/ext3/namei.c
+===================================================================
+--- iam.orig/fs/ext3/namei.c
++++ iam/fs/ext3/namei.c
+@@ -55,18 +55,20 @@ struct buffer_head *ext3_append(handle_t
+ u32 *block, int *err)
+ {
+ struct buffer_head *bh;
++ struct ext3_inode_info *ei = EXT3_I(inode);
+
++ /* with parallel dir operations all appends
++ * have to be serialized -bzzz */
++ down(&ei->i_append_sem);
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+- if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++ bh = ext3_bread(handle, inode, *block, 1, err);
++ if (bh != NULL) {
+ inode->i_size += inode->i_sb->s_blocksize;
+- EXT3_I(inode)->i_disksize = inode->i_size;
+- *err = ext3_journal_get_write_access(handle, bh);
+- if (*err != 0) {
+- brelse(bh);
+- bh = NULL;
+- }
++ ei->i_disksize = inode->i_size;
+ }
++ up(&ei->i_append_sem);
++
+ return bh;
+ }
+
+@@ -90,7 +92,7 @@ static void dx_set_count(struct iam_entr
+ static void dx_set_limit(struct iam_entry *entries, unsigned value);
+ static unsigned dx_root_limit(struct iam_path *p);
+ static unsigned dx_node_limit(struct iam_path *p);
+-static int dx_probe(struct dentry *dentry,
++static int dx_probe(struct qstr *name,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct iam_path *path);
+@@ -104,7 +106,6 @@ static struct buffer_head * ext3_dx_find
+ struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+-
+ static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+ {
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+@@ -138,23 +139,20 @@ int dx_node_check(struct iam_path *p, st
+ iam_get_ikey(p, e, iam_path_ikey(p, 1));
+ if (i > 0 &&
+ iam_ikeycmp(c, iam_path_ikey(p, 0),
+- iam_path_ikey(p, 1)) > 0) {
+- BREAKPOINT();
++ iam_path_ikey(p, 1)) > 0)
+ return 0;
+- }
+ blk = dx_get_block(p, e);
+- if (inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) {
+- BREAKPOINT();
++ /*
++ * Disable this check as it is racy.
++ */
++ if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize)
+ return 0;
+- }
+ /*
+ * By definition of a tree, no node points to the root.
+ */
+- if (blk == root) {
+- BREAKPOINT();
++ if (blk == root)
+ return 0;
+ }
+- }
+ return 1;
+ }
+
+@@ -241,12 +239,241 @@ struct stats dx_show_entries(struct dx_h
+ }
+ #endif /* DX_DEBUG */
+
+-int dx_lookup(struct iam_path *path)
++/*
++ * Per-node tree locking.
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ *
++ */
++
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock 25
++
++#define DX_DEBUG (1)
++
++#if DX_DEBUG
++static struct dx_lock_stats {
++ unsigned dls_bh_lock;
++ unsigned dls_bh_busy;
++ unsigned dls_bh_again;
++ unsigned dls_bh_full_again;
++} dx_lock_stats = { 0, };
++#define DX_DEVAL(x) x
++#else
++#define DX_DEVAL(x)
++#endif
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++ DX_DEVAL(dx_lock_stats.dls_bh_lock++);
++#ifdef CONFIG_SMP
++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++ DX_DEVAL(dx_lock_stats.dls_bh_busy++);
++ while (test_bit(BH_DXLock, &bh->b_state))
++ cpu_relax();
++ }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++ smp_mb__before_clear_bit();
++ clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++/*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
++ enum dynlock_type lt)
++{
++ return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS);
++}
++
++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh)
++{
++ if (lh != NULL)
++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh);
++}
++
++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh)
++{
++ int i;
++
++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
++ if (*lh != NULL) {
++ dx_unlock_htree(dir, *lh);
++ *lh = NULL;
++ }
++ }
++}
++
++/*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct iam_entry *dx_find_position(struct iam_path *path,
++ struct iam_frame *frame)
++{
++ int count;
++ struct iam_entry *p;
++ struct iam_entry *q;
++ struct iam_entry *m;
++
++ count = dx_get_count(frame->entries);
++ assert_corr(count && count <= dx_get_limit(frame->entries));
++ p = iam_entry_shift(path, frame->entries,
++ dx_index_is_compat(path) ? 1 : 2);
++ q = iam_entry_shift(path, frame->entries, count - 1);
++ while (p <= q) {
++ m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
++ if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
++ path->ip_ikey_target) > 0)
++ q = iam_entry_shift(path, m, -1);
++ else
++ p = iam_entry_shift(path, m, +1);
++ }
++ return iam_entry_shift(path, p, -1);
++}
++
++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame)
++{
++ return dx_get_block(path, dx_find_position(path, frame));
++}
++
++/*
++ * Fast check for frame consistency.
++ */
++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame)
++{
++ struct iam_container *bag;
++ struct iam_entry *next;
++ struct iam_entry *last;
++ struct iam_entry *entries;
++ struct iam_entry *at;
++
++ bag = path->ip_container;
++ at = frame->at;
++ entries = frame->entries;
++ last = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
++
++ if (unlikely(at > last))
++ return -EAGAIN;
++
++ if (unlikely(dx_get_block(path, at) != frame->leaf))
++ return -EAGAIN;
++
++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
++ path->ip_ikey_target) > 0))
++ return -EAGAIN;
++
++ next = iam_entry_shift(path, at, +1);
++ if (next <= last) {
++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
++ path->ip_ikey_target) <= 0))
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++/*
++ * returns 0 if path was unchanged, -EAGAIN otherwise.
++ */
++static int dx_check_path(struct iam_path *path, struct iam_frame *frame)
++{
++ int equal;
++
++ dx_lock_bh(frame->bh);
++ equal = dx_check_fast(path, frame) == 0 ||
++ frame->leaf == dx_find_ptr(path, frame);
++ DX_DEVAL(dx_lock_stats.dls_bh_again += !equal);
++ dx_unlock_bh(frame->bh);
++
++ return equal ? 0 : -EAGAIN;
++}
++
++/*
++ * returns 0 if path was unchanged, -EAGAIN otherwise.
++ */
++static int dx_check_full_path(struct iam_path *path, int search)
++{
++ struct iam_frame *bottom;
++ struct iam_frame *scan;
++ int i;
++ int result;
++
++ do_corr(schedule());
++
++ for (bottom = path->ip_frames, i = 0;
++ i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
++ ; /* find last filled in frame */
++ }
++
++ /*
++ * Lock frames, bottom to top.
++ */
++ for (scan = bottom - 1; scan >= path->ip_frames; --scan)
++ dx_lock_bh(scan->bh);
++ /*
++ * Check them top to bottom.
++ */
++ result = 0;
++ for (scan = path->ip_frames; scan < bottom; ++scan) {
++ struct iam_entry *pos;
++
++ if (search) {
++ if (dx_check_fast(path, scan) == 0)
++ continue;
++
++ pos = dx_find_position(path, scan);
++ if (scan->leaf != dx_get_block(path, pos)) {
++ result = -EAGAIN;
++ break;
++ }
++ scan->at = pos;
++ } else {
++ pos = iam_entry_shift(path, scan->entries,
++ dx_get_count(scan->entries) - 1);
++ if (scan->at > pos ||
++ scan->leaf != dx_get_block(path, scan->at)) {
++ result = -EAGAIN;
++ break;
++ }
++ }
++ }
++
++ /*
++ * Unlock top to bottom.
++ */
++ for (scan = path->ip_frames; scan < bottom; ++scan)
++ dx_unlock_bh(scan->bh);
++ DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result);
++ do_corr(schedule());
++
++ return result;
++}
++
++static int dx_lookup_try(struct iam_path *path)
+ {
+ u32 ptr;
+ int err = 0;
+ int i;
+- int delta;
+
+ struct iam_descr *param;
+ struct iam_frame *frame;
+@@ -255,20 +482,19 @@ int dx_lookup(struct iam_path *path)
+ param = iam_path_descr(path);
+ c = path->ip_container;
+
+- delta = dx_index_is_compat(path) ? 1 : 2;
+-
+- for (frame = path->ip_frames, i = 0,
+ ptr = param->id_ops->id_root_ptr(c);
+- i <= path->ip_indirect;
+- ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+- struct iam_entry *entries;
+- struct iam_entry *p;
+- struct iam_entry *q;
+- struct iam_entry *m;
+- unsigned count;
+-
++ for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
++ ++frame, ++i) {
+ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
+ &frame->bh);
++ do_corr(schedule());
++
++ dx_lock_bh(frame->bh);
++ /*
++ * node must be initialized under bh lock because concurrent
++ * creation procedure may change it and dx_lookup_try() will
++ * see obsolete tree height. -bzzz
++ */
+ if (err != 0)
+ break;
+
+@@ -283,53 +509,82 @@ int dx_lookup(struct iam_path *path)
+ break;
+
+ assert_inv(dx_node_check(path, frame));
+-
+- entries = frame->entries;
+- count = dx_get_count(entries);
+- assert_corr(count && count <= dx_get_limit(entries));
+- p = iam_entry_shift(path, entries, delta);
+- q = iam_entry_shift(path, entries, count - 1);
+- while (p <= q) {
+- m = iam_entry_shift(path,
+- p, iam_entry_diff(path, q, p) / 2);
+- dxtrace(printk("."));
+- if (iam_ikeycmp(c, iam_ikey_at(path, m),
+- path->ip_ikey_target) > 0)
+- q = iam_entry_shift(path, m, -1);
+- else
+- p = iam_entry_shift(path, m, +1);
++ /*
++ * splitting may change root index block and move hash we're
++ * looking for into another index block so, we have to check
++ * this situation and repeat from begining if path got changed
++ * -bzzz
++ */
++ if (i > 0) {
++ err = dx_check_path(path, frame - 1);
++ if (err != 0)
++ break;
+ }
+
+- frame->at = iam_entry_shift(path, p, -1);
+- if (EXT3_INVARIANT_ON) { // linear search cross check
+- unsigned n = count - 1;
+- struct iam_entry *at;
++ frame->at = dx_find_position(path, frame);
++ frame->curidx = ptr;
++ frame->leaf = ptr = dx_get_block(path, frame->at);
+
+- at = entries;
+- while (n--) {
+- dxtrace(printk(","));
+- at = iam_entry_shift(path, at, +1);
+- if (iam_ikeycmp(c, iam_ikey_at(path, at),
+- path->ip_ikey_target) > 0) {
+- if (at != iam_entry_shift(path, frame->at, 1)) {
+- BREAKPOINT();
+- printk(KERN_EMERG "%i\n",
+- iam_ikeycmp(c, iam_ikey_at(path, at),
+- path->ip_ikey_target));
+- }
+- at = iam_entry_shift(path, at, -1);
+- break;
+- }
+- }
+- assert_corr(at == frame->at);
+- }
++ dx_unlock_bh(frame->bh);
++ do_corr(schedule());
+ }
+ if (err != 0)
+- iam_path_fini(path);
++ dx_unlock_bh(frame->bh);
+ path->ip_frame = --frame;
+ return err;
+ }
+
++static int dx_lookup(struct iam_path *path)
++{
++ int err;
++ int i;
++
++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
++ assert(path->ip_frames[i].bh == NULL);
++
++ do {
++ err = dx_lookup_try(path);
++ do_corr(schedule());
++ if (err != 0)
++ iam_path_fini(path);
++ } while (err == -EAGAIN);
++
++ return err;
++}
++
++/*
++ * Performs path lookup and returns with found leaf (if any) locked by htree
++ * lock.
++ */
++int dx_lookup_lock(struct iam_path *path,
++ struct dynlock_handle **dl, enum dynlock_type lt)
++{
++ int result;
++ struct inode *dir;
++
++ dir = iam_path_obj(path);
++ while ((result = dx_lookup(path)) == 0) {
++ do_corr(schedule());
++ *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt);
++ if (*dl == NULL) {
++ iam_path_fini(path);
++ result = -ENOMEM;
++ break;
++ }
++ do_corr(schedule());
++ /*
++ * while locking leaf we just found may get split so we need
++ * to check this -bzzz
++ */
++ if (dx_check_full_path(path, 1) == 0)
++ break;
++ dx_unlock_htree(dir, *dl);
++ *dl = NULL;
++ iam_path_fini(path);
++ }
++ return result;
++}
++
+ /*
+ * Probe for a directory leaf block to search.
+ *
+@@ -339,7 +594,7 @@ int dx_lookup(struct iam_path *path)
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+-static int dx_probe(struct dentry *dentry, struct inode *dir,
++static int dx_probe(struct qstr *name, struct inode *dir,
+ struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+ int err;
+@@ -347,7 +602,7 @@ static int dx_probe(struct dentry *dentr
+
+ assert_corr(path->ip_data != NULL);
+ ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
+- ipc->ipc_dentry = dentry;
++ ipc->ipc_qstr = name;
+ ipc->ipc_hinfo = hinfo;
+
+ assert_corr(dx_index_is_compat(path));
+@@ -356,6 +611,7 @@ static int dx_probe(struct dentry *dentr
+ return err;
+ }
+
++
+ /*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+@@ -391,10 +647,16 @@ static int ext3_htree_advance(struct ino
+ * nodes need to be read.
+ */
+ while (1) {
++ do_corr(schedule());
++ dx_lock_bh(p->bh);
+ p->at = iam_entry_shift(path, p->at, +1);
+ if (p->at < iam_entry_shift(path, p->entries,
+- dx_get_count(p->entries)))
++ dx_get_count(p->entries))) {
++ p->leaf = dx_get_block(path, p->at);
++ dx_unlock_bh(p->bh);
+ break;
++ }
++ dx_unlock_bh(p->bh);
+ if (p == path->ip_frames)
+ return 0;
+ num_frames++;
+@@ -425,25 +687,125 @@ static int ext3_htree_advance(struct ino
+ * block so no check is necessary
+ */
+ while (num_frames--) {
++ iam_ptr_t idx;
++
++ do_corr(schedule());
++ dx_lock_bh(p->bh);
++ idx = p->leaf = dx_get_block(path, p->at);
++ dx_unlock_bh(p->bh);
+ err = iam_path_descr(path)->id_ops->
+- id_node_read(path->ip_container,
+- (iam_ptr_t)dx_get_block(path, p->at),
+- NULL, &bh);
++ id_node_read(path->ip_container, idx, NULL, &bh);
+ if (err != 0)
+ return err; /* Failure */
+ ++p;
+- brelse (p->bh);
++ brelse(p->bh);
++ assert_corr(p->bh != bh);
+ p->bh = bh;
+ p->entries = dx_node_get_entries(path, p);
+ p->at = iam_entry_shift(path, p->entries, !compat);
++ assert_corr(p->curidx != idx);
++ p->curidx = idx;
++ dx_lock_bh(p->bh);
++ assert_corr(p->leaf != dx_get_block(path, p->at));
++ p->leaf = dx_get_block(path, p->at);
++ dx_unlock_bh(p->bh);
+ assert_inv(dx_node_check(path, p));
+ }
+ return 1;
+ }
+
++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
++{
++ struct iam_frame *f;
++
++ for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
++ do_corr(schedule());
++ *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ);
++ if (*lh == NULL)
++ return -ENOMEM;
++ }
++ return 0;
++}
++
++static int iam_index_advance(struct iam_path *path)
++{
++ return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
++}
++
++/*
++ * Advance index part of @path to point to the next leaf. Returns 1 on
++ * success, 0, when end of container was reached. Leaf node is locked.
++ */
+ int iam_index_next(struct iam_container *c, struct iam_path *path)
+ {
+- return ext3_htree_advance(c->ic_object, 0, path, NULL, 0);
++ iam_ptr_t cursor;
++ struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, };
++ int result;
++ struct inode *object;
++
++ /*
++ * Locking for iam_index_next()... is to be described.
++ */
++
++ object = c->ic_object;
++ cursor = path->ip_frame->leaf;
++
++ while (1) {
++ result = iam_index_lock(path, lh);
++ do_corr(schedule());
++ if (result < 0)
++ break;
++
++ result = dx_check_full_path(path, 0);
++ if (result == 0 && cursor == path->ip_frame->leaf) {
++ result = iam_index_advance(path);
++
++ assert_corr(result == 0 ||
++ cursor != path->ip_frame->leaf);
++ break;
++ }
++ do {
++ dx_unlock_array(object, lh);
++
++ iam_path_release(path);
++ do_corr(schedule());
++
++ result = dx_lookup(path);
++ if (result < 0)
++ break;
++
++ while (path->ip_frame->leaf != cursor) {
++ do_corr(schedule());
++
++ result = iam_index_lock(path, lh);
++ do_corr(schedule());
++ if (result < 0)
++ break;
++
++ result = dx_check_full_path(path, 0);
++ if (result != 0)
++ break;
++
++ result = iam_index_advance(path);
++ if (result == 0) {
++ ext3_error(object->i_sb, __FUNCTION__,
++ "cannot find cursor: %u\n",
++ cursor);
++ result = -EIO;
++ }
++ if (result < 0)
++ break;
++ result = dx_check_full_path(path, 0);
++ if (result != 0)
++ break;
++ dx_unlock_array(object, lh);
++ }
++ } while (result == -EAGAIN);
++ if (result < 0)
++ break;
++ }
++ dx_unlock_array(object, lh);
++ return result;
+ }
+
+ int ext3_htree_next_block(struct inode *dir, __u32 hash,
+@@ -649,14 +1011,29 @@ void iam_insert_key(struct iam_path *pat
+ struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+ int count = dx_get_count(entries);
+
++ /*
++ * Unfortunately we cannot assert this, as this function is sometimes
++ * called by VFS under i_sem and without pdirops lock.
++ */
++ assert_corr(1 || iam_frame_is_locked(path, frame));
+ assert_corr(count < dx_get_limit(entries));
+ assert_corr(frame->at < iam_entry_shift(path, entries, count));
++ assert_inv(dx_node_check(path, frame));
+
+ memmove(iam_entry_shift(path, new, 1), new,
+ (char *)iam_entry_shift(path, entries, count) - (char *)new);
+ dx_set_ikey(path, new, key);
+ dx_set_block(path, new, ptr);
+ dx_set_count(entries, count + 1);
++ assert_inv(dx_node_check(path, frame));
++}
++
++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
++ const struct iam_ikey *key, iam_ptr_t ptr)
++{
++ dx_lock_bh(frame->bh);
++ iam_insert_key(path, frame, key, ptr);
++ dx_unlock_bh(frame->bh);
+ }
+
+ void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+@@ -882,7 +1259,7 @@ static struct buffer_head * ext3_dx_find
+ sb = dir->i_sb;
+ /* NFS may look up ".." - look at dx_root directory block */
+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+- *err = dx_probe(dentry, NULL, &hinfo, path);
++ *err = dx_probe(&dentry->d_name, NULL, &hinfo, path);
+ if (*err != 0)
+ return NULL;
+ } else {
+@@ -1114,7 +1491,7 @@ struct ext3_dir_entry_2 *move_entries(st
+ hash2 = map[split].hash;
+ continued = hash2 == map[split - 1].hash;
+ dxtrace(printk("Split block %i at %x, %i/%i\n",
+- dx_get_block(frame->at), hash2, split, count - split));
++ frame->leaf, hash2, split, count - split));
+
+ /* Fancy dance to stay within two buffers */
+ de2 = dx_move_dirents(data1, data2, map + split, count - split);
+@@ -1484,16 +1861,38 @@ static int shift_entries(struct iam_path
+ (char *) iam_entry_shift(path, entries, count1),
+ count2 * iam_entry_size(path));
+
+- dx_set_count(entries, count1);
+ dx_set_count(entries2, count2 + delta);
+ dx_set_limit(entries2, dx_node_limit(path));
+
+- iam_insert_key(path, parent, pivot, newblock);
++ /*
++ * NOTE: very subtle piece of code competing dx_probe() may find 2nd
++ * level index in root index, then we insert new index here and set
++ * new count in that 2nd level index. so, dx_probe() may see 2nd level
++ * index w/o hash it looks for. the solution is to check root index
++ * after we locked just founded 2nd level index -bzzz
++ */
++ iam_insert_key_lock(path, parent, pivot, newblock);
++
++ /*
++ * now old and new 2nd level index blocks contain all pointers, so
++ * dx_probe() may find it in the both. it's OK -bzzz
++ */
++ dx_lock_bh(frame->bh);
++ dx_set_count(entries, count1);
++ dx_unlock_bh(frame->bh);
++
++ /*
++ * now old 2nd level index block points to first half of leafs. it's
++ * importand that dx_probe() must check root index block for changes
++ * under dx_lock_bh(frame->bh) -bzzz
++ */
++
+ return count1;
+ }
+
+ #ifdef CONFIG_EXT3_INDEX
+-int split_index_node(handle_t *handle, struct iam_path *path)
++int split_index_node(handle_t *handle, struct iam_path *path,
++ struct dynlock_handle **lh)
+ {
+
+ struct iam_entry *entries; /* old block contents */
+@@ -1501,6 +1900,8 @@ int split_index_node(handle_t *handle, s
+ struct iam_frame *frame, *safe;
+ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+ u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
++ struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
++ struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
+ struct inode *dir = iam_path_obj(path);
+ struct iam_descr *descr;
+ int nr_splet;
+@@ -1523,12 +1924,14 @@ int split_index_node(handle_t *handle, s
+ * - first allocate all necessary blocks
+ *
+ * - insert pointers into them atomically.
+- *
+- * XXX nikita: this algorithm is *not* scalable, as it assumes that at
+- * least nodes in the path are locked.
+ */
+
+- /* Block full, should compress but for now just split */
++ /*
++ * Locking: leaf is already locked. htree-locks are acquired on all
++ * index nodes that require split bottom-to-top, on the "safe" node,
++ * and on all new nodes
++ */
++
+ dxtrace(printk("using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+
+@@ -1536,6 +1939,7 @@ int split_index_node(handle_t *handle, s
+ for (nr_splet = 0; frame >= path->ip_frames &&
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++nr_splet) {
++ do_corr(schedule());
+ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+ ext3_warning(dir->i_sb, __FUNCTION__,
+ "Directory index full!\n");
+@@ -1545,14 +1949,53 @@ int split_index_node(handle_t *handle, s
+ }
+
+ safe = frame;
+- /* Go back down, allocating blocks, and adding blocks into
++
++ /*
++ * Lock all nodes, bottom to top.
++ */
++ for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
++ do_corr(schedule());
++ lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE);
++ if (lock[i] == NULL) {
++ err = -ENOMEM;
++ goto cleanup;
++ }
++ }
++
++ /*
++ * Check for concurrent index modification.
++ */
++ err = dx_check_full_path(path, 1);
++ if (err)
++ goto cleanup;
++ /*
++ * And check that the same number of nodes is to be split.
++ */
++ for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
++ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
++ --frame, ++i) {
++ ;
++ }
++ if (i != nr_splet) {
++ err = -EAGAIN;
++ goto cleanup;
++ }
++
++ /* Go back down, allocating blocks, locking them, and adding into
+ * transaction... */
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
++ do_corr(schedule());
+ if (!bh_new[i] ||
+ descr->id_ops->id_node_init(path->ip_container,
+ bh_new[i], 0) != 0)
+ goto cleanup;
++ new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE);
++ if (new_lock[i] == NULL) {
++ err = -ENOMEM;
++ goto cleanup;
++ }
++ do_corr(schedule());
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, frame->bh);
+ if (err)
+@@ -1560,6 +2003,7 @@ int split_index_node(handle_t *handle, s
+ }
+ /* Add "safe" node to transaction too */
+ if (safe + 1 != path->ip_frames) {
++ do_corr(schedule());
+ err = ext3_journal_get_write_access(handle, safe->bh);
+ if (err)
+ goto journal_error;
+@@ -1596,16 +2040,21 @@ int split_index_node(handle_t *handle, s
+
+ assert_corr(i == 0);
+
++ do_corr(schedule());
++
+ frames = path->ip_frames;
+ memcpy((char *) entries2, (char *) entries,
+ count * iam_entry_size(path));
+ dx_set_limit(entries2, dx_node_limit(path));
+
+ /* Set up root */
++ dx_lock_bh(frame->bh);
+ next = descr->id_ops->id_root_inc(path->ip_container,
+ path, frame);
+ dx_set_block(path, next, newblock[0]);
++ dx_unlock_bh(frame->bh);
+
++ do_corr(schedule());
+ /* Shift frames in the path */
+ memmove(frames + 2, frames + 1,
+ (sizeof path->ip_frames) - 2 * sizeof frames[0]);
+@@ -1621,10 +2070,12 @@ int split_index_node(handle_t *handle, s
+ err = ext3_journal_get_write_access(handle, bh2);
+ if (err)
+ goto journal_error;
++ do_corr(schedule());
+ } else {
+ /* splitting non-root index node. */
+ struct iam_frame *parent = frame - 1;
+
++ do_corr(schedule());
+ count = shift_entries(path, frame, count,
+ entries, entries2, newblock[i]);
+ /* Which index block gets the new entry? */
+@@ -1634,7 +2085,11 @@ int split_index_node(handle_t *handle, s
+ frame->at = iam_entry_shift(path, entries2,
+ idx - count + d);
+ frame->entries = entries = entries2;
++ frame->curidx = newblock[i];
+ swap(frame->bh, bh2);
++ assert_corr(lock[i + 1] != NULL);
++ assert_corr(new_lock[i] != NULL);
++ swap(lock[i + 1], new_lock[i]);
+ bh_new[i] = bh2;
+ parent->at = iam_entry_shift(path,
+ parent->at, +1);
+@@ -1647,20 +2102,25 @@ int split_index_node(handle_t *handle, s
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (err)
+ goto journal_error;
++ do_corr(schedule());
+ err = ext3_journal_dirty_metadata(handle, parent->bh);
+ if (err)
+ goto journal_error;
+ }
++ do_corr(schedule());
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err)
+ goto journal_error;
++ }
+ /*
+ * This function was called to make insertion of new leaf
+ * possible. Check that it fulfilled its obligations.
+ */
+ assert_corr(dx_get_count(path->ip_frame->entries) <
+ dx_get_limit(path->ip_frame->entries));
+- }
++ assert_corr(lock[nr_splet] != NULL);
++ *lh = lock[nr_splet];
++ lock[nr_splet] = NULL;
+ if (nr_splet > 0) {
+ /*
+ * Log ->i_size modification.
+@@ -1674,6 +2134,12 @@ journal_error:
+ ext3_std_error(dir->i_sb, err);
+
+ cleanup:
++ dx_unlock_array(dir, lock);
++ dx_unlock_array(dir, new_lock);
++
++ assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
++
++ do_corr(schedule());
+ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
+ if (bh_new[i] != NULL)
+ brelse(bh_new[i]);
+@@ -1695,18 +2161,18 @@ static int ext3_dx_add_entry(handle_t *h
+ struct buffer_head * bh = NULL;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct ext3_dir_entry_2 *de;
++ struct dynlock_handle *dummy = NULL;
+ int err;
+ size_t isize;
+
+ iam_path_compat_init(&cpath, dir);
+ param = iam_path_descr(path);
+
+- err = dx_probe(dentry, NULL, &hinfo, path);
++ err = dx_probe(&dentry->d_name, NULL, &hinfo, path);
+ if (err != 0)
+ return err;
+ frame = path->ip_frame;
+
+- /* XXX nikita: global serialization! */
+ isize = dir->i_size;
+
+ err = param->id_ops->id_node_read(path->ip_container,
+@@ -1726,7 +2192,7 @@ static int ext3_dx_add_entry(handle_t *h
+ goto cleanup;
+ }
+
+- err = split_index_node(handle, path);
++ err = split_index_node(handle, path, &dummy);
+ if (err)
+ goto cleanup;
+
+@@ -1742,6 +2208,7 @@ static int ext3_dx_add_entry(handle_t *h
+ journal_error:
+ ext3_std_error(dir->i_sb, err);
+ cleanup:
++ dx_unlock_htree(dir, dummy);
+ if (bh)
+ brelse(bh);
+ cleanup2:
+Index: iam/fs/ext3/super.c
+===================================================================
+--- iam.orig/fs/ext3/super.c
++++ iam/fs/ext3/super.c
+@@ -465,4 +465,8 @@ static struct inode *ext3_alloc_inode(st
+ ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+ ei->vfs_inode.i_version = 1;
++
++ dynlock_init(&ei->i_htree_lock);
++ sema_init(&ei->i_rename_sem, 1);
++ sema_init(&ei->i_append_sem, 1);
+
+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
+Index: iam/include/linux/ext3_fs_i.h
+===================================================================
+--- iam.orig/include/linux/ext3_fs_i.h
++++ iam/include/linux/ext3_fs_i.h
+@@ -19,6 +19,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
++#include <linux/dynlocks.h>
+
+ struct reserve_window {
+ __u32 _rsv_start; /* First byte reserved */
+@@ -127,6 +128,12 @@ struct ext3_inode_info {
+ * by other means, so we have truncate_sem.
+ */
+ struct semaphore truncate_sem;
++
++ /* following fields for parallel directory operations -bzzz */
++ struct dynlock i_htree_lock;
++ struct semaphore i_append_sem;
++ struct semaphore i_rename_sem;
++
+ struct inode vfs_inode;
+
+ __u32 i_cached_extent[4];
+Index: iam/include/linux/lustre_iam.h
+===================================================================
+--- iam.orig/include/linux/lustre_iam.h
++++ iam/include/linux/lustre_iam.h
+@@ -39,6 +39,9 @@ enum {
+ * Maximal number of non-leaf levels in htree. In the stock ext3 this
+ * is 2.
+ */
++ /*
++ * XXX reduced back to 2 to make per-node locking work.
++ */
+ DX_MAX_TREE_HEIGHT = 5,
+ /*
+ * Scratch keys used by generic code for temporaries.
+@@ -62,7 +65,7 @@ enum {
+ /*
+ * Maximal format name length.
+ */
+- DX_FMT_NAME_LEN = 16
++ DX_FMT_NAME_LEN = 16,
+ };
+
+ #ifdef __KERNEL__
+@@ -133,8 +136,10 @@ enum {
+
+ #if EXT3_CORRECTNESS_ON
+ #define assert_corr(test) J_ASSERT(test)
++#define do_corr(exp) exp
+ #else
+ #define assert_corr(test) do {;} while (0)
++#define do_corr(exp) do {;} while (0)
+ #endif
+
+ #if EXT3_INVARIANT_ON
+@@ -179,7 +184,7 @@ struct iam_ikey;
+ * support interfaces like readdir(), where iteration over index has to be
+ * re-startable.
+ */
+-typedef __u64 iam_ptr_t;
++typedef __u32 iam_ptr_t;
+
+ /*
+ * Index node traversed during tree lookup.
+@@ -188,6 +193,11 @@ struct iam_frame {
+ struct buffer_head *bh; /* buffer holding node data */
+ struct iam_entry *entries; /* array of entries */
+ struct iam_entry *at; /* target entry, found by binary search */
++ iam_ptr_t leaf; /* (logical) offset of child node found by
++ * binary search. */
++ iam_ptr_t curidx; /* (logical) offset of this node. Used to
++ * per-node locking to detect concurrent
++ * splits. */
+ };
+
+ /*
+@@ -205,6 +215,11 @@ struct iam_leaf {
+ struct buffer_head *il_bh;
+ struct iam_lentry *il_entries;
+ struct iam_lentry *il_at;
++ /*
++ * Lock on a leaf node.
++ */
++ struct dynlock_handle *il_lock;
++ iam_ptr_t il_curidx; /* logical offset of leaf node. */
+ void *il_descr_data;
+ };
+
+@@ -215,19 +230,23 @@ enum iam_lookup_t {
+ /*
+ * lookup found a record with the key requested
+ */
+- IAM_LOOKUP_EXACT,
++ IAM_LOOKUP_EXACT = 0,
+ /*
+ * lookup positioned leaf on some record
+ */
+- IAM_LOOKUP_OK,
++ IAM_LOOKUP_OK = 1,
+ /*
+ * leaf was empty
+ */
+- IAM_LOOKUP_EMPTY,
++ IAM_LOOKUP_EMPTY = 2,
+ /*
+ * lookup positioned leaf before first record
+ */
+- IAM_LOOKUP_BEFORE
++ IAM_LOOKUP_BEFORE = 3,
++ /*
++ * Found hash may have a continuation in the next leaf.
++ */
++ IAM_LOOKUP_LAST = 0x100
+ };
+
+ /*
+@@ -270,9 +289,9 @@ struct iam_operations {
+ struct iam_path *path,
+ struct iam_frame *frame);
+
+- struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c);
+- void (*id_ipd_free)(const struct iam_container *c,
+- struct iam_path_descr *ipd);
++ struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c,
++ void *area);
++ void (*id_ipd_free)(struct iam_path_descr *ipd);
+ /*
+ * Format name.
+ */
+@@ -329,8 +348,10 @@ struct iam_leaf_operations {
+
+ void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
+ void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
++ void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r);
+
+ int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
++ int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k);
+
+ int (*key_size)(const struct iam_leaf *l);
+ /*
+@@ -473,11 +494,23 @@ struct iam_path_compat {
+ struct iam_container ipc_container;
+ __u32 ipc_scratch[DX_SCRATCH_KEYS];
+ struct dx_hash_info *ipc_hinfo;
+- struct dentry *ipc_dentry;
++ struct qstr *ipc_qstr;
+ struct iam_path_descr ipc_descr;
+ struct dx_hash_info ipc_hinfo_area;
+ };
+
++#define const_max(p, q) ((p > q) ? p : q)
++
++enum {
++ DX_MAX_IKEY_SIZE = 32, /* be generous */
++ /*
++ * Hack to avoid dynamic allocation and freeing of ipd.
++ */
++ DX_IPD_MAX_SIZE = const_max(sizeof(struct iam_path_compat),
++ DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS +
++ sizeof(struct iam_path_descr))
++};
++
+ /*
+ * iam cursor (iterator) api.
+ */
+@@ -554,6 +587,7 @@ struct iam_iterator {
+ void iam_path_init(struct iam_path *path, struct iam_container *c,
+ struct iam_path_descr *pd);
+ void iam_path_fini(struct iam_path *path);
++void iam_path_release(struct iam_path *path);
+
+ void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
+ void iam_path_compat_fini(struct iam_path_compat *path);
+@@ -683,12 +717,6 @@ static inline int iam_ikeycmp(const stru
+ return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2);
+ }
+
+-static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
+- const struct iam_rec *rec_src)
+-{
+- memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size);
+-}
+-
+ static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
+ {
+ return (void *)((char *)entry + off);
+@@ -720,6 +748,11 @@ iam_leaf_ops(const struct iam_leaf *leaf
+ return iam_leaf_descr(leaf)->id_leaf_ops;
+ }
+
++static inline void iam_reccpy(const struct iam_leaf *leaf,
++ struct iam_rec *rec_dst)
++{
++ iam_leaf_ops(leaf)->rec_get(leaf, rec_dst);
++}
+
+ /*XXX These stuff put here, just because they are used by iam.c and namei.c*/
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+@@ -848,7 +881,36 @@ static inline struct iam_ikey *iam_path_
+ return path->ip_data->ipd_key_scratch[nr];
+ }
+
+-int dx_lookup(struct iam_path *path);
++static inline struct dynlock *path_dynlock(struct iam_path *path)
++{
++ return &EXT3_I(iam_path_obj(path))->i_htree_lock;
++}
++
++static inline int iam_leaf_is_locked(const struct iam_leaf *leaf)
++{
++ int result;
++
++ result = dynlock_is_locked(path_dynlock(leaf->il_path),
++ leaf->il_curidx);
++ if (!result)
++ dump_stack();
++ return result;
++}
++
++static inline int iam_frame_is_locked(struct iam_path *path,
++ const struct iam_frame *frame)
++{
++ int result;
++
++ result = dynlock_is_locked(path_dynlock(path), frame->curidx);
++ if (!result)
++ dump_stack();
++ return result;
++}
++
++int dx_lookup_lock(struct iam_path *path,
++ struct dynlock_handle **dl, enum dynlock_type lt);
++
+ void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+ u32 hash, u32 block);
+ int dx_index_is_compat(struct iam_path *path);
+@@ -858,7 +920,8 @@ int ext3_htree_next_block(struct inode *
+
+ struct buffer_head *ext3_append(handle_t *handle, struct inode *inode,
+ u32 *block, int *err);
+-int split_index_node(handle_t *handle, struct iam_path *path);
++int split_index_node(handle_t *handle, struct iam_path *path,
++ struct dynlock_handle **lh);
+ struct ext3_dir_entry_2 *split_entry(struct inode *dir,
+ struct ext3_dir_entry_2 *de,
+ unsigned long ino, mode_t mode,
+@@ -874,6 +937,10 @@ struct ext3_dir_entry_2 *move_entries(st
+
+ extern struct iam_descr iam_htree_compat_param;
+
++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
++ enum dynlock_type lt);
++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh);
++
+ /*
+ * external
+ */
+@@ -889,7 +956,7 @@ int iam_read_leaf(struct iam_path *p);
+ int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+ handle_t *handle, struct buffer_head **bh);
+
+-void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
+ const struct iam_ikey *key, iam_ptr_t ptr);
+
+ int iam_leaf_at_end(const struct iam_leaf *l);
--- /dev/null
+Index: linux-2.6.9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:35:38.000000000 +0800
++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:35:47.000000000 +0800
+@@ -48,6 +48,11 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
++/*
++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
++ */
++#define DX_MAX_TREE_HEIGHT (5)
++
+ static struct buffer_head *ext3_append(handle_t *handle,
+ struct inode *inode,
+ u32 *block, int *err)
+@@ -75,7 +80,7 @@
+ #ifdef DX_DEBUG
+ #define dxtrace(command) command
+ #else
+-#define dxtrace(command)
++#define dxtrace(command)
+ #endif
+
+ struct fake_dirent
+@@ -168,7 +173,7 @@
+ static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+- struct dx_frame *frames,
++ struct dx_frame *frames,
+ __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+ struct ext3_dir_entry_2 **res_dir, int *err);
+@@ -249,7 +254,7 @@
+ }
+
+ struct stats
+-{
++{
+ unsigned names;
+ unsigned space;
+ unsigned bcount;
+@@ -367,7 +372,7 @@
+ goto fail;
+ }
+
+- if ((indirect = root->info.indirect_levels) > 1) {
++ if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) {
+ ext3_warning(dir->i_sb, __FUNCTION__,
+ "Unimplemented inode hash depth: %#06x",
+ root->info.indirect_levels);
+@@ -436,12 +441,15 @@
+
+ static void dx_release (struct dx_frame *frames)
+ {
++ int height;
++
+ if (frames[0].bh == NULL)
+ return;
+-
+- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+- brelse(frames[1].bh);
+- brelse(frames[0].bh);
++ height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels;
++ for (; height >= 0; height--) {
++ assert(frames[height].bh != NULL);
++ brelse(frames[height].bh);
++ }
+ }
+
+ /*
+@@ -463,7 +471,7 @@
+ */
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+- struct dx_frame *frames,
++ struct dx_frame *frames,
+ __u32 *start_hash)
+ {
+ struct dx_frame *p;
+@@ -582,7 +590,7 @@
+ {
+ struct dx_hash_info hinfo;
+ struct ext3_dir_entry_2 *de;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
+ struct inode *dir;
+ int block, err;
+ int count = 0;
+@@ -627,7 +635,7 @@
+ }
+ count += ret;
+ hashval = ~0;
+- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+ frame, frames, &hashval);
+ *next_hash = hashval;
+ if (ret < 0) {
+@@ -644,7 +652,7 @@
+ break;
+ }
+ dx_release(frames);
+- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
++ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+ count, *next_hash));
+ return count;
+ errout:
+@@ -918,7 +926,7 @@
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
+ struct ext3_dir_entry_2 *de, *top;
+ struct buffer_head *bh;
+ unsigned long block;
+@@ -1037,7 +1045,7 @@
+ parent = ERR_PTR(-ENOMEM);
+ }
+ return parent;
+-}
++}
+
+ #define S_SHIFT 12
+ static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+@@ -1098,6 +1106,8 @@
+ return prev;
+ }
+
++/* Allocate new node, and split leaf node @bh into it, inserting new pointer
++ * into parent node identified by @frame */
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+ struct buffer_head **bh,struct dx_frame *frame,
+ struct dx_hash_info *hinfo, int *error)
+@@ -1185,7 +1195,7 @@
+ * add_dirent_to_buf will attempt search the directory block for
+ * space. It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+- *
++ *
+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
+ * all other cases bh is released.
+ */
+@@ -1286,7 +1296,7 @@
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+ struct dx_root *root;
+- struct dx_frame frames[2], *frame;
++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame;
+ struct dx_entry *entries;
+ struct ext3_dir_entry_2 *de, *de2;
+ char *data1, *top;
+@@ -1427,20 +1437,29 @@
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+- struct dx_frame frames[2], *frame;
+- struct dx_entry *entries, *at;
++ struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe;
++ struct dx_node *node2;
++ struct dx_entry *entries; /* old block contents */
++ struct dx_entry *entries2; /* new block contents */
+ struct dx_hash_info hinfo;
+ struct buffer_head * bh;
++ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block * sb = dir->i_sb;
+ struct ext3_dir_entry_2 *de;
++ u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+ int err;
++ int nr_splet;
++ int i;
++ size_t isize;
+
+ frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+ entries = frame->entries;
+- at = frame->at;
++
++ /* XXX nikita: global serialization! */
++ isize = dir->i_size;
+
+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ goto cleanup;
+@@ -1456,29 +1475,43 @@
+ goto cleanup;
+ }
+
++ /*
++ * Tall-tree handling: we might have to split multiple index blocks
++ * all the way up to tree root. Tricky point here is error handling:
++ * to avoid complicated undo/rollback we
++ *
++ * - first allocate all necessary blocks
++ *
++ * - insert pointers into them atomically.
++ *
++ * XXX nikita: this algorithm is *not* scalable, as it assumes that at
++ * least nodes in the path are locked.
++ */
++
+ /* Block full, should compress but for now just split */
+ dxtrace(printk("using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+- /* Need to split index? */
+- if (dx_get_count(entries) == dx_get_limit(entries)) {
+- u32 newblock;
+- unsigned icount = dx_get_count(entries);
+- int levels = frame - frames;
+- struct dx_entry *entries2;
+- struct dx_node *node2;
+- struct buffer_head *bh2;
+
+- if (levels && (dx_get_count(frames->entries) ==
+- dx_get_limit(frames->entries))) {
++ /* What levels need split? */
++ for (nr_splet = 0; frame >= frames &&
++ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
++ --frame, ++nr_splet) {
++ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+ ext3_warning(sb, __FUNCTION__,
+ "Directory index full!\n");
+ err = -ENOSPC;
+ goto cleanup;
+ }
+- bh2 = ext3_append (handle, dir, &newblock, &err);
+- if (!(bh2))
++ }
++
++ safe = frame;
++ /* Go back down, allocating blocks, and adding blocks into
++ * transaction... */
++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
++ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
++ if (!bh_new[i])
+ goto cleanup;
+- node2 = (struct dx_node *)(bh2->b_data);
++ node2 = (struct dx_node *)(bh_new[i]->b_data);
+ entries2 = node2->entries;
+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+ node2->fake.inode = 0;
+@@ -1486,72 +1519,112 @@
+ err = ext3_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+- if (levels) {
+- unsigned icount1 = icount/2, icount2 = icount - icount1;
+- unsigned hash2 = dx_get_hash(entries + icount1);
+- dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+-
+- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+- err = ext3_journal_get_write_access(handle,
+- frames[0].bh);
++ }
++ /* Add "safe" node to transaction too */
++ if (safe + 1 != frames) {
++ err = ext3_journal_get_write_access(handle, safe->bh);
++ if (err)
++ goto journal_error;
++ }
++
++ /* Go through nodes once more, inserting pointers */
++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
++ unsigned count;
++ int idx;
++ struct buffer_head *bh2;
++
++ entries = frame->entries;
++ count = dx_get_count(entries);
++ idx = frame->at - entries;
++
++ bh2 = bh_new[i];
++ node2 = (struct dx_node *)(bh2->b_data);
++ entries2 = node2->entries;
++
++ if (frame == frames) {
++ /* splitting root node. Tricky point:
++ *
++ * In the "normal" B-tree we'd split root *and* add
++ * new root to the tree with pointers to the old root
++ * and its sibling (thus introducing two new nodes).
++ *
++ * In htree it's enough to add one node, because
++ * capacity of the root node is smaller than that of
++ * non-root one.
++ */
++ struct dx_root *root;
++ u8 indirects;
++
++ root = (struct dx_root *) frames->bh->b_data;
++ indirects = root->info.indirect_levels;
++ dxtrace(printk("Creating new root %d\n", indirects));
++ memcpy((char *) entries2, (char *) entries,
++ count * sizeof(struct dx_entry));
++ dx_set_limit(entries2, dx_node_limit(dir));
++
++ /* Set up root */
++ dx_set_count(entries, 1);
++ dx_set_block(entries + 0, newblock[i]);
++ root->info.indirect_levels = indirects + 1;
++
++ /* Shift frames in the path */
++ memmove(frames + 2, frames + 1,
++ (sizeof frames) - 2 * sizeof frames[0]);
++ /* Add new access path frame */
++ frames[1].at = entries2 + idx;
++ frames[1].entries = entries = entries2;
++ frames[1].bh = bh2;
++ ++ frame;
++ bh_new[i] = NULL; /* buffer head is "consumed" */
++ err = ext3_journal_get_write_access(handle, bh2);
+ if (err)
+ goto journal_error;
+-
+- memcpy ((char *) entries2, (char *) (entries + icount1),
+- icount2 * sizeof(struct dx_entry));
+- dx_set_count (entries, icount1);
+- dx_set_count (entries2, icount2);
++ } else {
++ /* splitting non-root index node. */
++ unsigned count1 = count/2, count2 = count - count1;
++ unsigned hash2 = dx_get_hash(entries + count1);
++ dxtrace(printk("Split index %i/%i\n", count1, count2));
++
++ memcpy ((char *) entries2, (char *) (entries + count1),
++ count2 * sizeof(struct dx_entry));
++ dx_set_count (entries, count1);
++ dx_set_count (entries2, count2);
+ dx_set_limit (entries2, dx_node_limit(dir));
+
+ /* Which index block gets the new entry? */
+- if (at - entries >= icount1) {
+- frame->at = at = at - entries - icount1 + entries2;
++ if (idx >= count1) {
++ frame->at = entries2 + idx - count1;
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
++ bh_new[i] = bh2;
+ }
+- dx_insert_block (frames + 0, hash2, newblock);
+- dxtrace(dx_show_index ("node", frames[1].entries));
++ dx_insert_block (frame - 1, hash2, newblock[i]);
++ dxtrace(dx_show_index ("node", frame->entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (err)
+ goto journal_error;
+- brelse (bh2);
+- } else {
+- dxtrace(printk("Creating second level index...\n"));
+- memcpy((char *) entries2, (char *) entries,
+- icount * sizeof(struct dx_entry));
+- dx_set_limit(entries2, dx_node_limit(dir));
+-
+- /* Set up root */
+- dx_set_count(entries, 1);
+- dx_set_block(entries + 0, newblock);
+- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+-
+- /* Add new access path frame */
+- frame = frames + 1;
+- frame->at = at = at - entries + entries2;
+- frame->entries = entries = entries2;
+- frame->bh = bh2;
+- err = ext3_journal_get_write_access(handle,
+- frame->bh);
+- if (err)
+- goto journal_error;
+ }
+- ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ de = do_split(handle, dir, &bh, --frame, &hinfo, &err);
+ if (!de)
+ goto cleanup;
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+- bh = NULL;
+- goto cleanup;
++ goto cleanup2;
+
+ journal_error:
+ ext3_std_error(dir->i_sb, err);
+ cleanup:
+ if (bh)
+ brelse(bh);
++cleanup2:
++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
++ if (bh_new[i] != NULL)
++ brelse(bh_new[i]);
++ }
++ if (err)
++ inode->i_size = isize;
+ dx_release(frames);
+ return err;
+ }
+@@ -1561,7 +1634,7 @@
+ * ext3_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+-static int ext3_delete_entry (handle_t *handle,
++static int ext3_delete_entry (handle_t *handle,
+ struct inode * dir,
+ struct ext3_dir_entry_2 * de_del,
+ struct buffer_head * bh)
+@@ -1821,7 +1894,7 @@
+ de1 = (struct ext3_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ if (le32_to_cpu(de->inode) != inode->i_ino ||
+- !le32_to_cpu(de1->inode) ||
++ !le32_to_cpu(de1->inode) ||
+ strcmp (".", de->name) ||
+ strcmp ("..", de1->name)) {
+ ext3_warning (inode->i_sb, "empty_dir",
+@@ -1891,7 +1964,7 @@
+ * being truncated, or files being unlinked. */
+
+ /* @@@ FIXME: Observation from aviro:
+- * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
++ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
+ * here (on lock_super()), so race with ext3_link() which might bump
+ * ->i_nlink. For, say it, character device. Not a regular file,
+ * not a directory, not a symlink and ->i_nlink > 0.
+@@ -2415,4 +2488,4 @@
+ .removexattr = generic_removexattr,
+ #endif
+ .permission = ext3_permission,
+-};
++};
diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/namei.c RH_2_6_9_42_0_3/fs/ext3/namei.c
--- RH_2_6_9_42_0_3.orig/fs/ext3/namei.c 2006-10-23 13:32:59.000000000 +0300
+++ RH_2_6_9_42_0_3/fs/ext3/namei.c 2007-02-22 18:58:13.000000000 +0200
-@@ -97,6 +97,7 @@ struct dx_entry
- __le32 block;
- };
-
-+
- /*
- * dx_root_info is laid out so that if it should somehow get overlaid by a
- * dirent the two low bits of the hash version will be zero. Therefore, the
-@@ -141,6 +142,14 @@ struct dx_map_entry
- u32 offs;
- };
+@@ -1624,6 +1633,28 @@ static int ext3_add_nondir(handle_t *han
+ return err;
+ }
+#define LVFS_DENTRY_PARAM_MAGIC 20070216UL
+struct lvfs_dentry_params
+ void *p_ptr;
+ u32 magic;
+};
-+
- #ifdef CONFIG_EXT3_INDEX
- static inline unsigned dx_get_block (struct dx_entry *entry);
- static void dx_set_block (struct dx_entry *entry, unsigned value);
-@@ -1624,6 +1633,20 @@ static int ext3_add_nondir(handle_t *han
- return err;
- }
-
++
+static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
+ int mode, struct dentry *dentry)
+{
ext3-mballoc3-rhel4.patch
ext3-nlinks-2.6.9.patch
ext3-ialloc-2.6.patch
-ext3-lookup-dotdot-2.6.9.patch
-ext3-sector_t-overflow-2.6.9-rhel4.patch
-ext3-check-jbd-errors-2.6.9.patch
+ext3-tall-htree.patch
+ext3-htree-path.patch
+ext3-htree-r5-hash.patch
+ext3-htree-path-ops.patch
+ext3-hash-selection.patch
+ext3-htree-comments.patch
+ext3-lookup-dotdot-2.6.9.patch
+ext3-sector_t-overflow-2.6.9-rhel4.patch
+ext3-check-jbd-errors-2.6.9.patch
ext3-uninit-2.6.9.patch
ext3-nanosecond-2.6-rhel4.patch
+ext3-iam-ops.patch
+ext3-iam-separate.patch
+ext3-iam-uapi.patch
+ext3-orphans-delay.patch
+ext3-pdirops-2.6.9.patch