Whamcloud - gitweb
iam:
authornikita <nikita>
Mon, 29 May 2006 21:10:37 +0000 (21:10 +0000)
committernikita <nikita>
Mon, 29 May 2006 21:10:37 +0000 (21:10 +0000)
  - separate code better,

  - finish iam_lfix format,

  - add format guessing: self-contained iam file that keeps (in the o-th page)
    its own meta-information: key and record size, etc.

  - user level tool ./lustre/utils/iam_create.c to create lfix iam files.

osd:

  - interface to iam isolating upper layers from knowledge of iam details.

fld:

  - adjust to changed indexing interface.

15 files changed:
lustre/fld/fld_handle.c
lustre/fld/fld_iam.c
lustre/fld/fld_internal.h
lustre/include/dt_object.h
lustre/include/lu_object.h
lustre/include/lustre_fid.h
lustre/kernel_patches/patches/ext3-iam-separate.patch
lustre/mdd/mdd_handler.c
lustre/mdt/mdt_handler.c
lustre/obdclass/dt_object.c
lustre/obdclass/lu_object.c
lustre/osd/osd_handler.c
lustre/osd/osd_internal.h
lustre/utils/Makefile.am
lustre/utils/create_iam.c

index 769646c..9c845d5 100644 (file)
@@ -362,17 +362,20 @@ int fld_server_init(const struct lu_context *ctx, struct fld *fld,
         INIT_LIST_HEAD(&fld_list_head.fld_list);
         spin_lock_init(&fld_list_head.fld_lock);
 
-        fld_iam_init(ctx, fld);
-
-        fld->fld_service =
-                ptlrpc_init_svc_conf(&fld_conf, fld_req_handle,
-                                     LUSTRE_FLD0_NAME,
-                                     fld->fld_proc_entry, NULL);
-        if (fld->fld_service != NULL)
-                result = ptlrpc_start_threads(NULL, fld->fld_service,
-                                              LUSTRE_FLD0_NAME);
-        else
-                result = -ENOMEM;
+        result = fld_iam_init(ctx, fld);
+
+        if (result == 0) {
+                fld->fld_service =
+                        ptlrpc_init_svc_conf(&fld_conf, fld_req_handle,
+                                             LUSTRE_FLD0_NAME,
+                                             fld->fld_proc_entry, NULL);
+                if (fld->fld_service != NULL)
+                        result = ptlrpc_start_threads(NULL, fld->fld_service,
+                                                      LUSTRE_FLD0_NAME);
+                else
+                        result = -ENOMEM;
+        }
+
         if (result != 0)
                 fld_server_fini(ctx, fld);
         return result;
@@ -396,9 +399,11 @@ void fld_server_fini(const struct lu_context *ctx, struct fld *fld)
                 OBD_FREE_PTR(fld);
         }
         spin_unlock(&fld_list_head.fld_lock);
-        lu_device_put(&fld->fld_dt->dd_lu_dev);
-        fld_iam_fini(ctx, fld);
-        fld->fld_dt = NULL;
+        if (fld->fld_dt != NULL) {
+                lu_device_put(&fld->fld_dt->dd_lu_dev);
+                fld_iam_fini(ctx, fld);
+                fld->fld_dt = NULL;
+        }
 }
 EXPORT_SYMBOL(fld_server_fini);
 
@@ -413,7 +418,7 @@ static int fld_handle(const struct lu_context *ctx,
                 rc = fld_handle_insert(ctx, fld, mf->mf_seq, mf->mf_mds);
                 break;
         case FLD_DELETE:
-                rc = fld_handle_delete(ctx, fld, mf->mf_seq, mf->mf_mds);
+                rc = fld_handle_delete(ctx, fld, mf->mf_seq);
                 break;
         case FLD_GET:
                 rc = fld_handle_lookup(ctx, fld, mf->mf_seq, &mf->mf_mds);
index 92c1fe8..270f1e8 100644 (file)
 #include <md_object.h>
 #include <lustre_mdc.h>
 #include <lustre_fid.h>
-#include <linux/lustre_iam.h>
 #include "fld_internal.h"
 
 
-struct iam_descr fld_param = {
-        .id_key_size = sizeof ((struct lu_fid *)0)->f_seq,
-        .id_ptr_size = 4, /* 32 bit block numbers for now */
-        .id_rec_size = sizeof(mdsno_t),
-        .id_node_gap = 0, /* no gaps in index nodes */
-        .id_root_gap = sizeof(struct iam_root),
-        .id_ops      = &generic_iam_ops,
-        .id_leaf_ops = &iam_lfix_leaf_ops
+static const struct dt_index_features fld_index_features = {
+        .dif_flags       = DT_IND_UPDATE,
+        .dif_keysize_min = sizeof(fidseq_t),
+        .dif_keysize_max = sizeof(fidseq_t),
+        .dif_recsize_min = sizeof(mdsno_t),
+        .dif_recsize_max = sizeof(mdsno_t)
 };
+
 /*
  * number of blocks to reserve for particular operations. Should be function
  * of ... something. Stub for now.
@@ -65,14 +63,54 @@ enum {
         FLD_TXN_INDEX_DELETE_CREDITS  = 10
 };
 
-static int fld_keycmp(const struct iam_container *c, const struct iam_key *k1,
-                      const struct iam_key *k2)
+struct fld_thread_info {
+        __u64 fti_key;
+        __u64 fti_rec;
+};
+
+static void *fld_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
+{
+        struct fld_thread_info *info;
+
+        OBD_ALLOC_PTR(info);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void fld_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
 {
-        __u64 p1 = le64_to_cpu(*(__u32 *)k1);
-        __u64 p2 = le64_to_cpu(*(__u32 *)k2);
+        struct fld_thread_info *info = data;
+        OBD_FREE_PTR(info);
+}
+
+static int fld_key_registered = 0;
 
-        return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
+static struct lu_context_key fld_thread_key = {
+        .lct_init = fld_key_init,
+        .lct_fini = fld_key_fini
+};
 
+static struct dt_key *fld_key(const struct lu_context *ctx,
+                              const fidseq_t seq_num)
+{
+        struct fld_thread_info *info = lu_context_key_get(ctx, &fld_thread_key);
+        LASSERT(info != NULL);
+
+        info->fti_key = cpu_to_be64(seq_num);
+        return (void *)&info->fti_key;
+}
+
+static struct dt_rec *fld_rec(const struct lu_context *ctx,
+                              const mdsno_t mds_num)
+{
+        struct fld_thread_info *info = lu_context_key_get(ctx, &fld_thread_key);
+        LASSERT(info != NULL);
+
+        info->fti_rec = cpu_to_be64(mds_num);
+        return (void *)&info->fti_rec;
 }
 
 int fld_handle_insert(const struct lu_context *ctx, struct fld *fld,
@@ -91,15 +129,15 @@ int fld_handle_insert(const struct lu_context *ctx, struct fld *fld,
         th = dt->dd_ops->dt_trans_start(ctx, dt, &txn);
 
         rc = dt_obj->do_index_ops->dio_insert(ctx, dt_obj,
-                                              (struct dt_rec*)(&mds_num),
-                                              (struct dt_key*)(&seq_num), th);
+                                              fld_rec(ctx, mds_num),
+                                              fld_key(ctx, seq_num), th);
         dt->dd_ops->dt_trans_stop(ctx, th);
 
         RETURN(rc);
 }
 
 int fld_handle_delete(const struct lu_context *ctx, struct fld *fld,
-                      fidseq_t seq_num, mdsno_t mds_num)
+                      fidseq_t seq_num)
 {
         struct dt_device *dt = fld->fld_dt;
         struct dt_object *dt_obj = fld->fld_obj;
@@ -111,8 +149,7 @@ int fld_handle_delete(const struct lu_context *ctx, struct fld *fld,
         txn.tp_credits = FLD_TXN_INDEX_DELETE_CREDITS;
         th = dt->dd_ops->dt_trans_start(ctx, dt, &txn);
         rc = dt_obj->do_index_ops->dio_delete(ctx, dt_obj,
-                                              (struct dt_rec*)(&mds_num),
-                                              (struct dt_key*)(&seq_num), th);
+                                              fld_key(ctx, seq_num), th);
         dt->dd_ops->dt_trans_stop(ctx, th);
 
         RETURN(rc);
@@ -121,52 +158,73 @@ int fld_handle_delete(const struct lu_context *ctx, struct fld *fld,
 int fld_handle_lookup(const struct lu_context *ctx,
                       struct fld *fld, fidseq_t seq_num, mdsno_t *mds_num)
 {
+        int result;
 
         struct dt_object *dt_obj = fld->fld_obj;
+        struct dt_rec    *rec = fld_rec(ctx, 0);
 
-        return dt_obj->do_index_ops->dio_lookup(ctx, dt_obj,
-                                             (struct dt_rec*)(&mds_num),
-                                             (struct dt_key*)(&seq_num));
+        result = dt_obj->do_index_ops->dio_lookup(ctx, dt_obj, rec,
+                                                  fld_key(ctx, seq_num));
+        if (result == 0)
+                *mds_num = be64_to_cpu(*(__u64 *)rec);
+        return result;
 }
 
 int fld_iam_init(const struct lu_context *ctx, struct fld *fld)
 {
         struct dt_device *dt = fld->fld_dt;
         struct dt_object *dt_obj;
-        struct iam_container *ic = NULL;
         int rc;
 
         ENTRY;
 
+        if (fld_key_registered == 0) {
+                rc = lu_context_key_register(&fld_thread_key);
+                if (rc != 0)
+                        return rc;
+        }
+        fld_key_registered++;
+
+        /*
+         * lu_context_key has to be registered before threads are started,
+         * check this.
+         */
+        LASSERT(fld->fld_service == NULL);
+
+        fld->fld_cookie = dt->dd_ops->dt_index_init(ctx, &fld_index_features);
+        if (IS_ERR(fld->fld_cookie) != 0)
+                return PTR_ERR(fld->fld_cookie);
+
         dt_obj = dt_store_open(ctx, dt, "fld", &fld->fld_fid);
         if (!IS_ERR(dt_obj)) {
                 fld->fld_obj = dt_obj;
-                if (dt_obj->do_index_ops != NULL) {
-                        /* XXX nikita: disable for now */
-                        /* rc = dt_obj->do_index_ops->dio_init(ctx, dt_obj,
-                                                            ic, &fld_param); */
-                        fld_param.id_ops->id_keycmp = fld_keycmp;
-                } else {
+                rc = dt_obj->do_ops->do_object_index_try(ctx, dt_obj,
+                                                         &fld_index_features,
+                                                         fld->fld_cookie);
+                if (rc == 0)
+                        LASSERT(dt_obj->do_index_ops != NULL);
+                else
                         CERROR("fld is not an index!\n");
-                        rc = -EINVAL;
-                }
         } else {
                 CERROR("Cannot find fld obj %lu \n", PTR_ERR(dt_obj));
                 rc = PTR_ERR(dt_obj);
         }
 
-
         RETURN(rc);
 }
 
 void fld_iam_fini(const struct lu_context *ctx, struct fld *fld)
 {
-        struct dt_object *dt_obj = fld->fld_obj;
-
-        /* XXX nikita: disable for now */
-        /* dt_obj->do_index_ops->dio_fini(ctx, dt_obj); */
-        /*XXX Should put object here,
-          lu_object_put(fld->fld_obj->do_lu);
-         *but no ctxt in this func, FIX later*/
-        fld->fld_obj = NULL;
+        if (!IS_ERR(fld->fld_cookie) && fld->fld_cookie != NULL) {
+                fld->fld_dt->dd_ops->dt_index_fini(ctx, fld->fld_cookie);
+                fld->fld_cookie = NULL;
+        }
+        if (fld->fld_obj != NULL) {
+                lu_object_put(ctx, &fld->fld_obj->do_lu);
+                fld->fld_obj = NULL;
+        }
+        if (fld_key_registered > 0) {
+                if (-- fld_key_registered == 0)
+                        lu_context_key_degister(&fld_thread_key);
+        }
 }
index 9b7a46e..12fc01b 100644 (file)
@@ -67,10 +67,11 @@ enum fld_op {
 int fld_handle_insert(const struct lu_context *ctx,
                       struct fld *fld, fidseq_t seq_num, mdsno_t mdsno);
 int fld_handle_delete(const struct lu_context *ctx,
-                      struct fld *fld, fidseq_t seq_num, mdsno_t mdsno);
+                      struct fld *fld, fidseq_t seq_num);
 int fld_handle_lookup(const struct lu_context *ctx,
                       struct fld *fld, fidseq_t seq_num, mdsno_t *mds);
 
 int fld_iam_init(const struct lu_context *ctx, struct fld *fld);
 void fld_iam_fini(const struct lu_context *ctx, struct fld *fld);
+
 #endif
index 21d8666..4770f1c 100644 (file)
@@ -51,6 +51,8 @@ struct thandle;
 struct txn_param;
 struct dt_device;
 struct dt_object;
+struct dt_index_features;
+struct dt_index_cookie;
 
 /*
  * Lock mode for DT objects.
@@ -94,9 +96,50 @@ struct dt_device_operations {
          */
         int   (*dt_root_get)(const struct lu_context *ctx,
                              struct dt_device *dev, struct lu_fid *f);
+        /*
+         * This method has to be called by any module that is going to use
+         * indexing capabilities of dt interface.
+         */
+        struct dt_index_cookie *(*dt_index_init)
+                (const struct lu_context *, const struct dt_index_features *);
+        /*
+         * Dual to ->dt_index_init().
+         */
+        void (*dt_index_fini)(const struct lu_context *ctx,
+                              struct dt_index_cookie *cookie);
+};
+
+struct dt_index_features {
+        /* required feature flags from enum dt_index_flags */
+        __u32 dif_flags;
+        /* minimal required key size */
+        size_t dif_keysize_min;
+        /* maximal required key size, 0 if no limit */
+        size_t dif_keysize_max;
+        /* minimal required record size */
+        size_t dif_recsize_min;
+        /* maximal required record size, 0 if no limit */
+        size_t dif_recsize_max;
+};
+
+enum dt_index_flags {
+        /* index supports variable sized keys */
+        DT_IND_VARKEY = 1 << 0,
+        /* index supports variable sized records */
+        DT_IND_VARREC = 1 << 1,
+        /* index can be modified */
+        DT_IND_UPDATE = 1 << 2,
+        /* index supports records with non-unique (duplicate) keys */
+        DT_IND_NONUNQ = 1 << 3
 };
 
 /*
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+
+/*
  * Per-dt-object operations.
  */
 struct dt_object_operations {
@@ -163,6 +206,21 @@ struct dt_object_operations {
          */
         int   (*do_object_destroy)(const struct lu_context *ctxt,
                                    struct dt_object *dt, struct thandle *th);
+        /*
+         * Announce that this object is going to be used as an index. This
+         * operation check that object supports indexing operations and
+         * installs appropriate dt_index_operations vector on success.
+         *
+         * Also probes for features. Operation is successful if all required
+         * features are supported. In this case, value of @cookie key is used
+         * as an opaque datum with format required by the underlying indexing
+         * implementation. Value of this key has to be allocated through
+         * ->dt_index_cookie().
+         */
+        int   (*do_object_index_try)(const struct lu_context *ctxt,
+                                     struct dt_object *dt,
+                                     const struct dt_index_features *feat,
+                                     struct dt_index_cookie *cookie);
 };
 
 /*
@@ -196,36 +254,6 @@ struct dt_rec;
  */
 struct dt_key;
 
-struct dt_index_features {
-        /* required feature flags from enum dt_index_flags */
-        __u32 dif_flags;
-        /* minimal required key size */
-        size_t dif_keysize_min;
-        /* maximal required key size, 0 if no limit */
-        size_t dif_keysize_max;
-        /* minimal required record size */
-        size_t dif_recsize_min;
-        /* maximal required record size, 0 if no limit */
-        size_t dif_recsize_max;
-};
-
-enum dt_index_flags {
-        /* index supports variable sized keys */
-        DT_IND_VARKEY = 1 << 0,
-        /* index supports variable sized records */
-        DT_IND_VARREC = 1 << 1,
-        /* index can be modified */
-        DT_IND_UPDATE = 1 << 2,
-        /* index supports records with non-unique (duplicate) keys */
-        DT_IND_NONUNQ = 1 << 3
-};
-
-/*
- * Features, required from index to support file system directories (mapping
- * names to fids).
- */
-extern const struct dt_index_features dt_directory_features;
-
 /*
  * Per-dt-object operations on object as index.
  */
@@ -245,14 +273,7 @@ struct dt_index_operations {
          * precondition: lu_object_exists(ctxt, &dt->do_lu);
          */
         int (*dio_delete)(const struct lu_context *ctxt, struct dt_object *dt,
-                          const struct dt_rec *rec, const struct dt_key *key,
-                          struct thandle *handle);
-        /*
-         * Features probing. Returns 1 if this index supports all features in
-         * @feat, -ve on error, 0 otherwise.
-         */
-        int (*dio_probe)(const struct lu_context *ctxt, struct dt_object *dt,
-                         const struct dt_index_features *feat);
+                          const struct dt_key *key, struct thandle *handle);
 };
 
 struct dt_device {
index 017d0f0..4d3ebd7 100644 (file)
@@ -718,13 +718,15 @@ struct lu_context_key {
          * Value constructor. This is called when new value is created for a
          * context. Returns pointer to new value of error pointer.
          */
-        void  *(*lct_init)(const struct lu_context *ctx);
+        void  *(*lct_init)(const struct lu_context *ctx,
+                           struct lu_context_key *key);
         /*
          * Value destructor. Called when context with previously allocated
          * value of this slot is destroyed. @data is a value that was returned
          * by a matching call to ->lct_init().
          */
-        void   (*lct_fini)(const struct lu_context *ctx, void *data);
+        void   (*lct_fini)(const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
         /*
          * Internal implementation detail: index within ->lc_value[] reserved
          * for this key.
index 2b2ea26..840fcd7 100644 (file)
@@ -82,6 +82,7 @@ struct fld {
         struct dt_device        *fld_dt;
         struct dt_object        *fld_obj;
         struct lu_fid            fld_fid; /* used during initialization */
+        struct dt_index_cookie  *fld_cookie;
 };
 
 int  fld_server_init(const struct lu_context *ctx, struct fld *fld,
index 0d4403b..46b89af 100644 (file)
-Index: linux-stage/fs/ext3/namei.c
+Index: iam/fs/ext3/Makefile
 ===================================================================
---- linux-stage.orig/fs/ext3/namei.c   2006-05-29 13:01:21.000000000 +0800
-+++ linux-stage/fs/ext3/namei.c        2006-05-29 13:01:22.000000000 +0800
-@@ -24,81 +24,6 @@
-  *    Theodore Ts'o, 2002
-  */
--/*
-- * iam: big theory statement.
-- *
-- * iam (Index Access Module) is a module providing abstraction of persistent
-- * transactional container on top of generalized ext3 htree.
-- *
-- * iam supports:
-- *
-- *     - key, pointer, and record size specifiable per container.
-- *
-- *     - trees taller than 2 index levels.
-- *
-- *     - read/write to existing ext3 htree directories as iam containers.
-- *
-- * iam container is a tree, consisting of leaf nodes containing keys and
-- * records stored in this container, and index nodes, containing keys and
-- * pointers to leaf or index nodes.
-- *
-- * iam does not work with keys directly, instead it calls user-supplied key
-- * comparison function (->dpo_keycmp()).
-- *
-- * Pointers are (currently) interpreted as logical offsets (measured in
-- * blocksful) within underlying flat file on top of which iam tree lives.
-- *
-- * On-disk format:
-- *
-- * iam mostly tries to reuse existing htree formats.
-- *
-- * Format of index node:
-- *
-- * +-----+-------+-------+-------+------+-------+------------+
-- * |     | count |       |       |      |       |            |
-- * | gap |   /   | entry | entry | .... | entry | free space |
-- * |     | limit |       |       |      |       |            |
-- * +-----+-------+-------+-------+------+-------+------------+
-- *
-- *       gap           this part of node is never accessed by iam code. It
-- *                     exists for binary compatibility with ext3 htree (that,
-- *                     in turn, stores fake struct ext2_dirent for ext2
-- *                     compatibility), and to keep some unspecified per-node
-- *                     data. Gap can be different for root and non-root index
-- *                     nodes. Gap size can be specified for each container
-- *                     (gap of 0 is allowed).
-- *
-- *       count/limit   current number of entries in this node, and the maximal
-- *                     number of entries that can fit into node. count/limit
-- *                     has the same size as entry, and is itself counted in
-- *                     count.
-- *
-- *       entry         index entry: consists of a key immediately followed by
-- *                     a pointer to a child node. Size of a key and size of a
-- *                     pointer depends on container. Entry has neither
-- *                     alignment nor padding.
-- *
-- *       free space    portion of node new entries are added to
-- *
-- * Entries in index node are sorted by their key value.
-- *
-- * Format of leaf node:
-- *
-- * +-----+-------+-------+-------+------+-------+------------+
-- * |     | count |       |       |      |       |            |
-- * | gap |   /   | leaf  | leaf  | .... | leaf  | free space |
-- * |     | limit |       |       |      |       |            |
-- * +-----+-------+-------+-------+------+-------+------------+
--
-- *       leaf          For leaf entry: consists of a rec immediately followd by 
-- *                     a key. size of a key and size of a rec depends on container.  
-- *
-- *
-- *
-- *
-- *
-- */
--
- #include <linux/module.h>
- #include <linux/fs.h>
- #include <linux/pagemap.h>
-@@ -112,10 +37,10 @@
- #include <linux/quotaops.h>
- #include <linux/buffer_head.h>
- #include <linux/smp_lock.h>
-+#include <linux/lustre_iam.h>
- #include "xattr.h"
- #include "iopen.h"
- #include "acl.h"
--#include <linux/lustre_iam.h>
- /*
-  * define how far ahead to read directories while searching them.
-  */
-@@ -125,9 +50,9 @@
- #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
--static struct buffer_head *ext3_append(handle_t *handle,
--                                      struct inode *inode,
--                                      u32 *block, int *err)
-+struct buffer_head *ext3_append(handle_t *handle,
-+                              struct inode *inode,
-+                              u32 *block, int *err)
- {
-       struct buffer_head *bh;
-@@ -136,14 +61,15 @@
-       if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
-               inode->i_size += inode->i_sb->s_blocksize;
-               EXT3_I(inode)->i_disksize = inode->i_size;
--              ext3_journal_get_write_access(handle,bh);
-+              *err = ext3_journal_get_write_access(handle, bh);
-+              if (err != 0) {
-+                      brelse(bh);
-+                      bh = NULL;
-+              }
-       }
-       return bh;
- }
--#ifndef assert
--#define assert(test) J_ASSERT(test)
--#endif
- #ifndef swap
- #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-@@ -162,10 +88,6 @@
-       u8 file_type;
- };
--struct dx_countlimit {
--      __le16 limit;
--      __le16 count;
--};
- /*
-  * dx_root_info is laid out so that if it should somehow get overlaid by a
-@@ -203,245 +125,10 @@
- };
+--- iam.orig/fs/ext3/Makefile  2006-05-27 19:58:43.000000000 +0400
++++ iam/fs/ext3/Makefile       2006-05-27 20:03:07.000000000 +0400
+@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o
  
+ ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+          ioctl.o namei.o super.o symlink.o hash.o resize.o \
+-         extents.o mballoc.o
++         extents.o mballoc.o iam.o iam_lfix.o
  
--static u32 htree_root_ptr(struct iam_container *c);
--static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
--static int htree_node_init(struct iam_container *c,
--                         struct buffer_head *bh, int root);
--static int htree_keycmp(struct iam_container *c,
--                      struct iam_key *k1, struct iam_key *k2);
--static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
--                         handle_t *h, struct buffer_head **bh);
--
--/*
-- * Parameters describing iam compatibility mode in which existing ext3 htrees
-- * can be manipulated.
-- */
--static struct iam_descr htree_compat_param = {
--      .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
--      .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
--      .id_node_gap = offsetof(struct dx_node, entries),
--      .id_root_gap = offsetof(struct dx_root, entries),
--
--      .id_root_ptr   = htree_root_ptr,
--      .id_node_check = htree_node_check,
--      .id_node_init  = htree_node_init,
--      .id_node_read  = htree_node_read,
--      .id_keycmp     = htree_keycmp
--};
--
--
--struct iam_key;
--struct iam_rec;
--struct iam_descr;
--struct iam_container;
--struct iam_path;
--
--
--
--/*
-- * iam cursor (iterator) api.
-- */
--
--/*
-- * Flags controlling iterator functionality.
-- */
--enum iam_it_flags {
--      /*
--       * this iterator will move (iam_it_{prev,next}() will be called on it)
--       */
--      IAM_IT_MOVE  = (1 << 0),
--      /*
--       * tree can be updated through this iterator.
--       */
--      IAM_IT_WRITE = (1 << 1)
--};
--
--/*
-- * States of iterator state machine.
-- */
--enum iam_it_state {
--      /* initial state */
--      IAM_IT_DETACHED,
--      /* iterator is above particular record in the container */
--      IAM_IT_ATTACHED
--};
--
--struct htree_cookie {
--      struct dx_hash_info *hinfo;
--      struct dentry       *dentry;
--};
--
--/*
-- * Iterator.
-- *
-- * Immediately after call to iam_it_init() iterator is in "detached"
-- * (IAM_IT_DETACHED) state: it is associated with given parent container, but
-- * doesn't point to any particular record in this container.
-- *
-- * After successful call to iam_it_get() and until corresponding call to
-- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
-- *
-- * Attached iterator can move through records in a container (provided
-- * IAM_IT_MOVE permission) in a key order, can get record and key values as it
-- * passes over them, and can modify container (provided IAM_IT_WRITE
-- * permission).
-- *
-- * Concurrency: iterators are supposed to be local to thread. Interfaces below
-- * do no internal serialization.
-- *
-- */
--struct iam_iterator {
--      /*
--       * iterator flags, taken from enum iam_it_flags.
--       */
--      __u32                 ii_flags;
--      enum iam_it_state     ii_state;
--      /*
--       * path to the record. Valid in IAM_IT_ATTACHED state.
--       */
--      struct iam_path       ii_path;
--};
--
--static inline struct iam_key *keycpy(struct iam_container *c,
--                                   struct iam_key *k1, struct iam_key *k2)
--{
--      return memcpy(k1, k2, c->ic_descr->id_key_size);
--}
--
--static inline int keycmp(struct iam_container *c,
--                       struct iam_key *k1, struct iam_key *k2)
--{
--      return c->ic_descr->id_keycmp(c, k1, k2);
--}
--
--static struct iam_container *iam_it_container(struct iam_iterator *it)
--{
--      return it->ii_path.ip_container;
--}
--
--static inline int it_keycmp(struct iam_iterator *it,
--                          struct iam_key *k1, struct iam_key *k2)
--{
--      return keycmp(iam_it_container(it), k1, k2);
--}
--
--/*
-- * Initialize iterator to IAM_IT_DETACHED state.
-- *
-- * postcondition: it_state(it) == IAM_IT_DETACHED
-- */
--int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags);
--/*
-- * Finalize iterator and release all resources.
-- *
-- * precondition: it_state(it) == IAM_IT_DETACHED
-- */
--void iam_it_fini(struct iam_iterator *it);
--
--/*
-- * Attach iterator. After successful completion, @it points to record with the
-- * largest key not larger than @k. Semantics of ->id_create() method guarantee
-- * that such record will always be found.
-- *
-- * Return value: 0: positioned on existing record,
-- *             -ve: error.
-- *
-- * precondition:  it_state(it) == IAM_IT_DETACHED
-- * postcondition: ergo(result == 0,
-- *                     (it_state(it) == IAM_IT_ATTACHED &&
-- *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
-- */
--int iam_it_get(struct iam_iterator *it, struct iam_key *k);
--
--/*
-- * Duplicates iterator.
-- *
-- * postcondition: it_state(dst) == it_state(src) &&
-- *                iam_it_container(dst) == iam_it_container(src) &&
-- *                dst->ii_flags = src->ii_flags &&
-- *                ergo(it_state(it) == IAM_IT_ATTACHED,
-- *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
-- *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
-- */
--void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
--
--/*
-- * Detach iterator. Does nothing it detached state.
-- *
-- * postcondition: it_state(it) == IAM_IT_DETACHED
-- */
--void iam_it_put(struct iam_iterator *it);
--
--/*
-- * Move iterator one record right.
-- *
-- * Return value: 0: success,
-- *              +1: end of container reached
-- *             -ve: error
-- *
-- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
-- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
-- */
--int iam_it_next(struct iam_iterator *it);
--
--/*
-- * Return pointer to the record under iterator.
-- *
-- * precondition:  it_state(it) == IAM_IT_ATTACHED
-- * postcondition: it_state(it) == IAM_IT_ATTACHED
-- */
--const struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
--
--/*
-- * Replace contents of record under iterator.
-- *
-- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
-- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
-- *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
-- */
--int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
--
--/*
-- * Place key under iterator in @k, return @k
-- *
-- * precondition:  it_state(it) == IAM_IT_ATTACHED
-- * postcondition: it_state(it) == IAM_IT_ATTACHED
-- */
--const struct iam_key *iam_it_key_get(struct iam_iterator *it,
--                                   struct iam_key *k);
--
--/*
-- * Insert new record with key @k and contents from @r, shifting records to the
-- * right.
-- *
-- * precondition:  it_state(it) == IAM_IT_ATTACHED &&
-- *                it->ii_flags&IAM_IT_WRITE &&
-- *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
-- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
-- *                ergo(result == 0,
-- *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
-- *                     !memcmp(iam_it_rec_get(it), r, ...))
-- */
--int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
--                    struct iam_key *k, struct iam_rec *r);
--/*
-- * Delete record under iterator.
-- *
-- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
-- * postcondition: it_state(it) == IAM_IT_ATTACHED
-- */
--int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
--
- #ifdef CONFIG_EXT3_INDEX
- static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
- static void dx_set_block(struct iam_path *p,
-                        struct iam_entry *entry, unsigned value);
--static inline struct iam_key *dx_get_key(struct iam_path *p,
--                                      struct iam_entry *entry,
--                                      struct iam_key *key);
--static void dx_set_key(struct iam_path *p, struct iam_entry *entry,
--                     struct iam_key *key);
--static unsigned dx_get_count(struct iam_entry *entries);
- static unsigned dx_get_limit(struct iam_entry *entries);
- static void dx_set_count(struct iam_entry *entries, unsigned value);
- static void dx_set_limit(struct iam_entry *entries, unsigned value);
-@@ -457,80 +144,29 @@
- static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
-               struct dx_map_entry *offsets, int count);
- static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
--static void dx_insert_block (struct iam_path *path,
--                           struct iam_frame *frame, u32 hash, u32 block);
--static int ext3_htree_next_block(struct inode *dir, __u32 hash,
--                               struct iam_path *path, __u32 *start_hash);
- static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-                      struct ext3_dir_entry_2 **res_dir, int *err);
- static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-                            struct inode *inode);
--static inline void iam_path_init(struct iam_path *path,
--                               struct iam_container *c, struct htree_cookie *hc);
--static inline void iam_path_fini(struct iam_path *path);
--
--
--/*
-- * Future: use high four bits of block for coalesce-on-delete flags
-- * Mask them off for now.
-- */
--
--static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off)
--{
--      return (void *)((char *)entry + off);
--}
--
--static inline struct iam_descr *path_descr(struct iam_path *p)
--{
--      return p->ip_container->ic_descr;
--}
--
--static inline struct inode *path_obj(struct iam_path *p)
--{
--      return p->ip_container->ic_object;
--}
--
- static inline size_t iam_entry_size(struct iam_path *p)
- {
--      return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size;
-+      return iam_path_descr(p)->id_key_size + iam_path_descr(p)->id_ptr_size;
- }
- static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
--                                            struct iam_entry *entry, int shift)
-+                                              struct iam_entry *entry,
-+                                              int shift)
- {
-       void *e = entry;
-       return e + shift * iam_entry_size(p);
- }
--static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
--                                    struct iam_entry *e1, struct iam_entry *e2)
--{
--      ptrdiff_t diff;
--
--      diff = (void *)e1 - (void *)e2;
--      assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
--      return diff / iam_entry_size(p);
--}
--
--static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
--{
--      return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size))
--              & 0x00ffffff;
--}
--
--static inline void dx_set_block(struct iam_path *p,
--                              struct iam_entry *entry, unsigned value)
-+static inline struct iam_key *iam_get_key(struct iam_path *p,
-+                                        struct iam_entry *entry,
-+                                        struct iam_key *key)
- {
--      *(u32*)entry_off(entry,
--                       path_descr(p)->id_key_size) = cpu_to_le32(value);
--}
--
--static inline struct iam_key *dx_get_key(struct iam_path *p,
--                                      struct iam_entry *entry,
--                                      struct iam_key *key)
--{
--      memcpy(key, entry, path_descr(p)->id_key_size);
-+      memcpy(key, entry, iam_path_descr(p)->id_key_size);
-       return key;
- }
-@@ -540,68 +176,70 @@
-       return (struct iam_key *)entry;
- }
--static inline void dx_set_key(struct iam_path *p,
--                            struct iam_entry *entry, struct iam_key *key)
--{
--      memcpy(entry, key, path_descr(p)->id_key_size);
--}
--
--static inline unsigned dx_get_count (struct iam_entry *entries)
--{
--      return le16_to_cpu(((struct dx_countlimit *) entries)->count);
--}
--
--static inline unsigned dx_get_limit (struct iam_entry *entries)
-+static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
-+                                     struct iam_entry *e1,
-+                                     struct iam_entry *e2)
- {
--      return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
--}
-+      ptrdiff_t diff;
--static inline void dx_set_count (struct iam_entry *entries, unsigned value)
--{
--      ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
-+      diff = (void *)e1 - (void *)e2;
-+      assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
-+      return diff / iam_entry_size(p);
- }
--static inline void dx_set_limit (struct iam_entry *entries, unsigned value)
-+static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
- {
-       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
- }
- static inline unsigned dx_root_limit(struct iam_path *p)
- {
--      struct iam_descr *param = path_descr(p);
--      unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
-+      struct iam_descr *param = iam_path_descr(p);
-+      unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize -
-               param->id_root_gap;
-       return entry_space / (param->id_key_size + param->id_ptr_size);
- }
--static inline unsigned dx_node_limit(struct iam_path *p)
--{
--      struct iam_descr *param = path_descr(p);
--      unsigned entry_space   = path_obj(p)->i_sb->s_blocksize -
--              param->id_node_gap;
--      return entry_space / (param->id_key_size + param->id_ptr_size);
--}
-+/*
-+ * Two iam_descr's are provided:
-+ *
-+ *    - htree_compat_param that supports legacy ext3-htree indices;
-+ *    - fixed_rec_param that supports containers with records of fixed size.
-+ *
-+ */
--static inline int dx_index_is_compat(struct iam_path *path)
--{
--      return path_descr(path) == &htree_compat_param;
--}
-+static u32 htree_root_ptr(struct iam_container *c);
-+static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
-+static int htree_node_init(struct iam_container *c, struct buffer_head *bh, int root);
-+static int htree_keycmp(const struct iam_container *c,
-+                      const struct iam_key *k1, const struct iam_key *k2);
--static struct iam_entry *dx_get_entries(struct iam_path *path, void *data,
--                                     int root)
--{
--      return data +
--              (root ?
--               path_descr(path)->id_root_gap : path_descr(path)->id_node_gap);
--}
-+struct iam_operations htree_operation = {
-+      .id_root_ptr   = htree_root_ptr,
-+      .id_node_check = htree_node_check,
-+      .id_node_init  = htree_node_init,
-+      .id_node_read  = iam_node_read,
-+      .id_keycmp     = htree_keycmp
-+};
-+
-+/*
-+ * Parameters describing iam compatibility mode in which existing ext3 htrees
-+ * can be manipulated.
-+ */
-+struct iam_descr htree_compat_param = {
-+      .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
-+      .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
-+      .id_node_gap = offsetof(struct dx_node, entries),
-+      .id_root_gap = offsetof(struct dx_root, entries),
-+      .id_ops      = &htree_operation
-+};
--static struct iam_entry *dx_node_get_entries(struct iam_path *path,
--                                          struct iam_frame *frame)
-+static inline int dx_index_is_compat(struct iam_path *path)
- {
--      return dx_get_entries(path,
--                            frame->bh->b_data, frame == path->ip_frames);
-+      return iam_path_descr(path) == &htree_compat_param;
- }
-+
- static int dx_node_check(struct iam_path *p, struct iam_frame *f)
- {
-       struct iam_entry     *e;
-@@ -614,10 +252,10 @@
-       count = dx_get_count(e);
-       e = iam_entry_shift(p, e, 1);
-       for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
--              keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]);
--              dx_get_key(p, e, p->ip_key_scratch[1]);
-+              iam_keycpy(c, iam_path_key(p, 0), iam_path_key(p, 1));
-+              iam_get_key(p, e, iam_path_key(p, 1));
-               if (i > 0 &&
--                  keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0)
-+                  iam_keycmp(c, iam_path_key(p, 0), iam_path_key(p, 1)) > 0)
-                       return 0;
-       }
-       return 1;
-@@ -636,13 +274,17 @@
-       data = frame->bh->b_data;
-       entries = dx_node_get_entries(path, frame);
--      sb = path_obj(path)->i_sb;
-+      sb = iam_path_obj(path)->i_sb;
-       if (frame == path->ip_frames) {
-               /* root node */
-               struct dx_root *root;
--              struct htree_cookie *hc = path->ip_descr_data;
-+              struct iam_path_compat *ipc;
-               root = data;
-+              assert(path->ip_data != NULL);
-+              ipc = container_of(path->ip_data, struct iam_path_compat,
-+                                 ipc_descr);
-+
-               if (root->info.hash_version > DX_HASH_MAX) {
-                       ext3_warning(sb, __FUNCTION__,
-                                    "Unrecognised inode hash code %d",
-@@ -669,15 +311,16 @@
-                                          root->info.info_length));
-               assert(dx_get_limit(entries) == dx_root_limit(path));
--              hc->hinfo->hash_version = root->info.hash_version;
--              hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed;
--              if (hc->dentry)
--                      ext3fs_dirhash(hc->dentry->d_name.name,
--                                     hc->dentry->d_name.len, hc->hinfo);
--              path->ip_key_target = (struct iam_key *)&hc->hinfo->hash;
-+              ipc->ipc_hinfo->hash_version = root->info.hash_version;
-+              ipc->ipc_hinfo->seed = EXT3_SB(sb)->s_hash_seed;
-+              if (ipc->ipc_dentry)
-+                      ext3fs_dirhash(ipc->ipc_dentry->d_name.name,
-+                                     ipc->ipc_dentry->d_name.len,
-+                                     ipc->ipc_hinfo);
-+              path->ip_key_target = (struct iam_key *)&ipc->ipc_hinfo->hash;
-       } else {
-               /* non-root index */
--              assert(entries == data + path_descr(path)->id_node_gap);
-+              assert(entries == data + iam_path_descr(path)->id_node_gap);
-               assert(dx_get_limit(entries) == dx_node_limit(path));
-       }
-       frame->entries = frame->at = entries;
-@@ -697,8 +340,8 @@
-       return 0;
- }
--static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
--                         handle_t *handle, struct buffer_head **bh)
-+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
-+                handle_t *handle, struct buffer_head **bh)
- {
-       int result = 0;
-@@ -708,8 +351,8 @@
-       return result;
- }
--static int htree_keycmp(struct iam_container *c,
--                      struct iam_key *k1, struct iam_key *k2)
-+static int htree_keycmp(const struct iam_container *c,
-+                      const struct iam_key *k1, const struct iam_key *k2)
- {
-       __u32 p1 = le32_to_cpu(*(__u32 *)k1);
-       __u32 p2 = le32_to_cpu(*(__u32 *)k2);
-@@ -800,7 +443,7 @@
- }
- #endif /* DX_DEBUG */
--static int dx_lookup(struct iam_path *path)
-+int dx_lookup(struct iam_path *path)
- {
-       u32 ptr;
-       int err = 0;
-@@ -810,11 +453,11 @@
-       struct iam_frame *frame;
-       struct iam_container *c;
--      param = path_descr(path);
-+      param = iam_path_descr(path);
-       c = path->ip_container;
-       
-       for (frame = path->ip_frames, i = 0,
--                   ptr = param->id_root_ptr(path->ip_container);
-+                   ptr = param->id_ops->id_root_ptr(path->ip_container);
-            i <= path->ip_indirect;
-            ptr = dx_get_block(path, frame->at), ++frame, ++i) {
-               struct iam_entry *entries;
-@@ -823,10 +466,11 @@
-               struct iam_entry *m;
-               unsigned count;
--              err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh);
-+              err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
-+                                                &frame->bh);
-               if (err != 0)
-                       break;
--              err = param->id_node_check(path, frame);
-+              err = param->id_ops->id_node_check(path, frame);
-               if (err != 0)
-                       break;
-@@ -841,8 +485,8 @@
-                       m = iam_entry_shift(path,
-                                          p, iam_entry_diff(path, q, p) / 2);
-                       dxtrace(printk("."));
--                      if (keycmp(c, iam_key_at(path, m),
--                                 path->ip_key_target) > 0)
-+                      if (iam_keycmp(c, iam_key_at(path, m),
-+                                     path->ip_key_target) > 0)
-                               q = iam_entry_shift(path, m, -1);
-                       else
-                               p = iam_entry_shift(path, m, +1);
-@@ -857,12 +501,12 @@
-                       while (n--) {
-                               dxtrace(printk(","));
-                               at = iam_entry_shift(path, at, +1);
--                              if (keycmp(c, iam_key_at(path, at),
--                                         path->ip_key_target) > 0) {
-+                              if (iam_keycmp(c, iam_key_at(path, at),
-+                                             path->ip_key_target) > 0) {
-                                       if (at != iam_entry_shift(path, frame->at, 1)) {
-                                               BREAKPOINT;
-                                               printk(KERN_EMERG "%i\n",
--                                                     keycmp(c, iam_key_at(path, at),
-+                                                     iam_keycmp(c, iam_key_at(path, at),
-                                                             path->ip_key_target));
-                                       }
-                                       at = iam_entry_shift(path, at, -1);
-@@ -891,508 +535,20 @@
-                   struct dx_hash_info *hinfo, struct iam_path *path)
- {
-       int err;
--      struct htree_cookie hc = {
--              .dentry = dentry,
--              .hinfo  = hinfo
--      };
-+      struct iam_path_compat *ipc;
-+
-+      assert(path->ip_data != NULL);
-+      ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
-+      ipc->ipc_dentry = dentry;
-+      ipc->ipc_hinfo = hinfo;
-       assert(dx_index_is_compat(path));
--      path->ip_descr_data = &hc;
-       err = dx_lookup(path);
-       assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
-       return err;
- }
- /*
-- * Initialize container @c, acquires additional reference on @inode.
-- */
--int iam_container_init(struct iam_container *c,
--                     struct iam_descr *descr, struct inode *inode)
--{
--      memset(c, 0, sizeof *c);
--      c->ic_descr  = descr;
--      c->ic_object = igrab(inode);
--      if (c->ic_object != NULL)
--              return 0;
--      else
--              return -ENOENT;
--}
--
--/*
-- * Finalize container @c, release all resources.
-- */
--void iam_container_fini(struct iam_container *c)
--{
--      if (c->ic_object != NULL) {
--              iput(c->ic_object);
--              c->ic_object = NULL;
--      }
--}
--
--static inline void iam_path_init(struct iam_path *path, struct iam_container *c, 
--                               struct htree_cookie *hc)
--{
--      memset(path, 0, sizeof *path);
--      path->ip_container = c;
--      path->ip_frame = path->ip_frames;
--      path->ip_descr_data = hc;
--}
--
--static inline void iam_path_fini(struct iam_path *path)
--{
--      int i;
--
--      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
--              if (path->ip_frames[i].bh != NULL) {
--                      brelse(path->ip_frames[i].bh);
--                      path->ip_frames[i].bh = NULL;
--              }
--      }
--}
--
--static void iam_path_compat_init(struct iam_path_compat *path,
--                               struct inode *inode)
--{
--      int i;
--
--      iam_container_init(&path->ipc_container, &htree_compat_param, inode);
--      /*
--       * XXX hack allowing finalization of iam_path_compat with
--       * iam_path_fini().
--       */
--      iput(inode);
--      iam_path_init(&path->ipc_path, &path->ipc_container, NULL);
--      for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
--              path->ipc_path.ip_key_scratch[i] =
--                      (struct iam_key *)&path->ipc_scrach[i];
--}
--
--static void iam_path_compat_fini(struct iam_path_compat *path)
--{
--      iam_path_fini(&path->ipc_path);
--      iam_container_fini(&path->ipc_container);
--}
--
--static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf)
--{
--      int block, err;
--      struct buffer_head *bh;
--      
--      block = dx_get_block(path, path->ip_frame->at);
--      err = path_descr(path)->id_node_read(path->ip_container, block, 
--                                           NULL, &bh);
--      if (err)
--              return err;
--
--      leaf->bh = bh;
--      leaf->entries = (struct iam_leaf_entry *)bh->b_data;
--      return 0;
--}
--
--static void iam_leaf_fini(struct iam_leaf *leaf)
--{
--      if (leaf->bh)
--              brelse(leaf->bh);
--}
--
--/*
-- * Search container @c for record with key @k. If record is found, its data
-- * are moved into @r.
-- *
-- *
-- *
-- * Return values: +ve: found, 0: not-found, -ve: error
-- */
--
--int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r)
--{
--      struct dx_hash_info     hinfo;
--      struct iam_path_compat cpath;
--      struct iam_path *path = &cpath.ipc_path;
--      struct htree_cookie hc = {
--              .hinfo  = &hinfo
--      };
--      int err, i;
--
--      iam_path_init(path, c, &hc);
--      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
--              path->ip_key_scratch[i] =
--                      (struct iam_key *)&cpath.ipc_scrach[i];
--      err = dx_lookup(path);
--      do {
--              struct iam_leaf leaf;
--              err = iam_leaf_init(path, &leaf);
--              if (err)
--                      goto errout;
--
--              for (path_descr(path)->id_leaf.start(c, &leaf);
--                   !path_descr(path)->id_leaf.at_end(c, &leaf);
--                   path_descr(path)->id_leaf.next(c, &leaf)) {
--                      struct iam_key *key;
--
--                      key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL);
--                      path_descr(path)->id_leaf.key(c, &leaf, key);
--                      if (keycmp(c, k, key) == 0) {
--                              memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf),
--                                     path_descr(path)->id_rec_size);
--                              iam_path_fini(path);
--                              iam_leaf_fini(&leaf);
--                              return 0;
--                      }
--              }
--
--              iam_leaf_fini(&leaf);
--              /* Check to see if we should continue to search */
--              err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL);
--              if (err < 0)
--                      goto errout;
--      } while (err == 1);
--errout:
--      iam_path_fini(path);
--      return(err);
--}
--EXPORT_SYMBOL(iam_lookup);
--
--static inline size_t iam_leaf_entry_size(struct iam_path *p)
--{
--      return path_descr(p)->id_rec_size + path_descr(p)->id_key_size;
--}
--
--static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p,
--                                    struct iam_leaf_entry *e1, struct iam_leaf_entry *e2)
--{
--      ptrdiff_t diff;
--
--      diff = (void *)e1 - (void *)e2;
--      assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff);
--      return diff / iam_leaf_entry_size(p);
--}
--
--static inline struct iam_leaf_entry* 
--iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift)
--{
--      void *e = entry;
--      return e + shift * iam_leaf_entry_size(p);
--}
--
--static inline struct iam_key *
--dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key)
--{
--      memcpy(key, e, path_descr(p)->id_key_size);
--      return key;
--}
--
--static inline struct iam_key *
--iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry)
--{
--      void *e = entry;
--      return e + path_descr(p)->id_rec_size;
--}
--static inline struct iam_leaf_entry *
--iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry)
--{
--      return entry; 
--}
--
--static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, 
--                         struct iam_key *k)
--{
--      struct iam_leaf_entry *p, *q, *m;
--      struct iam_leaf_entry *entries = leaf->entries;
--      int count = dx_get_count((struct iam_entry *)entries);
--      
--      p = iam_leaf_entry_shift(path, entries, 1);
--      q = iam_leaf_entry_shift(path, entries, count - 1);
--      while (p <= q) {
--              m = iam_leaf_entry_shift(path,
--                                 p, iam_leaf_entry_diff(path, q, p) / 2);
--              dxtrace(printk("."));
--              if (keycmp(path->ip_container, iam_leaf_key_at(path, m),
--                         path->ip_key_target) > 0)
--                      q = iam_leaf_entry_shift(path, m, -1);
--              else
--                      p = iam_leaf_entry_shift(path, m, +1);
--      }
--      leaf->at = q; 
--      return 0;
--}
--
--/*XXX what kind of lock should this entry be locked: WangDi */
--static int iam_leaf_insert(handle_t *handle, struct iam_path *path, 
--                         struct iam_key *k, struct iam_rec *r)
--{
--      struct iam_leaf leaf;
--      struct iam_leaf_entry *p, *q;
--      int err, count;
--
--      err = iam_leaf_init(path, &leaf);
--      if (err)
--              goto errout;
--      path_descr(path)->id_leaf.start(path->ip_container, &leaf);
--      count = dx_get_count((struct iam_entry *)leaf.entries);
--      if (dx_get_count((struct iam_entry *)leaf.entries) >= 
--          dx_get_limit((struct iam_entry *)leaf.entries)){
--              err = -ENOSPC;
--              goto errout;
--      }
--
--      err = iam_leaf_lookup(path, &leaf, k);
--      if (err)
--              goto errout;
--      
--      /*insert the k/r to leaf entries*/
--      p = iam_leaf_entry_shift(path, leaf.at, 1);
--      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
--      while (q < p) {
--              memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path));
--              q = iam_leaf_entry_shift(path, q, -1);  
--      }
--      memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size);
--      memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size);
--
--      dx_set_count((struct iam_entry*)leaf.entries, count + 1);
--      err = ext3_journal_dirty_metadata(handle, leaf.bh);
--      if (err)
--              ext3_std_error(path->ip_container->ic_object->i_sb, err);
--errout:       
--      iam_leaf_fini(&leaf);
--      return err;
--} 
--
--static int split_leaf_node(handle_t *handle, struct iam_path *path)
--{
--      struct inode *dir = path_obj(path);
--      unsigned continued = 0;
--      struct buffer_head *bh2;
--      u32 newblock, hash_split;
--      char *data2;
--      struct iam_leaf leaf;
--      unsigned split;
--      int     err;
--
--      bh2 = ext3_append (handle, dir, &newblock, &err);
--      if (!(bh2)) {
--              err = -ENOSPC;
--              goto errout;
--      }
--      err = iam_leaf_init(path, &leaf);
--      if (err)
--              goto errout;
--
--      BUFFER_TRACE(leaf.bh, "get_write_access");
--      err = ext3_journal_get_write_access(handle, leaf.bh);
--      if (err) {
--      journal_error:
--              iam_leaf_fini(&leaf);
--              brelse(bh2);
--              ext3_std_error(dir->i_sb, err);
--              err = -EIO;
--              goto errout;
--      }
--      data2 = bh2->b_data;
--      split = dx_get_count((struct iam_entry*)leaf.entries)/2;
--      hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split));
--      if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)),
--                 iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0)
--              continued = 1;
--
--      memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1),
--             iam_leaf_entry_shift(path, leaf.entries, split),
--             split * iam_leaf_entry_size(path));
-- 
--      /* Which block gets the new entry? */
--      dx_insert_block(path, path->ip_frame, hash_split + continued, newblock);
--      err = ext3_journal_dirty_metadata (handle, bh2);
--      if (err)
--              goto journal_error;
--      err = ext3_journal_dirty_metadata (handle, leaf.bh);
--      if (err)
--              goto journal_error;
--      brelse (bh2);
--      iam_leaf_fini(&leaf);
--errout:
--      return err;
--}
--
--static int split_index_node(handle_t *handle, struct iam_path *path);
--/*
-- * Insert new record @r with key @k into container @c (within context of
-- * transaction @h.
-- *
-- * Return values: 0: success, -ve: error, including -EEXIST when record with
-- * given key is already present.
-- *
-- * postcondition: ergo(result == 0 || result == -EEXIST,
-- *                                  iam_lookup(c, k, r2) > 0 &&
-- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
-- */
--int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, 
--             struct iam_rec *r)
--{
--      struct dx_hash_info     hinfo;
--      struct iam_path_compat cpath;
--      struct iam_path *path = &cpath.ipc_path;
--      struct htree_cookie hc = {
--              .hinfo  = &hinfo
--      };
--      int err, i;
--
--      iam_path_init(path, c, &hc);
--      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
--              path->ip_key_scratch[i] =
--                      (struct iam_key *)&cpath.ipc_scrach[i];
--      err = dx_lookup(path);
--      if (err)
--              goto errout; 
--
--      err = iam_leaf_insert(handle, path, k, r);
--      
--      if (err != -ENOSPC) 
--              goto errout;    
--
--      err = split_index_node(handle, path);
--      if (err)
--              goto errout;    
--
--      err = split_leaf_node(handle, path);
--      if (err)
--              goto errout;
--      
--      err = iam_leaf_insert(handle, path, k, r);
--errout:
--      iam_path_fini(path);
--      return(err);
--}
--
--EXPORT_SYMBOL(iam_insert);
--static int iam_leaf_delete(handle_t *handle, struct iam_path *path, 
--                         struct iam_key *k)
--{
--      struct iam_leaf leaf;
--      struct iam_leaf_entry *p, *q;
--      int err, count;
--
--      err = iam_leaf_init(path, &leaf);
--      if (err)
--              goto errout;
--      
--      err = iam_leaf_lookup(path, &leaf, k);
--      if (err)
--              goto errout;
--
--      count = dx_get_count((struct iam_entry*)leaf.entries);
--      /*delete the k to leaf entries*/
--      p = iam_leaf_entry_shift(path, leaf.at, 1);
--      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
--      while (p < q) {
--              memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path));
--              p = iam_leaf_entry_shift(path, p, 1);
--      }
--      dx_set_count((struct iam_entry*)leaf.entries, count - 1);
--
--      err = ext3_journal_dirty_metadata(handle, leaf.bh);
--      if (err)
--              ext3_std_error(path_obj(path)->i_sb, err);
--errout:       
--      iam_leaf_fini(&leaf);
--      return err;
--}
--
--/*
-- * Delete existing record with key @k.
-- *
-- * Return values: 0: success, -ENOENT: not-found, -ve: other error.
-- *
-- * postcondition: ergo(result == 0 || result == -ENOENT,
-- *                                 !iam_lookup(c, k, *));
-- */
--int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k)
--{
--      struct dx_hash_info     hinfo;
--      struct iam_path_compat cpath;
--      struct iam_path *path = &cpath.ipc_path;
--      struct htree_cookie hc = {
--              .hinfo  = &hinfo
--      };
--      int err, i;
--
--      iam_path_init(path, c, &hc);
--      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
--              path->ip_key_scratch[i] =
--                      (struct iam_key *)&cpath.ipc_scrach[i];
--      err = dx_lookup(path);
--      if (err)
--              goto errout; 
--
--      err = iam_leaf_delete(h, path, k);
--errout:
--      iam_path_fini(path);
--      return err;
--}
--
--EXPORT_SYMBOL(iam_delete);
--
--static int iam_leaf_update(handle_t *handle, struct iam_path *path, 
--                         struct iam_key *k, struct iam_rec *r)
--{
--      struct iam_leaf leaf;
--      int err;
--
--      err = iam_leaf_init(path, &leaf);
--      if (err)
--              goto errout;
--      
--      err = iam_leaf_lookup(path, &leaf, k);
--      if (err)
--              goto errout;
--
--      memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size);
--      memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size);
--
--      err = ext3_journal_dirty_metadata(handle, leaf.bh);
--      if (err)
--              ext3_std_error(path_obj(path)->i_sb, err);
--errout:       
--      iam_leaf_fini(&leaf);
--      return err;
--}
--/*
-- * Replace existing record with key @k, or insert new one. New record data are
-- * in @r.
-- *
-- * Return values: 0: success, -ve: error.
-- *
-- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
-- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
-- */
--int iam_update(handle_t *h, struct iam_container *c,
--             struct iam_key *k, struct iam_rec *r)
--{
--      struct dx_hash_info     hinfo;
--      struct iam_path_compat cpath;
--      struct iam_path *path = &cpath.ipc_path;
--      struct htree_cookie hc = {
--              .hinfo  = &hinfo
--      };
--      int err, i;
--      
--      iam_path_init(path, c, &hc);
--      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
--              path->ip_key_scratch[i] =
--                      (struct iam_key *)&cpath.ipc_scrach[i];
--      err = dx_lookup(path);
--      if (err)
--              goto errout; 
--
--      err = iam_leaf_update(h, path, k, r);
--errout:
--      iam_path_fini(path);
--      return err;
--}
--
--EXPORT_SYMBOL(iam_update);
--
--/*
-  * This function increments the frame pointer to search the next leaf
-  * block, and reads in the necessary intervening nodes if the search
-  * should be necessary.  Whether or not the search is necessary is
-@@ -1409,16 +565,15 @@
-  * If start_hash is non-null, it will be filled in with the starting
-  * hash of the next page.
-  */
--static int ext3_htree_next_block(struct inode *dir, __u32 hash,
--                               struct iam_path *path, __u32 *start_hash)
-+static int ext3_htree_advance(struct inode *dir, __u32 hash,
-+                            struct iam_path *path, __u32 *start_hash,
-+                            int compat)
- {
-       struct iam_frame *p;
-       struct buffer_head *bh;
-       int err, num_frames = 0;
-       __u32 bhash;
--      assert(dx_index_is_compat(path));
--
-       p = path->ip_frame;
-       /*
-        * Find the next leaf page by incrementing the frame pointer.
-@@ -1438,28 +593,34 @@
-               --p;
-       }
--      /*
--       * If the hash is 1, then continue only if the next page has a
--       * continuation hash of any value.  This is used for readdir
--       * handling.  Otherwise, check to see if the hash matches the
--       * desired contiuation hash.  If it doesn't, return since
--       * there's no point to read in the successive index pages.
--       */
--      dx_get_key(path, p->at, (struct iam_key *)&bhash);
--      if (start_hash)
--              *start_hash = bhash;
--      if ((hash & 1) == 0) {
--              if ((bhash & ~1) != hash)
--                      return 0;
-+      if (compat) {
-+              /*
-+               * Htree hash magic.
-+               */
-+              /*
-+               * If the hash is 1, then continue only if the next page has a
-+               * continuation hash of any value.  This is used for readdir
-+               * handling.  Otherwise, check to see if the hash matches the
-+               * desired contiuation hash.  If it doesn't, return since
-+               * there's no point to read in the successive index pages.
-+               */
-+              iam_get_key(path, p->at, (struct iam_key *)&bhash);
-+              if (start_hash)
-+                      *start_hash = bhash;
-+              if ((hash & 1) == 0) {
-+                      if ((bhash & ~1) != hash)
-+                              return 0;
-+              }
-       }
-       /*
-        * If the hash is HASH_NB_ALWAYS, we always go to the next
-        * block so no check is necessary
-        */
-       while (num_frames--) {
--              err = path_descr(path)->id_node_read(path->ip_container,
--                                                   (iam_ptr_t)dx_get_block(path, p->at),
--                                                   NULL, &bh);
-+              err = iam_path_descr(path)->id_ops->
-+                      id_node_read(path->ip_container,
-+                                   (iam_ptr_t)dx_get_block(path, p->at),
-+                                   NULL, &bh);
-               if (err != 0)
-                       return err; /* Failure */
-               ++p;
-@@ -1471,6 +632,16 @@
-       return 1;
- }
-+int iam_index_next(struct iam_container *c, struct iam_path *path)
-+{
-+      return ext3_htree_advance(c->ic_object, 0, path, NULL, 0);
-+}
-+
-+int ext3_htree_next_block(struct inode *dir, __u32 hash,
-+                        struct iam_path *path, __u32 *start_hash)
-+{
-+      return ext3_htree_advance(dir, hash, path, start_hash, 1);
-+}
- /*
-  * p is at least 6 bytes before the end of page
-@@ -1662,21 +833,30 @@
-       } while(more);
- }
--static void dx_insert_block(struct iam_path *path,
--                          struct iam_frame *frame, u32 hash, u32 block)
-+void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
-+                  const struct iam_key *key, iam_ptr_t ptr)
- {
-       struct iam_entry *entries = frame->entries;
--      struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1);
-+      struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
-       int count = dx_get_count(entries);
-       assert(count < dx_get_limit(entries));
--      assert(old < iam_entry_shift(path, entries, count));
-+      assert(frame->at < iam_entry_shift(path, entries, count));
-+
-       memmove(iam_entry_shift(path, new, 1), new,
-               (char *)iam_entry_shift(path, entries, count) - (char *)new);
--      dx_set_key(path, new, (struct iam_key *)&hash);
--      dx_set_block(path, new, block);
-+      dx_set_key(path, new, key);
-+      dx_set_block(path, new, ptr);
-       dx_set_count(entries, count + 1);
- }
-+
-+void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
-+                   u32 hash, u32 block)
-+{
-+      assert(dx_index_is_compat(path));
-+      iam_insert_key(path, frame, (struct iam_key *)&hash, block);
-+}
-+
- #endif
-@@ -1897,14 +1077,15 @@
-               if (*err != 0)
-                       return NULL;
-       } else {
--              path->ip_frame->bh = NULL;              /* for iam_path_fini() */
-+              path->ip_frame->bh = NULL;      /* for iam_path_fini() */
-               path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
-       }
-       hash = hinfo.hash;
-       do {
-               block = dx_get_block(path, path->ip_frame->at);
--              *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block,
--                                                   NULL, &bh);
-+              *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container,
-+                                                        (iam_ptr_t)block,
-+                                                        NULL, &bh);
-               if (*err != 0)
-                       goto errout;
-               de = (struct ext3_dir_entry_2 *) bh->b_data;
-@@ -2067,7 +1248,7 @@
-                       struct buffer_head **bh,struct iam_frame *frame,
-                       struct dx_hash_info *hinfo, int *error)
- {
--      struct inode *dir = path_obj(path);
-+      struct inode *dir = iam_path_obj(path);
-       unsigned blocksize = dir->i_sb->s_blocksize;
-       unsigned count, continued;
-       struct buffer_head *bh2;
-@@ -2392,15 +1573,15 @@
- }
- #ifdef CONFIG_EXT3_INDEX
--static int split_index_node(handle_t *handle, struct iam_path *path)
--{ 
-+int split_index_node(handle_t *handle, struct iam_path *path)
-+{
-       struct iam_entry *entries;   /* old block contents */
-       struct iam_entry *entries2;  /* new block contents */
-       struct iam_frame *frame, *safe;
-       struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
-       u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
--      struct inode *dir = path_obj(path);
-+      struct inode *dir = iam_path_obj(path);
-       int nr_splet;
-       int i, err;
-@@ -2442,7 +1623,8 @@
-       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
-               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
-               if (!bh_new[i] ||
--                  path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0)
-+                iam_path_descr(path)->id_ops->id_node_init(path->ip_container,
-+                                                     bh_new[i], 0) != 0)
-                       goto cleanup;
-               BUFFER_TRACE(frame->bh, "get_write_access");
-               err = ext3_journal_get_write_access(handle, frame->bh);
-@@ -2516,9 +1698,9 @@
-                       unsigned count1 = count/2, count2 = count - count1;
-                       unsigned hash2;
--                      dx_get_key(path,
--                                 iam_entry_shift(path, entries, count1),
--                                 (struct iam_key *)&hash2);
-+                      iam_get_key(path,
-+                                  iam_entry_shift(path, entries, count1),
-+                                  (struct iam_key *)&hash2);
-                       dxtrace(printk("Split index %i/%i\n", count1, count2));
-@@ -2578,7 +1760,7 @@
-       size_t isize;
-       iam_path_compat_init(&cpath, dir);
--      param = path_descr(path);
-+      param = iam_path_descr(path);
-       err = dx_probe(dentry, NULL, &hinfo, path);
-       if (err != 0)
-@@ -2588,8 +1770,9 @@
-       /* XXX nikita: global serialization! */
-       isize = dir->i_size;
--      err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), 
--                                handle, &bh);
-+      err = param->id_ops->id_node_read(path->ip_container,
-+                      (iam_ptr_t)dx_get_block(path, frame->at),
-+                      handle, &bh);
-       if (err != 0)
-               goto cleanup;
-@@ -2724,12 +1907,12 @@
-  * is so far negative - it has no inode.
-  *
-  * If the create succeeds, we fill in the inode information
-- * with d_instantiate(). 
-+ * with d_instantiate().
-  */
- static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
-               struct nameidata *nd)
- {
--      handle_t *handle; 
-+      handle_t *handle;
-       struct inode * inode;
-       int err, retries = 0;
-Index: linux-stage/fs/ext3/iam.c
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: iam/fs/ext3/iam.c
 ===================================================================
---- linux-stage.orig/fs/ext3/iam.c     2006-05-29 18:23:53.597737944 +0800
-+++ linux-stage/fs/ext3/iam.c  2006-05-29 13:01:22.000000000 +0800
-@@ -0,0 +1,990 @@
+--- iam.orig/fs/ext3/iam.c     2004-04-06 17:27:52.000000000 +0400
++++ iam/fs/ext3/iam.c  2006-05-29 22:49:31.000000000 +0400
+@@ -0,0 +1,1021 @@
 +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
 + * vim:expandtab:shiftwidth=8:tabstop=8:
 + *
@@ -1589,1222 +123,2884 @@ Index: linux-stage/fs/ext3/iam.c
 +#include <linux/smp_lock.h>
 +#include <linux/lustre_iam.h>
 +
-+#include <libcfs/libcfs.h>
-+#include <libcfs/kp30.h>
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
++
++#include "xattr.h"
++#include "iopen.h"
++#include "acl.h"
++
++/*
++ * List of all registered formats.
++ *
++ * No locking. Callers synchronize.
++ */
++static LIST_HEAD(iam_formats);
++
++void iam_format_register(struct iam_format *fmt)
++{
++        list_add(&fmt->if_linkage, &iam_formats);
++}
++EXPORT_SYMBOL(iam_format_register);
++
++static int iam_format_guess(struct iam_container *c)
++{
++        int result;
++        struct iam_format *fmt;
++
++        /*
++         * XXX temporary initialization hook.
++         */
++        {
++                static int initialized = 0;
++
++                if (!initialized) {
++                        iam_lfix_format_init();
++                        initialized = 1;
++                }
++        }
++
++        result = -ENOENT;
++        list_for_each_entry(fmt, &iam_formats, if_linkage) {
++                result = fmt->if_guess(c);
++                if (result == 0)
++                        break;
++        }
++        return result;
++}
++
++/*
++ * Initialize container @c, acquires additional reference on @inode.
++ */
++int iam_container_init(struct iam_container *c,
++                     struct iam_descr *descr, struct inode *inode)
++{
++      memset(c, 0, sizeof *c);
++      c->ic_descr  = descr;
++      c->ic_object = igrab(inode);
++      if (c->ic_object != NULL)
++              return 0;
++      else
++              return -ENOENT;
++}
++EXPORT_SYMBOL(iam_container_init);
++
++/*
++ * Determine container format.
++ */
++int iam_container_setup(struct iam_container *c)
++{
++        return iam_format_guess(c);
++}
++EXPORT_SYMBOL(iam_container_setup);
++
++/*
++ * Finalize container @c, release all resources.
++ */
++void iam_container_fini(struct iam_container *c)
++{
++      if (c->ic_object != NULL) {
++              iput(c->ic_object);
++              c->ic_object = NULL;
++      }
++}
++EXPORT_SYMBOL(iam_container_fini);
++
++void iam_path_init(struct iam_path *path, struct iam_container *c,
++                   struct iam_path_descr *pd)
++{
++      memset(path, 0, sizeof *path);
++      path->ip_container = c;
++      path->ip_frame = path->ip_frames;
++      path->ip_data = pd;
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf);
++
++void iam_path_fini(struct iam_path *path)
++{
++      int i;
++
++      iam_leaf_fini(&path->ip_leaf);
++      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
++              if (path->ip_frames[i].bh != NULL) {
++                      brelse(path->ip_frames[i].bh);
++                      path->ip_frames[i].bh = NULL;
++              }
++      }
++}
++
++extern struct iam_descr htree_compat_param;
++
++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
++{
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
++              path->ipc_descr.ipd_key_scratch[i] =
++                      (struct iam_key *)&path->ipc_scratch[i];
++
++      iam_container_init(&path->ipc_container, &htree_compat_param, inode);
++      /*
++       * XXX hack allowing finalization of iam_path_compat with
++       * iam_path_fini().
++       */
++      iput(inode);
++      iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
++}
++
++void iam_path_compat_fini(struct iam_path_compat *path)
++{
++      iam_path_fini(&path->ipc_path);
++      iam_container_fini(&path->ipc_container);
++}
++
++/*
++ * Helper function allocating iam_path_descr and initializing its key scratch
++ * area.
++ */
++struct iam_path_descr *iam_ipd_alloc(int keysize)
++{
++        struct iam_path_descr *ipd;
++        void *karea;
++        int i;
++
++        ipd = kmalloc(ARRAY_SIZE(ipd->ipd_key_scratch) * keysize +
++                      sizeof *ipd, GFP_KERNEL);
++        if (ipd != NULL) {
++                karea = ipd + 1;
++                for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch);
++                     ++i, karea += keysize)
++                        ipd->ipd_key_scratch[i] = karea;
++        }
++        return ipd;
++}
++EXPORT_SYMBOL(iam_ipd_alloc);
++
++void iam_ipd_free(struct iam_path_descr *ipd)
++{
++        kfree(ipd);
++}
++EXPORT_SYMBOL(iam_ipd_free);
++
++/*
++ * Leaf helpers.
++ */
++
++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf)
++{
++        return leaf->il_path;
++}
++
++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf)
++{
++        return iam_leaf_path(leaf)->ip_container;
++}
++
++struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf)
++{
++        return iam_leaf_container(leaf)->ic_descr;
++}
++
++struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf)
++{
++        return iam_leaf_descr(leaf)->id_leaf_ops;
++}
++
++/*
++ * Return pointer to current leaf record. Pointer is valid while corresponding
++ * leaf node is locked and pinned.
++ */
++static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
++{
++      return iam_leaf_ops(leaf)->rec(leaf);
++}
++
++/*
++ * Return pointer to the current leaf key. This function may return either
++ * pointer to the key stored in node, or copy key into @key buffer supplied by
++ * caller and return pointer to this buffer. The latter approach is used when
++ * keys in nodes are not stored in plain form (e.g., htree doesn't store keys
++ * at all).
++ *
++ * Caller should assume that returned pointer is only valid while leaf node is
++ * pinned and locked.
++ */
++static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf,
++                                    struct iam_key *key)
++{
++      return iam_leaf_ops(leaf)->key(leaf, key);
++}
++
++static int iam_leaf_load(struct iam_path *path)
++{
++      int block;
++      int err;
++      struct iam_container *c;
++      struct buffer_head   *bh;
++      struct iam_leaf      *leaf;
++      struct iam_descr     *descr;
++      
++      c     = path->ip_container;
++      leaf  = &path->ip_leaf;
++      descr = iam_path_descr(path);
++      block = dx_get_block(path, path->ip_frame->at);
++      err   = descr->id_ops->id_node_read(c, block, NULL, &bh);
++      if (err == 0) {
++              leaf->il_bh = bh;
++                leaf->il_path = path;
++              err = iam_leaf_ops(leaf)->init(leaf);
++      }
++      return err;
++}
++
++static void iam_leaf_fini(struct iam_leaf *leaf)
++{
++      iam_leaf_ops(leaf)->fini(leaf);
++      if (leaf->il_bh) {
++              brelse(leaf->il_bh);
++              leaf->il_bh = NULL;
++      }
++}
++
++static void iam_leaf_start(struct iam_leaf *folio)
++{
++      iam_leaf_ops(folio)->start(folio);
++}
++
++void iam_leaf_next(struct iam_leaf *folio)
++{
++      iam_leaf_ops(folio)->next(folio);
++}
++
++static void iam_rec_add(struct iam_leaf *leaf, struct iam_key *key,
++                        struct iam_rec *rec)
++{
++        iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
++}
++
++static void iam_rec_del(struct iam_leaf *leaf)
++{
++        iam_leaf_ops(leaf)->rec_del(leaf);
++}
++
++int iam_leaf_at_end(const struct iam_leaf *leaf)
++{
++        return iam_leaf_ops(leaf)->at_end(leaf);
++}
++
++void iam_leaf_split(struct iam_leaf *l, struct buffer_head *bh)
++{
++        iam_leaf_ops(l)->split(l, bh);
++}
++
++static int iam_leaf_can_add(const struct iam_leaf *l,
++                            const struct iam_key *k, const struct iam_rec *r)
++{
++        return iam_leaf_ops(l)->can_add(l, k, r);
++}
++
++/***********************************************************************/
++/* iterator interface                                                  */
++/***********************************************************************/
 +
-+#include "xattr.h"
-+#include "iopen.h"
-+#include "acl.h"
++static enum iam_it_state it_state(const struct iam_iterator *it)
++{
++        return it->ii_state;
++}
 +
++/*
++ * Helper function returning scratch key.
++ */
++static struct iam_key *it_scratch_key(const struct iam_iterator *it, int n)
++{
++        return iam_path_key(&it->ii_path, n);
++}
 +
-+static __u32 iam_root_ptr(struct iam_container *c)
++static struct iam_container *iam_it_container(const struct iam_iterator *it)
 +{
-+        return 0;
++      return it->ii_path.ip_container;
 +}
 +
-+static int iam_node_init(struct iam_container *c, struct buffer_head *bh,
-+                        int root)
++static inline int it_keycmp(const struct iam_iterator *it,
++                          const struct iam_key *k1, const struct iam_key *k2)
 +{
-+        return 0;
++      return iam_keycmp(iam_it_container(it), k1, k2);
 +}
 +
-+static int iam_node_check(struct iam_path *path, struct iam_frame *frame)
++/*
++ * Helper wrapper around iam_it_get(): returns 0 (success) only when record
++ * with exactly the same key as asked is found.
++ */
++static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
 +{
-+        struct iam_entry *entries;
-+        void *data;
-+        entries = dx_node_get_entries(path, frame);
++        int result;
 +
-+        data = frame->bh->b_data;
++        result = iam_it_get(it, k);
++        if (result == 0 &&
++            (it_keycmp(it, k, iam_it_key_get(it, it_scratch_key(it, 1))) != 0))
++                /*
++                 * Return -ENOENT if cursor is located above record with a key
++                 * different from one specified.
++                 *
++                 * XXX returning -ENOENT only works if iam_it_get never
++                 * returns -ENOENT as a legitimate error.
++                 */
++                result = -ENOENT;
++        return result;
++}
 +
-+        if (frame == path->ip_frames) {
-+               struct iam_root *root;
++void iam_container_write_lock(struct iam_container *ic)
++{
++      down(&ic->ic_object->i_sem);
++}
 +
-+               root = data;
-+               path->ip_indirect = root->info.indirect_levels;
-+        }
-+        frame->entries = frame->at = entries;
-+      return 0;
++void iam_container_write_unlock(struct iam_container *ic)
++{
++      up(&ic->ic_object->i_sem);
 +}
 +
-+static int iam_node_create(struct iam_container *c)
++void iam_container_read_lock(struct iam_container *ic)
 +{
-+        return 0;
++      down(&ic->ic_object->i_sem);
 +}
 +
-+struct iam_operations generic_iam_ops = {
-+        .id_root_ptr    = iam_root_ptr,
-+        .id_node_read   = iam_node_read,
-+        .id_node_init   = iam_node_init,
-+        .id_node_check  = iam_node_check,
-+        .id_create      = iam_node_create,
-+};
-+EXPORT_SYMBOL(generic_iam_ops);
++void iam_container_read_unlock(struct iam_container *ic)
++{
++      up(&ic->ic_object->i_sem);
++}
 +
-+static inline void iam_reccpy(struct iam_path *p, struct iam_rec *rec_dst,
-+                            struct iam_rec *rec_src)
++static void iam_it_lock(struct iam_iterator *it)
 +{
-+      memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size);
++        if (it->ii_flags&IAM_IT_WRITE)
++                iam_container_write_lock(iam_it_container(it));
++        else
++                iam_container_read_lock(iam_it_container(it));
 +}
 +
-+/*
-+ * Initialize container @c, acquires additional reference on @inode.
-+ */
-+int iam_container_init(struct iam_container *c,
-+                     struct iam_descr *descr, struct inode *inode)
++static void iam_it_unlock(struct iam_iterator *it)
 +{
-+      memset(c, 0, sizeof *c);
-+      c->ic_descr  = descr;
-+      c->ic_object = igrab(inode);
-+      if (c->ic_object != NULL)
-+              return 0;
++      if (it->ii_flags&IAM_IT_WRITE)
++              iam_container_write_unlock(iam_it_container(it));
 +      else
-+              return -ENOENT;
++              iam_container_read_unlock(iam_it_container(it));
 +}
-+EXPORT_SYMBOL(iam_container_init);
 +
 +/*
-+ * Finalize container @c, release all resources.
++ * Initialize iterator to IAM_IT_DETACHED state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
 + */
-+void iam_container_fini(struct iam_container *c)
++int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
++               struct iam_path_descr *pd)
 +{
-+      if (c->ic_object != NULL) {
-+              iput(c->ic_object);
-+              c->ic_object = NULL;
-+      }
++      memset(it, 0, sizeof *it);
++      it->ii_flags  = flags;
++      it->ii_state  = IAM_IT_DETACHED;
++      iam_path_init(&it->ii_path, c, pd);
++      return 0;
 +}
-+EXPORT_SYMBOL(iam_container_fini);
 +
-+void iam_path_init(struct iam_path *path, struct iam_container *c,
-+                struct iam_path_descr *pd)
++/*
++ * Finalize iterator and release all resources.
++ *
++ * precondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_fini(struct iam_iterator *it)
 +{
-+      memset(path, 0, sizeof *path);
-+      path->ip_container = c;
-+      path->ip_frame = path->ip_frames;
-+      path->ip_data = pd;
++      assert(it_state(it) == IAM_IT_DETACHED);
++      iam_path_fini(&it->ii_path);
 +}
 +
-+static void iam_leaf_fini(struct iam_leaf *leaf);
-+
-+void iam_path_fini(struct iam_path *path)
++int iam_path_lookup(struct iam_path *path)
 +{
-+      int i;
-+
-+      iam_leaf_fini(&path->ip_leaf);
-+      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
-+              if (path->ip_frames[i].bh != NULL) {
-+                      brelse(path->ip_frames[i].bh);
-+                      path->ip_frames[i].bh = NULL;
-+              }
++      struct iam_container *c;
++      struct iam_descr *descr;
++      struct iam_leaf  *leaf;
++      int result;
++      
++      c = path->ip_container;
++      leaf = &path->ip_leaf;
++      descr = iam_path_descr(path);
++      result = dx_lookup(path);
++      if (result == 0) {
++              result = iam_leaf_load(path);
++              if (result == 0)
++                      result = iam_leaf_ops(leaf)->lookup(leaf,
++                                                            path->ip_key_target);
 +      }
++      return result;
 +}
 +
-+extern struct iam_descr htree_compat_param;
-+
-+void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
++/*
++ * Attach iterator. After successful completion, @it points to record with
++ * smallest key not larger than @k.
++ *
++ * Return value: 0: positioned on existing record,
++ *             -ve: error.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED
++ * postcondition: ergo(result == 0,
++ *                     (it_state(it) == IAM_IT_ATTACHED &&
++ *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
++ */
++int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
 +{
-+      int i;
++        int result;
++        assert(it_state(it) == IAM_IT_DETACHED);
 +
-+      for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
-+              path->ipc_descr.ipd_key_scratch[i] =
-+                      (struct iam_key *)&path->ipc_scratch[i];
++        it->ii_path.ip_key_target = k;
++        iam_it_lock(it);
++        result = iam_path_lookup(&it->ii_path);
++        if (result == 0 || result == -ENOENT)
++                it->ii_state = IAM_IT_ATTACHED;
++        else
++                iam_it_unlock(it);
++      assert(ergo(result == 0,
++                    it_keycmp(it,
++                              iam_it_key_get(it, it_scratch_key(it, 0)),
++                            k) <= 0));
++        return result;
++}
 +
-+      iam_container_init(&path->ipc_container, &htree_compat_param, inode);
-+      /*
-+       * XXX hack allowing finalization of iam_path_compat with
-+       * iam_path_fini().
-+       */
-+      iput(inode);
-+      iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
++/*
++ * Duplicates iterator.
++ *
++ * postcondition: it_state(dst) == it_state(src) &&
++ *                iam_it_container(dst) == iam_it_container(src) &&
++ *                dst->ii_flags = src->ii_flags &&
++ *                ergo(it_state(src) == IAM_IT_ATTACHED,
++ *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++ *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
++ */
++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
++{
++        dst->ii_flags     = src->ii_flags;
++        dst->ii_state     = src->ii_state;
++        /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
++        /*
++         * XXX: duplicate lock.
++         */
++      assert(it_state(dst) == it_state(src));
++      assert(iam_it_container(dst) == iam_it_container(src));
++      assert(dst->ii_flags = src->ii_flags);
++      assert(ergo(it_state(src) == IAM_IT_ATTACHED,
++                  iam_it_rec_get(dst) == iam_it_rec_get(src) &&
++                  iam_it_key_get(dst, it_scratch_key(dst, 0)) ==
++                  iam_it_key_get(src, it_scratch_key(src, 0))));
++
++}
++/*
++ * Detach iterator. Does nothing it detached state.
++ *
++ * postcondition: it_state(it) == IAM_IT_DETACHED
++ */
++void iam_it_put(struct iam_iterator *it)
++{
++        if (it->ii_state == IAM_IT_ATTACHED) {
++                it->ii_state = IAM_IT_DETACHED;
++              iam_leaf_fini(&it->ii_path.ip_leaf);
++                iam_it_unlock(it);
++        }
 +}
 +
-+void iam_path_compat_fini(struct iam_path_compat *path)
++/*
++ * Move iterator one record right.
++ *
++ * Return value: 0: success,
++ *              +1: end of container reached
++ *             -ve: error
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
++ */
++int iam_it_next(struct iam_iterator *it)
 +{
-+      iam_path_fini(&path->ipc_path);
-+      iam_container_fini(&path->ipc_container);
++        int result;
++        struct iam_container *c;
++        struct iam_path      *path;
++        struct iam_leaf      *leaf;
++
++        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE);
++
++        c    = iam_it_container(it);
++        path = &it->ii_path;
++        leaf = &path->ip_leaf;
++
++        if (iam_leaf_at_end(leaf)) {
++                /* advance index portion of the path */
++                result = iam_index_next(c, path);
++                if (result == 1) {
++                        result = iam_leaf_load(path);
++                        if (result == 0)
++                                iam_leaf_start(leaf);
++                } else if (result == 0)
++                        /* end of container reached */
++                        result = +1;
++                if (result < 0)
++                        iam_it_put(it);
++        } else {
++                /* advance within leaf node */
++                iam_leaf_next(leaf);
++                result = 0;
++        }
++        assert(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
++        return result;
 +}
 +
 +/*
-+ * Leaf helpers.
++ * Return pointer to the record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
 + */
-+
-+struct iam_path *iam_leaf_path(const struct iam_leaf *leaf)
++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
 +{
-+        return leaf->il_path;
++        assert(it_state(it) == IAM_IT_ATTACHED);
++        return iam_leaf_rec(&it->ii_path.ip_leaf);
 +}
 +
-+struct iam_container *iam_leaf_container(const struct iam_leaf *leaf)
++static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
 +{
-+        return iam_leaf_path(leaf)->ip_container;
-+}
++        struct iam_leaf *folio;
 +
-+struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf)
-+{
-+        return iam_leaf_container(leaf)->ic_descr;
++        folio = &it->ii_path.ip_leaf;
++        iam_leaf_ops(folio)->rec_set(folio, r);
 +}
 +
-+struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf)
++static void iam_it_keycpy(struct iam_iterator *it, const struct iam_key *k)
 +{
-+        return iam_leaf_descr(leaf)->id_leaf_ops;
-+}
++        struct iam_leaf *folio;
 +
-+/*
-+ * Return pointer to current leaf record. Pointer is valid while corresponding
-+ * leaf node is locked and pinned.
-+ */
-+struct iam_rec *iam_leaf_rec(struct iam_leaf *leaf)
-+{
-+      return iam_leaf_ops(leaf)->rec(leaf);
++        folio = &it->ii_path.ip_leaf;
++        iam_leaf_ops(folio)->key_set(folio, k);
 +}
 +
++
 +/*
-+ * Return pointer to the current leaf key. This function may return either
-+ * pointer to the key stored in node, or copy key into @key buffer supplied by
-+ * caller and return pointer to this buffer. The latter approach is used when
-+ * keys in nodes are not stored in plain form (e.g., htree doesn't store keys
-+ * at all).
++ * Replace contents of record under iterator.
 + *
-+ * Caller should assume that returned pointer is only valid while leaf node is
-+ * pinned and locked.
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
 + */
-+struct iam_key *iam_leaf_key(struct iam_leaf *leaf, struct iam_key *key)
++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r)
 +{
-+      return iam_leaf_ops(leaf)->key(leaf, key);
-+}
++        int result;
 +
-+static int iam_leaf_load(struct iam_path *path)
-+{
-+      int block;
-+      int err;
-+      struct iam_container *c;
-+      struct buffer_head   *bh;
-+      struct iam_leaf      *leaf;
-+      struct iam_descr     *descr;
-+      
-+      c     = path->ip_container;
-+      leaf  = &path->ip_leaf;
-+      descr = iam_path_descr(path);
-+      block = dx_get_block(path, path->ip_frame->at);
-+      err   = descr->id_ops->id_node_read(c, block, NULL, &bh);
-+      if (err == 0) {
-+              leaf->il_bh = bh;
-+                leaf->il_path = path;
-+              err = iam_leaf_ops(leaf)->init(leaf);
-+      }
-+      return err;
-+}
++        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE);
 +
-+static void iam_leaf_fini(struct iam_leaf *leaf)
-+{
-+      iam_leaf_ops(leaf)->fini(leaf);
-+      if (leaf->il_bh) {
-+              brelse(leaf->il_bh);
-+              leaf->il_bh = NULL;
-+      }
++        result = ext3_journal_get_write_access(h, it->ii_path.ip_leaf.il_bh);
++        if (result == 0)
++                iam_it_reccpy(it, r);
++        return result;
 +}
 +
-+static void iam_leaf_start(struct iam_leaf *folio)
++/*
++ * Return pointer to the key under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++struct iam_key *iam_it_key_get(const struct iam_iterator *it, struct iam_key *k)
 +{
-+      iam_leaf_ops(folio)->start(folio);
++        assert(it_state(it) == IAM_IT_ATTACHED);
++        return iam_leaf_key(&it->ii_path.ip_leaf, k);
 +}
 +
-+void iam_leaf_next(struct iam_leaf *folio)
++static int iam_leaf_rec_add(handle_t *handle, struct iam_path *path)
 +{
-+      iam_leaf_ops(folio)->next(folio);
-+}
++        int err;
 +
-+static void iam_rec_add(struct iam_leaf *leaf, struct iam_key *key,
-+                        struct iam_rec *rec)
-+{
-+        iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
++        err = ext3_journal_get_write_access(handle, path->ip_leaf.il_bh);
++        if (err)
++                goto journal_error;
++        iam_rec_add(&path->ip_leaf, NULL, NULL);
++      err = ext3_journal_dirty_metadata(handle, path->ip_leaf.il_bh);
++journal_error:
++      if (err)
++                ext3_std_error(iam_path_obj(path)->i_sb, err);
++      return err;
 +}
 +
-+static void iam_rec_del(struct iam_leaf *leaf)
++static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
 +{
-+        iam_leaf_ops(leaf)->rec_del(leaf);
-+}
++        int err;
++        int err2;
++        u32 blknr; /* XXX 32bit block size */
++        struct buffer_head   *new_leaf;
++        struct iam_container *c;
 +
-+int iam_leaf_at_end(const struct iam_leaf *leaf)
-+{
-+        return iam_leaf_ops(leaf)->at_end(leaf);
-+}
++        c = iam_leaf_container(leaf);
++        err = ext3_journal_get_write_access(handle, leaf->il_bh);
++        if (err == 0) {
++                struct inode *obj;
 +
-+void iam_leaf_split(struct iam_leaf *l, struct buffer_head *bh)
-+{
-+        iam_leaf_ops(l)->split(l, bh);
++                obj = c->ic_object;
++                new_leaf = ext3_append(handle, c->ic_object, &blknr, &err);
++                if (new_leaf != NULL) {
++                        iam_leaf_ops(leaf)->init_new(c, new_leaf);
++                        iam_leaf_ops(leaf)->split(leaf, new_leaf);
++                        err = ext3_journal_dirty_metadata(handle, new_leaf);
++                        err2 = ext3_journal_dirty_metadata(handle, leaf->il_bh);
++                        err = err ? : err2;
++                        if (err)
++                                ext3_std_error(obj->i_sb, err);
++                        brelse(new_leaf);
++                }
++        }
++        return err;
 +}
 +
-+static int iam_leaf_can_add(struct iam_leaf *l,
-+                            struct iam_key *k, struct iam_rec *r)
++static int iam_add_rec(handle_t *handle, struct iam_path *path,
++                       const struct iam_key *k, const struct iam_rec *r)
 +{
-+        return iam_leaf_ops(l)->can_add(l, k, r);
-+}
-+
-+/***********************************************************************/
-+/* iterator interface                                                  */
-+/***********************************************************************/
++      int err;
 +
-+static enum iam_it_state it_state(const struct iam_iterator *it)
-+{
-+        return it->ii_state;
++      if (iam_leaf_can_add(&path->ip_leaf, k, r)) {
++              err = iam_leaf_rec_add(handle, path);
++      } else {
++              err = split_index_node(handle, path);
++              if (err == 0) {
++                        err = iam_new_leaf(handle, &path->ip_leaf);
++                      if (err == 0)
++                              err = iam_leaf_rec_add(handle, path);
++              }
++      }
++      return err;
 +}
 +
 +/*
-+ * Helper function returning scratch key.
++ * Insert new record with key @k and contents from @r, shifting records to the
++ * right.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                it->ii_flags&IAM_IT_WRITE &&
++ *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
++ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
++ *                ergo(result == 0,
++ *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
++ *                     !memcmp(iam_it_rec_get(it), r, ...))
 + */
-+static struct iam_key *it_scratch_key(struct iam_iterator *it, int n)
++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
++                      const struct iam_key *k, const struct iam_rec *r)
 +{
-+        return iam_path_key(&it->ii_path, n);
-+}
++        int result;
 +
-+static struct iam_container *iam_it_container(const struct iam_iterator *it)
-+{
-+      return it->ii_path.ip_container;
++        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE);
++#if 0
++        /*XXX remove this assert temporarily, since if the il_at point to the hearder,
++         * this assert might has some problems*/
++        assert(it_keycmp(it, iam_it_key_get(it, it_scratch_key(it, 0)), k) < 0);
++#endif
++      result = iam_add_rec(h, &it->ii_path, k, r);
++      if (result == 0) {
++              /* place record and key info freed space. Leaf node is already
++               * in transaction. */
++              iam_it_reccpy(it, r);
++                iam_it_keycpy(it, k);
++                iam_keycpy(it->ii_path.ip_container, it_scratch_key(it, 0), k);
++                /*
++               * XXX TBD.
++               */
++        }
++        assert(it_state(it) == IAM_IT_ATTACHED);
++        assert(ergo(result == 0,
++                    it_keycmp(it,
++                              iam_it_key_get(it,
++                                             it_scratch_key(it, 0)), k) == 0 &&
++                    !memcmp(iam_it_rec_get(it), r,
++                            iam_it_container(it)->ic_descr->id_rec_size)));
++        return result;
 +}
 +
-+static inline int it_keycmp(const struct iam_iterator *it,
-+                          const struct iam_key *k1, const struct iam_key *k2)
++static int iam_leaf_rec_remove(handle_t *handle, struct iam_leaf *leaf)
 +{
-+      return iam_keycmp(iam_it_container(it), k1, k2);
++      int err;
++
++        iam_rec_del(leaf);
++      err = ext3_journal_dirty_metadata(handle, leaf->il_bh);
++      if (err)
++              ext3_std_error(iam_path_obj(iam_leaf_path(leaf))->i_sb, err);
++      return err;
 +}
 +
 +/*
-+ * Helper wrapper around iam_it_get(): returns 0 (success) only when record
-+ * with exactly the same key as asked is found.
++ * Delete record under iterator.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
 + */
-+static int iam_it_get_exact(struct iam_iterator *it, struct iam_key *k)
++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
 +{
 +        int result;
 +
-+        result = iam_it_get(it, k);
-+        if (result == 0 &&
-+            (it_keycmp(it, k, iam_it_key_get(it, it_scratch_key(it, 1))) != 0))
-+                /*
-+                 * Return -ENOENT if cursor is located above record with a key
-+                 * different from one specified.
-+                 *
-+                 * XXX returning -ENOENT only works if iam_it_get never
-+                 * returns -ENOENT as a legitimate error.
-+                 */
-+                result = -ENOENT;
-+        return result;
++        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE);
++
++        result = ext3_journal_get_write_access(h, it->ii_path.ip_leaf.il_bh);
++        /*
++         * no compaction for now.
++         */
++        if (result == 0)
++                iam_leaf_rec_remove(h, &it->ii_path.ip_leaf);
++
++      return result;
 +}
 +
-+void iam_container_write_lock(struct iam_container *ic)
++/*
++ * Convert iterator to cookie.
++ *
++ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
++ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ */
++iam_pos_t iam_it_store(const struct iam_iterator *it)
 +{
-+      down(&ic->ic_object->i_sem);
++        iam_pos_t result;
++
++        assert(it_state(it) == IAM_IT_ATTACHED);
++        assert(iam_it_container(it)->ic_descr->id_key_size <= sizeof result);
++
++        result = 0;
++        iam_it_key_get(it, (struct iam_key *)&result);
++        return result;
 +}
 +
-+void iam_container_write_unlock(struct iam_container *ic)
++/*
++ * Restore iterator from cookie.
++ *
++ * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
++ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
++ *                                  iam_it_store(it) == pos)
++ */
++int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
 +{
-+      up(&ic->ic_object->i_sem);
++        assert(it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE);
++        assert(iam_it_container(it)->ic_descr->id_key_size <= sizeof pos);
++        return iam_it_get(it, (struct iam_key *)&pos);
 +}
 +
-+void iam_container_read_lock(struct iam_container *ic)
++/***********************************************************************/
++/* invariants                                                          */
++/***********************************************************************/
++
++static inline int ptr_inside(void *base, size_t size, void *ptr)
 +{
-+      down(&ic->ic_object->i_sem);
++        return (base <= ptr) && (ptr < base + size);
 +}
 +
-+void iam_container_read_unlock(struct iam_container *ic)
++int iam_frame_invariant(struct iam_frame *f)
 +{
-+      up(&ic->ic_object->i_sem);
++        return
++                (f->bh != NULL &&
++                f->bh->b_data != NULL &&
++                ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
++                ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
++                f->entries <= f->at);
 +}
-+
-+static void iam_it_lock(struct iam_iterator *it)
++int iam_leaf_invariant(struct iam_leaf *l)
 +{
-+        if (it->ii_flags&IAM_IT_WRITE)
-+                iam_container_write_lock(iam_it_container(it));
-+        else
-+                iam_container_read_lock(iam_it_container(it));
++        return
++                l->il_bh != NULL &&
++                l->il_bh->b_data != NULL &&
++                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
++                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
++                l->il_entries <= l->il_at;
 +}
 +
-+static void iam_it_unlock(struct iam_iterator *it)
++int iam_path_invariant(struct iam_path *p)
 +{
-+      if (it->ii_flags&IAM_IT_WRITE)
-+              iam_container_write_unlock(iam_it_container(it));
-+      else
-+              iam_container_read_unlock(iam_it_container(it));
++        int i;
++
++        if (p->ip_container == NULL ||
++            p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
++            p->ip_frame != p->ip_frames + p->ip_indirect ||
++            !iam_leaf_invariant(&p->ip_leaf))
++                return 0;
++        for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
++                if (i <= p->ip_indirect) {
++                        if (!iam_frame_invariant(&p->ip_frames[i]))
++                                return 0;
++                }
++        }
++        return 1;
 +}
 +
-+/*
-+ * Initialize iterator to IAM_IT_DETACHED state.
-+ *
-+ * postcondition: it_state(it) == IAM_IT_DETACHED
-+ */
-+int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
-+               struct iam_path_descr *pd)
++int iam_it_invariant(struct iam_iterator *it)
 +{
-+      memset(it, 0, sizeof *it);
-+      it->ii_flags  = flags;
-+      it->ii_state  = IAM_IT_DETACHED;
-+      iam_path_init(&it->ii_path, c, pd);
-+      return 0;
++        return
++                (it->ii_state == IAM_IT_DETACHED ||
++                 it->ii_state == IAM_IT_ATTACHED) &&
++                !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
++                ergo(it->ii_state == IAM_IT_ATTACHED,
++                     iam_path_invariant(&it->ii_path));
 +}
 +
 +/*
-+ * Finalize iterator and release all resources.
++ * Search container @c for record with key @k. If record is found, its data
++ * are moved into @r.
 + *
-+ * precondition: it_state(it) == IAM_IT_DETACHED
++ *
++ *
++ * Return values: +ve: found, 0: not-found, -ve: error
 + */
-+void iam_it_fini(struct iam_iterator *it)
++int iam_lookup(struct iam_container *c, const struct iam_key *k,
++               struct iam_rec *r, struct iam_path_descr *pd)
 +{
-+      assert(it_state(it) == IAM_IT_DETACHED);
-+      iam_path_fini(&it->ii_path);
-+}
++        struct iam_iterator it;
++        int result;
 +
-+int iam_path_lookup(struct iam_path *path)
-+{
-+      struct iam_container *c;
-+      struct iam_descr *descr;
-+      struct iam_leaf  *leaf;
-+      int result;
-+      
-+      c = path->ip_container;
-+      leaf = &path->ip_leaf;
-+      descr = iam_path_descr(path);
-+      result = dx_lookup(path);
-+      if (result == 0) {
-+              result = iam_leaf_load(path);
-+              if (result == 0)
-+                      result = iam_leaf_ops(leaf)->lookup(leaf,
-+                                                            path->ip_key_target);
-+      }
-+      return result;
++        iam_it_init(&it, c, 0, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == 0)
++                /*
++                 * record with required key found, copy it into user buffer
++                 */
++                iam_reccpy(&it.ii_path, r, iam_it_rec_get(&it));
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
 +}
++EXPORT_SYMBOL(iam_lookup);
 +
 +/*
-+ * Attach iterator. After successful completion, @it points to record with
-+ * smallest key not larger than @k.
++ * Insert new record @r with key @k into container @c (within context of
++ * transaction @h.
 + *
-+ * Return value: 0: positioned on existing record,
-+ *             -ve: error.
++ * Return values: 0: success, -ve: error, including -EEXIST when record with
++ * given key is already present.
 + *
-+ * precondition:  it_state(it) == IAM_IT_DETACHED
-+ * postcondition: ergo(result == 0,
-+ *                     (it_state(it) == IAM_IT_ATTACHED &&
-+ *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
++ * postcondition: ergo(result == 0 || result == -EEXIST,
++ *                                  iam_lookup(c, k, r2) > 0 &&
++ *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
 + */
-+int iam_it_get(struct iam_iterator *it, struct iam_key *k)
++int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
++               struct iam_rec *r, struct iam_path_descr *pd)
 +{
++        struct iam_iterator it;
 +        int result;
-+        assert(it_state(it) == IAM_IT_DETACHED);
 +
-+        it->ii_path.ip_key_target = k;
-+        iam_it_lock(it);
-+        result = iam_path_lookup(&it->ii_path);
-+        if (result == 0 || result == -ENOENT)
-+                it->ii_state = IAM_IT_ATTACHED;
-+        else
-+                iam_it_unlock(it);
-+      assert(ergo(result == 0,
-+                    it_keycmp(it,
-+                              iam_it_key_get(it, it_scratch_key(it, 0)),
-+                            k) <= 0));
++        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == -ENOENT)
++                result = iam_it_rec_insert(h, &it, k, r);
++        else if (result == 0)
++                result = -EEXIST;
++        iam_it_put(&it);
++        iam_it_fini(&it);
 +        return result;
 +}
++EXPORT_SYMBOL(iam_insert);
 +
-+/*
-+ * Duplicates iterator.
-+ *
-+ * postcondition: it_state(dst) == it_state(src) &&
-+ *                iam_it_container(dst) == iam_it_container(src) &&
-+ *                dst->ii_flags = src->ii_flags &&
-+ *                ergo(it_state(src) == IAM_IT_ATTACHED,
-+ *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
-+ *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
-+ */
-+void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src)
++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
++               struct iam_rec *r, struct iam_path_descr *pd)
 +{
-+        dst->ii_flags     = src->ii_flags;
-+        dst->ii_state     = src->ii_state;
-+        /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
-+        /*
-+         * XXX: duplicate lock.
-+         */
-+      assert(it_state(dst) == it_state(src));
-+      assert(iam_it_container(dst) == iam_it_container(src));
-+      assert(dst->ii_flags = src->ii_flags);
-+      assert(ergo(it_state(src) == IAM_IT_ATTACHED,
-+                  iam_it_rec_get(dst) == iam_it_rec_get(src) &&
-+                  iam_it_key_get(dst, it_scratch_key(dst, 0)) ==
-+                  iam_it_key_get(src, it_scratch_key(src, 0))));
++        struct iam_iterator it;
++        int result;
++
++        iam_it_init(&it, c, IAM_IT_WRITE, pd);
 +
++        result = iam_it_get_exact(&it, k);
++        if (result == 0)
++                iam_it_rec_set(h, &it, r);
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
 +}
++EXPORT_SYMBOL(iam_update);
++
 +/*
-+ * Detach iterator. Does nothing it detached state.
++ * Delete existing record with key @k.
 + *
-+ * postcondition: it_state(it) == IAM_IT_DETACHED
++ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
++ *
++ * postcondition: ergo(result == 0 || result == -ENOENT,
++ *                                 !iam_lookup(c, k, *));
 + */
-+void iam_it_put(struct iam_iterator *it)
++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
++             struct iam_path_descr *pd)
 +{
-+        if (it->ii_state == IAM_IT_ATTACHED) {
-+                it->ii_state = IAM_IT_DETACHED;
-+              iam_leaf_fini(&it->ii_path.ip_leaf);
-+                iam_it_unlock(it);
-+        }
++        struct iam_iterator it;
++        int result;
++
++        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++
++        result = iam_it_get_exact(&it, k);
++        if (result == 0)
++                iam_it_rec_delete(h, &it);
++        iam_it_put(&it);
++        iam_it_fini(&it);
++        return result;
 +}
++EXPORT_SYMBOL(iam_delete);
 +
-+/*
-+ * Move iterator one record right.
+Index: iam/fs/ext3/iam_lfix.c
+===================================================================
+--- iam.orig/fs/ext3/iam_lfix.c        2004-04-06 17:27:52.000000000 +0400
++++ iam/fs/ext3/iam_lfix.c     2006-05-29 23:50:12.000000000 +0400
+@@ -0,0 +1,445 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ *  iam_lfix.c
++ *  implementation of iam format for fixed size records.
++ *
++ *  Copyright (c) 2006 Cluster File Systems, Inc.
++ *   Author: Wang Di <wangdi@clusterfs.com>
++ *   Author: Nikita Danilov <nikita@clusterfs.com>
++ *
++ *   This file is part of the Lustre file system, http://www.lustre.org
++ *   Lustre is a trademark of Cluster File Systems, Inc.
 + *
-+ * Return value: 0: success,
-+ *              +1: end of container reached
-+ *             -ve: error
++ *   You may have signed or agreed to another license before downloading
++ *   this software.  If so, you are bound by the terms and conditions
++ *   of that agreement, and the following does not apply to you.  See the
++ *   LICENSE file included with this distribution for more information.
 + *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
-+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
++ *   If you did not agree to a different license, then this copy of Lustre
++ *   is open source software; you can redistribute it and/or modify it
++ *   under the terms of version 2 of the GNU General Public License as
++ *   published by the Free Software Foundation.
++ *
++ *   In either case, Lustre is distributed in the hope that it will be
++ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
++ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   license text for more details.
 + */
-+int iam_it_next(struct iam_iterator *it)
-+{
-+        int result;
-+        struct iam_container *c;
-+        struct iam_path      *path;
-+        struct iam_leaf      *leaf;
 +
-+        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE);
++#include <linux/types.h>
++#include <linux/jbd.h>
++/* ext3_error() */
++#include <linux/ext3_fs.h>
 +
-+        c    = iam_it_container(it);
-+        path = &it->ii_path;
-+        leaf = &path->ip_leaf;
++#include <linux/lustre_iam.h>
 +
-+        if (iam_leaf_at_end(leaf)) {
-+                /* advance index portion of the path */
-+                result = iam_index_next(c, path);
-+                if (result == 1) {
-+                        result = iam_leaf_load(path);
-+                        if (result == 0)
-+                                iam_leaf_start(leaf);
-+                } else if (result == 0)
-+                        /* end of container reached */
-+                        result = +1;
-+                if (result < 0)
-+                        iam_it_put(it);
-+        } else {
-+                /* advance within leaf node */
-+                iam_leaf_next(leaf);
-+                result = 0;
-+        }
-+        assert(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
-+        return result;
-+}
++#include <libcfs/libcfs.h>
++#include <libcfs/kp30.h>
 +
 +/*
-+ * Return pointer to the record under iterator.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED
-+ * postcondition: it_state(it) == IAM_IT_ATTACHED
++ * Leaf operations.
 + */
-+struct iam_rec *iam_it_rec_get(struct iam_iterator *it)
++
++enum {
++        IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in
++                                        * lustre/utils/create_iam.c */
++};
++
++/* This is duplicated in lustre/utils/create_iam.c */
++struct iam_leaf_head {
++        __le16 ill_magic;
++        __le16 ill_count;
++};
++
++static inline int iam_lfix_entry_size(const struct iam_leaf *l)
 +{
-+        assert(it_state(it) == IAM_IT_ATTACHED);
-+        return iam_leaf_rec(&it->ii_path.ip_leaf);
++        return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size;
 +}
 +
-+static void iam_it_reccpy(struct iam_iterator *it, struct iam_rec *r)
++static inline struct iam_lentry *
++iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift)
 +{
-+        memcpy(iam_leaf_rec(&it->ii_path.ip_leaf), r,
-+               iam_it_container(it)->ic_descr->id_rec_size);
++        return (void *)entry + shift * iam_lfix_entry_size(l);
 +}
 +
-+static void iam_it_keycpy(struct iam_iterator *it, struct iam_key *k)
++static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry)
 +{
-+        memcpy(iam_leaf_key(&it->ii_path.ip_leaf, NULL), k,
-+                iam_it_container(it)->ic_descr->id_key_size);
++        return (struct iam_key *)entry;
 +}
 +
++static struct iam_lentry *iam_entries(const struct buffer_head *bh)
++{
++        return (void *)bh->b_data + sizeof(struct iam_leaf_head);
++}
 +
-+/*
-+ * Replace contents of record under iterator.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
-+ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
-+ *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
-+ */
-+int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r)
++static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l)
 +{
-+        int result;
++        return iam_entries(l->il_bh);
++}
 +
-+        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE);
++static int lentry_count_get(const struct iam_leaf *leaf)
++{
++        struct iam_lentry *lentry = leaf->il_entries;
++        return le16_to_cpu(((struct iam_leaf_head *)lentry)->ill_count);
++}
 +
-+        result = ext3_journal_get_write_access(h, it->ii_path.ip_leaf.il_bh);
-+        if (result == 0)
-+                iam_it_reccpy(it, r);
-+        return result;
++static void lentry_count_set(struct iam_leaf *leaf, unsigned count)
++{
++        struct iam_lentry *lentry = leaf->il_entries;
++        ((struct iam_leaf_head *)lentry)->ill_count = cpu_to_le16(count);
 +}
 +
-+/*
-+ * Return pointer to the key under iterator.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED
-+ * postcondition: it_state(it) == IAM_IT_ATTACHED
++/*This func is for flat key, for those keys,
++ *which are not stored explicitly
++ *it would be decrypt in the key buffer
 + */
-+struct iam_key *iam_it_key_get(struct iam_iterator *it, struct iam_key *k)
++struct iam_key *iam_lfix_key(const struct iam_leaf *l, struct iam_key *key)
 +{
-+        assert(it_state(it) == IAM_IT_ATTACHED);
-+        return iam_leaf_key(&it->ii_path.ip_leaf, k);
++        void *ie = l->il_at;
++        return (struct iam_key*)ie;
 +}
 +
-+static int iam_leaf_rec_add(handle_t *handle, struct iam_path *path)
++static void iam_lfix_start(struct iam_leaf *l)
 +{
-+        int err;
++        l->il_at = iam_get_lentries(l);
++}
 +
-+        err = ext3_journal_get_write_access(handle, path->ip_leaf.il_bh);
-+        if (err)
-+                goto journal_error;
-+        iam_rec_add(&path->ip_leaf, NULL, NULL);
-+      err = ext3_journal_dirty_metadata(handle, path->ip_leaf.il_bh);
-+journal_error:
-+      if (err)
-+                ext3_std_error(iam_path_obj(path)->i_sb, err);
-+      return err;
++static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l,
++                                      const struct iam_lentry *e1,
++                                      const struct iam_lentry *e2)
++{
++        ptrdiff_t diff;
++        int esize;
++
++        esize = iam_lfix_entry_size(l);
++        diff = (void *)e1 - (void *)e2;
++        assert(diff / esize * esize == diff);
++        return diff / esize;
 +}
 +
-+static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
++static int iam_lfix_init(struct iam_leaf *l)
 +{
-+        int err;
-+        int err2;
-+        u32 blknr; /* XXX 32bit block size */
-+        struct buffer_head   *new_leaf;
-+        struct iam_container *c;
++        int result;
++        struct iam_leaf_head *ill;
 +
-+        c = iam_leaf_container(leaf);
-+        err = ext3_journal_get_write_access(handle, leaf->il_bh);
-+        if (err == 0) {
++        assert(l->il_bh != NULL);
++
++        ill = (struct iam_leaf_head*)l->il_bh->b_data;
++        if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC)) {
++                l->il_at = l->il_entries = iam_get_lentries(l);
++                result = 0;
++        } else {
 +                struct inode *obj;
 +
-+                obj = c->ic_object;
-+                new_leaf = ext3_append(handle, c->ic_object, &blknr, &err);
-+                if (new_leaf != NULL) {
-+                        iam_leaf_ops(leaf)->init_new(c, new_leaf);
-+                        iam_leaf_ops(leaf)->split(leaf, new_leaf);
-+                        err = ext3_journal_dirty_metadata(handle, new_leaf);
-+                        err2 = ext3_journal_dirty_metadata(handle, leaf->il_bh);
-+                        err = err ? : err2;
-+                        if (err)
-+                                ext3_std_error(obj->i_sb, err);
-+                        brelse(new_leaf);
-+                }
++                obj = iam_leaf_container(l)->ic_object;
++                ext3_error(obj->i_sb, __FUNCTION__,
++                           "Wrong magic in node %llu (#%lu): %#x != %#x\n",
++                           l->il_bh->b_blocknr, obj->i_ino,
++                           ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC));
++                result = -EIO;
 +        }
-+        return err;
++        return result;
 +}
 +
-+int iam_add_rec(handle_t *handle, struct iam_path *path,
-+                struct iam_key *k, struct iam_rec *r)
++static void iam_lfix_fini(struct iam_leaf *l)
 +{
-+      int err;
++        l->il_entries = l->il_at = NULL;
++        return;
++}
 +
-+      if (iam_leaf_can_add(&path->ip_leaf, k, r)) {
-+              err = iam_leaf_rec_add(handle, path);
-+      } else {
-+              err = split_index_node(handle, path);
-+              if (err == 0) {
-+                        err = iam_new_leaf(handle, &path->ip_leaf);
-+                      if (err == 0)
-+                              err = iam_leaf_rec_add(handle, path);
-+              }
-+      }
-+      return err;
++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l)
++{
++        int count = lentry_count_get(l);
++        struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count);
++
++        return ile;
 +}
 +
-+/*
-+ * Insert new record with key @k and contents from @r, shifting records to the
-+ * right.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
-+ *                it->ii_flags&IAM_IT_WRITE &&
-+ *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
-+ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
-+ *                ergo(result == 0,
-+ *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
-+ *                     !memcmp(iam_it_rec_get(it), r, ...))
-+ */
-+int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
-+                      struct iam_key *k, struct iam_rec *r)
++struct iam_rec *iam_lfix_rec(const struct iam_leaf *l)
 +{
-+        int result;
++        void *e = l->il_at;
++        return e + iam_leaf_descr(l)->id_key_size;
++}
 +
-+        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE);
-+#if 0
-+        /*XXX remove this assert temporarily, since if the il_at point to the hearder,
-+         * this assert might has some problems*/
-+        assert(it_keycmp(it, iam_it_key_get(it, it_scratch_key(it, 0)), k) < 0);
-+#endif
-+      result = iam_add_rec(h, &it->ii_path, k, r);
-+      if (result == 0) {
-+              /* place record and key info freed space. Leaf node is already
-+               * in transaction. */
-+              iam_it_reccpy(it, r);
-+                iam_it_keycpy(it, k);
-+                iam_keycpy(it->ii_path.ip_container, it_scratch_key(it, 0), k);
-+                /*
-+               * XXX TBD.
-+               */
++static void iam_lfix_next(struct iam_leaf *l)
++{
++        assert(!iam_leaf_at_end(l));
++        l->il_at = iam_lfix_shift(l, l->il_at, 1);
++}
++
++static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k)
++{
++        struct iam_lentry *p, *q, *m;
++        struct iam_container *c;
++        int count;
++
++        count = lentry_count_get(l);
++        c = iam_leaf_container(l);
++
++        p = iam_lfix_shift(l, l->il_entries, 1);
++        q = iam_lfix_shift(l, l->il_entries, count - 1);
++
++        while (p <= q) {
++                m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2);
++                if (iam_keycmp(c, iam_leaf_key_at(m), k) > 0)
++                        q = iam_lfix_shift(l, m, -1);
++                else
++                        p = iam_lfix_shift(l, m, +1);
 +        }
-+        assert(it_state(it) == IAM_IT_ATTACHED);
-+        assert(ergo(result == 0,
-+                    it_keycmp(it,
-+                              iam_it_key_get(it,
-+                                             it_scratch_key(it, 0)), k) == 0 &&
-+                    !memcmp(iam_it_rec_get(it), r,
-+                            iam_it_container(it)->ic_descr->id_rec_size)));
-+        return result;
++        l->il_at = iam_lfix_shift(l, p, -1);
++        iam_keycpy(c, iam_path_key(iam_leaf_path(l), 0), iam_leaf_key_at(q));
++
++        if (l->il_at == l->il_entries ||
++            iam_keycmp(c, iam_leaf_key_at(q), k) != 0)
++                return -ENOENT;
++        else
++                return 0;
 +}
 +
-+static int iam_leaf_rec_remove(handle_t *handle, struct iam_leaf *leaf)
++static void iam_lfix_rec_add(struct iam_leaf *leaf,
++                             const struct iam_key *k, const struct iam_rec *r)
 +{
-+      int err;
++        struct iam_lentry *end, *next, *cur, *nnext;
++        ptrdiff_t diff;
++        int count;
 +
-+        iam_rec_del(leaf);
-+      err = ext3_journal_dirty_metadata(handle, leaf->il_bh);
-+      if (err)
-+              ext3_std_error(iam_path_obj(iam_leaf_path(leaf))->i_sb, err);
-+      return err;
++        count = lentry_count_get(leaf);
++        end = iam_lfix_get_end(leaf);
++        cur = leaf->il_at;
++        if (cur != end) {
++                next = iam_lfix_shift(leaf, cur, 1);
++                if (next != end) {
++                        nnext = iam_lfix_shift(leaf, next, 1);
++                        diff = (void *)end - (void *)next;
++                        memmove(nnext, next, diff);
++                }
++                iam_lfix_next(leaf);
++        }
++        lentry_count_set(leaf, count + 1);
 +}
 +
-+/*
-+ * Delete record under iterator.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
-+ * postcondition: it_state(it) == IAM_IT_ATTACHED
-+ */
-+int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
++static void iam_lfix_rec_del(struct iam_leaf *leaf)
 +{
-+        int result;
-+
-+        assert(it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE);
++        struct iam_lentry *next, *end;
++        int count;
++        ptrdiff_t diff;
 +
-+        result = ext3_journal_get_write_access(h, it->ii_path.ip_leaf.il_bh);
-+        /*
-+         * no compaction for now.
-+         */
-+        if (result == 0)
-+                iam_leaf_rec_remove(h, &it->ii_path.ip_leaf);
++        count = lentry_count_get(leaf);
++        end = iam_lfix_get_end(leaf);
++        next = iam_lfix_shift(leaf, leaf->il_at, 1);
++        diff = (void *)end - (void *)next;
++        memmove(leaf->il_at, next, diff);
 +
-+      return result;
++        lentry_count_set(leaf, count - 1);
 +}
 +
-+/*
-+ * Convert iterator to cookie.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
-+ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
-+ * postcondition: it_state(it) == IAM_IT_ATTACHED
-+ */
-+iam_pos_t iam_it_store(struct iam_iterator *it)
++static int iam_lfix_can_add(const struct iam_leaf *l,
++                            const struct iam_key *k, const struct iam_rec *r)
 +{
-+        iam_pos_t result;
++        struct iam_lentry *end;
++        int block_size = iam_leaf_container(l)->ic_object->i_sb->s_blocksize;
++        unsigned long left, entry_size;
 +
-+        assert(it_state(it) == IAM_IT_ATTACHED);
-+        assert(iam_it_container(it)->ic_descr->id_key_size <= sizeof result);
++        end = iam_lfix_get_end(l);
 +
-+        result = 0;
-+        iam_it_key_get(it, (struct iam_key *)&result);
-+        return result;
-+}
++        left = block_size - iam_leaf_descr(l)->id_node_gap;
 +
-+/*
-+ * Restore iterator from cookie.
-+ *
-+ * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
-+ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
-+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
-+ *                                  iam_it_store(it) == pos)
-+ */
-+int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
-+{
-+        assert(it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE);
-+        assert(iam_it_container(it)->ic_descr->id_key_size <= sizeof pos);
-+        return iam_it_get(it, (struct iam_key *)&pos);
-+}
++        left -= (unsigned long)((void*)end - (void*)l->il_entries);
 +
-+/***********************************************************************/
-+/* invariants                                                          */
-+/***********************************************************************/
++        entry_size = iam_lfix_entry_size(l);
 +
-+static inline int ptr_inside(void *base, size_t size, void *ptr)
-+{
-+        return (base <= ptr) && (ptr < base + size);
++        if (left >= entry_size)
++                return 1;
++
++        return 0;
 +}
 +
-+int iam_frame_invariant(struct iam_frame *f)
++static int iam_lfix_at_end(const struct iam_leaf *folio)
 +{
-+        return
-+                (f->bh != NULL &&
-+                f->bh->b_data != NULL &&
-+                ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
-+                ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
-+                f->entries <= f->at);
++        struct iam_lentry *ile = iam_lfix_get_end(folio);
++
++        return (folio->il_at == ile);
 +}
-+int iam_leaf_invariant(struct iam_leaf *l)
++
++static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh)
 +{
-+        return
-+                l->il_bh != NULL &&
-+                l->il_bh->b_data != NULL &&
-+                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
-+                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
-+                l->il_entries <= l->il_at;
++        struct iam_leaf_head *hdr;
++
++        hdr = (struct iam_leaf_head*)bh->b_data;
++        hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC);
++        hdr->ill_count = cpu_to_le16(0);
 +}
 +
-+int iam_path_invariant(struct iam_path *p)
++static void iam_lfix_split(struct iam_leaf *l, struct buffer_head *bh)
 +{
-+        int i;
++        struct iam_path      *path;
++        struct iam_leaf_head *hdr;
++        const struct iam_key *pivot;
 +
-+        if (p->ip_container == NULL ||
-+            p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
-+            p->ip_frame != p->ip_frames + p->ip_indirect ||
-+            !iam_leaf_invariant(&p->ip_leaf))
-+                return 0;
-+        for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
-+                if (i <= p->ip_indirect) {
-+                        if (!iam_frame_invariant(&p->ip_frames[i]))
-+                                return 0;
-+                }
-+        }
-+        return 1;
++        unsigned count;
++        unsigned split;
++
++        void *start;
++        void *finis;
++
++        path = iam_leaf_path(l);
++
++        hdr = (void *)bh->b_data;
++
++        count = lentry_count_get(l);
++        split = count / 2;
++
++        start = iam_lfix_shift(l, iam_get_lentries(l), split);
++        finis = iam_lfix_shift(l, iam_get_lentries(l), count);
++
++        pivot = iam_leaf_key_at(start);
++
++        memmove(iam_entries(bh), start, finis - start);
++        hdr->ill_count = count - split;
++        lentry_count_set(l, split);
++        /*
++         * Insert pointer to the new node (together with the smallest key in
++         * the node) into index node.
++         */
++        iam_insert_key(path, path->ip_frame, pivot, bh->b_blocknr);
 +}
 +
-+int iam_it_invariant(struct iam_iterator *it)
++static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k)
 +{
-+        return
-+                (it->ii_state == IAM_IT_DETACHED ||
-+                 it->ii_state == IAM_IT_ATTACHED) &&
-+                !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
-+                ergo(it->ii_state == IAM_IT_ATTACHED,
-+                     iam_path_invariant(&it->ii_path));
++        iam_keycpy(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k);
 +}
 +
-+/*
-+ * Search container @c for record with key @k. If record is found, its data
-+ * are moved into @r.
-+ *
-+ *
-+ *
-+ * Return values: +ve: found, 0: not-found, -ve: error
-+ */
-+int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r,
-+             struct iam_path_descr *pd)
++static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r)
 +{
-+        struct iam_iterator it;
-+        int result;
-+
-+        iam_it_init(&it, c, 0, pd);
-+
-+        result = iam_it_get_exact(&it, k);
-+        if (result == 0)
-+                /*
-+                 * record with required key found, copy it into user buffer
-+                 */
-+                iam_reccpy(&it.ii_path, r, iam_it_rec_get(&it));
-+        iam_it_put(&it);
-+        iam_it_fini(&it);
-+        return result;
++        iam_reccpy(iam_leaf_path(l), iam_lfix_rec(l), r);
 +}
-+EXPORT_SYMBOL(iam_lookup);
++
++static struct iam_leaf_operations iam_lfix_leaf_ops = {
++        .init           = iam_lfix_init,
++        .init_new       = iam_lfix_init_new,
++        .fini           = iam_lfix_fini,
++        .start          = iam_lfix_start,
++        .next           = iam_lfix_next,
++        .key            = iam_lfix_key,
++        .rec            = iam_lfix_rec,
++        .key_set        = iam_lfix_key_set,
++        .rec_set        = iam_lfix_rec_set,
++        .lookup         = iam_lfix_lookup,
++        .at_end         = iam_lfix_at_end,
++        .rec_add        = iam_lfix_rec_add,
++        .rec_del        = iam_lfix_rec_del,
++        .can_add        = iam_lfix_can_add,
++        .split          = iam_lfix_split
++};
 +
 +/*
-+ * Insert new record @r with key @k into container @c (within context of
-+ * transaction @h.
-+ *
-+ * Return values: 0: success, -ve: error, including -EEXIST when record with
-+ * given key is already present.
-+ *
-+ * postcondition: ergo(result == 0 || result == -EEXIST,
-+ *                                  iam_lookup(c, k, r2) > 0 &&
-+ *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
++ * Index operations.
 + */
-+int iam_insert(handle_t *h, struct iam_container *c,
-+               struct iam_key *k, struct iam_rec *r, struct iam_path_descr *pd)
-+{
-+        struct iam_iterator it;
-+        int result;
 +
-+        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++enum {
++        /* This is duplicated in lustre/utils/create_iam.c */
++        /*
++         * Then shalt thou see the dew-BEDABBLED wretch
++         * Turn, and return, indenting with the way;
++         * Each envious brier his weary legs doth scratch,
++         * Each shadow makes him stop, each murmur stay:
++         * For misery is trodden on by many,
++         * And being low never relieved by any.
++         */
++        IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull
++};
 +
-+        result = iam_it_get_exact(&it, k);
-+        if (result == -ENOENT)
-+                result = iam_it_rec_insert(h, &it, k, r);
-+        else if (result == 0)
-+                result = -EEXIST;
-+        iam_it_put(&it);
-+        iam_it_fini(&it);
-+        return result;
++/* This is duplicated in lustre/utils/create_iam.c */
++struct iam_lfix_root {
++        __le64  ilr_magic;
++        __le16  ilr_keysize;
++        __le16  ilr_recsize;
++        __le16  ilr_indirect_levels;
++        __le16  ilr_padding;
++};
++
++static __u32 iam_lfix_root_ptr(struct iam_container *c)
++{
++        return 0;
 +}
-+EXPORT_SYMBOL(iam_insert);
 +
-+int iam_update(handle_t *h, struct iam_container *c,
-+               struct iam_key *k, struct iam_rec *r, struct iam_path_descr *pd)
++static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh,
++                              int root)
 +{
-+        struct iam_iterator it;
-+        int result;
++        return 0;
++}
 +
-+        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame)
++{
++        struct iam_entry *entries;
++        void *data;
++        entries = dx_node_get_entries(path, frame);
 +
-+        result = iam_it_get_exact(&it, k);
-+        if (result == 0)
-+                iam_it_rec_set(h, &it, r);
-+        iam_it_put(&it);
-+        iam_it_fini(&it);
-+        return result;
++        data = frame->bh->b_data;
++
++        if (frame == path->ip_frames) {
++                struct iam_lfix_root *root;
++
++                root = data;
++                path->ip_indirect = le16_to_cpu(root->ilr_indirect_levels);
++        }
++        frame->entries = frame->at = entries;
++        return 0;
 +}
-+EXPORT_SYMBOL(iam_update);
 +
-+/*
-+ * Delete existing record with key @k.
-+ *
-+ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
-+ *
-+ * postcondition: ergo(result == 0 || result == -ENOENT,
-+ *                                 !iam_lookup(c, k, *));
-+ */
-+int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k,
-+             struct iam_path_descr *pd)
++static int iam_lfix_node_create(struct iam_container *c)
 +{
-+        struct iam_iterator it;
-+        int result;
++        return 0;
++}
 +
-+        iam_it_init(&it, c, IAM_IT_WRITE, pd);
++static int iam_lfix_keycmp(const struct iam_container *c,
++                           const struct iam_key *k1, const struct iam_key *k2)
++{
++        return memcmp(k1, k2, c->ic_descr->id_key_size);
++}
 +
-+        result = iam_it_get_exact(&it, k);
-+        if (result == 0)
-+                iam_it_rec_delete(h, &it);
-+        iam_it_put(&it);
-+        iam_it_fini(&it);
++static struct iam_operations iam_lfix_ops = {
++        .id_root_ptr    = iam_lfix_root_ptr,
++        .id_node_read   = iam_node_read,
++        .id_node_init   = iam_lfix_node_init,
++        .id_node_check  = iam_lfix_node_check,
++        .id_create      = iam_lfix_node_create,
++        .id_keycmp      = iam_lfix_keycmp
++};
++
++static int iam_lfix_guess(struct iam_container *c)
++{
++        int result;
++        struct buffer_head *bh;
++        const struct iam_lfix_root *root;
++
++        assert(c->ic_object != NULL);
++
++        result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh);
++        if (result == 0) {
++                root = (void *)bh->b_data;
++                if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) {
++                        struct iam_descr *descr;
++
++                        descr = c->ic_descr;
++                        descr->id_key_size = le16_to_cpu(root->ilr_keysize);
++                        descr->id_rec_size = le16_to_cpu(root->ilr_recsize);
++                        descr->id_root_gap = sizeof(struct iam_lfix_root);
++                        descr->id_node_gap = 0;
++                        descr->id_ops      = &iam_lfix_ops;
++                        descr->id_leaf_ops = &iam_lfix_leaf_ops;
++                } else
++                        result = -EBADF;
++        }
 +        return result;
 +}
-+EXPORT_SYMBOL(iam_delete);
 +
-Index: linux-stage/fs/ext3/Makefile
++static struct iam_format iam_lfix_format = {
++        .if_guess = iam_lfix_guess
++};
++
++void iam_lfix_format_init(void)
++{
++        iam_format_register(&iam_lfix_format);
++}
+Index: iam/fs/ext3/namei.c
 ===================================================================
---- linux-stage.orig/fs/ext3/Makefile  2006-05-29 13:01:21.000000000 +0800
-+++ linux-stage/fs/ext3/Makefile       2006-05-29 13:01:22.000000000 +0800
-@@ -6,7 +6,7 @@
+--- iam.orig/fs/ext3/namei.c   2006-05-27 19:58:44.000000000 +0400
++++ iam/fs/ext3/namei.c        2006-05-29 19:44:45.000000000 +0400
+@@ -24,81 +24,6 @@
+  *    Theodore Ts'o, 2002
+  */
+-/*
+- * iam: big theory statement.
+- *
+- * iam (Index Access Module) is a module providing abstraction of persistent
+- * transactional container on top of generalized ext3 htree.
+- *
+- * iam supports:
+- *
+- *     - key, pointer, and record size specifiable per container.
+- *
+- *     - trees taller than 2 index levels.
+- *
+- *     - read/write to existing ext3 htree directories as iam containers.
+- *
+- * iam container is a tree, consisting of leaf nodes containing keys and
+- * records stored in this container, and index nodes, containing keys and
+- * pointers to leaf or index nodes.
+- *
+- * iam does not work with keys directly, instead it calls user-supplied key
+- * comparison function (->dpo_keycmp()).
+- *
+- * Pointers are (currently) interpreted as logical offsets (measured in
+- * blocksful) within underlying flat file on top of which iam tree lives.
+- *
+- * On-disk format:
+- *
+- * iam mostly tries to reuse existing htree formats.
+- *
+- * Format of index node:
+- *
+- * +-----+-------+-------+-------+------+-------+------------+
+- * |     | count |       |       |      |       |            |
+- * | gap |   /   | entry | entry | .... | entry | free space |
+- * |     | limit |       |       |      |       |            |
+- * +-----+-------+-------+-------+------+-------+------------+
+- *
+- *       gap           this part of node is never accessed by iam code. It
+- *                     exists for binary compatibility with ext3 htree (that,
+- *                     in turn, stores fake struct ext2_dirent for ext2
+- *                     compatibility), and to keep some unspecified per-node
+- *                     data. Gap can be different for root and non-root index
+- *                     nodes. Gap size can be specified for each container
+- *                     (gap of 0 is allowed).
+- *
+- *       count/limit   current number of entries in this node, and the maximal
+- *                     number of entries that can fit into node. count/limit
+- *                     has the same size as entry, and is itself counted in
+- *                     count.
+- *
+- *       entry         index entry: consists of a key immediately followed by
+- *                     a pointer to a child node. Size of a key and size of a
+- *                     pointer depends on container. Entry has neither
+- *                     alignment nor padding.
+- *
+- *       free space    portion of node new entries are added to
+- *
+- * Entries in index node are sorted by their key value.
+- *
+- * Format of leaf node:
+- *
+- * +-----+-------+-------+-------+------+-------+------------+
+- * |     | count |       |       |      |       |            |
+- * | gap |   /   | leaf  | leaf  | .... | leaf  | free space |
+- * |     | limit |       |       |      |       |            |
+- * +-----+-------+-------+-------+------+-------+------------+
+-
+- *       leaf          For leaf entry: consists of a rec immediately followd by 
+- *                     a key. size of a key and size of a rec depends on container.  
+- *
+- *
+- *
+- *
+- *
+- */
+-
+ #include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+@@ -112,10 +37,10 @@
+ #include <linux/quotaops.h>
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
++#include <linux/lustre_iam.h>
+ #include "xattr.h"
+ #include "iopen.h"
+ #include "acl.h"
+-#include <linux/lustre_iam.h>
+ /*
+  * define how far ahead to read directories while searching them.
+  */
+@@ -125,9 +50,9 @@
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+-static struct buffer_head *ext3_append(handle_t *handle,
+-                                      struct inode *inode,
+-                                      u32 *block, int *err)
++struct buffer_head *ext3_append(handle_t *handle,
++                              struct inode *inode,
++                              u32 *block, int *err)
+ {
+       struct buffer_head *bh;
+@@ -136,14 +61,15 @@ static struct buffer_head *ext3_append(h
+       if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+               inode->i_size += inode->i_sb->s_blocksize;
+               EXT3_I(inode)->i_disksize = inode->i_size;
+-              ext3_journal_get_write_access(handle,bh);
++              *err = ext3_journal_get_write_access(handle, bh);
++              if (err != 0) {
++                      brelse(bh);
++                      bh = NULL;
++              }
+       }
+       return bh;
+ }
+-#ifndef assert
+-#define assert(test) J_ASSERT(test)
+-#endif
+ #ifndef swap
+ #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+@@ -162,10 +88,6 @@ struct fake_dirent {
+       u8 file_type;
+ };
+-struct dx_countlimit {
+-      __le16 limit;
+-      __le16 count;
+-};
+ /*
+  * dx_root_info is laid out so that if it should somehow get overlaid by a
+@@ -203,245 +125,10 @@ struct dx_map_entry
+ };
+-static u32 htree_root_ptr(struct iam_container *c);
+-static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
+-static int htree_node_init(struct iam_container *c,
+-                         struct buffer_head *bh, int root);
+-static int htree_keycmp(struct iam_container *c,
+-                      struct iam_key *k1, struct iam_key *k2);
+-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
+-                         handle_t *h, struct buffer_head **bh);
+-
+-/*
+- * Parameters describing iam compatibility mode in which existing ext3 htrees
+- * can be manipulated.
+- */
+-static struct iam_descr htree_compat_param = {
+-      .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
+-      .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
+-      .id_node_gap = offsetof(struct dx_node, entries),
+-      .id_root_gap = offsetof(struct dx_root, entries),
+-
+-      .id_root_ptr   = htree_root_ptr,
+-      .id_node_check = htree_node_check,
+-      .id_node_init  = htree_node_init,
+-      .id_node_read  = htree_node_read,
+-      .id_keycmp     = htree_keycmp
+-};
+-
+-
+-struct iam_key;
+-struct iam_rec;
+-struct iam_descr;
+-struct iam_container;
+-struct iam_path;
+-
+-
+-
+-/*
+- * iam cursor (iterator) api.
+- */
+-
+-/*
+- * Flags controlling iterator functionality.
+- */
+-enum iam_it_flags {
+-      /*
+-       * this iterator will move (iam_it_{prev,next}() will be called on it)
+-       */
+-      IAM_IT_MOVE  = (1 << 0),
+-      /*
+-       * tree can be updated through this iterator.
+-       */
+-      IAM_IT_WRITE = (1 << 1)
+-};
+-
+-/*
+- * States of iterator state machine.
+- */
+-enum iam_it_state {
+-      /* initial state */
+-      IAM_IT_DETACHED,
+-      /* iterator is above particular record in the container */
+-      IAM_IT_ATTACHED
+-};
+-
+-struct htree_cookie {
+-      struct dx_hash_info *hinfo;
+-      struct dentry       *dentry;
+-};
+-
+-/*
+- * Iterator.
+- *
+- * Immediately after call to iam_it_init() iterator is in "detached"
+- * (IAM_IT_DETACHED) state: it is associated with given parent container, but
+- * doesn't point to any particular record in this container.
+- *
+- * After successful call to iam_it_get() and until corresponding call to
+- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED).
+- *
+- * Attached iterator can move through records in a container (provided
+- * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+- * passes over them, and can modify container (provided IAM_IT_WRITE
+- * permission).
+- *
+- * Concurrency: iterators are supposed to be local to thread. Interfaces below
+- * do no internal serialization.
+- *
+- */
+-struct iam_iterator {
+-      /*
+-       * iterator flags, taken from enum iam_it_flags.
+-       */
+-      __u32                 ii_flags;
+-      enum iam_it_state     ii_state;
+-      /*
+-       * path to the record. Valid in IAM_IT_ATTACHED state.
+-       */
+-      struct iam_path       ii_path;
+-};
+-
+-static inline struct iam_key *keycpy(struct iam_container *c,
+-                                   struct iam_key *k1, struct iam_key *k2)
+-{
+-      return memcpy(k1, k2, c->ic_descr->id_key_size);
+-}
+-
+-static inline int keycmp(struct iam_container *c,
+-                       struct iam_key *k1, struct iam_key *k2)
+-{
+-      return c->ic_descr->id_keycmp(c, k1, k2);
+-}
+-
+-static struct iam_container *iam_it_container(struct iam_iterator *it)
+-{
+-      return it->ii_path.ip_container;
+-}
+-
+-static inline int it_keycmp(struct iam_iterator *it,
+-                          struct iam_key *k1, struct iam_key *k2)
+-{
+-      return keycmp(iam_it_container(it), k1, k2);
+-}
+-
+-/*
+- * Initialize iterator to IAM_IT_DETACHED state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+-int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags);
+-/*
+- * Finalize iterator and release all resources.
+- *
+- * precondition: it_state(it) == IAM_IT_DETACHED
+- */
+-void iam_it_fini(struct iam_iterator *it);
+-
+-/*
+- * Attach iterator. After successful completion, @it points to record with the
+- * largest key not larger than @k. Semantics of ->id_create() method guarantee
+- * that such record will always be found.
+- *
+- * Return value: 0: positioned on existing record,
+- *             -ve: error.
+- *
+- * precondition:  it_state(it) == IAM_IT_DETACHED
+- * postcondition: ergo(result == 0,
+- *                     (it_state(it) == IAM_IT_ATTACHED &&
+- *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
+- */
+-int iam_it_get(struct iam_iterator *it, struct iam_key *k);
+-
+-/*
+- * Duplicates iterator.
+- *
+- * postcondition: it_state(dst) == it_state(src) &&
+- *                iam_it_container(dst) == iam_it_container(src) &&
+- *                dst->ii_flags = src->ii_flags &&
+- *                ergo(it_state(it) == IAM_IT_ATTACHED,
+- *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+- *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
+- */
+-void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
+-
+-/*
+- * Detach iterator. Does nothing it detached state.
+- *
+- * postcondition: it_state(it) == IAM_IT_DETACHED
+- */
+-void iam_it_put(struct iam_iterator *it);
+-
+-/*
+- * Move iterator one record right.
+- *
+- * Return value: 0: success,
+- *              +1: end of container reached
+- *             -ve: error
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE
+- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)
+- */
+-int iam_it_next(struct iam_iterator *it);
+-
+-/*
+- * Return pointer to the record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-const struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
+-
+-/*
+- * Replace contents of record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r);
+-
+-/*
+- * Place key under iterator in @k, return @k
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-const struct iam_key *iam_it_key_get(struct iam_iterator *it,
+-                                   struct iam_key *k);
+-
+-/*
+- * Insert new record with key @k and contents from @r, shifting records to the
+- * right.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+- *                it->ii_flags&IAM_IT_WRITE &&
+- *                it_keycmp(it, iam_it_key_get(it, *), k) < 0
+- * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+- *                ergo(result == 0,
+- *                     it_keycmp(it, iam_it_key_get(it, *), k) == 0 &&
+- *                     !memcmp(iam_it_rec_get(it), r, ...))
+- */
+-int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+-                    struct iam_key *k, struct iam_rec *r);
+-/*
+- * Delete record under iterator.
+- *
+- * precondition:  it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE
+- * postcondition: it_state(it) == IAM_IT_ATTACHED
+- */
+-int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+-
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry);
+ static void dx_set_block(struct iam_path *p,
+                        struct iam_entry *entry, unsigned value);
+-static inline struct iam_key *dx_get_key(struct iam_path *p,
+-                                      struct iam_entry *entry,
+-                                      struct iam_key *key);
+-static void dx_set_key(struct iam_path *p, struct iam_entry *entry,
+-                     struct iam_key *key);
+-static unsigned dx_get_count(struct iam_entry *entries);
+ static unsigned dx_get_limit(struct iam_entry *entries);
+ static void dx_set_count(struct iam_entry *entries, unsigned value);
+ static void dx_set_limit(struct iam_entry *entries, unsigned value);
+@@ -457,80 +144,29 @@ static void dx_sort_map(struct dx_map_en
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct iam_path *path,
+-                           struct iam_frame *frame, u32 hash, u32 block);
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct iam_path *path, __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+                      struct ext3_dir_entry_2 **res_dir, int *err);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
+-static inline void iam_path_init(struct iam_path *path,
+-                               struct iam_container *c, struct htree_cookie *hc);
+-static inline void iam_path_fini(struct iam_path *path);
+-
+-
+-/*
+- * Future: use high four bits of block for coalesce-on-delete flags
+- * Mask them off for now.
+- */
+-
+-static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off)
+-{
+-      return (void *)((char *)entry + off);
+-}
+-
+-static inline struct iam_descr *path_descr(struct iam_path *p)
+-{
+-      return p->ip_container->ic_descr;
+-}
+-
+-static inline struct inode *path_obj(struct iam_path *p)
+-{
+-      return p->ip_container->ic_object;
+-}
+-
+ static inline size_t iam_entry_size(struct iam_path *p)
+ {
+-      return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size;
++      return iam_path_descr(p)->id_key_size + iam_path_descr(p)->id_ptr_size;
+ }
+ static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
+-                                            struct iam_entry *entry, int shift)
++                                              struct iam_entry *entry,
++                                              int shift)
+ {
+       void *e = entry;
+       return e + shift * iam_entry_size(p);
+ }
+-static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
+-                                    struct iam_entry *e1, struct iam_entry *e2)
+-{
+-      ptrdiff_t diff;
+-
+-      diff = (void *)e1 - (void *)e2;
+-      assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
+-      return diff / iam_entry_size(p);
+-}
+-
+-static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+-{
+-      return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size))
+-              & 0x00ffffff;
+-}
+-
+-static inline void dx_set_block(struct iam_path *p,
+-                              struct iam_entry *entry, unsigned value)
+-{
+-      *(u32*)entry_off(entry,
+-                       path_descr(p)->id_key_size) = cpu_to_le32(value);
+-}
+-
+-static inline struct iam_key *dx_get_key(struct iam_path *p,
+-                                      struct iam_entry *entry,
+-                                      struct iam_key *key)
++static inline struct iam_key *iam_get_key(struct iam_path *p,
++                                        struct iam_entry *entry,
++                                        struct iam_key *key)
+ {
+-      memcpy(key, entry, path_descr(p)->id_key_size);
++      memcpy(key, entry, iam_path_descr(p)->id_key_size);
+       return key;
+ }
+@@ -540,68 +176,70 @@ static inline struct iam_key *iam_key_at
+       return (struct iam_key *)entry;
+ }
+-static inline void dx_set_key(struct iam_path *p,
+-                            struct iam_entry *entry, struct iam_key *key)
+-{
+-      memcpy(entry, key, path_descr(p)->id_key_size);
+-}
+-
+-static inline unsigned dx_get_count (struct iam_entry *entries)
+-{
+-      return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+-}
+-
+-static inline unsigned dx_get_limit (struct iam_entry *entries)
++static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
++                                     struct iam_entry *e1,
++                                     struct iam_entry *e2)
+ {
+-      return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+-}
++      ptrdiff_t diff;
  
- ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
-          ioctl.o namei.o super.o symlink.o hash.o resize.o \
--         extents.o mballoc.o
-+         extents.o mballoc.o iam.o iam_lfix.o
+-static inline void dx_set_count (struct iam_entry *entries, unsigned value)
+-{
+-      ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
++      diff = (void *)e1 - (void *)e2;
++      assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
++      return diff / iam_entry_size(p);
+ }
  
- ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-stage/fs/ext3/iam_lfix.c
-===================================================================
---- linux-stage.orig/fs/ext3/iam_lfix.c        2006-05-29 18:23:53.597737944 +0800
-+++ linux-stage/fs/ext3/iam_lfix.c     2006-05-29 18:04:05.000000000 +0800
-@@ -0,0 +1,310 @@
-+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
-+ * vim:expandtab:shiftwidth=8:tabstop=8:
-+ *
-+ *  iam_lfix.c
-+ *  implementation of iam format for fixed size records.
-+ *
-+ *  Copyright (c) 2006 Cluster File Systems, Inc.
-+ *   Author: Wang Di <wangdi@clusterfs.com>
-+ *   Author: Nikita Danilov <nikita@clusterfs.com>
-+ *
-+ *   This file is part of the Lustre file system, http://www.lustre.org
-+ *   Lustre is a trademark of Cluster File Systems, Inc.
-+ *
-+ *   You may have signed or agreed to another license before downloading
-+ *   this software.  If so, you are bound by the terms and conditions
-+ *   of that agreement, and the following does not apply to you.  See the
-+ *   LICENSE file included with this distribution for more information.
+-static inline void dx_set_limit (struct iam_entry *entries, unsigned value)
++static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+ {
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+ static inline unsigned dx_root_limit(struct iam_path *p)
+ {
+-      struct iam_descr *param = path_descr(p);
+-      unsigned entry_space = path_obj(p)->i_sb->s_blocksize -
++      struct iam_descr *param = iam_path_descr(p);
++      unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize -
+               param->id_root_gap;
+       return entry_space / (param->id_key_size + param->id_ptr_size);
+ }
+-static inline unsigned dx_node_limit(struct iam_path *p)
+-{
+-      struct iam_descr *param = path_descr(p);
+-      unsigned entry_space   = path_obj(p)->i_sb->s_blocksize -
+-              param->id_node_gap;
+-      return entry_space / (param->id_key_size + param->id_ptr_size);
+-}
++/*
++ * Two iam_descr's are provided:
 + *
-+ *   If you did not agree to a different license, then this copy of Lustre
-+ *   is open source software; you can redistribute it and/or modify it
-+ *   under the terms of version 2 of the GNU General Public License as
-+ *   published by the Free Software Foundation.
++ *    - htree_compat_param that supports legacy ext3-htree indices;
++ *    - fixed_rec_param that supports containers with records of fixed size.
 + *
-+ *   In either case, Lustre is distributed in the hope that it will be
-+ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-+ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ *   license text for more details.
-+ */
-+
-+#include <linux/types.h>
-+#include <linux/jbd.h>
-+/* ext3_error() */
-+#include <linux/ext3_fs.h>
-+
-+#include <linux/lustre_iam.h>
-+
-+#include <libcfs/libcfs.h>
-+#include <libcfs/kp30.h>
-+
-+static inline int iam_lfix_entry_size(const struct iam_leaf *l)
-+{
-+      return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size;
-+}
-+
-+static inline struct iam_lentry *
-+iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift)
-+{
-+              void *e = entry;
-+      return e + shift * iam_lfix_entry_size(l);
-+}
-+
-+static inline const struct iam_key *
-+iam_leaf_key_at(const struct iam_container *c, const struct iam_lentry *entry)
-+{
-+        return (const struct iam_key *)entry;
-+}
-+
-+static struct iam_lentry *iam_entries(const struct buffer_head *bh)
-+{
-+        return (void *)bh->b_data + sizeof(struct iam_leaf_head);
-+}
-+
-+static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l)
-+{
-+        return iam_entries(l->il_bh);
-+}
-+
-+static int lentry_count_get(const struct iam_leaf *leaf)
-+{
-+        struct iam_lentry *lentry = leaf->il_entries;
-+        return le16_to_cpu(((struct iam_leaf_head *)lentry)->ill_count);
-+}
-+
-+static void lentry_count_set(struct iam_leaf *leaf, unsigned count)
-+{
-+        struct iam_lentry *lentry = leaf->il_entries;
-+      ((struct iam_leaf_head *)lentry)->ill_count = cpu_to_le16(count);
-+}
-+
-+/*This func is for flat key, for those keys,
-+ *which are not stored explicitly
-+ *it would be decrypt in the key buffer
 + */
-+struct iam_key *iam_lfix_key(struct iam_leaf *l, struct iam_key *key)
-+{
-+        void *ie = l->il_at;
-+        return (struct iam_key*)ie;
-+}
-+
-+static void iam_lfix_start(struct iam_leaf *l)
-+{
-+        l->il_at = iam_get_lentries(l);
-+}
-+
-+static inline ptrdiff_t iam_lfix_diff(struct iam_leaf *l, struct iam_lentry *e1,
-+                                    struct iam_lentry *e2)
-+{
-+      ptrdiff_t diff;
-+        int esize;
-+
-+        esize = iam_lfix_entry_size(l);
-+      diff = (void *)e1 - (void *)e2;
-+      assert(diff / esize * esize == diff);
-+      return diff / esize;
-+}
-+
-+static int iam_lfix_init(struct iam_leaf *l)
-+{
-+        int result;
-+        struct iam_leaf_head *ill;
-+
-+        assert(l->il_bh != NULL);
-+
-+        ill = (struct iam_leaf_head*)l->il_bh->b_data;
-+        if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC)) {
-+                l->il_at = l->il_entries = iam_get_lentries(l);
-+                result = 0;
-+        } else {
-+                struct inode *obj;
-+
-+                obj = iam_leaf_container(l)->ic_object;
-+                ext3_error(obj->i_sb, __FUNCTION__,
-+                           "Wrong magic in node %llu (#%lu): %#x != %#x\n",
-+                           l->il_bh->b_blocknr, obj->i_ino,
-+                           ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC));
-+                result = -EIO;
-+        }
-+        return result;
-+}
-+
-+static void iam_lfix_fini(struct iam_leaf *l)
-+{
-+        l->il_entries = l->il_at = NULL;
-+        return;
-+}
-+
-+static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l)
-+{
-+        int count = lentry_count_get(l);
-+        struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count);
-+
-+        return ile;
-+}
-+
-+struct iam_rec *iam_lfix_rec(struct iam_leaf *l)
-+{
-+        void *e = l->il_at;
-+        return e + iam_leaf_descr(l)->id_key_size;
-+}
-+
-+static void iam_lfix_next(struct iam_leaf *l)
-+{
-+        assert(!iam_leaf_at_end(l));
-+        l->il_at = iam_lfix_shift(l, l->il_at, 1);
-+}
-+
-+static int iam_lfix_lookup(struct iam_leaf *l, struct iam_key *k)
-+{
-+        struct iam_lentry *p, *q, *m;
-+        struct iam_container *c;
-+        int count;
-+
-+        count = lentry_count_get(l);
-+        c = iam_leaf_container(l);
-+
-+        p = iam_get_lentries(l);
-+        q = iam_lfix_shift(l, l->il_entries, count);
-+
-+        while (p <= q) {
-+                m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2);
-+                if (iam_keycmp(c, iam_leaf_key_at(c, m), k) > 0)
-+                        q = iam_lfix_shift(l, m, -1);
-+                else
-+                        p = iam_lfix_shift(l, m, +1);
-+        }
-+        assert(p != iam_get_lentries(l));
-+        
-+        l->il_at = iam_lfix_shift(l, p, -1);
-+        iam_keycpy(c, iam_path_key(iam_leaf_path(l), 0), iam_leaf_key_at(c, q));
-+        if (l->il_at <= l->il_entries ||
-+            iam_keycmp(c, iam_leaf_key_at(c, q), k) != 0)
-+                return -ENOENT;
-+        else
-+                return 0;
-+}
-+
-+static void iam_lfix_rec_add(struct iam_leaf *leaf,
-+                             struct iam_key *k, struct iam_rec *r)
-+{
-+        struct iam_lentry *end, *next, *cur, *nnext;
-+        ptrdiff_t diff;
-+        int count;
-+
-+        count = lentry_count_get(leaf);
-+        end = iam_lfix_get_end(leaf);
-+        cur = leaf->il_at;
-+        if (cur != end) {
-+                next = iam_lfix_shift(leaf, cur, 1);
-+                if (next != end) {
-+                        nnext = iam_lfix_shift(leaf, next, 1);
-+                        diff = (void *)end - (void *)next;
-+                        memmove(nnext, next, diff);
-+                }
-+                iam_lfix_next(leaf);
-+        }
-+        lentry_count_set(leaf, count + 1);
-+}
-+
-+static void iam_lfix_rec_del(struct iam_leaf *leaf)
-+{
-+      struct iam_lentry *next, *end;
-+      int count;
-+      ptrdiff_t diff;
-+
-+        count = lentry_count_get(leaf);
-+        end = iam_lfix_get_end(leaf);
-+        next = iam_lfix_shift(leaf, leaf->il_at, 1);
-+        diff = (void *)end - (void *)next;
-+        memmove(leaf->il_at, next, diff);
-+
-+      lentry_count_set(leaf, count - 1);
-+}
-+
-+static int iam_lfix_can_add(struct iam_leaf *l,
-+                            struct iam_key *k, struct iam_rec *r)
-+{
-+        struct iam_lentry *end;
-+        int block_size = iam_leaf_container(l)->ic_object->i_sb->s_blocksize;
-+        unsigned long left, entry_size;
-+
-+        end = iam_lfix_get_end(l);
-+
-+        left = block_size - iam_leaf_descr(l)->id_node_gap;
+-static inline int dx_index_is_compat(struct iam_path *path)
+-{
+-      return path_descr(path) == &htree_compat_param;
+-}
++static u32 htree_root_ptr(struct iam_container *c);
++static int htree_node_check(struct iam_path *path, struct iam_frame *frame);
++static int htree_node_init(struct iam_container *c, struct buffer_head *bh, int root);
++static int htree_keycmp(const struct iam_container *c,
++                      const struct iam_key *k1, const struct iam_key *k2);
+-static struct iam_entry *dx_get_entries(struct iam_path *path, void *data,
+-                                     int root)
+-{
+-      return data +
+-              (root ?
+-               path_descr(path)->id_root_gap : path_descr(path)->id_node_gap);
+-}
++struct iam_operations htree_operation = {
++      .id_root_ptr   = htree_root_ptr,
++      .id_node_check = htree_node_check,
++      .id_node_init  = htree_node_init,
++      .id_node_read  = iam_node_read,
++      .id_keycmp     = htree_keycmp
++};
 +
-+        left -= (unsigned long)((void*)end - (void*)l->il_entries);
++/*
++ * Parameters describing iam compatibility mode in which existing ext3 htrees
++ * can be manipulated.
++ */
++struct iam_descr htree_compat_param = {
++      .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash,
++      .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs,
++      .id_node_gap = offsetof(struct dx_node, entries),
++      .id_root_gap = offsetof(struct dx_root, entries),
++      .id_ops      = &htree_operation
++};
+-static struct iam_entry *dx_node_get_entries(struct iam_path *path,
+-                                          struct iam_frame *frame)
++static inline int dx_index_is_compat(struct iam_path *path)
+ {
+-      return dx_get_entries(path,
+-                            frame->bh->b_data, frame == path->ip_frames);
++      return iam_path_descr(path) == &htree_compat_param;
+ }
 +
-+        entry_size = iam_lfix_entry_size(l);
+ static int dx_node_check(struct iam_path *p, struct iam_frame *f)
+ {
+       struct iam_entry     *e;
+@@ -614,10 +252,10 @@ static int dx_node_check(struct iam_path
+       count = dx_get_count(e);
+       e = iam_entry_shift(p, e, 1);
+       for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) {
+-              keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]);
+-              dx_get_key(p, e, p->ip_key_scratch[1]);
++              iam_keycpy(c, iam_path_key(p, 0), iam_path_key(p, 1));
++              iam_get_key(p, e, iam_path_key(p, 1));
+               if (i > 0 &&
+-                  keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0)
++                  iam_keycmp(c, iam_path_key(p, 0), iam_path_key(p, 1)) > 0)
+                       return 0;
+       }
+       return 1;
+@@ -636,13 +274,17 @@ static int htree_node_check(struct iam_p
+       data = frame->bh->b_data;
+       entries = dx_node_get_entries(path, frame);
+-      sb = path_obj(path)->i_sb;
++      sb = iam_path_obj(path)->i_sb;
+       if (frame == path->ip_frames) {
+               /* root node */
+               struct dx_root *root;
+-              struct htree_cookie *hc = path->ip_descr_data;
++              struct iam_path_compat *ipc;
+               root = data;
++              assert(path->ip_data != NULL);
++              ipc = container_of(path->ip_data, struct iam_path_compat,
++                                 ipc_descr);
 +
-+        if (left >= entry_size)
-+                return 1;
+               if (root->info.hash_version > DX_HASH_MAX) {
+                       ext3_warning(sb, __FUNCTION__,
+                                    "Unrecognised inode hash code %d",
+@@ -669,15 +311,17 @@ static int htree_node_check(struct iam_p
+                                          root->info.info_length));
+               assert(dx_get_limit(entries) == dx_root_limit(path));
+-              hc->hinfo->hash_version = root->info.hash_version;
+-              hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed;
+-              if (hc->dentry)
+-                      ext3fs_dirhash(hc->dentry->d_name.name,
+-                                     hc->dentry->d_name.len, hc->hinfo);
+-              path->ip_key_target = (struct iam_key *)&hc->hinfo->hash;
++              ipc->ipc_hinfo->hash_version = root->info.hash_version;
++              ipc->ipc_hinfo->seed = EXT3_SB(sb)->s_hash_seed;
++              if (ipc->ipc_dentry)
++                      ext3fs_dirhash(ipc->ipc_dentry->d_name.name,
++                                     ipc->ipc_dentry->d_name.len,
++                                     ipc->ipc_hinfo);
++              path->ip_key_target =
++                      (const struct iam_key *)&ipc->ipc_hinfo->hash;
+       } else {
+               /* non-root index */
+-              assert(entries == data + path_descr(path)->id_node_gap);
++              assert(entries == data + iam_path_descr(path)->id_node_gap);
+               assert(dx_get_limit(entries) == dx_node_limit(path));
+       }
+       frame->entries = frame->at = entries;
+@@ -697,8 +341,8 @@ static int htree_node_init(struct iam_co
+       return 0;
+ }
+-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr,
+-                         handle_t *handle, struct buffer_head **bh)
++int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
++                handle_t *handle, struct buffer_head **bh)
+ {
+       int result = 0;
+@@ -708,8 +352,8 @@ static int htree_node_read(struct iam_co
+       return result;
+ }
+-static int htree_keycmp(struct iam_container *c,
+-                      struct iam_key *k1, struct iam_key *k2)
++static int htree_keycmp(const struct iam_container *c,
++                      const struct iam_key *k1, const struct iam_key *k2)
+ {
+       __u32 p1 = le32_to_cpu(*(__u32 *)k1);
+       __u32 p2 = le32_to_cpu(*(__u32 *)k2);
+@@ -800,7 +444,7 @@ struct stats dx_show_entries(struct dx_h
+ }
+ #endif /* DX_DEBUG */
+-static int dx_lookup(struct iam_path *path)
++int dx_lookup(struct iam_path *path)
+ {
+       u32 ptr;
+       int err = 0;
+@@ -810,11 +454,11 @@ static int dx_lookup(struct iam_path *pa
+       struct iam_frame *frame;
+       struct iam_container *c;
+-      param = path_descr(path);
++      param = iam_path_descr(path);
+       c = path->ip_container;
+       
+       for (frame = path->ip_frames, i = 0,
+-                   ptr = param->id_root_ptr(path->ip_container);
++                   ptr = param->id_ops->id_root_ptr(c);
+            i <= path->ip_indirect;
+            ptr = dx_get_block(path, frame->at), ++frame, ++i) {
+               struct iam_entry *entries;
+@@ -823,10 +467,11 @@ static int dx_lookup(struct iam_path *pa
+               struct iam_entry *m;
+               unsigned count;
+-              err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh);
++              err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
++                                                &frame->bh);
+               if (err != 0)
+                       break;
+-              err = param->id_node_check(path, frame);
++              err = param->id_ops->id_node_check(path, frame);
+               if (err != 0)
+                       break;
+@@ -837,12 +482,27 @@ static int dx_lookup(struct iam_path *pa
+               assert(count && count <= dx_get_limit(entries));
+               p = iam_entry_shift(path, entries, 1);
+               q = iam_entry_shift(path, entries, count - 1);
++              /*
++               * Sanity check: target key is larger or equal to the leftmost
++               * key in the node.
++               */
++              if (iam_keycmp(c,
++                             iam_key_at(path, p), path->ip_key_target) < 0) {
++                      struct inode *obj;
++
++                      obj = c->ic_object;
++                      ext3_error(obj->i_sb, __FUNCTION__,
++                                 "corrupted search tree #%lu", obj->i_ino);
++                      err = -EIO;
++                      break;
++                      
++              }
+               while (p <= q) {
+                       m = iam_entry_shift(path,
+                                          p, iam_entry_diff(path, q, p) / 2);
+                       dxtrace(printk("."));
+-                      if (keycmp(c, iam_key_at(path, m),
+-                                 path->ip_key_target) > 0)
++                      if (iam_keycmp(c, iam_key_at(path, m),
++                                     path->ip_key_target) > 0)
+                               q = iam_entry_shift(path, m, -1);
+                       else
+                               p = iam_entry_shift(path, m, +1);
+@@ -857,12 +517,12 @@ static int dx_lookup(struct iam_path *pa
+                       while (n--) {
+                               dxtrace(printk(","));
+                               at = iam_entry_shift(path, at, +1);
+-                              if (keycmp(c, iam_key_at(path, at),
+-                                         path->ip_key_target) > 0) {
++                              if (iam_keycmp(c, iam_key_at(path, at),
++                                             path->ip_key_target) > 0) {
+                                       if (at != iam_entry_shift(path, frame->at, 1)) {
+                                               BREAKPOINT;
+                                               printk(KERN_EMERG "%i\n",
+-                                                     keycmp(c, iam_key_at(path, at),
++                                                     iam_keycmp(c, iam_key_at(path, at),
+                                                             path->ip_key_target));
+                                       }
+                                       at = iam_entry_shift(path, at, -1);
+@@ -891,508 +551,20 @@ static int dx_probe(struct dentry *dentr
+                   struct dx_hash_info *hinfo, struct iam_path *path)
+ {
+       int err;
+-      struct htree_cookie hc = {
+-              .dentry = dentry,
+-              .hinfo  = hinfo
+-      };
++      struct iam_path_compat *ipc;
 +
-+        return 0;
++      assert(path->ip_data != NULL);
++      ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr);
++      ipc->ipc_dentry = dentry;
++      ipc->ipc_hinfo = hinfo;
+       assert(dx_index_is_compat(path));
+-      path->ip_descr_data = &hc;
+       err = dx_lookup(path);
+       assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL);
+       return err;
+ }
+ /*
+- * Initialize container @c, acquires additional reference on @inode.
+- */
+-int iam_container_init(struct iam_container *c,
+-                     struct iam_descr *descr, struct inode *inode)
+-{
+-      memset(c, 0, sizeof *c);
+-      c->ic_descr  = descr;
+-      c->ic_object = igrab(inode);
+-      if (c->ic_object != NULL)
+-              return 0;
+-      else
+-              return -ENOENT;
+-}
+-
+-/*
+- * Finalize container @c, release all resources.
+- */
+-void iam_container_fini(struct iam_container *c)
+-{
+-      if (c->ic_object != NULL) {
+-              iput(c->ic_object);
+-              c->ic_object = NULL;
+-      }
+-}
+-
+-static inline void iam_path_init(struct iam_path *path, struct iam_container *c, 
+-                               struct htree_cookie *hc)
+-{
+-      memset(path, 0, sizeof *path);
+-      path->ip_container = c;
+-      path->ip_frame = path->ip_frames;
+-      path->ip_descr_data = hc;
+-}
+-
+-static inline void iam_path_fini(struct iam_path *path)
+-{
+-      int i;
+-
+-      for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
+-              if (path->ip_frames[i].bh != NULL) {
+-                      brelse(path->ip_frames[i].bh);
+-                      path->ip_frames[i].bh = NULL;
+-              }
+-      }
+-}
+-
+-static void iam_path_compat_init(struct iam_path_compat *path,
+-                               struct inode *inode)
+-{
+-      int i;
+-
+-      iam_container_init(&path->ipc_container, &htree_compat_param, inode);
+-      /*
+-       * XXX hack allowing finalization of iam_path_compat with
+-       * iam_path_fini().
+-       */
+-      iput(inode);
+-      iam_path_init(&path->ipc_path, &path->ipc_container, NULL);
+-      for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i)
+-              path->ipc_path.ip_key_scratch[i] =
+-                      (struct iam_key *)&path->ipc_scrach[i];
+-}
+-
+-static void iam_path_compat_fini(struct iam_path_compat *path)
+-{
+-      iam_path_fini(&path->ipc_path);
+-      iam_container_fini(&path->ipc_container);
+-}
+-
+-static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf)
+-{
+-      int block, err;
+-      struct buffer_head *bh;
+-      
+-      block = dx_get_block(path, path->ip_frame->at);
+-      err = path_descr(path)->id_node_read(path->ip_container, block, 
+-                                           NULL, &bh);
+-      if (err)
+-              return err;
+-
+-      leaf->bh = bh;
+-      leaf->entries = (struct iam_leaf_entry *)bh->b_data;
+-      return 0;
+-}
+-
+-static void iam_leaf_fini(struct iam_leaf *leaf)
+-{
+-      if (leaf->bh)
+-              brelse(leaf->bh);
+-}
+-
+-/*
+- * Search container @c for record with key @k. If record is found, its data
+- * are moved into @r.
+- *
+- *
+- *
+- * Return values: +ve: found, 0: not-found, -ve: error
+- */
+-
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      do {
+-              struct iam_leaf leaf;
+-              err = iam_leaf_init(path, &leaf);
+-              if (err)
+-                      goto errout;
+-
+-              for (path_descr(path)->id_leaf.start(c, &leaf);
+-                   !path_descr(path)->id_leaf.at_end(c, &leaf);
+-                   path_descr(path)->id_leaf.next(c, &leaf)) {
+-                      struct iam_key *key;
+-
+-                      key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL);
+-                      path_descr(path)->id_leaf.key(c, &leaf, key);
+-                      if (keycmp(c, k, key) == 0) {
+-                              memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf),
+-                                     path_descr(path)->id_rec_size);
+-                              iam_path_fini(path);
+-                              iam_leaf_fini(&leaf);
+-                              return 0;
+-                      }
+-              }
+-
+-              iam_leaf_fini(&leaf);
+-              /* Check to see if we should continue to search */
+-              err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL);
+-              if (err < 0)
+-                      goto errout;
+-      } while (err == 1);
+-errout:
+-      iam_path_fini(path);
+-      return(err);
+-}
+-EXPORT_SYMBOL(iam_lookup);
+-
+-static inline size_t iam_leaf_entry_size(struct iam_path *p)
+-{
+-      return path_descr(p)->id_rec_size + path_descr(p)->id_key_size;
+-}
+-
+-static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p,
+-                                    struct iam_leaf_entry *e1, struct iam_leaf_entry *e2)
+-{
+-      ptrdiff_t diff;
+-
+-      diff = (void *)e1 - (void *)e2;
+-      assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff);
+-      return diff / iam_leaf_entry_size(p);
+-}
+-
+-static inline struct iam_leaf_entry* 
+-iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift)
+-{
+-      void *e = entry;
+-      return e + shift * iam_leaf_entry_size(p);
+-}
+-
+-static inline struct iam_key *
+-dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key)
+-{
+-      memcpy(key, e, path_descr(p)->id_key_size);
+-      return key;
+-}
+-
+-static inline struct iam_key *
+-iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry)
+-{
+-      void *e = entry;
+-      return e + path_descr(p)->id_rec_size;
+-}
+-static inline struct iam_leaf_entry *
+-iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry)
+-{
+-      return entry; 
+-}
+-
+-static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, 
+-                         struct iam_key *k)
+-{
+-      struct iam_leaf_entry *p, *q, *m;
+-      struct iam_leaf_entry *entries = leaf->entries;
+-      int count = dx_get_count((struct iam_entry *)entries);
+-      
+-      p = iam_leaf_entry_shift(path, entries, 1);
+-      q = iam_leaf_entry_shift(path, entries, count - 1);
+-      while (p <= q) {
+-              m = iam_leaf_entry_shift(path,
+-                                 p, iam_leaf_entry_diff(path, q, p) / 2);
+-              dxtrace(printk("."));
+-              if (keycmp(path->ip_container, iam_leaf_key_at(path, m),
+-                         path->ip_key_target) > 0)
+-                      q = iam_leaf_entry_shift(path, m, -1);
+-              else
+-                      p = iam_leaf_entry_shift(path, m, +1);
+-      }
+-      leaf->at = q; 
+-      return 0;
+-}
+-
+-/*XXX what kind of lock should this entry be locked: WangDi */
+-static int iam_leaf_insert(handle_t *handle, struct iam_path *path, 
+-                         struct iam_key *k, struct iam_rec *r)
+-{
+-      struct iam_leaf leaf;
+-      struct iam_leaf_entry *p, *q;
+-      int err, count;
+-
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
+-      path_descr(path)->id_leaf.start(path->ip_container, &leaf);
+-      count = dx_get_count((struct iam_entry *)leaf.entries);
+-      if (dx_get_count((struct iam_entry *)leaf.entries) >= 
+-          dx_get_limit((struct iam_entry *)leaf.entries)){
+-              err = -ENOSPC;
+-              goto errout;
+-      }
+-
+-      err = iam_leaf_lookup(path, &leaf, k);
+-      if (err)
+-              goto errout;
+-      
+-      /*insert the k/r to leaf entries*/
+-      p = iam_leaf_entry_shift(path, leaf.at, 1);
+-      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
+-      while (q < p) {
+-              memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path));
+-              q = iam_leaf_entry_shift(path, q, -1);  
+-      }
+-      memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size);
+-      memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size);
+-
+-      dx_set_count((struct iam_entry*)leaf.entries, count + 1);
+-      err = ext3_journal_dirty_metadata(handle, leaf.bh);
+-      if (err)
+-              ext3_std_error(path->ip_container->ic_object->i_sb, err);
+-errout:       
+-      iam_leaf_fini(&leaf);
+-      return err;
+-} 
+-
+-static int split_leaf_node(handle_t *handle, struct iam_path *path)
+-{
+-      struct inode *dir = path_obj(path);
+-      unsigned continued = 0;
+-      struct buffer_head *bh2;
+-      u32 newblock, hash_split;
+-      char *data2;
+-      struct iam_leaf leaf;
+-      unsigned split;
+-      int     err;
+-
+-      bh2 = ext3_append (handle, dir, &newblock, &err);
+-      if (!(bh2)) {
+-              err = -ENOSPC;
+-              goto errout;
+-      }
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
+-
+-      BUFFER_TRACE(leaf.bh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, leaf.bh);
+-      if (err) {
+-      journal_error:
+-              iam_leaf_fini(&leaf);
+-              brelse(bh2);
+-              ext3_std_error(dir->i_sb, err);
+-              err = -EIO;
+-              goto errout;
+-      }
+-      data2 = bh2->b_data;
+-      split = dx_get_count((struct iam_entry*)leaf.entries)/2;
+-      hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split));
+-      if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)),
+-                 iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0)
+-              continued = 1;
+-
+-      memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1),
+-             iam_leaf_entry_shift(path, leaf.entries, split),
+-             split * iam_leaf_entry_size(path));
+- 
+-      /* Which block gets the new entry? */
+-      dx_insert_block(path, path->ip_frame, hash_split + continued, newblock);
+-      err = ext3_journal_dirty_metadata (handle, bh2);
+-      if (err)
+-              goto journal_error;
+-      err = ext3_journal_dirty_metadata (handle, leaf.bh);
+-      if (err)
+-              goto journal_error;
+-      brelse (bh2);
+-      iam_leaf_fini(&leaf);
+-errout:
+-      return err;
+-}
+-
+-static int split_index_node(handle_t *handle, struct iam_path *path);
+-/*
+- * Insert new record @r with key @k into container @c (within context of
+- * transaction @h.
+- *
+- * Return values: 0: success, -ve: error, including -EEXIST when record with
+- * given key is already present.
+- *
+- * postcondition: ergo(result == 0 || result == -EEXIST,
+- *                                  iam_lookup(c, k, r2) > 0 &&
+- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, 
+-             struct iam_rec *r)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      if (err)
+-              goto errout; 
+-
+-      err = iam_leaf_insert(handle, path, k, r);
+-      
+-      if (err != -ENOSPC) 
+-              goto errout;    
+-
+-      err = split_index_node(handle, path);
+-      if (err)
+-              goto errout;    
+-
+-      err = split_leaf_node(handle, path);
+-      if (err)
+-              goto errout;
+-      
+-      err = iam_leaf_insert(handle, path, k, r);
+-errout:
+-      iam_path_fini(path);
+-      return(err);
+-}
+-
+-EXPORT_SYMBOL(iam_insert);
+-static int iam_leaf_delete(handle_t *handle, struct iam_path *path, 
+-                         struct iam_key *k)
+-{
+-      struct iam_leaf leaf;
+-      struct iam_leaf_entry *p, *q;
+-      int err, count;
+-
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
+-      
+-      err = iam_leaf_lookup(path, &leaf, k);
+-      if (err)
+-              goto errout;
+-
+-      count = dx_get_count((struct iam_entry*)leaf.entries);
+-      /*delete the k to leaf entries*/
+-      p = iam_leaf_entry_shift(path, leaf.at, 1);
+-      q = iam_leaf_entry_shift(path, leaf.entries, count - 1);
+-      while (p < q) {
+-              memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path));
+-              p = iam_leaf_entry_shift(path, p, 1);
+-      }
+-      dx_set_count((struct iam_entry*)leaf.entries, count - 1);
+-
+-      err = ext3_journal_dirty_metadata(handle, leaf.bh);
+-      if (err)
+-              ext3_std_error(path_obj(path)->i_sb, err);
+-errout:       
+-      iam_leaf_fini(&leaf);
+-      return err;
+-}
+-
+-/*
+- * Delete existing record with key @k.
+- *
+- * Return values: 0: success, -ENOENT: not-found, -ve: other error.
+- *
+- * postcondition: ergo(result == 0 || result == -ENOENT,
+- *                                 !iam_lookup(c, k, *));
+- */
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      if (err)
+-              goto errout; 
+-
+-      err = iam_leaf_delete(h, path, k);
+-errout:
+-      iam_path_fini(path);
+-      return err;
+-}
+-
+-EXPORT_SYMBOL(iam_delete);
+-
+-static int iam_leaf_update(handle_t *handle, struct iam_path *path, 
+-                         struct iam_key *k, struct iam_rec *r)
+-{
+-      struct iam_leaf leaf;
+-      int err;
+-
+-      err = iam_leaf_init(path, &leaf);
+-      if (err)
+-              goto errout;
+-      
+-      err = iam_leaf_lookup(path, &leaf, k);
+-      if (err)
+-              goto errout;
+-
+-      memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size);
+-      memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size);
+-
+-      err = ext3_journal_dirty_metadata(handle, leaf.bh);
+-      if (err)
+-              ext3_std_error(path_obj(path)->i_sb, err);
+-errout:       
+-      iam_leaf_fini(&leaf);
+-      return err;
+-}
+-/*
+- * Replace existing record with key @k, or insert new one. New record data are
+- * in @r.
+- *
+- * Return values: 0: success, -ve: error.
+- *
+- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 &&
+- *                                  !memcmp(r, r2, c->ic_descr->id_rec_size));
+- */
+-int iam_update(handle_t *h, struct iam_container *c,
+-             struct iam_key *k, struct iam_rec *r)
+-{
+-      struct dx_hash_info     hinfo;
+-      struct iam_path_compat cpath;
+-      struct iam_path *path = &cpath.ipc_path;
+-      struct htree_cookie hc = {
+-              .hinfo  = &hinfo
+-      };
+-      int err, i;
+-      
+-      iam_path_init(path, c, &hc);
+-      for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i)
+-              path->ip_key_scratch[i] =
+-                      (struct iam_key *)&cpath.ipc_scrach[i];
+-      err = dx_lookup(path);
+-      if (err)
+-              goto errout; 
+-
+-      err = iam_leaf_update(h, path, k, r);
+-errout:
+-      iam_path_fini(path);
+-      return err;
+-}
+-
+-EXPORT_SYMBOL(iam_update);
+-
+-/*
+  * This function increments the frame pointer to search the next leaf
+  * block, and reads in the necessary intervening nodes if the search
+  * should be necessary.  Whether or not the search is necessary is
+@@ -1409,16 +581,15 @@ EXPORT_SYMBOL(iam_update);
+  * If start_hash is non-null, it will be filled in with the starting
+  * hash of the next page.
+  */
+-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+-                               struct iam_path *path, __u32 *start_hash)
++static int ext3_htree_advance(struct inode *dir, __u32 hash,
++                            struct iam_path *path, __u32 *start_hash,
++                            int compat)
+ {
+       struct iam_frame *p;
+       struct buffer_head *bh;
+       int err, num_frames = 0;
+       __u32 bhash;
+-      assert(dx_index_is_compat(path));
+-
+       p = path->ip_frame;
+       /*
+        * Find the next leaf page by incrementing the frame pointer.
+@@ -1438,28 +609,34 @@ static int ext3_htree_next_block(struct 
+               --p;
+       }
+-      /*
+-       * If the hash is 1, then continue only if the next page has a
+-       * continuation hash of any value.  This is used for readdir
+-       * handling.  Otherwise, check to see if the hash matches the
+-       * desired contiuation hash.  If it doesn't, return since
+-       * there's no point to read in the successive index pages.
+-       */
+-      dx_get_key(path, p->at, (struct iam_key *)&bhash);
+-      if (start_hash)
+-              *start_hash = bhash;
+-      if ((hash & 1) == 0) {
+-              if ((bhash & ~1) != hash)
+-                      return 0;
++      if (compat) {
++              /*
++               * Htree hash magic.
++               */
++              /*
++               * If the hash is 1, then continue only if the next page has a
++               * continuation hash of any value.  This is used for readdir
++               * handling.  Otherwise, check to see if the hash matches the
++               * desired contiuation hash.  If it doesn't, return since
++               * there's no point to read in the successive index pages.
++               */
++              iam_get_key(path, p->at, (struct iam_key *)&bhash);
++              if (start_hash)
++                      *start_hash = bhash;
++              if ((hash & 1) == 0) {
++                      if ((bhash & ~1) != hash)
++                              return 0;
++              }
+       }
+       /*
+        * If the hash is HASH_NB_ALWAYS, we always go to the next
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              err = path_descr(path)->id_node_read(path->ip_container,
+-                                                   (iam_ptr_t)dx_get_block(path, p->at),
+-                                                   NULL, &bh);
++              err = iam_path_descr(path)->id_ops->
++                      id_node_read(path->ip_container,
++                                   (iam_ptr_t)dx_get_block(path, p->at),
++                                   NULL, &bh);
+               if (err != 0)
+                       return err; /* Failure */
+               ++p;
+@@ -1471,6 +648,16 @@ static int ext3_htree_next_block(struct 
+       return 1;
+ }
++int iam_index_next(struct iam_container *c, struct iam_path *path)
++{
++      return ext3_htree_advance(c->ic_object, 0, path, NULL, 0);
 +}
 +
-+static int iam_lfix_at_end(const struct iam_leaf *folio)
++int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                        struct iam_path *path, __u32 *start_hash)
 +{
-+        struct iam_lentry *ile = iam_lfix_get_end(folio);
-+
-+        return (folio->il_at == ile);
++      return ext3_htree_advance(dir, hash, path, start_hash, 1);
 +}
+ /*
+  * p is at least 6 bytes before the end of page
+@@ -1662,21 +849,30 @@ static void dx_sort_map (struct dx_map_e
+       } while(more);
+ }
+-static void dx_insert_block(struct iam_path *path,
+-                          struct iam_frame *frame, u32 hash, u32 block)
++void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
++                  const struct iam_key *key, iam_ptr_t ptr)
+ {
+       struct iam_entry *entries = frame->entries;
+-      struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1);
++      struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+       int count = dx_get_count(entries);
+       assert(count < dx_get_limit(entries));
+-      assert(old < iam_entry_shift(path, entries, count));
++      assert(frame->at < iam_entry_shift(path, entries, count));
 +
-+static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh)
-+{
-+        struct iam_leaf_head *hdr;
+       memmove(iam_entry_shift(path, new, 1), new,
+               (char *)iam_entry_shift(path, entries, count) - (char *)new);
+-      dx_set_key(path, new, (struct iam_key *)&hash);
+-      dx_set_block(path, new, block);
++      dx_set_key(path, new, key);
++      dx_set_block(path, new, ptr);
+       dx_set_count(entries, count + 1);
+ }
 +
-+        hdr = (struct iam_leaf_head*)bh->b_data;
-+        hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC);
-+        hdr->ill_count = cpu_to_le16(0);
++void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
++                   u32 hash, u32 block)
++{
++      assert(dx_index_is_compat(path));
++      iam_insert_key(path, frame, (struct iam_key *)&hash, block);
 +}
 +
-+static void iam_lfix_split(struct iam_leaf *l, struct buffer_head *bh)
+ #endif
+@@ -1897,14 +1093,15 @@ static struct buffer_head * ext3_dx_find
+               if (*err != 0)
+                       return NULL;
+       } else {
+-              path->ip_frame->bh = NULL;              /* for iam_path_fini() */
++              path->ip_frame->bh = NULL;      /* for iam_path_fini() */
+               path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/
+       }
+       hash = hinfo.hash;
+       do {
+               block = dx_get_block(path, path->ip_frame->at);
+-              *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block,
+-                                                   NULL, &bh);
++              *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container,
++                                                        (iam_ptr_t)block,
++                                                        NULL, &bh);
+               if (*err != 0)
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -2067,7 +1264,7 @@ static struct ext3_dir_entry_2 *do_split
+                       struct buffer_head **bh,struct iam_frame *frame,
+                       struct dx_hash_info *hinfo, int *error)
+ {
+-      struct inode *dir = path_obj(path);
++      struct inode *dir = iam_path_obj(path);
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count, continued;
+       struct buffer_head *bh2;
+@@ -2392,18 +1589,25 @@ static int ext3_add_entry (handle_t *han
+ }
+ #ifdef CONFIG_EXT3_INDEX
+-static int split_index_node(handle_t *handle, struct iam_path *path)
+-{ 
++int split_index_node(handle_t *handle, struct iam_path *path)
 +{
-+        struct iam_path      *path;
-+      struct iam_leaf_head *hdr;
-+        const struct iam_key *pivot;
-+
-+      unsigned count;
-+      unsigned split;
-+
-+      void *start;
-+      void *finis;
-+
-+        path = iam_leaf_path(l);
-+
-+      hdr = (void *)bh->b_data;
-+
-+        count = lentry_count_get(l);
-+        split = count / 2;
-+
-+        start = iam_lfix_shift(l, iam_get_lentries(l), split);
-+        finis = iam_lfix_shift(l, iam_get_lentries(l), count);
-+
-+        pivot = iam_leaf_key_at(iam_leaf_container(l), start);
-+
-+        memmove(iam_entries(bh), start, finis - start);
-+        hdr->ill_count = count - split;
-+        lentry_count_set(l, split);
-+        /*
-+         * Insert pointer to the new node (together with the smallest key in
-+         * the node) into index node.
-+         */
-+        iam_insert_key(path, path->ip_frame, pivot, bh->b_blocknr);
-+}
+       struct iam_entry *entries;   /* old block contents */
+       struct iam_entry *entries2;  /* new block contents */
+       struct iam_frame *frame, *safe;
+       struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+       u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+-      struct inode *dir = path_obj(path);
++      struct inode *dir = iam_path_obj(path);
++      struct iam_descr *descr;
+       int nr_splet;
+       int i, err;
++      descr = iam_path_descr(path);
++      /*
++       * Algorithm below depends on this.
++       */
++      assert(descr->id_node_gap < descr->id_root_gap);
 +
-+struct iam_leaf_operations iam_lfix_leaf_ops = {
-+        .init           = iam_lfix_init,
-+        .init_new       = iam_lfix_init_new,
-+        .fini           = iam_lfix_fini,
-+        .start          = iam_lfix_start,
-+        .next           = iam_lfix_next,
-+        .key            = iam_lfix_key,
-+        .rec            = iam_lfix_rec,
-+        .lookup         = iam_lfix_lookup,
-+        .at_end         = iam_lfix_at_end,
-+        .rec_add        = iam_lfix_rec_add,
-+        .rec_del        = iam_lfix_rec_del,
-+        .can_add        = iam_lfix_can_add,
-+        .split          = iam_lfix_split
-+};
-+EXPORT_SYMBOL(iam_lfix_leaf_ops);
-Index: linux-stage/include/linux/lustre_iam.h
+       frame = path->ip_frame;
+       entries = frame->entries;
+@@ -2442,7 +1646,8 @@ static int split_index_node(handle_t *ha
+       for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+               bh_new[i] = ext3_append (handle, dir, &newblock[i], &err);
+               if (!bh_new[i] ||
+-                  path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0)
++                  descr->id_ops->id_node_init(path->ip_container,
++                                              bh_new[i], 0) != 0)
+                       goto cleanup;
+               BUFFER_TRACE(frame->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, frame->bh);
+@@ -2516,9 +1721,9 @@ static int split_index_node(handle_t *ha
+                       unsigned count1 = count/2, count2 = count - count1;
+                       unsigned hash2;
+-                      dx_get_key(path,
+-                                 iam_entry_shift(path, entries, count1),
+-                                 (struct iam_key *)&hash2);
++                      iam_get_key(path,
++                                  iam_entry_shift(path, entries, count1),
++                                  (struct iam_key *)&hash2);
+                       dxtrace(printk("Split index %i/%i\n", count1, count2));
+@@ -2578,7 +1783,7 @@ static int ext3_dx_add_entry(handle_t *h
+       size_t isize;
+       iam_path_compat_init(&cpath, dir);
+-      param = path_descr(path);
++      param = iam_path_descr(path);
+       err = dx_probe(dentry, NULL, &hinfo, path);
+       if (err != 0)
+@@ -2588,8 +1793,9 @@ static int ext3_dx_add_entry(handle_t *h
+       /* XXX nikita: global serialization! */
+       isize = dir->i_size;
+-      err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), 
+-                                handle, &bh);
++      err = param->id_ops->id_node_read(path->ip_container,
++                      (iam_ptr_t)dx_get_block(path, frame->at),
++                      handle, &bh);
+       if (err != 0)
+               goto cleanup;
+@@ -2724,12 +1930,12 @@ static struct inode * ext3_new_inode_wan
+  * is so far negative - it has no inode.
+  *
+  * If the create succeeds, we fill in the inode information
+- * with d_instantiate(). 
++ * with d_instantiate().
+  */
+ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+               struct nameidata *nd)
+ {
+-      handle_t *handle; 
++      handle_t *handle;
+       struct inode * inode;
+       int err, retries = 0;
+Index: iam/include/linux/lustre_iam.h
 ===================================================================
---- linux-stage.orig/include/linux/lustre_iam.h        2006-05-29 13:01:21.000000000 +0800
-+++ linux-stage/include/linux/lustre_iam.h     2006-05-29 13:01:22.000000000 +0800
-@@ -1,9 +1,61 @@
+--- iam.orig/include/linux/lustre_iam.h        2006-05-27 19:58:44.000000000 +0400
++++ iam/include/linux/lustre_iam.h     2006-05-29 22:41:51.000000000 +0400
+@@ -1,9 +1,64 @@
 +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
 + * vim:expandtab:shiftwidth=8:tabstop=8:
 + *
@@ -2837,6 +3033,9 @@ Index: linux-stage/include/linux/lustre_iam.h
 +#ifndef __LINUX_LUSTRE_IAM_H__
 +#define __LINUX_LUSTRE_IAM_H__
 +
++/* handle_t, journal_start(), journal_stop() */
++#include <linux/jbd.h>
++
  /*
 - * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2.
 + *  linux/include/linux/lustre_iam.h
@@ -2868,7 +3067,7 @@ Index: linux-stage/include/linux/lustre_iam.h
  };
  
  /*
-@@ -30,6 +82,11 @@
+@@ -30,6 +85,11 @@ struct iam_key;
  /* Incomplete type use to refer to the records stored in iam containers. */
  struct iam_rec;
  
@@ -2880,7 +3079,7 @@ Index: linux-stage/include/linux/lustre_iam.h
  typedef __u64 iam_ptr_t;
  
  /*
-@@ -41,45 +98,25 @@
+@@ -41,45 +101,25 @@ struct iam_frame {
        struct iam_entry *at;      /* target entry, found by binary search */
  };
  
@@ -2940,7 +3139,7 @@ Index: linux-stage/include/linux/lustre_iam.h
        /*
         * Returns pointer (in the same sense as pointer in index entry) to
         * the root node.
-@@ -102,8 +139,8 @@
+@@ -102,8 +142,8 @@ struct iam_descr {
        /*
         * Key comparison function. Returns -1, 0, +1.
         */
@@ -2951,7 +3150,7 @@ Index: linux-stage/include/linux/lustre_iam.h
        /*
         * Create new container.
         *
-@@ -111,25 +148,120 @@
+@@ -111,25 +151,109 @@ struct iam_descr {
         * contains single record with the smallest possible key.
         */
        int (*id_create)(struct iam_container *c);
@@ -3010,10 +3209,13 @@ Index: linux-stage/include/linux/lustre_iam.h
 +         *
 +         * Caller should assume that returned pointer is only valid
 +         * while leaf node is pinned and locked.*/
-+        struct iam_key *(*key)(struct iam_leaf *l, struct iam_key *k);
++        struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k);
 +        /* return pointer to entry body. Pointer is valid while
 +           corresponding leaf node is locked and pinned. */
-+        struct iam_rec *(*rec)(struct iam_leaf *l);
++        struct iam_rec *(*rec)(const struct iam_leaf *l);
++
++        void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
++        void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
 +
 +        /*
 +         * Search leaf @l for a record with key @k or for a place
@@ -3021,15 +3223,15 @@ Index: linux-stage/include/linux/lustre_iam.h
 +         *
 +         * Scratch keys from @path can be used.
 +         */
-+        int (*lookup)(struct iam_leaf *l, struct iam_key *k);
++        int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
 +
-+        int (*can_add)(struct iam_leaf *l,
-+                       struct iam_key *k, struct iam_rec *r);
++        int (*can_add)(const struct iam_leaf *l,
++                       const struct iam_key *k, const struct iam_rec *r);
 +        /*
 +         * add rec for a leaf
 +         */
 +        void (*rec_add)(struct iam_leaf *l,
-+                        struct iam_key *k, struct iam_rec *r);
++                        const struct iam_key *k, const struct iam_rec *r);
 +        /*
 +         * remove rec for a leaf
 +         */
@@ -3044,20 +3246,6 @@ Index: linux-stage/include/linux/lustre_iam.h
 +struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
 +struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
 +
-+struct iam_root {
-+        struct iam_root_info {
-+               u8 indirect_levels;
-+               u8 pad[3];
-+       } info;
-+       struct {} entries[0];
-+};
-+
-+#define IAM_LEAF_HEADER_MAGIC 0x1976
-+struct iam_leaf_head {
-+      __le16 ill_magic;
-+      __le16 ill_count;
-+};
-+
 +/*
 + * Parameters, describing a flavor of iam container.
 + */
@@ -3091,25 +3279,27 @@ Index: linux-stage/include/linux/lustre_iam.h
  };
  
  struct iam_container {
-@@ -149,6 +281,17 @@
- };
- /*
+@@ -142,10 +266,17 @@ struct iam_container {
+        * container flavor.
+        */
+       struct iam_descr *ic_descr;
++};
++
++/*
 + * description-specific part of iam_path. This is usually embedded into larger
 + * structure.
 + */
 +struct iam_path_descr {
-+      /*
+       /*
+-       * pointer to flavor-specific per-container data.
 +       * Scratch-pad area for temporary keys.
-+       */
+        */
+-      void             *ic_descr_data;
 +      struct iam_key        *ipd_key_scratch[DX_SCRATCH_KEYS];
-+};
-+
-+/*
-  * Structure to keep track of a path drilled through htree.
-  */
- struct iam_path {
-@@ -172,34 +315,232 @@
+ };
+ /*
+@@ -172,34 +303,238 @@ struct iam_path {
        /*
         * Leaf node: a child of ->ip_frame.
         */
@@ -3118,12 +3308,13 @@ Index: linux-stage/include/linux/lustre_iam.h
        /*
         * Key searched for.
         */
-       struct iam_key        *ip_key_target;
-       /*
+-      struct iam_key        *ip_key_target;
+-      /*
 -       * Scratch-pad area for temporary keys.
 -       */
 -      struct iam_key        *ip_key_scratch[DX_SCRATCH_KEYS];
--      /*
++      const struct iam_key  *ip_key_target;
+       /*
 -       * pointer to flavor-specific per-container data.
 +       * Description-specific data.
         */
@@ -3146,12 +3337,8 @@ Index: linux-stage/include/linux/lustre_iam.h
 +      struct dx_hash_info  *ipc_hinfo;
 +      struct dentry        *ipc_dentry;
 +      struct iam_path_descr ipc_descr;
- };
--int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
--int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
--int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
--int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
++};
++
 +/*
 + * iam cursor (iterator) api.
 + */
@@ -3164,8 +3351,12 @@ Index: linux-stage/include/linux/lustre_iam.h
 +      IAM_IT_DETACHED,
 +      /* iterator is above particular record in the container */
 +      IAM_IT_ATTACHED
-+};
-+
+ };
+-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k);
+-int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
+-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r);
 +/*
 + * Flags controlling iterator functionality.
 + */
@@ -3217,6 +3408,10 @@ Index: linux-stage/include/linux/lustre_iam.h
 +
 +void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
 +void iam_path_compat_fini(struct iam_path_compat *path);
++
++struct iam_path_descr *iam_ipd_alloc(int keysize);
++void iam_ipd_free(struct iam_path_descr *ipd);
++
 +/*
 + * Initialize iterator to IAM_IT_DETACHED state.
 + *
@@ -3244,7 +3439,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 + *                     (it_state(it) == IAM_IT_ATTACHED &&
 + *                      it_keycmp(it, iam_it_key_get(it, *), k) < 0))
 + */
-+int iam_it_get(struct iam_iterator *it, struct iam_key *k);
++int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
 +
 +/*
 + * Duplicates iterator.
@@ -3256,7 +3451,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 + *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
 + *                     iam_it_key_get(dst, *1) == iam_it_key_get(src, *2))
 + */
-+void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src);
++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
 +
 +/*
 + * Detach iterator. Does nothing it detached state.
@@ -3283,7 +3478,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 + * precondition:  it_state(it) == IAM_IT_ATTACHED
 + * postcondition: it_state(it) == IAM_IT_ATTACHED
 + */
-+struct iam_rec *iam_it_rec_get(struct iam_iterator *it);
++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
 +
 +/*
 + * Replace contents of record under iterator.
@@ -3300,7 +3495,8 @@ Index: linux-stage/include/linux/lustre_iam.h
 + * precondition:  it_state(it) == IAM_IT_ATTACHED
 + * postcondition: it_state(it) == IAM_IT_ATTACHED
 + */
-+struct iam_key *iam_it_key_get(struct iam_iterator *it, struct iam_key *k);
++struct iam_key *iam_it_key_get(const struct iam_iterator *it,
++                               struct iam_key *k);
 +
 +/*
 + * Insert new record with key @k and contents from @r, shifting records to the
@@ -3315,7 +3511,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 + *                     !memcmp(iam_it_rec_get(it), r, ...))
 + */
 +int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
-+                    struct iam_key *k, struct iam_rec *r);
++                    const struct iam_key *k, const struct iam_rec *r);
 +/*
 + * Delete record under iterator.
 + *
@@ -3333,7 +3529,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 + *                path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
 + * postcondition: it_state(it) == IAM_IT_ATTACHED
 + */
-+iam_pos_t iam_it_store(struct iam_iterator *it);
++iam_pos_t iam_it_store(const struct iam_iterator *it);
 +
 +/*
 + * Restore iterator from cookie.
@@ -3345,21 +3541,27 @@ Index: linux-stage/include/linux/lustre_iam.h
 + */
 +int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
 +
-+int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r,
++int iam_lookup(struct iam_container *c, const struct iam_key *k,
++               struct iam_rec *r, struct iam_path_descr *pd);
++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
 +             struct iam_path_descr *pd);
-+int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k,
-+             struct iam_path_descr *pd);
-+int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k,
++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
 +             struct iam_rec *r, struct iam_path_descr *pd);
-+int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k,
++int iam_insert(handle_t *handle, struct iam_container *c,
++               const struct iam_key *k,
 +             struct iam_rec *r, struct iam_path_descr *pd);
  /*
   * Initialize container @c, acquires additional reference on @inode.
   */
-@@ -210,3 +551,155 @@
+@@ -210,3 +545,170 @@ int iam_container_init(struct iam_contai
   */
  void iam_container_fini(struct iam_container *c);
  
++/*
++ * Determine container format.
++ */
++int iam_container_setup(struct iam_container *c);
++
 +#ifndef assert
 +#define assert(test) J_ASSERT(test)
 +#endif
@@ -3369,7 +3571,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 +        return c->ic_descr;
 +}
 +
-+static inline struct iam_descr *iam_path_descr(struct iam_path *p)
++static inline struct iam_descr *iam_path_descr(const struct iam_path *p)
 +{
 +      return p->ip_container->ic_descr;
 +}
@@ -3391,6 +3593,12 @@ Index: linux-stage/include/linux/lustre_iam.h
 +      return c->ic_descr->id_ops->id_keycmp(c, k1, k2);
 +}
 +
++static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst,
++                            const struct iam_rec *rec_src)
++{
++      memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size);
++}
++
 +static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
 +{
 +      return (void *)((char *)entry + off);
@@ -3461,7 +3669,7 @@ Index: linux-stage/include/linux/lustre_iam.h
 +                            frame->bh->b_data, frame == path->ip_frames);
 +}
 +
-+static inline struct iam_key *iam_path_key(struct iam_path *path, int nr)
++static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr)
 +{
 +      assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
 +      return path->ip_data->ipd_key_scratch[nr];
@@ -3478,10 +3686,6 @@ Index: linux-stage/include/linux/lustre_iam.h
 +                              u32 *block, int *err);
 +int split_index_node(handle_t *handle, struct iam_path *path);
 +
-+extern struct iam_leaf_operations lfix_leaf_ops;
-+extern struct iam_operations generic_iam_ops;
-+
-+
 +/*
 + * external
 + */
@@ -3508,7 +3712,15 @@ Index: linux-stage/include/linux/lustre_iam.h
 +struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf);
 +struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
 +
-+extern struct iam_leaf_operations iam_lfix_leaf_ops;
++
++struct iam_format {
++        int (*if_guess)(struct iam_container *c);
++        struct list_head if_linkage;
++};
++
++void iam_format_register(struct iam_format *fmt);
++
++void iam_lfix_format_init(void);
 +
 +/* __LINUX_LUSTRE_IAM_H__ */
 +#endif
index ea9a3bb..2976c85 100644 (file)
@@ -605,7 +605,6 @@ __mdd_index_delete(const struct lu_context *ctxt, struct mdd_device *mdd,
         mdd_lock2(ctxt, pobj, obj);
 
         rc = next->do_index_ops->dio_delete(ctxt, next,
-                                            (const struct dt_rec *)mdd_object_getfid(obj),
                                             (struct dt_key *)name, handle);
         mdd_unlock2(ctxt, pobj, obj);
 
@@ -940,7 +939,8 @@ static struct lu_device_type mdd_device_type = {
         .ldt_ops  = &mdd_device_type_ops
 };
 
-static void *mdd_key_init(const struct lu_context *ctx)
+static void *mdd_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
 {
         struct mdd_thread_info *info;
 
@@ -950,7 +950,8 @@ static void *mdd_key_init(const struct lu_context *ctx)
         return info;
 }
 
-static void mdd_key_fini(const struct lu_context *ctx, void *data)
+static void mdd_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
 {
         struct mdd_thread_info *info = data;
         OBD_FREE_PTR(info);
index 4dd3d63..bd5333f 100644 (file)
@@ -1971,7 +1971,8 @@ static struct lu_device *mdt_device_alloc(const struct lu_context *ctx,
  * context key constructor/destructor
  */
 
-static void *mdt_thread_init(const struct lu_context *ctx)
+static void *mdt_thread_init(const struct lu_context *ctx,
+                             struct lu_context_key *key)
 {
         struct mdt_thread_info *info;
 
@@ -1987,7 +1988,8 @@ static void *mdt_thread_init(const struct lu_context *ctx)
         return info;
 }
 
-static void mdt_thread_fini(const struct lu_context *ctx, void *data)
+static void mdt_thread_fini(const struct lu_context *ctx,
+                            struct lu_context_key *key, void *data)
 {
         struct mdt_thread_info *info = data;
         OBD_FREE_PTR(info);
index 10a5b23..5b67287 100644 (file)
@@ -187,3 +187,7 @@ struct dt_object *dt_store_open(const struct lu_context *ctx,
         return child;
 }
 EXPORT_SYMBOL(dt_store_open);
+
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
index 9a57cd3..dd661f1 100644 (file)
@@ -609,7 +609,7 @@ static void keys_fini(struct lu_context *ctx)
                                 LASSERT(key->lct_fini != NULL);
                                 LASSERT(key->lct_used > 1);
 
-                                key->lct_fini(ctx, ctx->lc_value[i]);
+                                key->lct_fini(ctx, key, ctx->lc_value[i]);
                                 key->lct_used--;
                                 ctx->lc_value[i] = NULL;
                         }
@@ -637,7 +637,7 @@ static int keys_init(struct lu_context *ctx)
                                 LASSERT(key->lct_init != NULL);
                                 LASSERT(key->lct_index == i);
 
-                                value = key->lct_init(ctx);
+                                value = key->lct_init(ctx, key);
                                 if (IS_ERR(value)) {
                                         keys_fini(ctx);
                                         return PTR_ERR(value);
index d7a8feb..f59a395 100644 (file)
@@ -85,8 +85,10 @@ static int   osd_object_print  (const struct lu_context *ctx,
                                 struct seq_file *f, const struct lu_object *o);
 static void  osd_device_free   (const struct lu_context *ctx,
                                 struct lu_device *m);
-static void *osd_key_init      (const struct lu_context *ctx);
-static void  osd_key_fini      (const struct lu_context *ctx, void *data);
+static void *osd_key_init      (const struct lu_context *ctx,
+                                struct lu_context_key *key);
+static void  osd_key_fini      (const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data);
 static int   osd_has_index     (struct osd_object *obj);
 static void  osd_object_init0  (struct osd_object *obj);
 static int   osd_device_init   (const struct lu_context *ctx,
@@ -108,9 +110,22 @@ static int   osd_index_insert  (const struct lu_context *ctxt,
                                 const struct dt_rec *rec,
                                 const struct dt_key *key,
                                 struct thandle *handle);
+static int   osd_index_delete  (const struct lu_context *ctxt,
+                                struct dt_object *dt, const struct dt_key *key,
+                                struct thandle *handle);
 static int   osd_index_probe   (const struct lu_context *ctxt,
-                                struct dt_object *dt,
+                                struct osd_object *o,
                                 const struct dt_index_features *feat);
+static int   osd_index_try     (const struct lu_context *ctx,
+                                struct dt_object *dt,
+                                const struct dt_index_features *feat,
+                                struct dt_index_cookie *cookie);
+static void *osd_index_key_init(const struct lu_context *ctx,
+                                struct lu_context_key *key);
+static void  osd_index_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data);
+static void  osd_index_fini    (const struct lu_context *ctx,
+                                struct dt_index_cookie *cookie);
 
 static struct osd_object  *osd_obj          (const struct lu_object *o);
 static struct osd_device  *osd_dev          (const struct lu_device *d);
@@ -132,6 +147,9 @@ static struct inode       *osd_iget         (struct osd_thread_info *info,
 static struct super_block *osd_sb           (const struct osd_device *dev);
 static journal_t          *osd_journal      (const struct osd_device *dev);
 
+static struct dt_index_cookie *osd_index_init(const struct lu_context *ctx,
+                                        const struct dt_index_features *feat);
+
 static struct lu_device_type_operations osd_device_type_ops;
 static struct lu_device_type            osd_device_type;
 static struct lu_object_operations      osd_lu_obj_ops;
@@ -210,7 +228,7 @@ static void osd_object_init0(struct osd_object *obj)
 {
         LASSERT(obj->oo_inode != NULL);
 
-        if (osd_has_index(obj))
+        if (S_ISDIR(obj->oo_inode->i_mode))
                 obj->oo_dt.do_index_ops = &osd_index_ops;
         else
                 obj->oo_dt.do_body_ops = &osd_body_ops;
@@ -240,8 +258,11 @@ static void osd_object_delete(const struct lu_context *ctx, struct lu_object *l)
 {
         struct osd_object *o = osd_obj(l);
 
-        if (o->oo_inode != NULL)
+        if (o->oo_inode != NULL) {
+                if (o->oo_container.ic_object == o->oo_inode)
+                        iam_container_fini(&o->oo_container);
                 iput(o->oo_inode);
+        }
 }
 
 static int osd_inode_unlinked(const struct inode *inode)
@@ -388,6 +409,8 @@ static struct dt_device_operations osd_dt_ops = {
         .dt_statfs      = osd_statfs,
         .dt_trans_start = osd_trans_start,
         .dt_trans_stop  = osd_trans_stop,
+        .dt_index_init  = osd_index_init,
+        .dt_index_fini  = osd_index_fini
 };
 
 static void osd_object_lock(const struct lu_context *ctx, struct dt_object *dt,
@@ -571,37 +594,81 @@ static int osd_object_create(const struct lu_context *ctx, struct dt_object *dt,
 }
 
 static struct dt_object_operations osd_obj_ops = {
-        .do_object_lock   = osd_object_lock,
-        .do_object_unlock = osd_object_unlock,
-        .do_attr_get      = osd_attr_get,
-        .do_object_create = osd_object_create,
+        .do_object_lock      = osd_object_lock,
+        .do_object_unlock    = osd_object_unlock,
+        .do_attr_get         = osd_attr_get,
+        .do_object_create    = osd_object_create,
+        .do_object_index_try = osd_index_try
 };
 
 static struct dt_body_operations osd_body_ops = {
 };
-#if 0
-static int osd_index_delete(struct lu_context *ctxt, struct dt_object *dt,
-                            const struct lu_fid *fid, const char *name,
-                            struct thandle *th)
-{
-        struct osd_object *osj = osd_dt_obj(dt);
-        struct osd_thandle *oh;
-        struct iam_path_descr ipd;
-        int rc;
 
-        ENTRY;
-        oh = container_of0(th, struct osd_thandle, ot_super);
-        LASSERT(oh->ot_handle != NULL);
-        if (osj->oo_container)
-                rc = iam_delete(oh->ot_handle, osj->oo_container, name, &ipd);
-
-        RETURN(rc);
-}
-#endif
 /*
  * Index operations.
  */
 
+struct osd_index_cookie {
+        struct lu_context_key oic_key;
+        int                   oic_size;
+};
+
+static void *osd_index_key_init(const struct lu_context *ctx,
+                                struct lu_context_key *key)
+{
+        struct osd_index_cookie *cookie;
+        void *area;
+
+        cookie = container_of(key, struct osd_index_cookie, oic_key);
+        area = iam_ipd_alloc(cookie->oic_size);
+        if (area == NULL)
+                area = ERR_PTR(-ENOMEM);
+        return area;
+}
+
+static void osd_index_key_fini(const struct lu_context *ctx,
+                               struct lu_context_key *key, void *data)
+{
+        iam_ipd_free(data);
+}
+
+static struct dt_index_cookie *osd_index_init(const struct lu_context *ctx,
+                                          const struct dt_index_features *feat)
+{
+        struct osd_index_cookie *cookie;
+
+        OBD_ALLOC_PTR(cookie);
+        if (cookie != NULL) {
+                int result;
+                int keysize;
+
+                keysize = feat->dif_keysize_max;
+                /*
+                 * XXX Variable keysize is not yet supported.
+                 */
+                LASSERT((feat->dif_flags & (DT_IND_VARKEY|DT_IND_VARREC|
+                                            DT_IND_NONUNQ)) == 0);
+                LASSERT(keysize > 0);
+                cookie->oic_key.lct_init = osd_index_key_init;
+                cookie->oic_key.lct_fini = osd_index_key_fini;
+                cookie->oic_size = keysize;
+                result = lu_context_key_register(&cookie->oic_key);
+                if (result != 0)
+                        cookie = ERR_PTR(result);
+        } else
+                cookie = ERR_PTR(-ENOMEM);
+        return (struct dt_index_cookie *)cookie;
+}
+
+static void osd_index_fini(const struct lu_context *ctx,
+                           struct dt_index_cookie *dcook)
+{
+        struct osd_index_cookie *cookie = (void *)dcook;
+
+        lu_context_key_degister(&cookie->oic_key);
+        OBD_FREE_PTR(cookie);
+}
+
 #if OI_IN_MEMORY
 
 /*
@@ -641,14 +708,66 @@ static int osd_build_fid(struct osd_device *osd,
         return result;
 }
 
-/*
- * XXX temporary stub stuff for ipd
- */
-static void ipd_init(struct iam_path_descr *ipd, __u64 *key)
+static int osd_index_probe(const struct lu_context *ctxt, struct osd_object *o,
+                           const struct dt_index_features *feat)
 {
-        int i;
-        for (i = 0; i < DX_SCRATCH_KEYS; i++, key++)
-                ipd->ipd_key_scratch[i] = key;
+        if (feat == &dt_directory_features)
+                return 1;
+        else
+                return 0; /* nothing yet is supported */
+}
+
+static int osd_index_try(const struct lu_context *ctx, struct dt_object *dt,
+                         const struct dt_index_features *feat,
+                         struct dt_index_cookie *dcook)
+{
+        int result;
+        struct osd_object *obj = osd_dt_obj(dt);
+        struct osd_index_cookie *cookie = (void *)dcook;
+
+        LASSERT(lu_object_exists(ctx, &dt->do_lu));
+
+        if (!osd_has_index(obj)) {
+                result = iam_container_init(&obj->oo_container,
+                                            &obj->oo_descr, obj->oo_inode);
+                if (result == 0) {
+                        result = iam_container_setup(&obj->oo_container);
+                        if (result == 0) {
+                                result = osd_index_probe(ctx, obj, feat);
+                                if (result == 0) {
+                                        dt->do_index_ops = &osd_index_ops;
+                                        obj->oo_cookie_key = &cookie->oic_key;
+                                }
+                        }
+                }
+        } else
+                result = 0;
+        return result;
+}
+
+static int osd_index_delete(const struct lu_context *ctxt, struct dt_object *dt,
+                            const struct dt_key *key, struct thandle *handle)
+{
+        struct osd_object     *obj = osd_dt_obj(dt);
+        struct osd_thandle    *oh;
+        struct iam_path_descr *ipd;
+        int rc;
+
+        ENTRY;
+
+        LASSERT(lu_object_exists(ctxt, &dt->do_lu));
+        LASSERT(obj->oo_container.ic_object == obj->oo_inode);
+
+        ipd = lu_context_key_get(ctxt, obj->oo_cookie_key);
+        LASSERT(ipd != NULL);
+
+        oh = container_of0(handle, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
+
+        rc = iam_delete(oh->ot_handle, &obj->oo_container,
+                        (const struct iam_key *)key, ipd);
+
+        RETURN(rc);
 }
 
 /*
@@ -658,21 +777,24 @@ static void ipd_init(struct iam_path_descr *ipd, __u64 *key)
 static int osd_index_lookup(const struct lu_context *ctxt, struct dt_object *dt,
                             struct dt_rec *rec, const struct dt_key *key)
 {
-#if 0
-        struct osd_object *osj = osd_dt_obj(dt);
-        struct osd_thandle *oh;
-        struct iam_path_descr ipd;
-        __u64 scratch_key[DX_SCRATCH_KEYS];
-        int rc = 0;
+        struct osd_object     *obj = osd_dt_obj(dt);
+if (!S_ISDIR(obj->oo_inode->i_mode)) {
+        struct iam_path_descr *ipd;
+        int rc;
 
         ENTRY;
 
-        ipd_init(&ipd, scratch_key);
-        if (osj->oo_container)
-                rc = iam_lookup(osj->oo_container, key, rec, &ipd);
+        LASSERT(lu_object_exists(ctxt, &dt->do_lu));
+        LASSERT(obj->oo_container.ic_object == obj->oo_inode);
+
+        ipd = lu_context_key_get(ctxt, obj->oo_cookie_key);
+        LASSERT(ipd != NULL);
+
+        rc = iam_lookup(&obj->oo_container, (const struct iam_key *)key,
+                        (struct iam_rec *)rec, ipd);
 
         RETURN(rc);
-#else
+} else {
         struct osd_object      *obj  = osd_dt_obj(dt);
         struct osd_device      *osd  = osd_obj2dev(obj);
         struct osd_thread_info *info = lu_context_key_get(ctxt, &osd_key);
@@ -728,7 +850,7 @@ static int osd_index_lookup(const struct lu_context *ctxt, struct dt_object *dt,
                 result = -ENOMEM;
         dput(parent);
         return result;
-#endif
+}
 }
 
 static int osd_add_rec(struct osd_thread_info *info, struct osd_device *dev,
@@ -769,23 +891,28 @@ static int osd_index_insert(const struct lu_context *ctx, struct dt_object *dt,
                             const struct dt_rec *rec, const struct dt_key *key,
                             struct thandle *th)
 {
-#if 0
-        struct osd_object *osj = osd_dt_obj(dt);
-        struct osd_thandle *oh;
-        struct iam_path_descr ipd;
-        __u64 scratch_key[DX_SCRATCH_KEYS];
-        int rc = 0;
+        struct osd_object     *obj = osd_dt_obj(dt);
+if (!S_ISDIR(obj->oo_inode->i_mode)) {
+        struct osd_thandle    *oh;
+        struct iam_path_descr *ipd;
+        int rc;
 
         ENTRY;
 
-        ipd_init(&ipd, scratch_key);
+        LASSERT(lu_object_exists(ctx, &dt->do_lu));
+        LASSERT(obj->oo_container.ic_object == obj->oo_inode);
+
+        ipd = lu_context_key_get(ctx, obj->oo_cookie_key);
+        LASSERT(ipd != NULL);
+
         oh = container_of0(th, struct osd_thandle, ot_super);
         LASSERT(oh->ot_handle != NULL);
-        if (osj->oo_container)
-                rc = iam_insert(oh->ot_handle, osj->oo_container, key, rec, &ipd);
+        rc = iam_insert(oh->ot_handle, &obj->oo_container,
+                        (const struct iam_key *)key,
+                        (struct iam_rec *)rec, ipd);
 
         RETURN(rc);
-#else
+} else {
         const struct lu_fid *fid  = (const struct lu_fid *)rec;
         const char          *name = (const char *)key;
 
@@ -820,71 +947,13 @@ static int osd_index_insert(const struct lu_context *ctx, struct dt_object *dt,
         } else
                 result = PTR_ERR(luch);
         return result;
-#endif
 }
-
-const struct dt_index_features dt_directory_features;
-
-static int osd_index_probe(const struct lu_context *ctxt, struct dt_object *dt,
-                           const struct dt_index_features *feat)
-{
-        struct osd_object *obj = osd_dt_obj(dt);
-
-        if (feat == &dt_directory_features)
-                return 1;
-        else
-                return 0; /* nothing yet is supported */
-}
-#define osd_fld_name "fld_iam"
-static int osd_index_init(const struct lu_context *ctx,
-                          struct dt_object *dt_obj, void *container,
-                          void *param)
-{
-        struct osd_object *obj = osd_dt_obj(dt_obj);
-        struct osd_device *osd = osd_obj2dev(obj);
-
-        /*
-         * XXX nikita: not yet.
-         */
-        return 0;
-
-        if (!obj->oo_container) {
-                /*stub for fld_iam*/
-                struct dentry *dentry;
-
-                dentry = osd_open(osd_sb(osd)->s_root, osd_fld_name,
-                                  S_IFREG);
-                if (IS_ERR(dentry)) {
-                        CERROR("can not open %s, rc = %ld\n", osd_fld_name,
-                                PTR_ERR(dentry));
-                        return (PTR_ERR(dentry));
-                }
-                obj->oo_inode = dentry->d_inode;
-                obj->oo_dt.do_index_ops = &osd_index_ops;
-                iam_container_init(container, param, obj->oo_inode);
-                obj->oo_container = container;
-                dput(dentry);
-        }
-
-        return 0;
-}
-
-static int osd_index_fini(struct dt_object *dt_obj)
-{
-        struct osd_object *obj = osd_dt_obj(dt_obj);
-
-        if (obj->oo_container) {
-                iam_container_fini(obj->oo_container);
-                OBD_FREE(obj->oo_container, sizeof(struct iam_container));
-                obj->oo_container = NULL;
-        }
-        return 0;
 }
 
 static struct dt_index_operations osd_index_ops = {
         .dio_lookup = osd_index_lookup,
         .dio_insert = osd_index_insert,
-        .dio_probe  = osd_index_probe,
+        .dio_delete = osd_index_delete
 };
 
 /*
@@ -905,7 +974,8 @@ static struct lu_context_key osd_key = {
         .lct_fini = osd_key_fini
 };
 
-static void *osd_key_init(const struct lu_context *ctx)
+static void *osd_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
 {
         struct osd_thread_info *info;
 
@@ -915,7 +985,8 @@ static void *osd_key_init(const struct lu_context *ctx)
         return info;
 }
 
-static void osd_key_fini(const struct lu_context *ctx, void *data)
+static void osd_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
 {
         struct osd_thread_info *info = data;
         OBD_FREE_PTR(info);
@@ -1252,7 +1323,7 @@ static journal_t *osd_journal(const struct osd_device *dev)
 
 static int osd_has_index(struct osd_object *obj)
 {
-        return S_ISDIR(obj->oo_inode->i_mode);
+        return obj->oo_dt.do_index_ops != NULL;
 }
 
 static struct lu_object_operations osd_lu_obj_ops = {
index 2fcffaa..1ad1ec9 100644 (file)
@@ -33,6 +33,7 @@
 
 /* struct rw_semaphore */
 #include <linux/rwsem.h>
+#include <linux/lustre_iam.h>
 
 #include <dt_object.h>
 #include "osd_oi.h"
@@ -41,14 +42,16 @@ struct inode;
 struct dentry;
 
 struct osd_object {
-        struct dt_object     oo_dt;
+        struct dt_object       oo_dt;
         /*
          * Inode for file system object represented by this osd_object. This
          * inode is pinned for the whole duration of lu_object life.
          */
-        struct inode        *oo_inode;
-        struct rw_semaphore  oo_sem;
-        void                *oo_container;
+        struct inode          *oo_inode;
+        struct rw_semaphore    oo_sem;
+        struct iam_container   oo_container;
+        struct iam_descr       oo_descr;
+        struct lu_context_key *oo_cookie_key;
 };
 
 /*
@@ -82,7 +85,7 @@ struct osd_thread_info {
          * XXX temporary: for ->i_op calls.
          */
         struct qstr         oti_str;
-        void                *scratch_key;        
+        void                *scratch_key;
 };
 
 #endif /* __KERNEL__ */
index eb43617..3b53350 100644 (file)
@@ -17,7 +17,7 @@ rootsbin_PROGRAMS = mount.lustre
 sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest \
        mount_lustre mkfs_lustre mkfs.lustre \
        tunefs_lustre tunefs.lustre l_getgroups
-bin_PROGRAMS = lfs llog_reader
+bin_PROGRAMS = lfs llog_reader create_iam
 lib_LIBRARIES = liblustreapi.a
 sbin_SCRIPTS = $(sbin_scripts)
 bin_SCRIPTS = $(bin_scripts)
@@ -44,6 +44,7 @@ wiretest_SOURCES = wiretest.c
 
 obdio_SOURCES = obdio.c obdiolib.c obdiolib.h
 obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h
+create_iam_SOURCES = create_iam.c
 
 llog_reader_SOURCES = llog_reader.c
 llog_reader_LDADD := $(LIBPTLCTL)
index 51a3bc8..8f514c1 100644 (file)
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  create_iam.c
+ *  User-level tool for creation of iam files.
+ *
+ *  Copyright (c) 2006 Cluster File Systems, Inc.
+ *   Author: Wang Di <wangdi@clusterfs.com>
+ *   Author: Nikita Danilov <nikita@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   You may have signed or agreed to another license before downloading
+ *   this software.  If so, you are bound by the terms and conditions
+ *   of that agreement, and the following does not apply to you.  See the
+ *   LICENSE file included with this distribution for more information.
+ *
+ *   If you did not agree to a different license, then this copy of Lustre
+ *   is open source software; you can redistribute it and/or modify it
+ *   under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   In either case, Lustre is distributed in the hope that it will be
+ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   license text for more details.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
 #include <stdio.h>
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
 
-void usage(char *prog)
+#include <sys/types.h>
+
+#ifdef HAVE_ENDIAN_H
+#include <endian.h>
+#endif
+
+#include <libcfs/libcfs.h>
+
+void usage(void)
 {
-        printf("usage: %s [-s] filename\n", prog);
+        printf("usage: create_iam "
+               "[-h] [-k <keysize>] [-r recsize] [-b <blocksize] [-p <ptrsize>] [-v]\n");
 }
 
-struct iam_root {
-        struct iam_root_info {
-                unsigned char indirect_levels;
-                unsigned char pad[3]; 
-        } info;
-        struct {} entries[0];
+enum {
+        IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL
 };
 
-struct iam_entry {
-       unsigned long long ie_key;
-       unsigned long      ie_index;
+struct iam_lfix_root {
+        u_int64_t  ilr_magic;
+        u_int16_t  ilr_keysize;
+        u_int16_t  ilr_recsize;
+        u_int16_t  ilr_indirect_levels;
+        u_int16_t  ilr_padding;
 };
 
-struct leaf_header {
-       unsigned short   lh_magic;
-       unsigned short   lh_count;
+enum {
+        IAM_LEAF_HEADER_MAGIC = 0x1976
 };
 
-struct leaf_entry {
-       unsigned long long le_key;
-       unsigned long long le_rec;
+struct iam_leaf_head {
+        u_int16_t ill_magic;
+        u_int16_t ill_count;
 };
 
 #define LEAF_HEAD_MAGIC 0x1976
 int main(int argc, char **argv)
 {
-       struct iam_root root;
-       struct iam_entry ie;
-       struct leaf_header header;
-       struct leaf_entry le;
-       char buf[4096];
-       int fd, rc, file_arg = 1;
-
-        memset(buf, 0, 4096);
-
-        if (argc < 2 || argc > 3) {
-                usage(argv[0]);
-                exit(1);
+        int rc;
+        int opt;
+        int blocksize = 4096;
+        int keysize   = 8;
+        int recsize   = 8;
+        int ptrsize   = 4;
+        int verbose   = 0;
+        void *buf;
+        struct iam_lfix_root *root;
+        struct iam_leaf_head *head;
+        void *entry;
+
+        do {
+                opt = getopt(argc, argv, "hb:k:r:p:v");
+                switch (opt) {
+                case 'v':
+                        verbose++;
+                case -1:
+                        break;
+                case 'b':
+                        blocksize = atoi(optarg);
+                        break;
+                case 'k':
+                        keysize = atoi(optarg);
+                        break;
+                case 'r':
+                        recsize = atoi(optarg);
+                        break;
+                case 'p':
+                        ptrsize = atoi(optarg);
+                        break;
+                case '?':
+                default:
+                        fprintf(stderr, "Unable to parse options.");
+                case 'h':
+                        usage();
+                        return 0;
+                }
+        } while (opt != -1);
+
+        if (ptrsize != 4 && ptrsize != 8) {
+                fprintf(stderr, "Invalid ptrsize (%i). "
+                        "Only 4 and 8 are supported\n", ptrsize);
+                return 1;
+        }
+
+        if (blocksize <= 100 || keysize < 1 || recsize < 0) {
+                fprintf(stderr, "Too small record, key or block block\n");
+                return 1;
         }
 
-       fd = open(argv[file_arg], O_RDWR | O_TRUNC | O_CREAT, 0644);
-        if (fd == -1) {
-                printf("Error opening %s %s\n", argv[1], strerror(errno));
-                exit(1);
+        if (keysize + recsize + sizeof(struct iam_leaf_head) > blocksize / 3) {
+                fprintf(stderr, "Too large (record, key) or too small block\n");
+                return 1;
+        }
+        if (verbose > 0) {
+                fprintf(stderr,
+                        "key: %i, rec: %i, ptr: %i, block: %i\n",
+                        keysize, recsize, ptrsize, blocksize);
+        }
+        buf = malloc(blocksize);
+        if (buf == NULL) {
+                fprintf(stderr, "Unable to allocate %i bytes\n", blocksize);
+                return 1;
+        }
+
+        root = memset(buf, 0, blocksize);
+
+        *root = (struct iam_lfix_root) {
+                .ilr_magic           = cpu_to_le64(IAM_LFIX_ROOT_MAGIC),
+                .ilr_keysize         = cpu_to_le16(keysize),
+                .ilr_recsize         = cpu_to_le16(recsize),
+                .ilr_indirect_levels = cpu_to_le16(1)
+        };
+        entry = root + 1;
+        /*
+         * Entry format is <key> followed by <ptr>. In the minimal tree
+         * consisting of a root and single node, <key> is a minimal possible
+         * key.
+         *
+         * XXX: this key is hard-coded to be a sequence of 0's.
+         */
+        entry += keysize;
+        /* now @entry points to <ptr> */
+        if (ptrsize == 4)
+                *(u_int32_t *)entry = cpu_to_le32(1);
+        else
+                *(u_int64_t *)entry = cpu_to_le64(1);
+        rc = write(1, buf, blocksize);
+        if (rc != blocksize) {
+                fprintf(stderr, "Unable to write root node: %m (%i)\n", rc);
+                return 1;
+        }
+
+        /* form leaf */
+        head = memset(buf, 0, blocksize);
+        *head = (struct iam_leaf_head) {
+                .ill_magic = cpu_to_le16(LEAF_HEAD_MAGIC),
+                /*
+                 * Leaf contains an entry with the smallest possible key
+                 * (created by zeroing).
+                 */
+                .ill_count = cpu_to_le16(1),
+        };
+
+        rc = write(1, buf, blocksize);
+        if (rc != blocksize) {
+                fprintf(stderr, "Unable to write leaf node: %m (%i)\n", rc);
+                return 1;
         }
-       
-       /*create the root entry*/
-       memset(buf, 0, 4096);
-       root.info.indirect_levels = 0;
-       memcpy(buf, &root, sizeof(struct iam_root));
-
-       /*insert the dx_limit compatiable structure to make 
-        *iam compatiable with dx code*/        
-       header.lh_count = 2;
-       
-       memcpy (buf + sizeof(struct iam_root), &header,
-               sizeof(struct iam_entry));
-       
-       ie.ie_key = 0x0;
-       ie.ie_index = 1;
-
-       memcpy (buf + sizeof(struct iam_root) + sizeof(struct iam_entry), &ie,
-               sizeof(struct iam_entry));
-       rc = write(fd, buf, sizeof(buf));
-       if (rc < 0) {
-               printf("Error Writing %s %s \n", argv[1], strerror(errno));
-               close(fd);
-               exit(rc);
-       }
-       
-       /*create the first index entry*/        
-       memset(buf, 0, 4096);
-       header.lh_magic = LEAF_HEAD_MAGIC;
-       header.lh_count = 1; 
-       memcpy(buf, &header, sizeof(struct leaf_header));
-
-       /*insert the lowest key of the leaf*/
-       le.le_key = 0; /*tmp assume 0 is the lowest key of the leaf*/ 
-       le.le_rec = 0;
-
-       memcpy(buf + sizeof(struct leaf_header), &le, 
-              sizeof(struct leaf_entry)); 
-       rc = write(fd, buf, sizeof(buf));
-       if (rc < 0) {
-               printf("Error Writing %s %s \n", argv[1], strerror(errno));
-               close(fd);
-               exit(rc);
-       }
-       close(fd);
-       exit(0);
+        return 0;
 }