Whamcloud - gitweb
b=18504
authordeshmukh <deshmukh>
Tue, 7 Jul 2009 06:56:06 +0000 (06:56 +0000)
committerdeshmukh <deshmukh>
Tue, 7 Jul 2009 06:56:06 +0000 (06:56 +0000)
i=adilger
i=pravin
i=girish

Moved IAM code from ldiskfs to OSD.

lustre/obdclass/hash.c
lustre/osd/Makefile.in
lustre/osd/autoMakefile.am
lustre/osd/osd_handler.c
lustre/osd/osd_iam.c [new file with mode: 0644]
lustre/osd/osd_iam.h [new file with mode: 0644]
lustre/osd/osd_iam_lfix.c [new file with mode: 0644]
lustre/osd/osd_iam_lvar.c [new file with mode: 0644]
lustre/osd/osd_internal.h

index 5165e7c..26e9e9e 100644 (file)
@@ -23,6 +23,9 @@
 #endif
 
 #define DELTA 0x9E3779B9
+#define DX_HASH_R5      98
+#define DX_HASH_SAME    99
+
 
 static void TEA_transform(__u32 buf[4], __u32 const in[])
 {
index 45ee501..ad99827 100644 (file)
@@ -1,5 +1,6 @@
 MODULES := osd
-osd-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o
+osd-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o osd_iam.o \
+            osd_iam_lfix.o osd_iam_lvar.o
 
 EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs
 
index 8e88a77..8cd8bf9 100644 (file)
@@ -38,5 +38,6 @@ if MODULES
 modulefs_DATA = osd$(KMODEXT)
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES := $(osd-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
+DIST_SOURCES := $(osd-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h \
+                osd_iam.h
index 7235a3c..2c639c7 100644 (file)
@@ -77,7 +77,6 @@
 
 /* fid_is_local() */
 #include <lustre_fid.h>
-#include <linux/lustre_iam.h>
 
 #include "osd_internal.h"
 #include "osd_igif.h"
@@ -1361,25 +1360,6 @@ static int osd_create_post(struct osd_thread_info *info, struct osd_object *obj,
         return 0;
 }
 
-extern struct inode *ldiskfs_create_inode(handle_t *handle,
-                                          struct inode * dir, int mode);
-extern int ldiskfs_add_entry(handle_t *handle, struct dentry *dentry,
-                             struct inode *inode);
-extern int ldiskfs_delete_entry(handle_t *handle,
-                                struct inode * dir,
-                                struct ldiskfs_dir_entry_2 * de_del,
-                                struct buffer_head * bh);
-extern struct buffer_head * ldiskfs_find_entry(struct dentry *dentry,
-                                               struct ldiskfs_dir_entry_2
-                                               ** res_dir);
-extern int ldiskfs_add_dot_dotdot(handle_t *handle, struct inode *dir,
-                                  struct inode *inode);
-
-extern int ldiskfs_xattr_set_handle(handle_t *handle, struct inode *inode,
-                                    int name_index, const char *name,
-                                    const void *value, size_t value_len,
-                                    int flags);
-
 static struct dentry * osd_child_dentry_get(const struct lu_env *env,
                                             struct osd_object *obj,
                                             const char *name,
@@ -1446,13 +1426,6 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj,
         return result;
 }
 
-
-extern int iam_lvar_create(struct inode *obj, int keysize, int ptrsize,
-                           int recsize, handle_t *handle);
-
-extern int iam_lfix_create(struct inode *obj, int keysize, int ptrsize,
-                           int recsize, handle_t *handle);
-
 enum {
         OSD_NAME_LEN = 255
 };
@@ -2167,7 +2140,6 @@ static int osd_iam_index_probe(const struct lu_env *env, struct osd_object *o,
                            const struct dt_index_features *feat)
 {
         struct iam_descr *descr;
-        struct dt_object *dt = &o->oo_dt;
 
         if (osd_object_is_root(o))
                 return feat == &dt_directory_features;
@@ -2178,19 +2150,8 @@ static int osd_iam_index_probe(const struct lu_env *env, struct osd_object *o,
         if (feat == &dt_directory_features) {
                 if (descr->id_rec_size == sizeof(struct lu_fid_pack))
                         return 1;
-
-                if (descr == &iam_htree_compat_param) {
-                        /* if it is a HTREE dir then there is good chance that,
-                         * we dealing with ext3 directory here with no FIDs. */
-
-                        if (descr->id_rec_size ==
-                            sizeof ((struct ldiskfs_dir_entry_2 *)NULL)->inode) {
-
-                                dt->do_index_ops = &osd_index_ea_ops;
-                                return 1;
-                        }
-                }
-                return 0;
+                else
+                        return 0;
         } else {
                 return
                         feat->dif_keysize_min <= descr->id_key_size &&
@@ -3719,9 +3680,6 @@ static int osd_process_config(const struct lu_env *env,
         RETURN(err);
 }
 
-extern void ldiskfs_orphan_cleanup (struct super_block * sb,
-                                    struct ldiskfs_super_block * es);
-
 static int osd_recovery_complete(const struct lu_env *env,
                                  struct lu_device *d)
 {
diff --git a/lustre/osd/osd_iam.c b/lustre/osd/osd_iam.c
new file mode 100644 (file)
index 0000000..9f1cd26
--- /dev/null
@@ -0,0 +1,2297 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * iam.c
+ * Top-level entry points into iam module
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+/*
+ * iam: big theory statement.
+ *
+ * iam (Index Access Module) is a module providing abstraction of persistent
+ * transactional container on top of generalized ldiskfs htree.
+ *
+ * iam supports:
+ *
+ *     - key, pointer, and record size specifiable per container.
+ *
+ *     - trees taller than 2 index levels.
+ *
+ *     - read/write to existing ldiskfs htree directories as iam containers.
+ *
+ * iam container is a tree, consisting of leaf nodes containing keys and
+ * records stored in this container, and index nodes, containing keys and
+ * pointers to leaf or index nodes.
+ *
+ * iam does not work with keys directly, instead it calls user-supplied key
+ * comparison function (->dpo_keycmp()).
+ *
+ * Pointers are (currently) interpreted as logical offsets (measured in
+ * blocksful) within underlying flat file on top of which iam tree lives.
+ *
+ * On-disk format:
+ *
+ * iam mostly tries to reuse existing htree formats.
+ *
+ * Format of index node:
+ *
+ * +-----+-------+-------+-------+------+-------+------------+
+ * |     | count |       |       |      |       |            |
+ * | gap |   /   | entry | entry | .... | entry | free space |
+ * |     | limit |       |       |      |       |            |
+ * +-----+-------+-------+-------+------+-------+------------+
+ *
+ *       gap           this part of node is never accessed by iam code. It
+ *                     exists for binary compatibility with ldiskfs htree (that,
+ *                     in turn, stores fake struct ext2_dirent for ext2
+ *                     compatibility), and to keep some unspecified per-node
+ *                     data. Gap can be different for root and non-root index
+ *                     nodes. Gap size can be specified for each container
+ *                     (gap of 0 is allowed).
+ *
+ *       count/limit   current number of entries in this node, and the maximal
+ *                     number of entries that can fit into node. count/limit
+ *                     has the same size as entry, and is itself counted in
+ *                     count.
+ *
+ *       entry         index entry: consists of a key immediately followed by
+ *                     a pointer to a child node. Size of a key and size of a
+ *                     pointer depends on container. Entry has neither
+ *                     alignment nor padding.
+ *
+ *       free space    portion of node new entries are added to
+ *
+ * Entries in index node are sorted by their key value.
+ *
+ * Format of a leaf node is not specified. Generic iam code accesses leaf
+ * nodes through ->id_leaf methods in struct iam_descr.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd.h>
+#include <linux/time.h>
+#include <linux/ldiskfs_fs.h>
+#include <linux/ldiskfs_jbd.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include "osd_internal.h"
+
+#include "xattr.h"
+#include "iopen.h"
+#include "acl.h"
+
+/*
+ * List of all registered formats.
+ *
+ * No locking. Callers synchronize.
+ */
+static LIST_HEAD(iam_formats);
+
+void iam_format_register(struct iam_format *fmt)
+{
+        list_add(&fmt->if_linkage, &iam_formats);
+}
+EXPORT_SYMBOL(iam_format_register);
+
+/*
+ * Determine format of given container. This is done by scanning list of
+ * registered formats and calling ->if_guess() method of each in turn.
+ */
+static int iam_format_guess(struct iam_container *c)
+{
+        int result;
+        struct iam_format *fmt;
+
+        /*
+         * XXX temporary initialization hook.
+         */
+        {
+                static int initialized = 0;
+
+                if (!initialized) {
+                        iam_lvar_format_init();
+                        iam_lfix_format_init();
+                        initialized = 1;
+                }
+        }
+
+        result = -ENOENT;
+        list_for_each_entry(fmt, &iam_formats, if_linkage) {
+                result = fmt->if_guess(c);
+                if (result == 0)
+                        break;
+        }
+        return result;
+}
+
+/*
+ * Initialize container @c.
+ */
+int iam_container_init(struct iam_container *c,
+                       struct iam_descr *descr, struct inode *inode)
+{
+        memset(c, 0, sizeof *c);
+        c->ic_descr  = descr;
+        c->ic_object = inode;
+        init_rwsem(&c->ic_sem);
+        return 0;
+}
+EXPORT_SYMBOL(iam_container_init);
+
+/*
+ * Determine container format.
+ */
+int iam_container_setup(struct iam_container *c)
+{
+        return iam_format_guess(c);
+}
+EXPORT_SYMBOL(iam_container_setup);
+
+/*
+ * Finalize container @c, release all resources.
+ */
+void iam_container_fini(struct iam_container *c)
+{
+}
+EXPORT_SYMBOL(iam_container_fini);
+
+void iam_path_init(struct iam_path *path, struct iam_container *c,
+                   struct iam_path_descr *pd)
+{
+        memset(path, 0, sizeof *path);
+        path->ip_container = c;
+        path->ip_frame = path->ip_frames;
+        path->ip_data = pd;
+        path->ip_leaf.il_path = path;
+}
+
+static void iam_leaf_fini(struct iam_leaf *leaf);
+
+void iam_path_release(struct iam_path *path)
+{
+        int i;
+
+        for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
+                if (path->ip_frames[i].bh != NULL) {
+                        brelse(path->ip_frames[i].bh);
+                        path->ip_frames[i].bh = NULL;
+                }
+        }
+}
+
+void iam_path_fini(struct iam_path *path)
+{
+        iam_leaf_fini(&path->ip_leaf);
+        iam_path_release(path);
+}
+
+
+void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
+{
+        int i;
+
+        path->ipc_hinfo = &path->ipc_hinfo_area;
+        for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
+                path->ipc_descr.ipd_key_scratch[i] =
+                        (struct iam_ikey *)&path->ipc_scratch[i];
+
+        iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
+}
+
+void iam_path_compat_fini(struct iam_path_compat *path)
+{
+        iam_path_fini(&path->ipc_path);
+}
+
+/*
+ * Helper function initializing iam_path_descr and its key scratch area.
+ */
+struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
+{
+        struct iam_path_descr *ipd;
+        void *karea;
+        int i;
+
+        ipd = area;
+        karea = ipd + 1;
+        for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
+                ipd->ipd_key_scratch[i] = karea;
+        return ipd;
+}
+EXPORT_SYMBOL(iam_ipd_alloc);
+
+void iam_ipd_free(struct iam_path_descr *ipd)
+{
+}
+EXPORT_SYMBOL(iam_ipd_free);
+
+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+                  handle_t *h, struct buffer_head **bh)
+{
+        int result = 0;
+
+        *bh = ldiskfs_bread(h, c->ic_object, (int)ptr, 0, &result);
+        if (*bh == NULL)
+                result = -EIO;
+        return result;
+}
+
+/*
+ * Return pointer to current leaf record. Pointer is valid while corresponding
+ * leaf node is locked and pinned.
+ */
+static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
+{
+        return iam_leaf_ops(leaf)->rec(leaf);
+}
+
+/*
+ * Return pointer to the current leaf key. This function returns pointer to
+ * the key stored in node.
+ *
+ * Caller should assume that returned pointer is only valid while leaf node is
+ * pinned and locked.
+ */
+static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
+{
+        return iam_leaf_ops(leaf)->key(leaf);
+}
+
+static int iam_leaf_key_size(const struct iam_leaf *leaf)
+{
+        return iam_leaf_ops(leaf)->key_size(leaf);
+}
+
+static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
+                                      struct iam_ikey *key)
+{
+        return iam_leaf_ops(leaf)->ikey(leaf, key);
+}
+
+static int iam_leaf_keycmp(const struct iam_leaf *leaf,
+                           const struct iam_key *key)
+{
+        return iam_leaf_ops(leaf)->key_cmp(leaf, key);
+}
+
+static int iam_leaf_keyeq(const struct iam_leaf *leaf,
+                          const struct iam_key *key)
+{
+        return iam_leaf_ops(leaf)->key_eq(leaf, key);
+}
+
+#if LDISKFS_INVARIANT_ON
+static int iam_leaf_check(struct iam_leaf *leaf);
+extern int dx_node_check(struct iam_path *p, struct iam_frame *f);
+
+static int iam_path_check(struct iam_path *p)
+{
+        int i;
+        int result;
+        struct iam_frame *f;
+        struct iam_descr *param;
+
+        result = 1;
+        param = iam_path_descr(p);
+        for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
+                f = &p->ip_frames[i];
+                if (f->bh != NULL) {
+                        result = dx_node_check(p, f);
+                        if (result)
+                                result = !param->id_ops->id_node_check(p, f);
+                }
+        }
+        if (result && p->ip_leaf.il_bh != NULL)
+                result = iam_leaf_check(&p->ip_leaf);
+        if (result == 0) {
+                ldiskfs_std_error(iam_path_obj(p)->i_sb, result);
+        }
+        return result;
+}
+#endif
+
+static int iam_leaf_load(struct iam_path *path)
+{
+        iam_ptr_t block;
+        int err;
+        struct iam_container *c;
+        struct buffer_head   *bh;
+        struct iam_leaf      *leaf;
+        struct iam_descr     *descr;
+
+        c     = path->ip_container;
+        leaf  = &path->ip_leaf;
+        descr = iam_path_descr(path);
+        block = path->ip_frame->leaf;
+        if (block == 0) {
+                /* XXX bug 11027 */
+                printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
+                       (long unsigned)path->ip_frame->leaf,
+                       dx_get_count(dx_node_get_entries(path, path->ip_frame)),
+                       path->ip_frames[0].bh, path->ip_frames[1].bh,
+                       path->ip_frames[2].bh);
+        }
+        err   = descr->id_ops->id_node_read(c, block, NULL, &bh);
+        if (err == 0) {
+                leaf->il_bh = bh;
+                leaf->il_curidx = block;
+                err = iam_leaf_ops(leaf)->init(leaf);
+                assert_inv(ergo(err == 0, iam_leaf_check(leaf)));
+        }
+        return err;
+}
+
+static void iam_unlock_htree(struct inode *dir, struct dynlock_handle *lh)
+{
+        if (lh != NULL)
+                dynlock_unlock(&LDISKFS_I(dir)->i_htree_lock, lh);
+}
+
+
+static void iam_leaf_unlock(struct iam_leaf *leaf)
+{
+        if (leaf->il_lock != NULL) {
+                iam_unlock_htree(iam_leaf_container(leaf)->ic_object,
+                                leaf->il_lock);
+                do_corr(schedule());
+                leaf->il_lock = NULL;
+        }
+}
+
+static void iam_leaf_fini(struct iam_leaf *leaf)
+{
+        if (leaf->il_path != NULL) {
+                iam_leaf_unlock(leaf);
+                assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf)));
+                iam_leaf_ops(leaf)->fini(leaf);
+                if (leaf->il_bh) {
+                        brelse(leaf->il_bh);
+                        leaf->il_bh = NULL;
+                        leaf->il_curidx = 0;
+                }
+        }
+}
+
+static void iam_leaf_start(struct iam_leaf *folio)
+{
+        iam_leaf_ops(folio)->start(folio);
+}
+
+void iam_leaf_next(struct iam_leaf *folio)
+{
+        iam_leaf_ops(folio)->next(folio);
+}
+
+static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
+                             const struct iam_rec *rec)
+{
+        iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
+}
+
+static void iam_rec_del(struct iam_leaf *leaf, int shift)
+{
+        iam_leaf_ops(leaf)->rec_del(leaf, shift);
+}
+
+int iam_leaf_at_end(const struct iam_leaf *leaf)
+{
+        return iam_leaf_ops(leaf)->at_end(leaf);
+}
+
+void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr)
+{
+        iam_leaf_ops(l)->split(l, bh, nr);
+}
+
+int iam_leaf_can_add(const struct iam_leaf *l,
+                     const struct iam_key *k, const struct iam_rec *r)
+{
+        return iam_leaf_ops(l)->can_add(l, k, r);
+}
+
+#if LDISKFS_INVARIANT_ON
+static int iam_leaf_check(struct iam_leaf *leaf)
+{
+        return 1;
+#if 0
+        struct iam_lentry    *orig;
+        struct iam_path      *path;
+        struct iam_container *bag;
+        struct iam_ikey       *k0;
+        struct iam_ikey       *k1;
+        int result;
+        int first;
+
+        orig = leaf->il_at;
+        path = iam_leaf_path(leaf);
+        bag  = iam_leaf_container(leaf);
+
+        result = iam_leaf_ops(leaf)->init(leaf);
+        if (result != 0)
+                return result;
+
+        first = 1;
+        iam_leaf_start(leaf);
+        k0 = iam_path_ikey(path, 0);
+        k1 = iam_path_ikey(path, 1);
+        while (!iam_leaf_at_end(leaf)) {
+                iam_ikeycpy(bag, k0, k1);
+                iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1));
+                if (!first && iam_ikeycmp(bag, k0, k1) > 0) {
+                        return 0;
+                }
+                first = 0;
+                iam_leaf_next(leaf);
+        }
+        leaf->il_at = orig;
+        return 1;
+#endif
+}
+#endif
+
+static int iam_txn_dirty(handle_t *handle,
+                         struct iam_path *path, struct buffer_head *bh)
+{
+        int result;
+
+        result = ldiskfs_journal_dirty_metadata(handle, bh);
+        if (result != 0)
+                ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
+        return result;
+}
+
+static int iam_txn_add(handle_t *handle,
+                       struct iam_path *path, struct buffer_head *bh)
+{
+        int result;
+
+        result = ldiskfs_journal_get_write_access(handle, bh);
+        if (result != 0)
+                ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
+        return result;
+}
+
+/***********************************************************************/
+/* iterator interface                                                  */
+/***********************************************************************/
+
+static enum iam_it_state it_state(const struct iam_iterator *it)
+{
+        return it->ii_state;
+}
+
+/*
+ * Helper function returning scratch key.
+ */
+static struct iam_container *iam_it_container(const struct iam_iterator *it)
+{
+        return it->ii_path.ip_container;
+}
+
+static inline int it_keycmp(const struct iam_iterator *it,
+                            const struct iam_key *k)
+{
+        return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
+}
+
+static inline int it_keyeq(const struct iam_iterator *it,
+                           const struct iam_key *k)
+{
+        return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
+}
+
+static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
+{
+        return iam_ikeycmp(it->ii_path.ip_container,
+                           iam_leaf_ikey(&it->ii_path.ip_leaf,
+                                         iam_path_ikey(&it->ii_path, 0)), ik);
+}
+
+static inline int it_at_rec(const struct iam_iterator *it)
+{
+        return !iam_leaf_at_end(&it->ii_path.ip_leaf);
+}
+
+static inline int it_before(const struct iam_iterator *it)
+{
+        return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
+}
+
+/*
+ * Helper wrapper around iam_it_get(): returns 0 (success) only when record
+ * with exactly the same key as asked is found.
+ */
+static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
+{
+        int result;
+
+        result = iam_it_get(it, k);
+        if (result > 0)
+                result = 0;
+        else if (result == 0)
+                /*
+                 * Return -ENOENT if cursor is located above record with a key
+                 * different from one specified, or in the empty leaf.
+                 *
+                 * XXX returning -ENOENT only works if iam_it_get() never
+                 * returns -ENOENT as a legitimate error.
+                 */
+                result = -ENOENT;
+        return result;
+}
+
+void iam_container_write_lock(struct iam_container *ic)
+{
+        down_write(&ic->ic_sem);
+}
+
+void iam_container_write_unlock(struct iam_container *ic)
+{
+        up_write(&ic->ic_sem);
+}
+
+void iam_container_read_lock(struct iam_container *ic)
+{
+        down_read(&ic->ic_sem);
+}
+
+void iam_container_read_unlock(struct iam_container *ic)
+{
+        up_read(&ic->ic_sem);
+}
+
+/*
+ * Initialize iterator to IAM_IT_DETACHED state.
+ *
+ * postcondition: it_state(it) == IAM_IT_DETACHED
+ */
+int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
+                 struct iam_path_descr *pd)
+{
+        memset(it, 0, sizeof *it);
+        it->ii_flags  = flags;
+        it->ii_state  = IAM_IT_DETACHED;
+        iam_path_init(&it->ii_path, c, pd);
+        return 0;
+}
+EXPORT_SYMBOL(iam_it_init);
+
+/*
+ * Finalize iterator and release all resources.
+ *
+ * precondition: it_state(it) == IAM_IT_DETACHED
+ */
+void iam_it_fini(struct iam_iterator *it)
+{
+        assert_corr(it_state(it) == IAM_IT_DETACHED);
+        iam_path_fini(&it->ii_path);
+}
+EXPORT_SYMBOL(iam_it_fini);
+
+/*
+ * this locking primitives are used to protect parts
+ * of dir's htree. protection unit is block: leaf or index
+ */
+struct dynlock_handle *iam_lock_htree(struct inode *dir, unsigned long value,
+                                     enum dynlock_type lt)
+{
+        return dynlock_lock(&LDISKFS_I(dir)->i_htree_lock, value, lt, GFP_NOFS);
+}
+
+
+
+int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
+{
+        struct iam_frame *f;
+
+        for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
+                do_corr(schedule());
+                *lh = iam_lock_htree(iam_path_obj(path), f->curidx, DLT_READ);
+                if (*lh == NULL)
+                        return -ENOMEM;
+        }
+        return 0;
+}
+
+/*
+ * Fast check for frame consistency.
+ */
+static int iam_check_fast(struct iam_path *path, struct iam_frame *frame)
+{
+        struct iam_container *bag;
+        struct iam_entry *next;
+        struct iam_entry *last;
+        struct iam_entry *entries;
+        struct iam_entry *at;
+
+        bag     = path->ip_container;
+        at      = frame->at;
+        entries = frame->entries;
+        last    = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
+
+        if (unlikely(at > last))
+                return -EAGAIN;
+
+        if (unlikely(dx_get_block(path, at) != frame->leaf))
+                return -EAGAIN;
+
+        if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
+                                 path->ip_ikey_target) > 0))
+                return -EAGAIN;
+
+        next = iam_entry_shift(path, at, +1);
+        if (next <= last) {
+                if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
+                                         path->ip_ikey_target) <= 0))
+                        return -EAGAIN;
+        }
+        return 0;
+}
+
+int dx_index_is_compat(struct iam_path *path)
+{
+        return iam_path_descr(path) == NULL;
+}
+
+/*
+ * dx_find_position
+ *
+ * search position of specified hash in index
+ *
+ */
+
+struct iam_entry *iam_find_position(struct iam_path *path,
+                                   struct iam_frame *frame)
+{
+        int count;
+        struct iam_entry *p;
+        struct iam_entry *q;
+        struct iam_entry *m;
+
+        count = dx_get_count(frame->entries);
+        assert_corr(count && count <= dx_get_limit(frame->entries));
+        p = iam_entry_shift(path, frame->entries,
+                            dx_index_is_compat(path) ? 1 : 2);
+        q = iam_entry_shift(path, frame->entries, count - 1);
+        while (p <= q) {
+                m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
+                if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
+                                path->ip_ikey_target) > 0)
+                        q = iam_entry_shift(path, m, -1);
+                else
+                        p = iam_entry_shift(path, m, +1);
+        }
+        return iam_entry_shift(path, p, -1);
+}
+
+
+
+static iam_ptr_t iam_find_ptr(struct iam_path *path, struct iam_frame *frame)
+{
+        return dx_get_block(path, iam_find_position(path, frame));
+}
+
+void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
+                    const struct iam_ikey *key, iam_ptr_t ptr)
+{
+        struct iam_entry *entries = frame->entries;
+        struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+        int count = dx_get_count(entries);
+
+        /*
+         * Unfortunately we cannot assert this, as this function is sometimes
+         * called by VFS under i_sem and without pdirops lock.
+         */
+        assert_corr(1 || iam_frame_is_locked(path, frame));
+        assert_corr(count < dx_get_limit(entries));
+        assert_corr(frame->at < iam_entry_shift(path, entries, count));
+        assert_inv(dx_node_check(path, frame));
+
+        memmove(iam_entry_shift(path, new, 1), new,
+                (char *)iam_entry_shift(path, entries, count) - (char *)new);
+        dx_set_ikey(path, new, key);
+        dx_set_block(path, new, ptr);
+        dx_set_count(entries, count + 1);
+        assert_inv(dx_node_check(path, frame));
+}
+
+void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
+                         const struct iam_ikey *key, iam_ptr_t ptr)
+{
+        iam_lock_bh(frame->bh);
+        iam_insert_key(path, frame, key, ptr);
+        iam_unlock_bh(frame->bh);
+}
+/*
+ * returns 0 if path was unchanged, -EAGAIN otherwise.
+ */
+static int iam_check_path(struct iam_path *path, struct iam_frame *frame)
+{
+        int equal;
+
+        iam_lock_bh(frame->bh);
+        equal = iam_check_fast(path, frame) == 0 ||
+                frame->leaf == iam_find_ptr(path, frame);
+        DX_DEVAL(iam_lock_stats.dls_bh_again += !equal);
+        iam_unlock_bh(frame->bh);
+
+        return equal ? 0 : -EAGAIN;
+}
+
+static int iam_lookup_try(struct iam_path *path)
+{
+        u32 ptr;
+        int err = 0;
+        int i;
+
+        struct iam_descr *param;
+        struct iam_frame *frame;
+        struct iam_container *c;
+
+        param = iam_path_descr(path);
+        c = path->ip_container;
+
+        ptr = param->id_ops->id_root_ptr(c);
+        for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
+             ++frame, ++i) {
+                err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
+                                                  &frame->bh);
+                do_corr(schedule());
+
+                iam_lock_bh(frame->bh);
+                /*
+                 * node must be initialized under bh lock because concurrent
+                 * creation procedure may change it and iam_lookup_try() will
+                 * see obsolete tree height. -bzzz
+                 */
+                if (err != 0)
+                        break;
+
+                if (LDISKFS_INVARIANT_ON) {
+                        err = param->id_ops->id_node_check(path, frame);
+                        if (err != 0)
+                                break;
+                }
+
+                err = param->id_ops->id_node_load(path, frame);
+                if (err != 0)
+                        break;
+
+                assert_inv(dx_node_check(path, frame));
+                /*
+                 * splitting may change root index block and move hash we're
+                 * looking for into another index block so, we have to check
+                 * this situation and repeat from begining if path got changed
+                 * -bzzz
+                 */
+                if (i > 0) {
+                        err = iam_check_path(path, frame - 1);
+                        if (err != 0)
+                                break;
+                }
+
+                frame->at = iam_find_position(path, frame);
+                frame->curidx = ptr;
+                frame->leaf = ptr = dx_get_block(path, frame->at);
+
+                iam_unlock_bh(frame->bh);
+                do_corr(schedule());
+        }
+        if (err != 0)
+                iam_unlock_bh(frame->bh);
+        path->ip_frame = --frame;
+        return err;
+}
+
+static int __iam_path_lookup(struct iam_path *path)
+{
+        int err;
+        int i;
+
+        for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
+                assert(path->ip_frames[i].bh == NULL);
+
+        do {
+                err = iam_lookup_try(path);
+                do_corr(schedule());
+                if (err != 0)
+                        iam_path_fini(path);
+        } while (err == -EAGAIN);
+
+        return err;
+}
+
+/*
+ * returns 0 if path was unchanged, -EAGAIN otherwise.
+ */
+static int iam_check_full_path(struct iam_path *path, int search)
+{
+        struct iam_frame *bottom;
+        struct iam_frame *scan;
+        int i;
+        int result;
+
+        do_corr(schedule());
+
+        for (bottom = path->ip_frames, i = 0;
+             i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
+                ; /* find last filled in frame */
+        }
+
+        /*
+         * Lock frames, bottom to top.
+         */
+        for (scan = bottom - 1; scan >= path->ip_frames; --scan)
+                iam_lock_bh(scan->bh);
+        /*
+         * Check them top to bottom.
+         */
+        result = 0;
+        for (scan = path->ip_frames; scan < bottom; ++scan) {
+                struct iam_entry *pos;
+
+                if (search) {
+                        if (iam_check_fast(path, scan) == 0)
+                                continue;
+
+                        pos = iam_find_position(path, scan);
+                        if (scan->leaf != dx_get_block(path, pos)) {
+                                result = -EAGAIN;
+                                break;
+                        }
+                        scan->at = pos;
+                } else {
+                        pos = iam_entry_shift(path, scan->entries,
+                                              dx_get_count(scan->entries) - 1);
+                        if (scan->at > pos ||
+                            scan->leaf != dx_get_block(path, scan->at)) {
+                                result = -EAGAIN;
+                                break;
+                        }
+                }
+        }
+
+        /*
+         * Unlock top to bottom.
+         */
+        for (scan = path->ip_frames; scan < bottom; ++scan)
+                iam_unlock_bh(scan->bh);
+        DX_DEVAL(iam_lock_stats.dls_bh_full_again += !!result);
+        do_corr(schedule());
+
+        return result;
+}
+
+
+/*
+ * Performs path lookup and returns with found leaf (if any) locked by htree
+ * lock.
+ */
+int iam_lookup_lock(struct iam_path *path,
+                   struct dynlock_handle **dl, enum dynlock_type lt)
+{
+        int result;
+        struct inode *dir;
+
+        dir = iam_path_obj(path);
+        while ((result = __iam_path_lookup(path)) == 0) {
+                do_corr(schedule());
+                *dl = iam_lock_htree(dir, path->ip_frame->leaf, lt);
+                if (*dl == NULL) {
+                        iam_path_fini(path);
+                        result = -ENOMEM;
+                        break;
+                }
+                do_corr(schedule());
+                /*
+                 * while locking leaf we just found may get split so we need
+                 * to check this -bzzz
+                 */
+                if (iam_check_full_path(path, 1) == 0)
+                        break;
+                iam_unlock_htree(dir, *dl);
+                *dl = NULL;
+                iam_path_fini(path);
+        }
+        return result;
+}
+/*
+ * Performs tree top-to-bottom traversal starting from root, and loads leaf
+ * node.
+ */
+static int iam_path_lookup(struct iam_path *path, int index)
+{
+        struct iam_container *c;
+        struct iam_descr *descr;
+        struct iam_leaf  *leaf;
+        int result;
+
+        c = path->ip_container;
+        leaf = &path->ip_leaf;
+        descr = iam_path_descr(path);
+        result = iam_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
+        assert_inv(iam_path_check(path));
+        do_corr(schedule());
+        if (result == 0) {
+                result = iam_leaf_load(path);
+                assert_inv(ergo(result == 0, iam_leaf_check(leaf)));
+                if (result == 0) {
+                        do_corr(schedule());
+                        if (index)
+                                result = iam_leaf_ops(leaf)->
+                                        ilookup(leaf, path->ip_ikey_target);
+                        else
+                                result = iam_leaf_ops(leaf)->
+                                        lookup(leaf, path->ip_key_target);
+                        do_corr(schedule());
+                }
+                if (result < 0)
+                        iam_leaf_unlock(leaf);
+        }
+        return result;
+}
+
+/*
+ * Common part of iam_it_{i,}get().
+ */
+static int __iam_it_get(struct iam_iterator *it, int index)
+{
+        int result;
+        assert_corr(it_state(it) == IAM_IT_DETACHED);
+
+        result = iam_path_lookup(&it->ii_path, index);
+        if (result >= 0) {
+                int collision;
+
+                collision = result & IAM_LOOKUP_LAST;
+                switch (result & ~IAM_LOOKUP_LAST) {
+                case IAM_LOOKUP_EXACT:
+                        result = +1;
+                        it->ii_state = IAM_IT_ATTACHED;
+                        break;
+                case IAM_LOOKUP_OK:
+                        result = 0;
+                        it->ii_state = IAM_IT_ATTACHED;
+                        break;
+                case IAM_LOOKUP_BEFORE:
+                case IAM_LOOKUP_EMPTY:
+                        result = 0;
+                        it->ii_state = IAM_IT_SKEWED;
+                        break;
+                default:
+                        assert(0);
+                }
+                result |= collision;
+        }
+        /*
+         * See iam_it_get_exact() for explanation.
+         */
+        assert_corr(result != -ENOENT);
+        return result;
+}
+
+/*
+ * Correct hash, but not the same key was found, iterate through hash
+ * collision chain, looking for correct record.
+ */
+static int iam_it_collision(struct iam_iterator *it)
+{
+        int result;
+
+        assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
+
+        while ((result = iam_it_next(it)) == 0) {
+                do_corr(schedule());
+                if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
+                        return -ENOENT;
+                if (it_keyeq(it, it->ii_path.ip_key_target))
+                        return 0;
+        }
+        return result;
+}
+
+/*
+ * Attach iterator. After successful completion, @it points to record with
+ * least key not larger than @k.
+ *
+ * Return value: 0: positioned on existing record,
+ *             +ve: exact position found,
+ *             -ve: error.
+ *
+ * precondition:  it_state(it) == IAM_IT_DETACHED
+ * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
+ *                     it_keycmp(it, k) <= 0)
+ */
+int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
+{
+        int result;
+        assert_corr(it_state(it) == IAM_IT_DETACHED);
+
+        it->ii_path.ip_ikey_target = NULL;
+        it->ii_path.ip_key_target  = k;
+
+        result = __iam_it_get(it, 0);
+
+        if (result == IAM_LOOKUP_LAST) {
+                result = iam_it_collision(it);
+                if (result != 0) {
+                        iam_it_put(it);
+                        iam_it_fini(it);
+                        result = __iam_it_get(it, 0);
+                } else
+                        result = +1;
+        }
+        if (result > 0)
+                result &= ~IAM_LOOKUP_LAST;
+
+        assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
+        assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
+                         it_keycmp(it, k) <= 0));
+        return result;
+}
+EXPORT_SYMBOL(iam_it_get);
+
+/*
+ * Attach iterator by index key.
+ */
+static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
+{
+        assert_corr(it_state(it) == IAM_IT_DETACHED);
+
+        it->ii_path.ip_ikey_target = k;
+        return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
+}
+
+/*
+ * Attach iterator, and assure it points to the record (not skewed).
+ *
+ * Return value: 0: positioned on existing record,
+ *             +ve: exact position found,
+ *             -ve: error.
+ *
+ * precondition:  it_state(it) == IAM_IT_DETACHED &&
+ *                !(it->ii_flags&IAM_IT_WRITE)
+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
+ */
+int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
+{
+        int result;
+        assert_corr(it_state(it) == IAM_IT_DETACHED &&
+                    !(it->ii_flags&IAM_IT_WRITE));
+        result = iam_it_get(it, k);
+        if (result == 0) {
+                if (it_state(it) != IAM_IT_ATTACHED) {
+                        assert_corr(it_state(it) == IAM_IT_SKEWED);
+                        result = iam_it_next(it);
+                }
+        }
+        assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
+        return result;
+}
+EXPORT_SYMBOL(iam_it_get_at);
+
+/*
+ * Duplicates iterator.
+ *
+ * postcondition: it_state(dst) == it_state(src) &&
+ *                iam_it_container(dst) == iam_it_container(src) &&
+ *                dst->ii_flags = src->ii_flags &&
+ *                ergo(it_state(src) == IAM_IT_ATTACHED,
+ *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+ *                     iam_it_key_get(dst) == iam_it_key_get(src))
+ */
+void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
+{
+        dst->ii_flags     = src->ii_flags;
+        dst->ii_state     = src->ii_state;
+        /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
+        /*
+         * XXX: duplicate lock.
+         */
+        assert_corr(it_state(dst) == it_state(src));
+        assert_corr(iam_it_container(dst) == iam_it_container(src));
+        assert_corr(dst->ii_flags = src->ii_flags);
+        assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
+                    iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+                    iam_it_key_get(dst) == iam_it_key_get(src)));
+
+}
+
+/*
+ * Detach iterator. Does nothing it detached state.
+ *
+ * postcondition: it_state(it) == IAM_IT_DETACHED
+ */
+void iam_it_put(struct iam_iterator *it)
+{
+        if (it->ii_state != IAM_IT_DETACHED) {
+                it->ii_state = IAM_IT_DETACHED;
+                iam_leaf_fini(&it->ii_path.ip_leaf);
+        }
+}
+EXPORT_SYMBOL(iam_it_put);
+
+static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
+                                        struct iam_ikey *ikey);
+
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int iam_htree_advance(struct inode *dir, __u32 hash,
+                              struct iam_path *path, __u32 *start_hash,
+                              int compat)
+{
+        struct iam_frame *p;
+        struct buffer_head *bh;
+        int err, num_frames = 0;
+        __u32 bhash;
+
+        p = path->ip_frame;
+        /*
+         * Find the next leaf page by incrementing the frame pointer.
+         * If we run out of entries in the interior node, loop around and
+         * increment pointer in the parent node.  When we break out of
+         * this loop, num_frames indicates the number of interior
+         * nodes need to be read.
+         */
+        while (1) {
+                do_corr(schedule());
+                iam_lock_bh(p->bh);
+                p->at = iam_entry_shift(path, p->at, +1);
+                if (p->at < iam_entry_shift(path, p->entries,
+                                            dx_get_count(p->entries))) {
+                        p->leaf = dx_get_block(path, p->at);
+                        iam_unlock_bh(p->bh);
+                        break;
+                }
+                iam_unlock_bh(p->bh);
+                if (p == path->ip_frames)
+                        return 0;
+                num_frames++;
+                --p;
+        }
+
+        if (compat) {
+                /*
+                 * Htree hash magic.
+                 */
+        /*
+         * If the hash is 1, then continue only if the next page has a
+         * continuation hash of any value.  This is used for readdir
+         * handling.  Otherwise, check to see if the hash matches the
+         * desired contiuation hash.  If it doesn't, return since
+         * there's no point to read in the successive index pages.
+         */
+                dx_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
+        if (start_hash)
+                *start_hash = bhash;
+        if ((hash & 1) == 0) {
+                if ((bhash & ~1) != hash)
+                        return 0;
+        }
+        }
+        /*
+         * If the hash is HASH_NB_ALWAYS, we always go to the next
+         * block so no check is necessary
+         */
+        while (num_frames--) {
+                iam_ptr_t idx;
+
+                do_corr(schedule());
+                iam_lock_bh(p->bh);
+                idx = p->leaf = dx_get_block(path, p->at);
+                iam_unlock_bh(p->bh);
+                err = iam_path_descr(path)->id_ops->
+                        id_node_read(path->ip_container, idx, NULL, &bh);
+                if (err != 0)
+                        return err; /* Failure */
+                ++p;
+                brelse(p->bh);
+                assert_corr(p->bh != bh);
+                p->bh = bh;
+                p->entries = dx_node_get_entries(path, p);
+                p->at = iam_entry_shift(path, p->entries, !compat);
+                assert_corr(p->curidx != idx);
+                p->curidx = idx;
+                iam_lock_bh(p->bh);
+                assert_corr(p->leaf != dx_get_block(path, p->at));
+                p->leaf = dx_get_block(path, p->at);
+                iam_unlock_bh(p->bh);
+                assert_inv(dx_node_check(path, p));
+        }
+        return 1;
+}
+
+
+static inline int iam_index_advance(struct iam_path *path)
+{
+        return iam_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
+}
+
+static void iam_unlock_array(struct inode *dir, struct dynlock_handle **lh)
+{
+        int i;
+
+        for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
+                if (*lh != NULL) {
+                        iam_unlock_htree(dir, *lh);
+                        *lh = NULL;
+                }
+        }
+}
+/*
+ * Advance index part of @path to point to the next leaf. Returns 1 on
+ * success, 0, when end of container was reached. Leaf node is locked.
+ */
+int iam_index_next(struct iam_container *c, struct iam_path *path)
+{
+        iam_ptr_t cursor;
+        struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, };
+        int result;
+        struct inode *object;
+
+        /*
+         * Locking for iam_index_next()... is to be described.
+         */
+
+        object = c->ic_object;
+        cursor = path->ip_frame->leaf;
+
+        while (1) {
+                result = iam_index_lock(path, lh);
+                do_corr(schedule());
+                if (result < 0)
+                        break;
+
+                result = iam_check_full_path(path, 0);
+                if (result == 0 && cursor == path->ip_frame->leaf) {
+                        result = iam_index_advance(path);
+
+                        assert_corr(result == 0 ||
+                                    cursor != path->ip_frame->leaf);
+                        break;
+                }
+                do {
+                        iam_unlock_array(object, lh);
+
+                        iam_path_release(path);
+                        do_corr(schedule());
+
+                        result = __iam_path_lookup(path);
+                        if (result < 0)
+                                break;
+
+                        while (path->ip_frame->leaf != cursor) {
+                                do_corr(schedule());
+
+                                result = iam_index_lock(path, lh);
+                                do_corr(schedule());
+                                if (result < 0)
+                                        break;
+
+                                result = iam_check_full_path(path, 0);
+                                if (result != 0)
+                                        break;
+
+                                result = iam_index_advance(path);
+                                if (result == 0) {
+                                        CERROR("cannot find cursor : %u\n",
+                                                cursor);
+                                        result = -EIO;
+                                }
+                                if (result < 0)
+                                        break;
+                                result = iam_check_full_path(path, 0);
+                                if (result != 0)
+                                        break;
+                                iam_unlock_array(object, lh);
+                        }
+                } while (result == -EAGAIN);
+                if (result < 0)
+                        break;
+        }
+        iam_unlock_array(object, lh);
+        return result;
+}
+
+/*
+ * Move iterator one record right.
+ *
+ * Return value: 0: success,
+ *              +1: end of container reached
+ *             -ve: error
+ *
+ * precondition:  (it_state(it) == IAM_IT_ATTACHED ||
+ *                 it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
+ *                ergo(result >  0, it_state(it) == IAM_IT_DETACHED)
+ */
+int iam_it_next(struct iam_iterator *it)
+{
+        int result;
+        struct iam_path      *path;
+        struct iam_leaf      *leaf;
+        struct inode         *obj;
+        do_corr(struct iam_ikey *ik_orig);
+
+        /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
+        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+                    it_state(it) == IAM_IT_SKEWED);
+
+        path = &it->ii_path;
+        leaf = &path->ip_leaf;
+        obj  = iam_path_obj(path);
+
+        assert_corr(iam_leaf_is_locked(leaf));
+
+        result = 0;
+        do_corr(ik_orig = it_at_rec(it) ?
+                iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
+        if (it_before(it)) {
+                assert_corr(!iam_leaf_at_end(leaf));
+                it->ii_state = IAM_IT_ATTACHED;
+        } else {
+                if (!iam_leaf_at_end(leaf))
+                        /* advance within leaf node */
+                        iam_leaf_next(leaf);
+                /*
+                 * multiple iterations may be necessary due to empty leaves.
+                 */
+                while (result == 0 && iam_leaf_at_end(leaf)) {
+                        do_corr(schedule());
+                        /* advance index portion of the path */
+                        result = iam_index_next(iam_it_container(it), path);
+                        assert_corr(iam_leaf_is_locked(leaf));
+                        if (result == 1) {
+                                struct dynlock_handle *lh;
+                                lh = iam_lock_htree(obj, path->ip_frame->leaf,
+                                                   DLT_WRITE);
+                                if (lh != NULL) {
+                                        iam_leaf_fini(leaf);
+                                        leaf->il_lock = lh;
+                                        result = iam_leaf_load(path);
+                                        if (result == 0)
+                                                iam_leaf_start(leaf);
+                                } else
+                                        result = -ENOMEM;
+                        } else if (result == 0)
+                                /* end of container reached */
+                                result = +1;
+                        if (result != 0)
+                                iam_it_put(it);
+                }
+                if (result == 0)
+                        it->ii_state = IAM_IT_ATTACHED;
+        }
+        assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
+        assert_corr(ergo(result >  0, it_state(it) == IAM_IT_DETACHED));
+        assert_corr(ergo(result == 0 && ik_orig != NULL,
+                         it_ikeycmp(it, ik_orig) >= 0));
+        return result;
+}
+EXPORT_SYMBOL(iam_it_next);
+
+/*
+ * Return pointer to the record under iterator.
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
+ * postcondition: it_state(it) == IAM_IT_ATTACHED
+ */
+struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
+{
+        assert_corr(it_state(it) == IAM_IT_ATTACHED);
+        assert_corr(it_at_rec(it));
+        return iam_leaf_rec(&it->ii_path.ip_leaf);
+}
+EXPORT_SYMBOL(iam_it_rec_get);
+
+static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
+{
+        struct iam_leaf *folio;
+
+        folio = &it->ii_path.ip_leaf;
+        iam_leaf_ops(folio)->rec_set(folio, r);
+}
+
+/*
+ * Replace contents of record under iterator.
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+ *                it->ii_flags&IAM_IT_WRITE
+ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+ *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+ */
+int iam_it_rec_set(handle_t *h,
+                   struct iam_iterator *it, const struct iam_rec *r)
+{
+        int result;
+        struct iam_path *path;
+        struct buffer_head *bh;
+
+        assert_corr(it_state(it) == IAM_IT_ATTACHED &&
+                    it->ii_flags&IAM_IT_WRITE);
+        assert_corr(it_at_rec(it));
+
+        path = &it->ii_path;
+        bh   = path->ip_leaf.il_bh;
+        result = iam_txn_add(h, path, bh);
+        if (result == 0) {
+                iam_it_reccpy(it, r);
+                result = iam_txn_dirty(h, path, bh);
+        }
+        return result;
+}
+EXPORT_SYMBOL(iam_it_rec_set);
+
+/*
+ * Return pointer to the index key under iterator.
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED ||
+ *                it_state(it) == IAM_IT_SKEWED
+ */
+static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
+                                        struct iam_ikey *ikey)
+{
+        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+                    it_state(it) == IAM_IT_SKEWED);
+        assert_corr(it_at_rec(it));
+        return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
+}
+
+/*
+ * Return pointer to the key under iterator.
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED ||
+ *                it_state(it) == IAM_IT_SKEWED
+ */
+struct iam_key *iam_it_key_get(const struct iam_iterator *it)
+{
+        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+                    it_state(it) == IAM_IT_SKEWED);
+        assert_corr(it_at_rec(it));
+        return iam_leaf_key(&it->ii_path.ip_leaf);
+}
+EXPORT_SYMBOL(iam_it_key_get);
+
+/*
+ * Return size of key under iterator (in bytes)
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED ||
+ *                it_state(it) == IAM_IT_SKEWED
+ */
+int iam_it_key_size(const struct iam_iterator *it)
+{
+        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+                    it_state(it) == IAM_IT_SKEWED);
+        assert_corr(it_at_rec(it));
+        return iam_leaf_key_size(&it->ii_path.ip_leaf);
+}
+EXPORT_SYMBOL(iam_it_key_size);
+
+/*
+ * Insertion of new record. Interaction with jbd during non-trivial case (when
+ * split happens) is as following:
+ *
+ *  - new leaf node is involved into transaction by ldiskfs_append();
+ *
+ *  - old leaf node is involved into transaction by iam_add_rec();
+ *
+ *  - leaf where insertion point ends in, is marked dirty by iam_add_rec();
+ *
+ *  - leaf without insertion point is marked dirty (as @new_leaf) by
+ *  iam_new_leaf();
+ *
+ *  - split index nodes are involved into transaction and marked dirty by
+ *  split_index_node().
+ *
+ *  - "safe" index node, which is no split, but where new pointer is inserted
+ *  is involved into transaction and marked dirty by split_index_node().
+ *
+ *  - index node where pointer to new leaf is inserted is involved into
+ *  transaction by split_index_node() and marked dirty by iam_add_rec().
+ *
+ *  - inode is marked dirty by iam_add_rec().
+ *
+ */
+
+static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
+{
+        int err;
+        iam_ptr_t blknr;
+        struct buffer_head   *new_leaf;
+        struct buffer_head   *old_leaf;
+        struct iam_container *c;
+        struct inode         *obj;
+        struct iam_path      *path;
+
+        assert_inv(iam_leaf_check(leaf));
+
+        c = iam_leaf_container(leaf);
+        path = leaf->il_path;
+
+        obj = c->ic_object;
+        new_leaf = ldiskfs_append(handle, obj, (__u32 *)&blknr, &err);
+        do_corr(schedule());
+        if (new_leaf != NULL) {
+                struct dynlock_handle *lh;
+
+                lh = iam_lock_htree(obj, blknr, DLT_WRITE);
+                do_corr(schedule());
+                if (lh != NULL) {
+                        iam_leaf_ops(leaf)->init_new(c, new_leaf);
+                        do_corr(schedule());
+                        old_leaf = leaf->il_bh;
+                        iam_leaf_split(leaf, &new_leaf, blknr);
+                        if (old_leaf != leaf->il_bh) {
+                                /*
+                                 * Switched to the new leaf.
+                                 */
+                                iam_leaf_unlock(leaf);
+                                leaf->il_lock = lh;
+                                path->ip_frame->leaf = blknr;
+                        } else
+                                iam_unlock_htree(obj, lh);
+                        do_corr(schedule());
+                        err = iam_txn_dirty(handle, path, new_leaf);
+                        brelse(new_leaf);
+                        if (err == 0)
+                                err = ldiskfs_mark_inode_dirty(handle, obj);
+                        do_corr(schedule());
+                } else
+                        err = -ENOMEM;
+        }
+        assert_inv(iam_leaf_check(leaf));
+        assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf));
+        assert_inv(iam_path_check(iam_leaf_path(leaf)));
+        return err;
+}
+
+static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+{
+        ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static int iam_shift_entries(struct iam_path *path,
+                         struct iam_frame *frame, unsigned count,
+                         struct iam_entry *entries, struct iam_entry *entries2,
+                         u32 newblock)
+{
+        unsigned count1;
+        unsigned count2;
+        int delta;
+
+        struct iam_frame *parent = frame - 1;
+        struct iam_ikey *pivot = iam_path_ikey(path, 3);
+
+        delta = dx_index_is_compat(path) ? 0 : +1;
+
+        count1 = count/2 + delta;
+        count2 = count - count1;
+        dx_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
+
+        dxtrace(printk("Split index %i/%i\n", count1, count2));
+
+        memcpy((char *) iam_entry_shift(path, entries2, delta),
+               (char *) iam_entry_shift(path, entries, count1),
+               count2 * iam_entry_size(path));
+
+        dx_set_count(entries2, count2 + delta);
+        dx_set_limit(entries2, dx_node_limit(path));
+
+        /*
+         * NOTE: very subtle piece of code competing dx_probe() may find 2nd
+         * level index in root index, then we insert new index here and set
+         * new count in that 2nd level index. so, dx_probe() may see 2nd level
+         * index w/o hash it looks for. the solution is to check root index
+         * after we locked just founded 2nd level index -bzzz
+         */
+        iam_insert_key_lock(path, parent, pivot, newblock);
+
+        /*
+         * now old and new 2nd level index blocks contain all pointers, so
+         * dx_probe() may find it in the both.  it's OK -bzzz
+         */
+        iam_lock_bh(frame->bh);
+        dx_set_count(entries, count1);
+        iam_unlock_bh(frame->bh);
+
+        /*
+         * now old 2nd level index block points to first half of leafs. it's
+         * importand that dx_probe() must check root index block for changes
+         * under dx_lock_bh(frame->bh) -bzzz
+         */
+
+        return count1;
+}
+
+
+int split_index_node(handle_t *handle, struct iam_path *path,
+                     struct dynlock_handle **lh)
+{
+
+        struct iam_entry *entries;   /* old block contents */
+        struct iam_entry *entries2;  /* new block contents */
+         struct iam_frame *frame, *safe;
+        struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+        u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+        struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
+        struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
+        struct inode *dir = iam_path_obj(path);
+        struct iam_descr *descr;
+        int nr_splet;
+        int i, err;
+
+        descr = iam_path_descr(path);
+        /*
+         * Algorithm below depends on this.
+         */
+        assert_corr(dx_root_limit(path) < dx_node_limit(path));
+
+        frame = path->ip_frame;
+        entries = frame->entries;
+
+        /*
+         * Tall-tree handling: we might have to split multiple index blocks
+         * all the way up to tree root. Tricky point here is error handling:
+         * to avoid complicated undo/rollback we
+         *
+         *   - first allocate all necessary blocks
+         *
+         *   - insert pointers into them atomically.
+         */
+
+        /*
+         * Locking: leaf is already locked. htree-locks are acquired on all
+         * index nodes that require split bottom-to-top, on the "safe" node,
+         * and on all new nodes
+         */
+
+        dxtrace(printk("using %u of %u node entries\n",
+                       dx_get_count(entries), dx_get_limit(entries)));
+
+        /* What levels need split? */
+        for (nr_splet = 0; frame >= path->ip_frames &&
+             dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+             --frame, ++nr_splet) {
+                do_corr(schedule());
+                if (nr_splet == DX_MAX_TREE_HEIGHT) {
+                        /*
+                        CWARN(dir->i_sb, __FUNCTION__,
+                                     "Directory index full!\n");
+                                     */
+                        err = -ENOSPC;
+                        goto cleanup;
+                }
+        }
+
+        safe = frame;
+
+        /*
+         * Lock all nodes, bottom to top.
+         */
+        for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
+                do_corr(schedule());
+                lock[i] = iam_lock_htree(dir, frame->curidx, DLT_WRITE);
+                if (lock[i] == NULL) {
+                        err = -ENOMEM;
+                        goto cleanup;
+                }
+        }
+
+        /*
+         * Check for concurrent index modification.
+         */
+        err = iam_check_full_path(path, 1);
+        if (err)
+                goto cleanup;
+        /*
+         * And check that the same number of nodes is to be split.
+         */
+        for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
+             dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+             --frame, ++i) {
+                ;
+        }
+        if (i != nr_splet) {
+                err = -EAGAIN;
+                goto cleanup;
+        }
+
+        /* Go back down, allocating blocks, locking them, and adding into
+         * transaction... */
+        for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+                bh_new[i] = ldiskfs_append (handle, dir, &newblock[i], &err);
+                do_corr(schedule());
+                if (!bh_new[i] ||
+                    descr->id_ops->id_node_init(path->ip_container,
+                                                bh_new[i], 0) != 0)
+                        goto cleanup;
+                new_lock[i] = iam_lock_htree(dir, newblock[i], DLT_WRITE);
+                if (new_lock[i] == NULL) {
+                        err = -ENOMEM;
+                        goto cleanup;
+                }
+                do_corr(schedule());
+                BUFFER_TRACE(frame->bh, "get_write_access");
+                err = ldiskfs_journal_get_write_access(handle, frame->bh);
+                if (err)
+                        goto journal_error;
+        }
+        /* Add "safe" node to transaction too */
+        if (safe + 1 != path->ip_frames) {
+                do_corr(schedule());
+                err = ldiskfs_journal_get_write_access(handle, safe->bh);
+                if (err)
+                        goto journal_error;
+        }
+
+        /* Go through nodes once more, inserting pointers */
+        for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+                unsigned count;
+                int idx;
+                struct buffer_head *bh2;
+                struct buffer_head *bh;
+
+                entries = frame->entries;
+                count = dx_get_count(entries);
+                idx = iam_entry_diff(path, frame->at, entries);
+
+                bh2 = bh_new[i];
+                entries2 = dx_get_entries(path, bh2->b_data, 0);
+
+                bh = frame->bh;
+                if (frame == path->ip_frames) {
+                        /* splitting root node. Tricky point:
+                         *
+                         * In the "normal" B-tree we'd split root *and* add
+                         * new root to the tree with pointers to the old root
+                         * and its sibling (thus introducing two new nodes).
+                         *
+                         * In htree it's enough to add one node, because
+                         * capacity of the root node is smaller than that of
+                         * non-root one.
+                         */
+                        struct iam_frame *frames;
+                        struct iam_entry *next;
+
+                        assert_corr(i == 0);
+
+                        do_corr(schedule());
+
+                        frames = path->ip_frames;
+                        memcpy((char *) entries2, (char *) entries,
+                               count * iam_entry_size(path));
+                        dx_set_limit(entries2, dx_node_limit(path));
+
+                        /* Set up root */
+                          iam_lock_bh(frame->bh);
+                        next = descr->id_ops->id_root_inc(path->ip_container,
+                                                          path, frame);
+                        dx_set_block(path, next, newblock[0]);
+                          iam_unlock_bh(frame->bh);
+
+                        do_corr(schedule());
+                        /* Shift frames in the path */
+                        memmove(frames + 2, frames + 1,
+                                (sizeof path->ip_frames) - 2 * sizeof frames[0]);
+                        /* Add new access path frame */
+                        frames[1].at = iam_entry_shift(path, entries2, idx);
+                        frames[1].entries = entries = entries2;
+                        frames[1].bh = bh2;
+                        assert_inv(dx_node_check(path, frame));
+                        ++ path->ip_frame;
+                        ++ frame;
+                        assert_inv(dx_node_check(path, frame));
+                        bh_new[0] = NULL; /* buffer head is "consumed" */
+                        err = ldiskfs_journal_get_write_access(handle, bh2);
+                        if (err)
+                                goto journal_error;
+                        do_corr(schedule());
+                } else {
+                        /* splitting non-root index node. */
+                        struct iam_frame *parent = frame - 1;
+
+                        do_corr(schedule());
+                        count = iam_shift_entries(path, frame, count,
+                                              entries, entries2, newblock[i]);
+                        /* Which index block gets the new entry? */
+                        if (idx >= count) {
+                                int d = dx_index_is_compat(path) ? 0 : +1;
+
+                                frame->at = iam_entry_shift(path, entries2,
+                                                            idx - count + d);
+                                frame->entries = entries = entries2;
+                                frame->curidx = newblock[i];
+                                swap(frame->bh, bh2);
+                                assert_corr(lock[i + 1] != NULL);
+                                assert_corr(new_lock[i] != NULL);
+                                swap(lock[i + 1], new_lock[i]);
+                                bh_new[i] = bh2;
+                                parent->at = iam_entry_shift(path,
+                                                             parent->at, +1);
+                        }
+                        assert_inv(dx_node_check(path, frame));
+                        assert_inv(dx_node_check(path, parent));
+                        dxtrace(dx_show_index ("node", frame->entries));
+                        dxtrace(dx_show_index ("node",
+                               ((struct dx_node *) bh2->b_data)->entries));
+                        err = ldiskfs_journal_dirty_metadata(handle, bh2);
+                        if (err)
+                                goto journal_error;
+                        do_corr(schedule());
+                        err = ldiskfs_journal_dirty_metadata(handle, parent->bh);
+                        if (err)
+                                goto journal_error;
+                }
+                do_corr(schedule());
+                err = ldiskfs_journal_dirty_metadata(handle, bh);
+                if (err)
+                        goto journal_error;
+        }
+                /*
+                 * This function was called to make insertion of new leaf
+                 * possible. Check that it fulfilled its obligations.
+                 */
+                assert_corr(dx_get_count(path->ip_frame->entries) <
+                            dx_get_limit(path->ip_frame->entries));
+        assert_corr(lock[nr_splet] != NULL);
+        *lh = lock[nr_splet];
+        lock[nr_splet] = NULL;
+        if (nr_splet > 0) {
+                /*
+                 * Log ->i_size modification.
+                 */
+                err = ldiskfs_mark_inode_dirty(handle, dir);
+                if (err)
+                        goto journal_error;
+        }
+        goto cleanup;
+journal_error:
+        ldiskfs_std_error(dir->i_sb, err);
+
+cleanup:
+        iam_unlock_array(dir, lock);
+        iam_unlock_array(dir, new_lock);
+
+        assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
+
+        do_corr(schedule());
+        for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
+                if (bh_new[i] != NULL)
+                        brelse(bh_new[i]);
+        }
+        return err;
+}
+
+static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
+                       struct iam_path *path,
+                       const struct iam_key *k, const struct iam_rec *r)
+{
+        int err;
+        struct iam_leaf *leaf;
+
+        leaf = &path->ip_leaf;
+        assert_inv(iam_leaf_check(leaf));
+        assert_inv(iam_path_check(path));
+        err = iam_txn_add(handle, path, leaf->il_bh);
+        if (err == 0) {
+                do_corr(schedule());
+                if (!iam_leaf_can_add(leaf, k, r)) {
+                        struct dynlock_handle *lh = NULL;
+
+                        do {
+                                assert_corr(lh == NULL);
+                                do_corr(schedule());
+                                err = split_index_node(handle, path, &lh);
+                                if (err == -EAGAIN) {
+                                        assert_corr(lh == NULL);
+
+                                        iam_path_fini(path);
+                                        it->ii_state = IAM_IT_DETACHED;
+
+                                        do_corr(schedule());
+                                        err = iam_it_get_exact(it, k);
+                                        if (err == -ENOENT)
+                                                err = +1; /* repeat split */
+                                        else if (err == 0)
+                                                err = -EEXIST;
+                                }
+                        } while (err > 0);
+                        assert_inv(iam_path_check(path));
+                        if (err == 0) {
+                                assert_corr(lh != NULL);
+                                do_corr(schedule());
+                                err = iam_new_leaf(handle, leaf);
+                                if (err == 0)
+                                        err = iam_txn_dirty(handle, path,
+                                                            path->ip_frame->bh);
+                        }
+                        iam_unlock_htree(iam_path_obj(path), lh);
+                        do_corr(schedule());
+                }
+                if (err == 0) {
+                        iam_leaf_rec_add(leaf, k, r);
+                        err = iam_txn_dirty(handle, path, leaf->il_bh);
+                }
+        }
+        assert_inv(iam_leaf_check(leaf));
+        assert_inv(iam_leaf_check(&path->ip_leaf));
+        assert_inv(iam_path_check(path));
+        return err;
+}
+
+/*
+ * Insert new record with key @k and contents from @r, shifting records to the
+ * right. On success, iterator is positioned on the newly inserted record.
+ *
+ * precondition: it->ii_flags&IAM_IT_WRITE &&
+ *               (it_state(it) == IAM_IT_ATTACHED ||
+ *                it_state(it) == IAM_IT_SKEWED) &&
+ *               ergo(it_state(it) == IAM_IT_ATTACHED,
+ *                    it_keycmp(it, k) <= 0) &&
+ *               ergo(it_before(it), it_keycmp(it, k) > 0));
+ * postcondition: ergo(result == 0,
+ *                     it_state(it) == IAM_IT_ATTACHED &&
+ *                     it_keycmp(it, k) == 0 &&
+ *                     !memcmp(iam_it_rec_get(it), r, ...))
+ */
+int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+                      const struct iam_key *k, const struct iam_rec *r)
+{
+        int result;
+        struct iam_path *path;
+
+        path = &it->ii_path;
+
+        assert_corr(it->ii_flags&IAM_IT_WRITE);
+        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+                    it_state(it) == IAM_IT_SKEWED);
+        assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
+                         it_keycmp(it, k) <= 0));
+        assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
+        result = iam_add_rec(h, it, path, k, r);
+        if (result == 0)
+                it->ii_state = IAM_IT_ATTACHED;
+        assert_corr(ergo(result == 0,
+                         it_state(it) == IAM_IT_ATTACHED &&
+                         it_keycmp(it, k) == 0));
+        return result;
+}
+EXPORT_SYMBOL(iam_it_rec_insert);
+
+/*
+ * Delete record under iterator.
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+ *                it->ii_flags&IAM_IT_WRITE &&
+ *                it_at_rec(it)
+ * postcondition: it_state(it) == IAM_IT_ATTACHED ||
+ *                it_state(it) == IAM_IT_DETACHED
+ */
+int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
+{
+        int result;
+        struct iam_leaf *leaf;
+        struct iam_path *path;
+
+        assert_corr(it_state(it) == IAM_IT_ATTACHED &&
+                    it->ii_flags&IAM_IT_WRITE);
+        assert_corr(it_at_rec(it));
+
+        path = &it->ii_path;
+        leaf = &path->ip_leaf;
+
+        assert_inv(iam_leaf_check(leaf));
+        assert_inv(iam_path_check(path));
+
+        result = iam_txn_add(h, path, leaf->il_bh);
+        /*
+         * no compaction for now.
+         */
+        if (result == 0) {
+                iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
+                result = iam_txn_dirty(h, path, leaf->il_bh);
+                if (result == 0 && iam_leaf_at_end(leaf) &&
+                    it->ii_flags&IAM_IT_MOVE) {
+                        result = iam_it_next(it);
+                        if (result > 0)
+                                result = 0;
+                }
+        }
+        assert_inv(iam_leaf_check(leaf));
+        assert_inv(iam_path_check(path));
+        assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+                    it_state(it) == IAM_IT_DETACHED);
+        return result;
+}
+EXPORT_SYMBOL(iam_it_rec_delete);
+
+/*
+ * Convert iterator to cookie.
+ *
+ * precondition:  it_state(it) == IAM_IT_ATTACHED &&
+ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+ * postcondition: it_state(it) == IAM_IT_ATTACHED
+ */
+iam_pos_t iam_it_store(const struct iam_iterator *it)
+{
+        iam_pos_t result;
+
+        assert_corr(it_state(it) == IAM_IT_ATTACHED);
+        assert_corr(it_at_rec(it));
+        assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
+                    sizeof result);
+
+        result = 0;
+        return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
+}
+EXPORT_SYMBOL(iam_it_store);
+
+/*
+ * Restore iterator from cookie.
+ *
+ * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
+ *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
+ *                                  iam_it_store(it) == pos)
+ */
+int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
+{
+        assert_corr(it_state(it) == IAM_IT_DETACHED &&
+                    it->ii_flags&IAM_IT_MOVE);
+        assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
+        return iam_it_iget(it, (struct iam_ikey *)&pos);
+}
+EXPORT_SYMBOL(iam_it_load);
+
+/***********************************************************************/
+/* invariants                                                          */
+/***********************************************************************/
+
+static inline int ptr_inside(void *base, size_t size, void *ptr)
+{
+        return (base <= ptr) && (ptr < base + size);
+}
+
+int iam_frame_invariant(struct iam_frame *f)
+{
+        return
+                (f->bh != NULL &&
+                f->bh->b_data != NULL &&
+                ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
+                ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
+                f->entries <= f->at);
+}
+int iam_leaf_invariant(struct iam_leaf *l)
+{
+        return
+                l->il_bh != NULL &&
+                l->il_bh->b_data != NULL &&
+                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
+                ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
+                l->il_entries <= l->il_at;
+}
+
+int iam_path_invariant(struct iam_path *p)
+{
+        int i;
+
+        if (p->ip_container == NULL ||
+            p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
+            p->ip_frame != p->ip_frames + p->ip_indirect ||
+            !iam_leaf_invariant(&p->ip_leaf))
+                return 0;
+        for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
+                if (i <= p->ip_indirect) {
+                        if (!iam_frame_invariant(&p->ip_frames[i]))
+                                return 0;
+                }
+        }
+        return 1;
+}
+
+int iam_it_invariant(struct iam_iterator *it)
+{
+        return
+                (it->ii_state == IAM_IT_DETACHED ||
+                 it->ii_state == IAM_IT_ATTACHED ||
+                 it->ii_state == IAM_IT_SKEWED) &&
+                !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
+                ergo(it->ii_state == IAM_IT_ATTACHED ||
+                     it->ii_state == IAM_IT_SKEWED,
+                     iam_path_invariant(&it->ii_path) &&
+                     equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
+}
+
+/*
+ * Search container @c for record with key @k. If record is found, its data
+ * are moved into @r.
+ *
+ * Return values: 0: found, -ENOENT: not-found, -ve: error
+ */
+int iam_lookup(struct iam_container *c, const struct iam_key *k,
+               struct iam_rec *r, struct iam_path_descr *pd)
+{
+        struct iam_iterator it;
+        int result;
+
+        iam_it_init(&it, c, 0, pd);
+
+        result = iam_it_get_exact(&it, k);
+        if (result == 0)
+                /*
+                 * record with required key found, copy it into user buffer
+                 */
+                iam_reccpy(&it.ii_path.ip_leaf, r);
+        iam_it_put(&it);
+        iam_it_fini(&it);
+        return result;
+}
+EXPORT_SYMBOL(iam_lookup);
+
+/*
+ * Insert new record @r with key @k into container @c (within context of
+ * transaction @h).
+ *
+ * Return values: 0: success, -ve: error, including -EEXIST when record with
+ * given key is already present.
+ *
+ * postcondition: ergo(result == 0 || result == -EEXIST,
+ *                                  iam_lookup(c, k, r2) > 0;
+ */
+int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
+               const struct iam_rec *r, struct iam_path_descr *pd)
+{
+        struct iam_iterator it;
+        int result;
+
+        iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+        result = iam_it_get_exact(&it, k);
+        if (result == -ENOENT)
+                result = iam_it_rec_insert(h, &it, k, r);
+        else if (result == 0)
+                result = -EEXIST;
+        iam_it_put(&it);
+        iam_it_fini(&it);
+        return result;
+}
+EXPORT_SYMBOL(iam_insert);
+
+/*
+ * Update record with the key @k in container @c (within context of
+ * transaction @h), new record is given by @r.
+ *
+ * Return values: 0: success, -ve: error, including -ENOENT if no record with
+ * the given key found.
+ */
+int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
+               const struct iam_rec *r, struct iam_path_descr *pd)
+{
+        struct iam_iterator it;
+        int result;
+
+        iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+        result = iam_it_get_exact(&it, k);
+        if (result == 0)
+                iam_it_rec_set(h, &it, r);
+        iam_it_put(&it);
+        iam_it_fini(&it);
+        return result;
+}
+EXPORT_SYMBOL(iam_update);
+
+/*
+ * Delete existing record with key @k.
+ *
+ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
+ *
+ * postcondition: ergo(result == 0 || result == -ENOENT,
+ *                                 !iam_lookup(c, k, *));
+ */
+int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
+               struct iam_path_descr *pd)
+{
+        struct iam_iterator it;
+        int result;
+
+        iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+        result = iam_it_get_exact(&it, k);
+        if (result == 0)
+                iam_it_rec_delete(h, &it);
+        iam_it_put(&it);
+        iam_it_fini(&it);
+        return result;
+}
+EXPORT_SYMBOL(iam_delete);
+
+int iam_root_limit(int rootgap, int blocksize, int size)
+{
+        int limit;
+        int nlimit;
+
+        limit = (blocksize - rootgap) / size;
+        nlimit = blocksize / size;
+        if (limit == nlimit)
+                limit--;
+        return limit;
+}
diff --git a/lustre/osd/osd_iam.h b/lustre/osd/osd_iam.h
new file mode 100644 (file)
index 0000000..b7c53cb
--- /dev/null
@@ -0,0 +1,1145 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osd_iam.c
+ * Top-level entry points into osd module
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LINUX_LUSTRE_IAM_H__
+#define __LINUX_LUSTRE_IAM_H__
+
+#include <linux/module.h>
+#include <asm/unaligned.h>
+#include <linux/dynlocks.h>
+
+/*
+ *  linux/include/linux/osd_iam.h
+ */
+#ifndef CLASSERT
+#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0)
+#endif
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+enum {
+        /*
+         * Maximal number of non-leaf levels in htree. In the stock ldiskfs this
+         * is 2.
+         */
+        /*
+         * XXX reduced back to 2 to make per-node locking work.
+         */
+        DX_MAX_TREE_HEIGHT = 5,
+        /*
+         * Scratch keys used by generic code for temporaries.
+         *
+         * Allocation:
+         *
+         *         [0] reserved for assertions and as a staging area for
+         *         record keys immediately used for key comparisons.
+         *
+         *         [1] reserved for record key, stored during iteration over
+         *         node records (see dx_node_check()).
+         *
+         *         [2] reserved for leaf node operations.
+         *
+         *         [3] reserved for index operations.
+         *
+         *         [4] reserved for path->ip_ikey_target
+         *
+         */
+        DX_SCRATCH_KEYS    = 5,
+        /*
+         * Maximal format name length.
+         */
+        DX_FMT_NAME_LEN    = 16,
+};
+
+#ifdef __KERNEL__
+/* handle_t, journal_start(), journal_stop() */
+#include <linux/jbd.h>
+
+/*
+ * Debugging.
+ *
+ * Various debugging levels.
+ */
+
+#if 0
+/*
+ * Following macros are defined in config.h and are tunable through
+ * appropriate configure switches (indicated below).
+ */
+
+/*
+ * Compile basic assertions in. You want this most of the time.
+ *
+ * --{enable,disable}-ldiskfs-assert (on by default).
+ */
+#define LDISKFS_ASSERT (1)
+
+/*
+ * Compile heavier correctness checks in. You want this during development
+ * cycle.
+ *
+ * --{enable,disable}-ldiskfs-correctness (off by default).
+ */
+#define LDISKFS_CORRECTNESS (1)
+
+/*
+ * Compile heavy invariant checking in. You want this early during development
+ * or when chasing a bug.
+ *
+ * --{enable,disable}-ldiskfs-invariant (off by default).
+ */
+#define LDISKFS_INVARIANT (1)
+#endif
+
+#if defined(LDISKFS_ASSERT)
+#define LDISKFS_ASSERT_ON (1)
+#else
+#define LDISKFS_ASSERT_ON (0)
+#endif
+
+#if defined(LDISKFS_CORRECTNESS)
+#define LDISKFS_CORRECTNESS_ON (1)
+#else
+#define LDISKFS_CORRECTNESS_ON (0)
+#endif
+
+#if defined(LDISKFS_INVARIANT)
+#define LDISKFS_INVARIANT_ON (1)
+#else
+#define LDISKFS_INVARIANT_ON (0)
+#endif
+
+#ifndef assert
+#if LDISKFS_ASSERT_ON
+#define assert(test) J_ASSERT(test)
+#else
+#define assert(test) ((void)(test))
+#endif
+#endif
+
+#if LDISKFS_CORRECTNESS_ON
+#define assert_corr(test) J_ASSERT(test)
+#define do_corr(exp) exp
+#else
+#define assert_corr(test) do {;} while (0)
+#define do_corr(exp) do {;} while (0)
+#endif
+
+#if LDISKFS_INVARIANT_ON
+#define assert_inv(test) J_ASSERT(test)
+#else
+#define assert_inv(test) do {;} while (0)
+#endif
+
+/*
+ * Entry within index tree node. Consists of a key immediately followed
+ * (without padding) by a pointer to the child node.
+ *
+ * Both key and pointer are of variable size, hence incomplete type.
+ */
+struct iam_entry;
+
+struct iam_entry_compat {
+        __le32 hash;
+        __le32 block;
+};
+
+/*
+ * Incomplete type used to refer to keys in iam container.
+ *
+ * As key size can be different from container to container, iam has to use
+ * incomplete type. Clients cast pointer to iam_key to real key type and back.
+ */
+struct iam_key;
+
+/*
+ * Incomplete type use to refer to the records stored in iam containers.
+ */
+struct iam_rec;
+
+/*
+ * Key in index node. Possibly compressed. Fixed size.
+ */
+struct iam_ikey;
+
+/*
+ * Scalar type into which certain iam_key's can be uniquely mapped. Used to
+ * support interfaces like readdir(), where iteration over index has to be
+ * re-startable.
+ */
+typedef __u32 iam_ptr_t;
+
+/*
+ * Index node traversed during tree lookup.
+ */
+struct iam_frame {
+        struct buffer_head *bh;    /* buffer holding node data */
+        struct iam_entry *entries; /* array of entries */
+        struct iam_entry *at;      /* target entry, found by binary search */
+        iam_ptr_t         leaf;    /* (logical) offset of child node found by
+                                    * binary search. */
+        iam_ptr_t         curidx;  /* (logical) offset of this node. Used to
+                                    * per-node locking to detect concurrent
+                                    * splits. */
+};
+
+/*
+ * Opaque entry in the leaf node.
+ */
+struct iam_lentry;
+
+struct iam_path;
+struct iam_container;
+
+
+/* leaf node reached by tree lookup */
+struct iam_leaf {
+        struct iam_path    *il_path;
+        struct buffer_head *il_bh;
+        struct iam_lentry  *il_entries;
+        struct iam_lentry  *il_at;
+        /*
+         * Lock on a leaf node.
+         */
+        struct dynlock_handle *il_lock;
+        iam_ptr_t              il_curidx; /* logical offset of leaf node. */
+        void               *il_descr_data;
+};
+
+/*
+ * Return values of ->lookup() operation from struct iam_leaf_operations.
+ */
+enum iam_lookup_t {
+        /*
+         * lookup found a record with the key requested
+         */
+        IAM_LOOKUP_EXACT  = 0,
+        /*
+         * lookup positioned leaf on some record
+         */
+        IAM_LOOKUP_OK     = 1,
+        /*
+         * leaf was empty
+         */
+        IAM_LOOKUP_EMPTY  = 2,
+        /*
+         * lookup positioned leaf before first record
+         */
+        IAM_LOOKUP_BEFORE = 3,
+        /*
+         * Found hash may have a continuation in the next leaf.
+         */
+        IAM_LOOKUP_LAST   = 0x100
+};
+
+/*
+ * Format-specific container operations. These are called by generic iam code.
+ */
+struct iam_operations {
+        /*
+         * Returns pointer (in the same sense as pointer in index entry) to
+         * the root node.
+         */
+        __u32 (*id_root_ptr)(struct iam_container *c);
+
+        /*
+         * Check validity and consistency of index node.
+         */
+        int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
+        /*
+         * Copy some data from node header into frame. This is called when
+         * new node is loaded into frame.
+         */
+        int (*id_node_load)(struct iam_path *path, struct iam_frame *frame);
+        /*
+         * Initialize new node (stored in @bh) that is going to be added into
+         * tree.
+         */
+        int (*id_node_init)(struct iam_container *c,
+                            struct buffer_head *bh, int root);
+        int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
+                            handle_t *h, struct buffer_head **bh);
+        /*
+         * Key comparison functions. Returns -1, 0, +1.
+         */
+        int (*id_ikeycmp)(const struct iam_container *c,
+                          const struct iam_ikey *k1,
+                          const struct iam_ikey *k2);
+        /*
+         * Modify root node when tree height increases.
+         */
+        struct iam_entry *(*id_root_inc)(struct iam_container *c,
+                                         struct iam_path *path,
+                                         struct iam_frame *frame);
+
+        struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c,
+                                               void *area);
+        void (*id_ipd_free)(struct iam_path_descr *ipd);
+        /*
+         * Format name.
+         */
+        char id_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Another format-specific operation vector, consisting of methods to access
+ * leaf nodes. This is separated from struct iam_operations, because it is
+ * assumed that there will be many formats with different format of leaf
+ * nodes, yes the same struct iam_operations.
+ */
+struct iam_leaf_operations {
+                /*
+                 * leaf operations.
+                 */
+
+        /*
+         * initialize just loaded leaf node.
+         */
+        int (*init)(struct iam_leaf *p);
+        /*
+         * Format new node.
+         */
+        void (*init_new)(struct iam_container *c, struct buffer_head *bh);
+        /*
+         * Release resources.
+         */
+        void (*fini)(struct iam_leaf *l);
+                /*
+                 * returns true iff leaf is positioned at the last entry.
+                 */
+        int (*at_end)(const struct iam_leaf *l);
+                /* position leaf at the first entry */
+        void (*start)(struct iam_leaf *l);
+                /* more leaf to the next entry. */
+        void (*next)(struct iam_leaf *l);
+        /*
+         * return key of current leaf record. This method may return
+         * either pointer to the key stored in node, or copy key into
+         * @k buffer supplied by caller and return pointer to this
+         * buffer. The latter approach is used when keys in nodes are
+         * not stored in plain form (e.g., htree doesn't store keys at
+         * all).
+         *
+         * Caller should assume that returned pointer is only valid
+         * while leaf node is pinned and locked.
+         */
+        struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k);
+        struct iam_key *(*key)(const struct iam_leaf *l);
+        /* return pointer to entry body. Pointer is valid while
+           corresponding leaf node is locked and pinned. */
+        struct iam_rec *(*rec)(const struct iam_leaf *l);
+
+        void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
+        void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
+        void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r);
+
+        int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
+        int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k);
+
+        int (*key_size)(const struct iam_leaf *l);
+        /*
+         * Search leaf @l for a record with key @k or for a place
+         * where such record is to be inserted.
+         *
+         * Scratch keys from @path can be used.
+         */
+        int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
+        int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik);
+
+        int (*can_add)(const struct iam_leaf *l,
+                       const struct iam_key *k, const struct iam_rec *r);
+        /*
+         * add rec for a leaf
+         */
+        void (*rec_add)(struct iam_leaf *l,
+                        const struct iam_key *k, const struct iam_rec *r);
+        /*
+         * remove rec for a leaf
+         */
+        void (*rec_del)(struct iam_leaf *l, int shift);
+        /*
+         * split leaf node, moving some entries into @bh (the latter currently
+         * is assumed to be empty).
+         */
+        void (*split)(struct iam_leaf *l, struct buffer_head **bh,
+                      iam_ptr_t newblknr);
+};
+
+/*
+ * Parameters, describing a flavor of iam container.
+ */
+struct iam_descr {
+        /*
+         * Size of a key in this container, in bytes.
+         */
+         size_t       id_key_size;
+        /*
+         * Size of a key in index nodes, in bytes.
+         */
+         size_t       id_ikey_size;
+        /*
+         * Size of a pointer to the next level (stored in index nodes), in
+         * bytes.
+         */
+        size_t       id_ptr_size;
+        /*
+         * Size of a record (stored in leaf nodes), in bytes.
+         */
+        size_t       id_rec_size;
+        /*
+         * Size of unused (by iam) space at the beginning of every non-root
+         * node, in bytes. Used for compatibility with ldiskfs.
+         */
+        size_t       id_node_gap;
+        /*
+         * Size of unused (by iam) space at the beginning of root node, in
+         * bytes. Used for compatibility with ldiskfs.
+         */
+        size_t       id_root_gap;
+
+        struct iam_operations           *id_ops;
+        struct iam_leaf_operations      *id_leaf_ops;
+};
+
+/*
+ * An instance of iam container.
+ */
+struct iam_container {
+        /*
+         * Underlying flat file. IO against this object is issued to
+         * read/write nodes.
+         */
+        struct inode     *ic_object;
+        /*
+         * container flavor.
+         */
+        struct iam_descr *ic_descr;
+        /*
+         * read-write lock protecting index consistency.
+         */
+        struct rw_semaphore ic_sem;
+};
+
+/*
+ * description-specific part of iam_path. This is usually embedded into larger
+ * structure.
+ */
+struct iam_path_descr {
+        /*
+         * Scratch-pad area for temporary keys.
+         */
+        struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS];
+};
+
+/*
+ * Structure to keep track of a path drilled through htree.
+ */
+struct iam_path {
+        /*
+         * Parent container.
+         */
+        struct iam_container  *ip_container;
+        /*
+         * Number of index levels minus one.
+         */
+        int                    ip_indirect;
+        /*
+         * Nodes that top-to-bottom traversal passed through.
+         */
+        struct iam_frame       ip_frames[DX_MAX_TREE_HEIGHT];
+        /*
+         * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
+         * immediately above leaf).
+         */
+        struct iam_frame      *ip_frame;
+        /*
+         * Leaf node: a child of ->ip_frame.
+         */
+        struct iam_leaf        ip_leaf;
+        /*
+         * Key searched for.
+         */
+        const struct iam_key  *ip_key_target;
+        const struct iam_ikey *ip_ikey_target;
+        /*
+         * Description-specific data.
+         */
+        struct iam_path_descr *ip_data;
+};
+
+struct ldiskfs_dx_hash_info;
+
+/*
+ * Helper structure for legacy htrees.
+ */
+struct iam_path_compat {
+        struct iam_path      ipc_path;
+        struct iam_container ipc_container;
+        __u32                 ipc_scratch[DX_SCRATCH_KEYS];
+        struct ldiskfs_dx_hash_info  *ipc_hinfo;
+        struct qstr          *ipc_qstr;
+        struct iam_path_descr ipc_descr;
+        struct ldiskfs_dx_hash_info   ipc_hinfo_area;
+};
+
+#define const_max(p, q) ((p > q) ? p : q)
+
+enum {
+        DX_MAX_IKEY_SIZE   = 32, /* be generous */
+        /*
+         * Hack to avoid dynamic allocation and freeing of ipd.
+         */
+        DX_IPD_MAX_SIZE    = const_max(sizeof(struct iam_path_compat),
+                                       DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS +
+                                       sizeof(struct iam_path_descr))
+};
+
+/*
+ * iam cursor (iterator) api.
+ */
+
+/*
+ * States of iterator state machine.
+ */
+enum iam_it_state {
+        /* initial state */
+        IAM_IT_DETACHED,
+        /* iterator is above particular record in the container */
+        IAM_IT_ATTACHED,
+        /* iterator is positioned before record  */
+        IAM_IT_SKEWED
+};
+
+/*
+ * Flags controlling iterator functionality.
+ */
+enum iam_it_flags {
+        /*
+         * this iterator will move (iam_it_next() will be called on it)
+         */
+        IAM_IT_MOVE  = (1 << 0),
+        /*
+         * tree can be updated through this iterator.
+         */
+        IAM_IT_WRITE = (1 << 1)
+};
+
+/*
+ * Iterator.
+ *
+ * Immediately after call to iam_it_init() iterator is in "detached"
+ * (IAM_IT_DETACHED) state: it is associated with given parent container, but
+ * doesn't point to any particular record in this container.
+ *
+ * After successful call to iam_it_get() and until corresponding call to
+ * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or
+ * IAM_IT_SKEWED.
+ *
+ * Active iterator can move through records in a container (provided
+ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+ * passes over them, and can modify container (provided IAM_IT_WRITE
+ * permission).
+ *
+ * Iteration may reach the end of container, at which point iterator switches
+ * into IAM_IT_DETACHED state.
+ *
+ * Concurrency: iterators are supposed to be local to thread. Interfaces below
+ * do no internal serialization of access to the iterator fields.
+ *
+ * When in non-detached state, iterator keeps some container nodes pinned in
+ * memory and locked (that locking may be implemented at the container
+ * granularity though). In particular, clients may assume that pointers to
+ * records and keys obtained through iterator interface as valid until
+ * iterator is detached (except that they may be invalidated by sub-sequent
+ * operations done through the same iterator).
+ *
+ */
+struct iam_iterator {
+        /*
+         * iterator flags, taken from enum iam_it_flags.
+         */
+        __u32                 ii_flags;
+        enum iam_it_state     ii_state;
+        /*
+         * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED
+         * states.
+         */
+        struct iam_path       ii_path;
+};
+
+void iam_path_init(struct iam_path *path, struct iam_container *c,
+                   struct iam_path_descr *pd);
+void iam_path_fini(struct iam_path *path);
+void iam_path_release(struct iam_path *path);
+
+void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
+void iam_path_compat_fini(struct iam_path_compat *path);
+
+struct iam_path_descr *iam_ipd_alloc(void *area, int keysize);
+void iam_ipd_free(struct iam_path_descr *ipd);
+
+int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
+                 struct iam_path_descr *pd);
+void iam_it_fini(struct iam_iterator *it);
+int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
+int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k);
+void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
+void iam_it_put(struct iam_iterator *it);
+int iam_it_next(struct iam_iterator *it);
+struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
+int iam_it_rec_set(handle_t *h,
+                   struct iam_iterator *it, const struct iam_rec *r);
+struct iam_key *iam_it_key_get(const struct iam_iterator *it);
+int iam_it_key_size(const struct iam_iterator *it);
+int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+                      const struct iam_key *k, const struct iam_rec *r);
+int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+
+typedef __u64 iam_pos_t;
+
+iam_pos_t iam_it_store(const struct iam_iterator *it);
+int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
+
+int iam_lookup(struct iam_container *c, const struct iam_key *k,
+               struct iam_rec *r, struct iam_path_descr *pd);
+int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
+               struct iam_path_descr *pd);
+int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
+               const struct iam_rec *r, struct iam_path_descr *pd);
+int iam_insert(handle_t *handle, struct iam_container *c,
+               const struct iam_key *k,
+               const struct iam_rec *r, struct iam_path_descr *pd);
+/*
+ * Initialize container @c.
+ */
+int iam_container_init(struct iam_container *c,
+                       struct iam_descr *descr, struct inode *inode);
+/*
+ * Finalize container @c, release all resources.
+ */
+void iam_container_fini(struct iam_container *c);
+
+/*
+ * Determine container format.
+ */
+int iam_container_setup(struct iam_container *c);
+
+static inline struct iam_descr *iam_container_descr(struct iam_container *c)
+{
+        return c->ic_descr;
+}
+
+static inline struct iam_descr *iam_path_descr(const struct iam_path *p)
+{
+        return p->ip_container->ic_descr;
+}
+
+static inline struct inode *iam_path_obj(struct iam_path *p)
+{
+        return p->ip_container->ic_object;
+}
+
+static inline void iam_ikeycpy(const struct iam_container *c,
+                               struct iam_ikey *k1, const struct iam_ikey *k2)
+{
+        memcpy(k1, k2, c->ic_descr->id_ikey_size);
+}
+
+static inline size_t iam_entry_size(struct iam_path *p)
+{
+        return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size;
+}
+
+static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
+                                                struct iam_entry *entry,
+                                                int shift)
+{
+        void *e = entry;
+        return e + shift * iam_entry_size(p);
+}
+
+static inline struct iam_ikey *dx_get_ikey(struct iam_path *p,
+                                            struct iam_entry *entry,
+                                            struct iam_ikey *key)
+{
+        return memcpy(key, entry, iam_path_descr(p)->id_ikey_size);
+}
+
+static inline struct iam_ikey *iam_ikey_at(struct iam_path *p,
+                                           struct iam_entry *entry)
+{
+        return (struct iam_ikey *)entry;
+}
+
+static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
+                                       struct iam_entry *e1,
+                                       struct iam_entry *e2)
+{
+        ptrdiff_t diff;
+
+        diff = (void *)e1 - (void *)e2;
+        assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
+        return diff / iam_entry_size(p);
+}
+
+/*
+ * Helper for the frequent case, where key was already placed into @k1 by
+ * callback.
+ */
+static inline void iam_ikeycpy0(const struct iam_container *c,
+                                struct iam_ikey *k1, const struct iam_ikey *k2)
+{
+        if (k1 != k2)
+                iam_ikeycpy(c, k1, k2);
+}
+
+static inline int iam_ikeycmp(const struct iam_container *c,
+                              const struct iam_ikey *k1,
+                              const struct iam_ikey *k2)
+{
+        return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2);
+}
+
+static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
+{
+        return (void *)((char *)entry + off);
+}
+
+/*
+ * Leaf helpers.
+ */
+
+static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf)
+{
+        return leaf->il_path;
+}
+
+static inline struct iam_container *
+iam_leaf_container(const struct iam_leaf *leaf)
+{
+        return iam_leaf_path(leaf)->ip_container;
+}
+
+static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf)
+{
+        return iam_leaf_container(leaf)->ic_descr;
+}
+
+static inline struct iam_leaf_operations *
+iam_leaf_ops(const struct iam_leaf *leaf)
+{
+        return iam_leaf_descr(leaf)->id_leaf_ops;
+}
+
+static inline void iam_reccpy(const struct iam_leaf *leaf,
+                              struct iam_rec *rec_dst)
+{
+        iam_leaf_ops(leaf)->rec_get(leaf, rec_dst);
+}
+
+/*XXX These stuff put here, just because they are used by iam.c */
+static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+{
+        u32 *addr;
+
+        addr = iam_entry_off(entry, iam_path_descr(p)->id_ikey_size);
+        return le32_to_cpu(get_unaligned(addr)) & 0x00ffffff;
+}
+
+static inline void dx_set_block(struct iam_path *p,
+                                struct iam_entry *entry, unsigned value)
+{
+        u32 *addr;
+
+        addr = iam_entry_off(entry, iam_path_descr(p)->id_ikey_size);
+        put_unaligned(cpu_to_le32(value), addr);
+}
+
+static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry,
+                               const struct iam_ikey *key)
+{
+        iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key);
+}
+
+struct dx_map_entry
+{
+        u32 hash;
+        u32 offs;
+};
+
+struct fake_dirent {
+        __le32 inode;
+        __le16 rec_len;
+        u8 name_len;
+        u8 file_type;
+};
+
+struct dx_countlimit {
+        __le16 limit;
+        __le16 count;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+
+struct dx_root {
+        struct fake_dirent dot;
+        char dot_name[4];
+        struct fake_dirent dotdot;
+        char dotdot_name[4];
+        struct dx_root_info
+        {
+                __le32 reserved_zero;
+                u8 hash_version;
+                u8 info_length; /* 8 */
+                u8 indirect_levels;
+                u8 unused_flags;
+        }
+        info;
+        struct {} entries[0];
+};
+
+struct dx_node
+{
+        struct fake_dirent fake;
+        struct {} entries[0];
+};
+
+
+static inline unsigned dx_get_count(struct iam_entry *entries)
+{
+        return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit(struct iam_entry *entries)
+{
+        return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count(struct iam_entry *entries, unsigned value)
+{
+        ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline unsigned dx_node_limit(struct iam_path *p)
+{
+        struct iam_descr *param = iam_path_descr(p);
+        unsigned entry_space   = iam_path_obj(p)->i_sb->s_blocksize -
+                param->id_node_gap;
+        return entry_space / (param->id_ikey_size + param->id_ptr_size);
+}
+
+static inline unsigned dx_root_limit(struct iam_path *p)
+{
+        struct iam_descr *param = iam_path_descr(p);
+        unsigned limit = iam_path_obj(p)->i_sb->s_blocksize -
+                param->id_root_gap;
+        limit /= (param->id_ikey_size + param->id_ptr_size);
+        if (limit == dx_node_limit(p))
+                limit--;
+        return limit;
+}
+
+
+static inline struct iam_entry *dx_get_entries(struct iam_path *path,
+                                               void *data, int root)
+{
+        struct iam_descr *param = iam_path_descr(path);
+        return data + (root ? param->id_root_gap : param->id_node_gap);
+}
+
+
+static inline struct iam_entry *dx_node_get_entries(struct iam_path *path,
+                                                    struct iam_frame *frame)
+{
+        return dx_get_entries(path,
+                              frame->bh->b_data, frame == path->ip_frames);
+}
+
+static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path,
+                                             int nr)
+{
+        assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
+        return path->ip_data->ipd_key_scratch[nr];
+}
+
+static inline struct dynlock *path_dynlock(struct iam_path *path)
+{
+        return &LDISKFS_I(iam_path_obj(path))->i_htree_lock;
+}
+
+static inline int iam_leaf_is_locked(const struct iam_leaf *leaf)
+{
+        int result;
+
+        result = dynlock_is_locked(path_dynlock(leaf->il_path),
+                                   leaf->il_curidx);
+        if (!result)
+                dump_stack();
+        return result;
+}
+
+static inline int iam_frame_is_locked(struct iam_path *path,
+                                      const struct iam_frame *frame)
+{
+        int result;
+
+        result = dynlock_is_locked(path_dynlock(path), frame->curidx);
+        if (!result)
+                dump_stack();
+        return result;
+}
+
+int dx_lookup_lock(struct iam_path *path,
+                   struct dynlock_handle **dl, enum dynlock_type lt);
+
+void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+                     u32 hash, u32 block);
+int dx_index_is_compat(struct iam_path *path);
+
+int ldiskfs_htree_next_block(struct inode *dir, __u32 hash,
+                          struct iam_path *path, __u32 *start_hash);
+
+struct buffer_head *ldiskfs_append(handle_t *handle, struct inode *inode,
+                                u32 *block, int *err);
+int split_index_node(handle_t *handle, struct iam_path *path,
+                     struct dynlock_handle **lh);
+struct ldiskfs_dir_entry_2 *split_entry(struct inode *dir,
+                                     struct ldiskfs_dir_entry_2 *de,
+                                     unsigned long ino, mode_t mode,
+                                     const char *name, int namelen);
+struct ldiskfs_dir_entry_2 *find_insertion_point(struct inode *dir,
+                                              struct buffer_head *bh,
+                                              const char *name, int namelen);
+struct ldiskfs_dir_entry_2 *move_entries(struct inode *dir,
+                                      struct ldiskfs_dx_hash_info *hinfo,
+                                      struct buffer_head **bh1,
+                                      struct buffer_head **bh2,
+                                      __u32 *delim_hash);
+
+extern struct iam_descr iam_htree_compat_param;
+
+struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
+                                     enum dynlock_type lt);
+void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh);
+
+/*
+ * external
+ */
+void iam_container_write_lock(struct iam_container *c);
+void iam_container_write_unlock(struct iam_container *c);
+
+void iam_container_read_lock(struct iam_container *c);
+void iam_container_read_unlock(struct iam_container *c);
+
+int iam_index_next(struct iam_container *c, struct iam_path *p);
+int iam_read_leaf(struct iam_path *p);
+
+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+                  handle_t *handle, struct buffer_head **bh);
+int iam_lvar_create(struct inode *obj, int keysize, int ptrsize,
+                           int recsize, handle_t *handle);
+int iam_lfix_create(struct inode *obj, int keysize, int ptrsize,
+                           int recsize, handle_t *handle);
+
+#ifndef swap
+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command) 
+#endif
+
+#define BH_DXLock        25
+#define DX_DEBUG (1)
+#if DX_DEBUG
+static struct iam_lock_stats {
+        unsigned dls_bh_lock;
+        unsigned dls_bh_busy;
+        unsigned dls_bh_again;
+        unsigned dls_bh_full_again;
+} iam_lock_stats = { 0, };
+#define DX_DEVAL(x) x
+#else
+#define DX_DEVAL(x)
+#endif
+
+static inline void iam_lock_bh(struct buffer_head volatile *bh)
+{
+        DX_DEVAL(iam_lock_stats.dls_bh_lock++);
+#ifdef CONFIG_SMP
+        while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
+                DX_DEVAL(iam_lock_stats.dls_bh_busy++);
+                while (test_bit(BH_DXLock, &bh->b_state))
+                        cpu_relax();
+        }
+#endif
+}
+
+static inline void iam_unlock_bh(struct buffer_head *bh)
+{
+#ifdef CONFIG_SMP
+        smp_mb__before_clear_bit();
+        clear_bit(BH_DXLock, &bh->b_state);
+#endif
+}
+
+
+void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
+                    const struct iam_ikey *key, iam_ptr_t ptr);
+
+void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
+                         const struct iam_ikey *key, iam_ptr_t ptr);
+
+
+int  iam_leaf_at_end(const struct iam_leaf *l);
+void iam_leaf_next(struct iam_leaf *folio);
+int iam_leaf_can_add(const struct iam_leaf *l,
+                     const struct iam_key *k, const struct iam_rec *r);
+
+struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
+struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
+struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf);
+struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
+
+
+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+                  handle_t *h, struct buffer_head **bh);
+
+/*
+ * Container format.
+ */
+struct iam_format {
+        /*
+         * Method called to recognize container format. Should return true iff
+         * container @c conforms to this format. This method may do IO to read
+         * container pages.
+         *
+         * If container is recognized, this method sets operation vectors
+         * ->id_ops and ->id_leaf_ops in container description (c->ic_descr),
+         * and fills other description fields.
+         */
+        int (*if_guess)(struct iam_container *c);
+        /*
+         * Linkage into global list of container formats.
+         */
+        struct list_head if_linkage;
+};
+
+void iam_format_register(struct iam_format *fmt);
+int iam_root_limit(int rootgap, int blocksize, int size);
+
+void iam_lfix_format_init(void);
+void iam_lvar_format_init(void);
+void iam_htree_format_init(void);
+
+struct iam_private_info;
+
+void ldiskfs_iam_release(struct file *filp, struct inode *inode);
+
+int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
+                   unsigned long arg);
+
+/* dir.c 
+#if LDISKFS_INVARIANT_ON
+extern int ldiskfs_check_dir_entry(const char *, struct inode *,
+                                struct ldiskfs_dir_entry_2 *,
+                                struct buffer_head *, unsigned long);
+#else
+static inline int ldiskfs_check_dir_entry(const char * function,
+                                       struct inode * dir,
+                                       struct ldiskfs_dir_entry_2 * de,
+                                       struct buffer_head * bh,
+                                       unsigned long offset)
+{
+        return 1;
+}
+#endif
+*/
+
+/* __KERNEL__ */
+#endif
+
+/*
+ * User level API. Copy exists in lustre/lustre/tests/iam_ut.c
+ */
+
+struct iam_uapi_info {
+        __u16 iui_keysize;
+        __u16 iui_recsize;
+        __u16 iui_ptrsize;
+        __u16 iui_height;
+        char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+struct iam_uapi_op {
+        void *iul_key;
+        void *iul_rec;
+};
+
+struct iam_uapi_it {
+        struct iam_uapi_op iui_op;
+        __u16              iui_state;
+};
+
+enum iam_ioctl_cmd {
+        IAM_IOC_INIT     = _IOW('i', 1, struct iam_uapi_info),
+        IAM_IOC_GETINFO  = _IOR('i', 2, struct iam_uapi_info),
+        IAM_IOC_INSERT   = _IOR('i', 3, struct iam_uapi_op),
+        IAM_IOC_LOOKUP   = _IOWR('i', 4, struct iam_uapi_op),
+        IAM_IOC_DELETE   = _IOR('i', 5, struct iam_uapi_op),
+        IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it),
+        IAM_IOC_IT_NEXT  = _IOW('i', 7, struct iam_uapi_it),
+        IAM_IOC_IT_STOP  = _IOR('i', 8, struct iam_uapi_it),
+
+        IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long)
+};
+
+/* __LINUX_LUSTRE_IAM_H__ */
+#endif
diff --git a/lustre/osd/osd_iam_lfix.c b/lustre/osd/osd_iam_lfix.c
new file mode 100644 (file)
index 0000000..22a60e2
--- /dev/null
@@ -0,0 +1,852 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * iam_lfix.c
+ * implementation of iam format for fixed size records.
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/types.h>
+#include <linux/jbd.h>
+/* ldiskfs_error() */
+#include <linux/ldiskfs_fs.h>
+#include "osd_internal.h"
+
+/*
+ * Leaf operations.
+ */
+
+enum {
+        IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in
+                                        * lustre/utils/create_iam.c */
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct iam_leaf_head {
+        __le16 ill_magic;
+        __le16 ill_count;
+};
+
+static inline int iam_lfix_entry_size(const struct iam_leaf *l)
+{
+        return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size;
+}
+
+static inline struct iam_lentry *
+iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift)
+{
+        return (void *)entry + shift * iam_lfix_entry_size(l);
+}
+
+static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry)
+{
+        return (struct iam_key *)entry;
+}
+
+static inline int lfix_keycmp(const struct iam_container *c,
+                              const struct iam_key *k1,
+                              const struct iam_key *k2)
+{
+        return memcmp(k1, k2, c->ic_descr->id_key_size);
+}
+
+static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l)
+{
+        return (struct iam_leaf_head *)l->il_bh->b_data;
+}
+
+static struct iam_lentry *iam_entries(const struct buffer_head *bh)
+{
+        return (void *)bh->b_data + sizeof(struct iam_leaf_head);
+}
+
+static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l)
+{
+        return iam_entries(l->il_bh);
+}
+
+static int leaf_count_limit(const struct iam_leaf *leaf)
+{
+        int free_space;
+
+        free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
+        free_space -= sizeof(struct iam_leaf_head);
+        return free_space / iam_lfix_entry_size(leaf);
+}
+
+static int lentry_count_get(const struct iam_leaf *leaf)
+{
+        return le16_to_cpu(iam_get_head(leaf)->ill_count);
+}
+
+static void lentry_count_set(struct iam_leaf *leaf, unsigned count)
+{
+        assert_corr(0 <= count && count <= leaf_count_limit(leaf));
+        iam_get_head(leaf)->ill_count = cpu_to_le16(count);
+}
+
+static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l);
+
+#if LDISKFS_CORRECTNESS_ON || LDISKFS_INVARIANT_ON
+static int iam_leaf_at_rec(const struct iam_leaf *folio)
+{
+        return
+                iam_get_lentries(folio) <= folio->il_at &&
+                folio->il_at < iam_lfix_get_end(folio);
+}
+#endif
+
+static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l,
+                                      struct iam_ikey *key)
+{
+        void *ie = l->il_at;
+        assert_corr(iam_leaf_at_rec(l));
+        return (struct iam_ikey*)ie;
+}
+
+static struct iam_key *iam_lfix_key(const struct iam_leaf *l)
+{
+        void *ie = l->il_at;
+        assert_corr(iam_leaf_at_rec(l));
+        return (struct iam_key*)ie;
+}
+
+static int iam_lfix_key_size(const struct iam_leaf *l)
+{
+        return iam_leaf_descr(l)->id_key_size;
+}
+
+static void iam_lfix_start(struct iam_leaf *l)
+{
+        l->il_at = iam_get_lentries(l);
+}
+
+static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l,
+                                      const struct iam_lentry *e1,
+                                      const struct iam_lentry *e2)
+{
+        ptrdiff_t diff;
+        int esize;
+
+        esize = iam_lfix_entry_size(l);
+        diff = (void *)e1 - (void *)e2;
+        assert_corr(diff / esize * esize == diff);
+        return diff / esize;
+}
+
+static int iam_lfix_init(struct iam_leaf *l)
+{
+        int result;
+        struct iam_leaf_head *ill;
+        int count;
+
+        assert_corr(l->il_bh != NULL);
+
+        ill = iam_get_head(l);
+        count = le16_to_cpu(ill->ill_count);
+        if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) &&
+            0 <= count && count <= leaf_count_limit(l)) {
+                l->il_at = l->il_entries = iam_get_lentries(l);
+                result = 0;
+        } else {
+                struct inode *obj;
+
+                obj = iam_leaf_container(l)->ic_object;
+                CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or "
+                       "wrong count: %i (%i)",
+                       (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
+                       ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC),
+                       count, leaf_count_limit(l));
+                result = -EIO;
+        }
+        return result;
+}
+
+static void iam_lfix_fini(struct iam_leaf *l)
+{
+        l->il_entries = l->il_at = NULL;
+}
+
+static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l)
+{
+        int count = lentry_count_get(l);
+        struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count);
+
+        return ile;
+}
+
+struct iam_rec *iam_lfix_rec(const struct iam_leaf *l)
+{
+        void *e = l->il_at;
+        assert_corr(iam_leaf_at_rec(l));
+        return e + iam_leaf_descr(l)->id_key_size;
+}
+
+static void iam_lfix_next(struct iam_leaf *l)
+{
+        assert_corr(iam_leaf_at_rec(l));
+        l->il_at = iam_lfix_shift(l, l->il_at, 1);
+}
+
+/*
+ * Bug chasing.
+ */
+int lfix_dump = 0;
+EXPORT_SYMBOL(lfix_dump);
+
+static char hdigit(char ch)
+{
+        static char d[] = "0123456789abcdef";
+        return d[ch & 0xf];
+}
+
+static char *hex(char ch, char *area)
+{
+        area[0] = hdigit(ch >> 4);
+        area[1] = hdigit(ch);
+        area[2] = 0;
+        return area;
+}
+
+static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry)
+{
+        int i;
+        char *area;
+        char h[3];
+
+        area = (char *)entry;
+        printk(KERN_EMERG "[");
+        for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area)
+                printk("%s", hex(*area, h));
+        printk("]-(");
+        for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area)
+                printk("%s", hex(*area, h));
+        printk(")\n");
+}
+
+static void lfix_print(struct iam_leaf *leaf)
+{
+        struct iam_lentry *entry;
+        int count;
+        int i;
+
+        entry = leaf->il_entries;
+        count = lentry_count_get(leaf);
+        printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count);
+        for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1))
+                l_print(leaf, entry);
+}
+
+static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k)
+{
+        struct iam_lentry *p, *q, *m, *t;
+        struct iam_container *c;
+        int count;
+        int result;
+
+        count = lentry_count_get(l);
+        if (count == 0)
+                return IAM_LOOKUP_EMPTY;
+
+        result = IAM_LOOKUP_OK;
+        c = iam_leaf_container(l);
+
+        p = l->il_entries;
+        q = iam_lfix_shift(l, p, count - 1);
+        if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) {
+                /*
+                 * @k is less than the least key in the leaf
+                 */
+                l->il_at = p;
+                result = IAM_LOOKUP_BEFORE;
+        } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) {
+                l->il_at = q;
+        } else {
+                /*
+                 * EWD1293
+                 */
+                while (iam_lfix_shift(l, p, 1) != q) {
+                        m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2);
+                        assert_corr(p < m && m < q);
+                        if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0)
+                                p = m;
+                        else
+                                q = m;
+                }
+                assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 &&
+                            lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0);
+                /*
+                 * skip over records with duplicate keys.
+                 */
+                while (p > l->il_entries) {
+                        t = iam_lfix_shift(l, p, -1);
+                        if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0)
+                                p = t;
+                        else
+                                break;
+                }
+                l->il_at = p;
+        }
+        assert_corr(iam_leaf_at_rec(l));
+
+        if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0)
+                result = IAM_LOOKUP_EXACT;
+
+        if (lfix_dump)
+                lfix_print(l);
+
+        return result;
+}
+
+static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik)
+{
+        assert(0);
+        return IAM_LOOKUP_OK;
+}
+
+static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k)
+{
+        assert_corr(iam_leaf_at_rec(l));
+        memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size);
+}
+
+static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
+{
+        return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k);
+}
+
+static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k)
+{
+        return !lfix_keycmp(iam_leaf_container(l),
+                            iam_leaf_key_at(l->il_at), k);
+}
+
+static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r)
+{
+        assert_corr(iam_leaf_at_rec(l));
+        memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size);
+}
+
+static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r)
+{
+        assert_corr(iam_leaf_at_rec(l));
+        memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size);
+}
+
+static void iam_lfix_rec_add(struct iam_leaf *leaf,
+                             const struct iam_key *k, const struct iam_rec *r)
+{
+        struct iam_lentry *end;
+        struct iam_lentry *cur;
+        struct iam_lentry *start;
+        ptrdiff_t diff;
+        int count;
+
+        assert_corr(iam_leaf_can_add(leaf, k, r));
+
+        count = lentry_count_get(leaf);
+        /*
+         * This branch handles two exceptional cases:
+         *
+         *   - leaf positioned beyond last record, and
+         *
+         *   - empty leaf.
+         */
+        if (!iam_leaf_at_end(leaf)) {
+                end   = iam_lfix_get_end(leaf);
+                cur   = leaf->il_at;
+                if (lfix_keycmp(iam_leaf_container(leaf),
+                               k, iam_leaf_key_at(cur)) >= 0)
+                        iam_lfix_next(leaf);
+                else
+                        /*
+                         * Another exceptional case: insertion with the key
+                         * less than least key in the leaf.
+                         */
+                        assert_corr(cur == leaf->il_entries);
+
+                start = leaf->il_at;
+                diff  = (void *)end - (void *)start;
+                assert_corr(diff >= 0);
+                memmove(iam_lfix_shift(leaf, start, 1), start, diff);
+        }
+        lentry_count_set(leaf, count + 1);
+        iam_lfix_key_set(leaf, k);
+        iam_lfix_rec_set(leaf, r);
+        assert_corr(iam_leaf_at_rec(leaf));
+}
+
+static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift)
+{
+        struct iam_lentry *next, *end;
+        int count;
+        ptrdiff_t diff;
+
+        assert_corr(iam_leaf_at_rec(leaf));
+
+        count = lentry_count_get(leaf);
+        end = iam_lfix_get_end(leaf);
+        next = iam_lfix_shift(leaf, leaf->il_at, 1);
+        diff = (void *)end - (void *)next;
+        memmove(leaf->il_at, next, diff);
+
+        lentry_count_set(leaf, count - 1);
+}
+
+static int iam_lfix_can_add(const struct iam_leaf *l,
+                            const struct iam_key *k, const struct iam_rec *r)
+{
+        return lentry_count_get(l) < leaf_count_limit(l);
+}
+
+static int iam_lfix_at_end(const struct iam_leaf *folio)
+{
+        return folio->il_at == iam_lfix_get_end(folio);
+}
+
+static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh)
+{
+        struct iam_leaf_head *hdr;
+
+        hdr = (struct iam_leaf_head*)bh->b_data;
+        hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC);
+        hdr->ill_count = cpu_to_le16(0);
+}
+
+static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh,
+                           iam_ptr_t new_blknr)
+{
+        struct iam_path       *path;
+        struct iam_leaf_head  *hdr;
+        const struct iam_ikey *pivot;
+        struct buffer_head    *new_leaf;
+
+        unsigned count;
+        unsigned split;
+
+        void *start;
+        void *finis;
+
+        new_leaf = *bh;
+        path = iam_leaf_path(l);
+
+        hdr = (void *)new_leaf->b_data;
+
+        count = lentry_count_get(l);
+        split = count / 2;
+
+        start = iam_lfix_shift(l, iam_get_lentries(l), split);
+        finis = iam_lfix_shift(l, iam_get_lentries(l), count);
+
+        pivot = (const struct iam_ikey *)iam_leaf_key_at(start);
+
+        memmove(iam_entries(new_leaf), start, finis - start);
+        hdr->ill_count = count - split;
+        lentry_count_set(l, split);
+        if ((void *)l->il_at >= start) {
+                /*
+                 * insertion point moves into new leaf.
+                 */
+                int shift;
+                int result;
+
+                shift = iam_lfix_diff(l, l->il_at, start);
+                *bh = l->il_bh;
+                l->il_bh = new_leaf;
+                l->il_curidx = new_blknr;
+                result = iam_lfix_init(l);
+                /*
+                 * init cannot fail, as node was just initialized.
+                 */
+                assert_corr(result == 0);
+                l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift);
+        }
+        /*
+         * Insert pointer to the new node (together with the least key in
+         * the node) into index node.
+         */
+        iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr);
+}
+
+static struct iam_leaf_operations iam_lfix_leaf_ops = {
+        .init           = iam_lfix_init,
+        .init_new       = iam_lfix_init_new,
+        .fini           = iam_lfix_fini,
+        .start          = iam_lfix_start,
+        .next           = iam_lfix_next,
+        .key            = iam_lfix_key,
+        .ikey           = iam_lfix_ikey,
+        .rec            = iam_lfix_rec,
+        .key_set        = iam_lfix_key_set,
+        .key_cmp        = iam_lfix_key_cmp,
+        .key_eq         = iam_lfix_key_eq,
+        .key_size       = iam_lfix_key_size,
+        .rec_set        = iam_lfix_rec_set,
+        .rec_get        = iam_lfix_rec_get,
+        .lookup         = iam_lfix_lookup,
+        .ilookup        = iam_lfix_ilookup,
+        .at_end         = iam_lfix_at_end,
+        .rec_add        = iam_lfix_rec_add,
+        .rec_del        = iam_lfix_rec_del,
+        .can_add        = iam_lfix_can_add,
+        .split          = iam_lfix_split
+};
+
+/*
+ * Index operations.
+ */
+
+enum {
+        /* This is duplicated in lustre/utils/create_iam.c */
+        /*
+         * Then shalt thou see the dew-BEDABBLED wretch
+         * Turn, and return, indenting with the way;
+         * Each envious brier his weary legs doth scratch,
+         * Each shadow makes him stop, each murmur stay:
+         * For misery is trodden on by many,
+         * And being low never relieved by any.
+         */
+        IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct iam_lfix_root {
+        __le64  ilr_magic;
+        __le16  ilr_keysize;
+        __le16  ilr_recsize;
+        __le16  ilr_ptrsize;
+        u8      ilr_indirect_levels;
+        u8      ilr_padding;
+};
+
+static __u32 iam_lfix_root_ptr(struct iam_container *c)
+{
+        return 0;
+}
+
+static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh,
+                              int root)
+{
+        return 0;
+}
+
+static struct iam_entry *iam_lfix_root_inc(struct iam_container *c,
+                                           struct iam_path *path,
+                                           struct iam_frame *frame)
+{
+        struct iam_lfix_root *root;
+        struct iam_entry     *entries;
+
+        entries = frame->entries;
+
+        dx_set_count(entries, 2);
+        assert_corr(dx_get_limit(entries) == dx_root_limit(path));
+
+        root = (void *)frame->bh->b_data;
+        assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC);
+        root->ilr_indirect_levels ++;
+        frame->at = entries = iam_entry_shift(path, entries, 1);
+        memset(iam_ikey_at(path, entries), 0,
+               iam_path_descr(path)->id_ikey_size);
+        return entries;
+}
+
+static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame)
+{
+        unsigned count;
+        unsigned limit;
+        unsigned limit_correct;
+        struct iam_entry *entries;
+
+        entries = dx_node_get_entries(path, frame);
+
+        if (frame == path->ip_frames) {
+                struct iam_lfix_root *root;
+
+                root = (void *)frame->bh->b_data;
+                if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) {
+                        return -EIO;
+                }
+                limit_correct = dx_root_limit(path);
+        } else
+                limit_correct = dx_node_limit(path);
+        count = dx_get_count(entries);
+        limit = dx_get_limit(entries);
+        if (count > limit) {
+                return -EIO;
+        }
+        if (limit != limit_correct) {
+                return -EIO;
+        }
+        return 0;
+}
+
+static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame)
+{
+        struct iam_entry *entries;
+        void *data;
+        entries = dx_node_get_entries(path, frame);
+
+        data = frame->bh->b_data;
+
+        if (frame == path->ip_frames) {
+                struct iam_lfix_root *root;
+
+                root = data;
+                path->ip_indirect = root->ilr_indirect_levels;
+                if (path->ip_ikey_target == NULL)
+                        path->ip_ikey_target =
+                                (struct iam_ikey *)path->ip_key_target;
+        }
+        frame->entries = frame->at = entries;
+        return 0;
+}
+
+static int iam_lfix_ikeycmp(const struct iam_container *c,
+                            const struct iam_ikey *k1,
+                            const struct iam_ikey *k2)
+{
+        return memcmp(k1, k2, c->ic_descr->id_ikey_size);
+}
+
+static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c,
+                                                 void *area)
+{
+        return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
+}
+
+static struct iam_operations iam_lfix_ops = {
+        .id_root_ptr    = iam_lfix_root_ptr,
+        .id_node_read   = iam_node_read,
+        .id_node_init   = iam_lfix_node_init,
+        .id_node_check  = iam_lfix_node_check,
+        .id_node_load   = iam_lfix_node_load,
+        .id_ikeycmp     = iam_lfix_ikeycmp,
+        .id_root_inc    = iam_lfix_root_inc,
+        .id_ipd_alloc   = iam_lfix_ipd_alloc,
+        .id_ipd_free    = iam_ipd_free,
+        .id_name        = "lfix"
+};
+
+static int iam_lfix_guess(struct iam_container *c)
+{
+        int result;
+        struct buffer_head *bh;
+        const struct iam_lfix_root *root;
+
+        assert_corr(c->ic_object != NULL);
+
+        result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh);
+        if (result == 0) {
+                root = (void *)bh->b_data;
+                if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) {
+                        struct iam_descr *descr;
+
+                        descr = c->ic_descr;
+                        descr->id_key_size  = le16_to_cpu(root->ilr_keysize);
+                        descr->id_ikey_size = le16_to_cpu(root->ilr_keysize);
+                        descr->id_rec_size  = le16_to_cpu(root->ilr_recsize);
+                        descr->id_ptr_size  = le16_to_cpu(root->ilr_ptrsize);
+                        descr->id_root_gap  = sizeof(struct iam_lfix_root);
+                        descr->id_node_gap  = 0;
+                        descr->id_ops       = &iam_lfix_ops;
+                        descr->id_leaf_ops  = &iam_lfix_leaf_ops;
+                } else
+                        result = -EBADF;
+                brelse(bh);
+        }
+        return result;
+}
+
+static struct iam_format iam_lfix_format = {
+        .if_guess = iam_lfix_guess
+};
+
+void iam_lfix_format_init(void)
+{
+        iam_format_register(&iam_lfix_format);
+}
+
+/*
+ * Debugging aid.
+ */
+
+#define KEYSIZE (8)
+#define RECSIZE (8)
+#define PTRSIZE (4)
+
+#define LFIX_ROOT_RECNO \
+        ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE))
+
+#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE))
+
+#define LFIX_LEAF_RECNO \
+        ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE))
+
+struct lfix_root {
+        struct iam_lfix_root lr_root;
+        struct {
+                char key[KEYSIZE];
+                char ptr[PTRSIZE];
+        } lr_entry[LFIX_ROOT_RECNO];
+};
+
+struct lfix_index {
+        struct dx_countlimit li_cl;
+        char   li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)];
+        struct {
+                char key[KEYSIZE];
+                char ptr[PTRSIZE];
+        } li_entry[LFIX_INDEX_RECNO - 1];
+};
+
+struct lfix_leaf {
+        struct iam_leaf_head ll_head;
+        struct {
+                char key[KEYSIZE];
+                char rec[RECSIZE];
+        } ll_entry[LFIX_LEAF_RECNO];
+};
+
+#define STORE_UNALIGNED(val, dst)                       \
+({                                                      \
+        typeof(val) __val = (val);                      \
+        CLASSERT(sizeof(val) == sizeof(*(dst)));        \
+        memcpy(dst, &__val, sizeof(*(dst)));            \
+})
+
+#include <linux/jbd.h>
+#include <linux/ldiskfs_fs.h>
+#include <linux/ldiskfs_jbd.h>
+
+static void lfix_root(void *buf,
+                      int blocksize, int keysize, int ptrsize, int recsize)
+{
+        struct iam_lfix_root *root;
+        struct dx_countlimit *limit;
+        void                 *entry;
+
+        root = buf;
+        *root = (typeof(*root)) {
+                .ilr_magic           = cpu_to_le64(IAM_LFIX_ROOT_MAGIC),
+                .ilr_keysize         = cpu_to_le16(keysize),
+                .ilr_recsize         = cpu_to_le16(recsize),
+                .ilr_ptrsize         = cpu_to_le16(ptrsize),
+                .ilr_indirect_levels = 0
+        };
+
+        limit = (void *)(root + 1);
+        *limit = (typeof(*limit)){
+                /*
+                 * limit itself + one pointer to the leaf.
+                 */
+                .count = cpu_to_le16(2),
+                .limit = iam_root_limit(sizeof(struct iam_lfix_root),
+                                        blocksize, keysize + ptrsize)
+        };
+
+        entry = root + 1;
+        /*
+         * Skip over @limit.
+         */
+        entry += keysize + ptrsize;
+
+        /*
+         * Entry format is <key> followed by <ptr>. In the minimal tree
+         * consisting of a root and single node, <key> is a minimal possible
+         * key.
+         *
+         * XXX: this key is hard-coded to be a sequence of 0's.
+         */
+
+        entry += keysize;
+        /* now @entry points to <ptr> */
+        if (ptrsize == 4)
+                STORE_UNALIGNED(cpu_to_le32(1), (u_int32_t *)entry);
+        else
+                STORE_UNALIGNED(cpu_to_le64(1), (u_int64_t *)entry);
+}
+
+static void lfix_leaf(void *buf,
+                      int blocksize, int keysize, int ptrsize, int recsize)
+{
+        struct iam_leaf_head *head;
+
+        /* form leaf */
+        head = buf;
+        *head = (struct iam_leaf_head) {
+                .ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC),
+                /*
+                 * Leaf contains an entry with the smallest possible key
+                 * (created by zeroing).
+                 */
+                .ill_count = cpu_to_le16(1),
+        };
+}
+
+int iam_lfix_create(struct inode *obj,
+                    int keysize, int ptrsize, int recsize, handle_t *handle)
+{
+        struct buffer_head *root_node;
+        struct buffer_head *leaf_node;
+        struct super_block *sb;
+
+        u32 blknr;
+        int result;
+        unsigned long bsize;
+
+        assert_corr(obj->i_size == 0);
+
+        sb = obj->i_sb;
+        bsize = sb->s_blocksize;
+        root_node = ldiskfs_append(handle, obj, &blknr, &result);
+        leaf_node = ldiskfs_append(handle, obj, &blknr, &result);
+        if (root_node != NULL && leaf_node != NULL) {
+                lfix_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
+                lfix_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
+                ldiskfs_mark_inode_dirty(handle, obj);
+                result = ldiskfs_journal_dirty_metadata(handle, root_node);
+                if (result == 0)
+                        result = ldiskfs_journal_dirty_metadata(handle, leaf_node);
+                if (result != 0)
+                        ldiskfs_std_error(sb, result);
+        }
+        brelse(leaf_node);
+        brelse(root_node);
+        return result;
+}
+EXPORT_SYMBOL(iam_lfix_create);
diff --git a/lustre/osd/osd_iam_lvar.c b/lustre/osd/osd_iam_lvar.c
new file mode 100644 (file)
index 0000000..49a77d7
--- /dev/null
@@ -0,0 +1,1072 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * iam_lvar.c
+ *
+ * implementation of iam format for fixed size records, variable sized keys.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/types.h>
+#include <linux/jbd.h>
+/* ldiskfs_error() */
+#include <linux/ldiskfs_fs.h>
+#include "osd_internal.h"
+
+/*
+ * Leaf operations.
+ */
+
+enum {
+        IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in
+                                      * lustre/utils/create_iam.c */
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct lvar_leaf_header {
+        __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
+        __le16 vlh_used;  /* used bytes, including header */
+};
+
+/*
+ * Format of leaf entry:
+ *
+ * __le16 keysize
+ *     u8 key[keysize]
+ *     u8 record[rec_size]
+ *
+ * Entries are ordered in key order.
+ */
+
+/* This is duplicated in lustre/utils/create_iam.c */
+typedef __u32 lvar_hash_t;
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct lvar_leaf_entry {
+        __le32 vle_hash;
+        __le16 vle_keysize;
+        u8     vle_key[0];
+};
+
+#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
+
+
+static inline int blocksize(const struct iam_leaf *leaf)
+{
+        return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
+}
+
+static inline const char *kchar(const struct iam_key *key)
+{
+        return (void *)key;
+}
+
+static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
+{
+        return (struct iam_lentry *)ent;
+}
+
+static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
+{
+        return (struct lvar_leaf_entry *)lent;
+}
+
+
+static inline int e_keysize(const struct lvar_leaf_entry *ent)
+{
+        return le16_to_cpu(ent->vle_keysize);
+}
+
+/* This is duplicated in lustre/utils/create_iam.c */
+enum {
+        LVAR_PAD   = 4,
+        LVAR_ROUND = LVAR_PAD - 1
+};
+
+static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
+{
+        CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1)));
+
+        return (offsetof(struct lvar_leaf_entry, vle_key) +
+                namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
+}
+
+static inline int rec_size(const struct iam_rec *rec)
+{
+        return *(const char *)rec;
+}
+
+static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
+{
+        return ((void *)ent) +
+                offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
+}
+
+static inline int e_size(const struct iam_leaf *leaf,
+                         const struct lvar_leaf_entry *ent)
+{
+        return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
+}
+
+static inline char *e_char(const struct lvar_leaf_entry *ent)
+{
+        return (char *)&ent->vle_key;
+}
+
+static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
+{
+        return (struct iam_key *)e_char(ent);
+}
+
+static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
+{
+        return le32_to_cpu(ent->vle_hash);
+}
+
+static void e_print(const struct lvar_leaf_entry *ent)
+{
+        printk("        %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
+               e_keysize(ent), e_keysize(ent), e_char(ent));
+}
+#if 0
+static int e_check(const struct iam_leaf *leaf,
+                   const struct lvar_leaf_entry *ent)
+{
+        const void *point = ent;
+        const void *start = leaf->il_bh->b_data;
+        return
+                start + sizeof(struct lvar_leaf_header) <= point &&
+                point + e_size(leaf, ent) < start + blocksize(leaf);
+}
+#endif
+
+static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
+                                             const struct lvar_leaf_entry *ent)
+{
+        return ((void *)ent) + e_size(leaf, ent);
+}
+
+#define LVAR_HASH_SANDWICH  (0)
+#define LVAR_HASH_TEA       (1)
+#define LVAR_HASH_R5        (0)
+#define LVAR_HASH_PREFIX    (0)
+
+static __u32 hash_build0(const char *name, int namelen)
+{
+        __u32 result;
+
+        if (namelen == 0)
+                return 0;
+        if (strncmp(name, ".", 1) == 0 && namelen == 1)
+                return 1;
+        if (strncmp(name, "..", 2) == 0 && namelen == 2)
+                return 2;
+
+        if (LVAR_HASH_PREFIX) {
+                result = 0;
+                strncpy((void *)&result,
+                        name, min(namelen, (int)sizeof result));
+        } else {
+                struct ldiskfs_dx_hash_info hinfo;
+
+                if (LVAR_HASH_TEA)
+                        hinfo.hash_version = LDISKFS_DX_HASH_TEA;
+                else
+                        hinfo.hash_version = LDISKFS_DX_HASH_R5;
+                hinfo.seed = 0;
+                ldiskfsfs_dirhash(name, namelen, &hinfo);
+                result = hinfo.hash;
+                if (LVAR_HASH_SANDWICH) {
+                        __u32 result2;
+
+                        hinfo.hash_version = LDISKFS_DX_HASH_TEA;
+                        hinfo.seed = 0;
+                        ldiskfsfs_dirhash(name, namelen, &hinfo);
+                        result2 = hinfo.hash;
+                        result = (0xfc000000 & result2) | (0x03ffffff & result);
+                }
+        }
+        return result;
+}
+
+enum {
+        HASH_GRAY_AREA = 1024,
+        HASH_MAX_SIZE  = 0x7fffffffUL
+};
+
+static __u32 hash_build(const char *name, int namelen)
+{
+        __u32 hash;
+
+        hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE;
+        if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
+                hash &= HASH_GRAY_AREA - 1;
+        return hash;
+}
+
+static inline lvar_hash_t get_hash(const struct iam_container *bag,
+                                   const char *name, int namelen)
+{
+        return hash_build(name, namelen);
+}
+
+static inline int e_eq(const struct lvar_leaf_entry *ent,
+                       const char *name, int namelen)
+{
+        return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
+}
+
+static inline int e_cmp(const struct iam_leaf *leaf,
+                        const struct lvar_leaf_entry *ent, lvar_hash_t hash)
+{
+        lvar_hash_t ehash;
+
+        ehash = e_hash(ent);
+        return ehash == hash ? 0 : (ehash < hash ? -1 : +1);
+}
+
+static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
+{
+        return (struct lvar_leaf_header *)l->il_bh->b_data;
+}
+
+static int h_used(const struct lvar_leaf_header *hdr)
+{
+        return le16_to_cpu(hdr->vlh_used);
+}
+
+static void h_used_adj(const struct iam_leaf *leaf,
+                       struct lvar_leaf_header *hdr, int adj)
+{
+        int used;
+
+        used = h_used(hdr) + adj;
+        assert_corr(sizeof *hdr <= used && used <= blocksize(leaf));
+        hdr->vlh_used = cpu_to_le16(used);
+}
+
+static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
+{
+        return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
+}
+
+static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
+{
+        return (void *)l->il_bh->b_data + h_used(n_head(l));
+}
+
+static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
+{
+        return lentry_lvar(l->il_at);
+}
+
+void n_print(const struct iam_leaf *l)
+{
+        struct lvar_leaf_entry *scan;
+
+        printk(KERN_EMERG "used: %d\n", h_used(n_head(l)));
+        for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
+                e_print(scan);
+}
+
+#if LDISKFS_CORRECTNESS_ON
+static int n_at_rec(const struct iam_leaf *folio)
+{
+        return
+                n_start(folio) <= lentry_lvar(folio->il_at) &&
+                lentry_lvar(folio->il_at) < n_end(folio);
+}
+
+#if LDISKFS_INVARIANT_ON
+static int n_invariant(const struct iam_leaf *leaf)
+{
+        struct iam_path        *path;
+        struct lvar_leaf_entry *scan;
+        struct lvar_leaf_entry *end;
+        lvar_hash_t             hash;
+        lvar_hash_t             nexthash;
+        lvar_hash_t             starthash;
+
+        end  = n_end(leaf);
+        hash = 0;
+        path = leaf->il_path;
+
+        if (h_used(n_head(leaf)) > blocksize(leaf))
+                return 0;
+
+        /*
+         * Delimiting key in the parent index node. Clear least bit to account
+         * for hash collision marker.
+         */
+        starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
+        for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
+                nexthash = e_hash(scan);
+                if (nexthash != get_hash(iam_leaf_container(leaf),
+                                         e_char(scan), e_keysize(scan))) {
+                        BREAKPOINT();
+                        return 0;
+                }
+                if (0 && nexthash < starthash) {
+                        /*
+                         * Unfortunately this useful invariant cannot be
+                         * reliably checked as parent node is nor necessarily
+                         * locked.
+                         */
+                        n_print(leaf);
+                        printk("%#x < %#x\n", nexthash, starthash);
+                        dump_stack();
+                        return 0;
+                }
+                if (nexthash < hash) {
+                        BREAKPOINT();
+                        return 0;
+                }
+                hash = nexthash;
+        }
+        if (scan != end) {
+                BREAKPOINT();
+                return 0;
+        }
+        return 1;
+}
+/* LDISKFS_INVARIANT_ON */
+#endif
+
+/* LDISKFS_CORRECTNESS_ON */
+#endif
+
+static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
+                                  struct iam_ikey *key)
+{
+        lvar_hash_t *hash;
+
+        assert_corr(n_at_rec(l));
+
+        hash = (void *)key;
+        *hash = e_hash(n_cur(l));
+        return key;
+}
+
+static struct iam_key *lvar_key(const struct iam_leaf *l)
+{
+        return e_key(n_cur(l));
+}
+
+static int lvar_key_size(const struct iam_leaf *l)
+{
+        return e_keysize(n_cur(l));
+}
+
+static void lvar_start(struct iam_leaf *l)
+{
+        l->il_at = lvar_lentry(n_start(l));
+}
+
+static int lvar_init(struct iam_leaf *l)
+{
+        int result;
+        int used;
+        struct lvar_leaf_header *head;
+
+        assert_corr(l->il_bh != NULL);
+
+        head = n_head(l);
+        used = h_used(head);
+        if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) &&
+            used <= blocksize(l)) {
+                l->il_at = l->il_entries = lvar_lentry(n_start(l));
+                result = 0;
+        } else {
+                struct inode *obj;
+
+                obj = iam_leaf_container(l)->ic_object;
+                CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or "
+                       "wrong used: %i",
+                       (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
+                       head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC),
+                       used);
+                result = -EIO;
+        }
+        return result;
+}
+
+static void lvar_fini(struct iam_leaf *l)
+{
+        l->il_entries = l->il_at = NULL;
+}
+
+struct iam_rec *lvar_rec(const struct iam_leaf *l)
+{
+        assert_corr(n_at_rec(l));
+        return e_rec(n_cur(l));
+}
+
+static void lvar_next(struct iam_leaf *l)
+{
+        assert_corr(n_at_rec(l));
+        assert_corr(iam_leaf_is_locked(l));
+        l->il_at = lvar_lentry(e_next(l, n_cur(l)));
+}
+
+static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
+{
+        struct lvar_leaf_entry *found;
+        struct lvar_leaf_entry *scan;
+        struct lvar_leaf_entry *end;
+        int                     result;
+        const char             *name;
+        int                     namelen;
+        int                     found_equal;
+        lvar_hash_t             hash;
+        int                     last;
+
+        assert_inv(n_invariant(leaf));
+        end = n_end(leaf);
+
+        name = kchar(k);
+        namelen = strlen(name);
+        hash = get_hash(iam_leaf_container(leaf), name, namelen);
+        found = NULL;
+        found_equal = 0;
+        last = 1;
+
+        for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
+                lvar_hash_t scan_hash;
+
+                scan_hash = e_hash(scan);
+                if (scan_hash < hash)
+                        found = scan;
+                else if (scan_hash == hash) {
+                        if (e_eq(scan, name, namelen)) {
+                                /*
+                                 * perfect match
+                                 */
+                                leaf->il_at = lvar_lentry(scan);
+                                return IAM_LOOKUP_EXACT;
+                        } else if (!found_equal) {
+                                        found = scan;
+                                        found_equal = 1;
+                        }
+                } else {
+                        last = 0;
+                        break;
+                }
+        }
+        if (found == NULL) {
+                /*
+                 * @k is less than all hashes in the leaf.
+                 */
+                lvar_start(leaf);
+                result = IAM_LOOKUP_BEFORE;
+        } else {
+                leaf->il_at = lvar_lentry(found);
+                result = IAM_LOOKUP_OK;
+                assert_corr(n_at_rec(leaf));
+        }
+        if (last)
+                result |= IAM_LOOKUP_LAST;
+        assert_inv(n_invariant(leaf));
+
+        return result;
+}
+
+static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
+{
+        struct lvar_leaf_entry *scan;
+        struct lvar_leaf_entry *end;
+        lvar_hash_t             hash;
+
+        assert_inv(n_invariant(leaf));
+        end  = n_end(leaf);
+        hash = *(const lvar_hash_t *)ik;
+
+        lvar_start(leaf);
+        for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
+                lvar_hash_t scan_hash;
+
+                scan_hash = e_hash(scan);
+                if (scan_hash > hash)
+                        return scan == n_start(leaf) ?
+                                IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
+                leaf->il_at = lvar_lentry(scan);
+                if (scan_hash == hash)
+                        return IAM_LOOKUP_EXACT;
+        }
+        assert_inv(n_invariant(leaf));
+        /*
+         * @ik is greater than any key in the node. Return last record in the
+         * node.
+         */
+        return IAM_LOOKUP_OK;
+}
+
+static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
+{
+        memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
+}
+
+static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
+{
+        assert_corr(n_at_rec(l));
+        assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
+        assert_corr(iam_leaf_is_locked(l));
+        __lvar_key_set(l, k);
+        assert_inv(n_invariant(l));
+}
+
+static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
+{
+        lvar_hash_t hash;
+        const char *name;
+
+        name = kchar(k);
+
+        hash = get_hash(iam_leaf_container(l), name, strlen(name));
+        return e_cmp(l, n_cur(l), hash);
+}
+
+static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
+{
+        const char *name;
+
+        name = kchar(k);
+        return e_eq(n_cur(l), name, strlen(name));
+}
+
+static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
+{
+        memcpy(e_rec(n_cur(l)), r, rec_size(r));
+}
+
+static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
+{
+        assert_corr(n_at_rec(l));
+        assert_corr(iam_leaf_is_locked(l));
+        __lvar_rec_set(l, r);
+        assert_inv(n_invariant(l));
+}
+
+static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
+{
+        struct iam_rec *rec;
+
+        rec = e_rec(n_cur(l));
+        assert_corr(n_at_rec(l));
+        assert_corr(iam_leaf_is_locked(l));
+        memcpy(r, rec, rec_size(rec));
+        assert_inv(n_invariant(l));
+}
+
+static int lvar_can_add(const struct iam_leaf *l,
+                        const struct iam_key *k, const struct iam_rec *r)
+{
+        assert_corr(iam_leaf_is_locked(l));
+        return
+                h_used(n_head(l)) +
+                getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
+}
+
+static int lvar_at_end(const struct iam_leaf *folio)
+{
+        assert_corr(iam_leaf_is_locked(folio));
+        return n_cur(folio) == n_end(folio);
+}
+
+static void lvar_rec_add(struct iam_leaf *leaf,
+                         const struct iam_key *k, const struct iam_rec *r)
+{
+        const char *key;
+        int   ksize;
+        int   shift;
+        void *end;
+        void *start;
+        ptrdiff_t diff;
+
+        assert_corr(lvar_can_add(leaf, k, r));
+        assert_inv(n_invariant(leaf));
+        assert_corr(iam_leaf_is_locked(leaf));
+
+        key   = kchar(k);
+        ksize = strlen(key);
+        shift = getsize(leaf, ksize, rec_size(r));
+
+        if (!lvar_at_end(leaf)) {
+                assert_corr(n_cur(leaf) < n_end(leaf));
+                end = n_end(leaf);
+                if (lvar_key_cmp(leaf, k) <= 0)
+                        lvar_next(leaf);
+                else
+                        /*
+                         * Another exceptional case: insertion with the key
+                         * less than least key in the leaf.
+                         */
+                        assert_corr(leaf->il_at == leaf->il_entries);
+
+                start = leaf->il_at;
+                diff  = PDIFF(end, start);
+                assert_corr(diff >= 0);
+                memmove(start + shift, start, diff);
+        }
+        h_used_adj(leaf, n_head(leaf), shift);
+        n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
+        n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
+                                                     key, ksize));
+        __lvar_key_set(leaf, k);
+        __lvar_rec_set(leaf, r);
+        assert_corr(n_at_rec(leaf));
+        assert_inv(n_invariant(leaf));
+}
+
+static void lvar_rec_del(struct iam_leaf *leaf, int shift)
+{
+        void *next;
+        void *end;
+        int nob;
+
+        assert_corr(n_at_rec(leaf));
+        assert_inv(n_invariant(leaf));
+        assert_corr(iam_leaf_is_locked(leaf));
+
+        end  = n_end(leaf);
+        next = e_next(leaf, n_cur(leaf));
+        nob  = e_size(leaf, n_cur(leaf));
+        memmove(leaf->il_at, next, end - next);
+        h_used_adj(leaf, n_head(leaf), -nob);
+        assert_inv(n_invariant(leaf));
+}
+
+static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
+{
+        struct lvar_leaf_header *hdr;
+
+        hdr = (struct lvar_leaf_header *)bh->b_data;
+        hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
+        hdr->vlh_used  = sizeof *hdr;
+}
+
+static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
+                                          struct lvar_leaf_entry **prev)
+{
+        void *scan;
+        void *start;
+        int threshold;
+
+        *prev = NULL;
+        threshold = blocksize(leaf) / 2;
+        for (scan = start = n_start(leaf); scan - start <= threshold;
+             *prev = scan, scan = e_next(leaf, scan)) {
+                ;
+        }
+        return scan;
+}
+
+static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
+                       iam_ptr_t new_blknr)
+{
+        struct lvar_leaf_entry  *first_to_move;
+        struct lvar_leaf_entry  *last_to_stay;
+        struct iam_path         *path;
+        struct lvar_leaf_header *hdr;
+        struct buffer_head      *new_leaf;
+
+        ptrdiff_t   tomove;
+        lvar_hash_t hash;
+
+        assert_inv(n_invariant(leaf));
+        assert_corr(iam_leaf_is_locked(leaf));
+
+        new_leaf = *bh;
+        path = iam_leaf_path(leaf);
+
+        hdr = (void *)new_leaf->b_data;
+
+        first_to_move = find_pivot(leaf, &last_to_stay);
+        assert_corr(last_to_stay != NULL);
+        assert_corr(e_next(leaf, last_to_stay) == first_to_move);
+
+        hash = e_hash(first_to_move);
+        if (hash == e_hash(last_to_stay))
+                /*
+                 * Duplicate hash.
+                 */
+                hash |= 1;
+
+        tomove = PDIFF(n_end(leaf), first_to_move);
+        memmove(hdr + 1, first_to_move, tomove);
+
+        h_used_adj(leaf, hdr, tomove);
+        h_used_adj(leaf, n_head(leaf), -tomove);
+
+        assert_corr(n_end(leaf) == first_to_move);
+
+        if (n_cur(leaf) >= first_to_move) {
+                /*
+                 * insertion point moves into new leaf.
+                 */
+                ptrdiff_t shift;
+                int result;
+
+                shift = PDIFF(leaf->il_at, first_to_move);
+                *bh = leaf->il_bh;
+                leaf->il_bh = new_leaf;
+                leaf->il_curidx = new_blknr;
+
+                assert_corr(iam_leaf_is_locked(leaf));
+                result = lvar_init(leaf);
+                /*
+                 * init cannot fail, as node was just initialized.
+                 */
+                assert_corr(result == 0);
+                leaf->il_at = ((void *)leaf->il_at) + shift;
+        }
+        /*
+         * Insert pointer to the new node (together with the least key in
+         * the node) into index node.
+         */
+        iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
+                            new_blknr);
+        assert_corr(n_cur(leaf) < n_end(leaf));
+        assert_inv(n_invariant(leaf));
+}
+
+static struct iam_leaf_operations lvar_leaf_ops = {
+        .init           = lvar_init,
+        .init_new       = lvar_init_new,
+        .fini           = lvar_fini,
+        .start          = lvar_start,
+        .next           = lvar_next,
+        .key            = lvar_key,
+        .ikey           = lvar_ikey,
+        .rec            = lvar_rec,
+        .key_set        = lvar_key_set,
+        .key_cmp        = lvar_key_cmp,
+        .key_eq         = lvar_key_eq,
+        .key_size       = lvar_key_size,
+        .rec_set        = lvar_rec_set,
+        .rec_get        = lvar_rec_get,
+        .lookup         = lvar_lookup,
+        .ilookup        = lvar_ilookup,
+        .at_end         = lvar_at_end,
+        .rec_add        = lvar_rec_add,
+        .rec_del        = lvar_rec_del,
+        .can_add        = lvar_can_add,
+        .split          = lvar_split
+};
+
+/*
+ * Index operations.
+ */
+
+enum {
+        /* This is duplicated in lustre/utils/create_iam.c */
+        /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
+        IAM_LVAR_ROOT_MAGIC = 0xb01dface
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct lvar_root {
+        __le32 vr_magic;
+        __le16 vr_recsize;
+        __le16 vr_ptrsize;
+        u8     vr_indirect_levels;
+        u8     vr_padding0;
+        __le16 vr_padding1;
+};
+
+static __u32 lvar_root_ptr(struct iam_container *c)
+{
+        return 0;
+}
+
+static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
+                          int root)
+{
+        return 0;
+}
+
+static struct iam_entry *lvar_root_inc(struct iam_container *c,
+                                       struct iam_path *path,
+                                       struct iam_frame *frame)
+{
+        struct lvar_root *root;
+        struct iam_entry *entries;
+
+        assert_corr(iam_frame_is_locked(path, frame));
+        entries = frame->entries;
+
+        dx_set_count(entries, 2);
+        assert_corr(dx_get_limit(entries) == dx_root_limit(path));
+
+        root = (void *)frame->bh->b_data;
+        assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
+        root->vr_indirect_levels ++;
+        frame->at = entries = iam_entry_shift(path, entries, 1);
+        memset(iam_ikey_at(path, entries), 0,
+               iam_path_descr(path)->id_ikey_size);
+        return entries;
+}
+
+static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
+{
+        unsigned count;
+        unsigned limit;
+        unsigned limit_correct;
+        struct iam_entry *entries;
+
+        entries = dx_node_get_entries(path, frame);
+
+        if (frame == path->ip_frames) {
+                struct lvar_root *root;
+
+                root = (void *)frame->bh->b_data;
+                if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
+                        return -EIO;
+                limit_correct = dx_root_limit(path);
+        } else
+                limit_correct = dx_node_limit(path);
+        count = dx_get_count(entries);
+        limit = dx_get_limit(entries);
+        if (count > limit)
+                return -EIO;
+        if (limit != limit_correct)
+                return -EIO;
+        return 0;
+}
+
+static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
+{
+        struct iam_entry *entries;
+        void *data;
+        entries = dx_node_get_entries(path, frame);
+
+        data = frame->bh->b_data;
+
+        if (frame == path->ip_frames) {
+                struct lvar_root *root;
+                const char *name;
+
+                root = data;
+                name = kchar(path->ip_key_target);
+                path->ip_indirect = root->vr_indirect_levels;
+                if (path->ip_ikey_target == NULL) {
+                        path->ip_ikey_target = iam_path_ikey(path, 4);
+                        *(lvar_hash_t *)path->ip_ikey_target =
+                                get_hash(path->ip_container, name,
+                                         strlen(name));
+                }
+        }
+        frame->entries = frame->at = entries;
+        return 0;
+}
+
+static int lvar_ikeycmp(const struct iam_container *c,
+                        const struct iam_ikey *k1, const struct iam_ikey *k2)
+{
+        lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
+        lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
+
+        return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
+}
+
+static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
+                                             void *area)
+{
+        return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
+}
+
+static void lvar_root(void *buf,
+                      int blocksize, int keysize, int ptrsize, int recsize)
+{
+        struct lvar_root *root;
+        struct dx_countlimit *limit;
+        void                 *entry;
+        int isize;
+
+        isize = sizeof(lvar_hash_t) + ptrsize;
+        root = buf;
+        *root = (typeof(*root)) {
+                .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
+                .vr_recsize          = cpu_to_le16(recsize),
+                .vr_ptrsize          = cpu_to_le16(ptrsize),
+                .vr_indirect_levels  = 0
+        };
+
+        limit = (void *)(root + 1);
+        *limit = (typeof(*limit)){
+                /*
+                 * limit itself + one pointer to the leaf.
+                 */
+                .count = cpu_to_le16(2),
+                .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
+                                        sizeof (lvar_hash_t) + ptrsize)
+        };
+
+        entry = root + 1;
+        /*
+         * Skip over @limit.
+         */
+        entry += isize;
+
+        /*
+         * Entry format is <key> followed by <ptr>. In the minimal tree
+         * consisting of a root and single node, <key> is a minimal possible
+         * key.
+         */
+        *(lvar_hash_t *)entry = 0;
+        entry += sizeof(lvar_hash_t);
+        /* now @entry points to <ptr> */
+        if (ptrsize == 4)
+                *(u_int32_t *)entry = cpu_to_le32(1);
+        else
+                *(u_int64_t *)entry = cpu_to_le64(1);
+}
+
+static int lvar_esize(int namelen, int recsize)
+{
+        return (offsetof(struct lvar_leaf_entry, vle_key) +
+                namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
+}
+
+static void lvar_leaf(void *buf,
+                      int blocksize, int keysize, int ptrsize, int recsize)
+{
+        struct lvar_leaf_header *head;
+        struct lvar_leaf_entry  *entry;
+
+        /* form leaf */
+        head = buf;
+        *head = (typeof(*head)) {
+                .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
+                .vlh_used  = cpu_to_le16(sizeof *head + lvar_esize(0, recsize))
+        };
+        entry = (void *)(head + 1);
+        *entry = (typeof(*entry)) {
+                .vle_hash    = 0,
+                .vle_keysize = 0
+        };
+        memset(e_rec(entry), 0, recsize);
+        *(char *)e_rec(entry) = recsize;
+}
+
+#include <linux/jbd.h>
+#include <linux/ldiskfs_fs.h>
+#include <linux/ldiskfs_jbd.h>
+
+int iam_lvar_create(struct inode *obj,
+                    int keysize, int ptrsize, int recsize, handle_t *handle)
+{
+        struct buffer_head *root_node;
+        struct buffer_head *leaf_node;
+        struct super_block *sb;
+
+        u32 blknr;
+        int result;
+        unsigned long bsize;
+
+        assert_corr(obj->i_size == 0);
+
+        sb = obj->i_sb;
+        bsize = sb->s_blocksize;
+        root_node = ldiskfs_append(handle, obj, &blknr, &result);
+        leaf_node = ldiskfs_append(handle, obj, &blknr, &result);
+        if (root_node != NULL && leaf_node != NULL) {
+                lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
+                lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
+                ldiskfs_mark_inode_dirty(handle, obj);
+                result = ldiskfs_journal_dirty_metadata(handle, root_node);
+                if (result == 0)
+                        result = ldiskfs_journal_dirty_metadata(handle, leaf_node);
+                if (result != 0)
+                        ldiskfs_std_error(sb, result);
+        }
+        brelse(leaf_node);
+        brelse(root_node);
+        return result;
+}
+EXPORT_SYMBOL(iam_lvar_create);
+
+static struct iam_operations lvar_ops = {
+        .id_root_ptr    = lvar_root_ptr,
+        .id_node_read   = iam_node_read,
+        .id_node_init   = lvar_node_init,
+        .id_node_check  = lvar_node_check,
+        .id_node_load   = lvar_node_load,
+        .id_ikeycmp     = lvar_ikeycmp,
+        .id_root_inc    = lvar_root_inc,
+        .id_ipd_alloc   = lvar_ipd_alloc,
+        .id_ipd_free    = iam_ipd_free,
+        .id_name        = "lvar"
+};
+
+static int lvar_guess(struct iam_container *c)
+{
+        int result;
+        struct buffer_head *bh;
+        const struct lvar_root *root;
+
+        assert_corr(c->ic_object != NULL);
+
+        result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
+        if (result == 0) {
+                root = (void *)bh->b_data;
+                if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
+                        struct iam_descr *descr;
+
+                        descr = c->ic_descr;
+                        descr->id_key_size  = LDISKFS_NAME_LEN;
+                        descr->id_ikey_size = sizeof (lvar_hash_t);
+                        descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
+                        descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
+                        descr->id_root_gap  = sizeof *root;
+                        descr->id_node_gap  = 0;
+                        descr->id_ops       = &lvar_ops;
+                        descr->id_leaf_ops  = &lvar_leaf_ops;
+                } else
+                        result = -EBADF;
+                brelse(bh);
+        }
+        return result;
+}
+
+static struct iam_format lvar_format = {
+        .if_guess = lvar_guess
+};
+
+void iam_lvar_format_init(void)
+{
+        iam_format_register(&lvar_format);
+}
+
index 40f9ee4..14910aa 100644 (file)
@@ -53,7 +53,6 @@
 #include <linux/ldiskfs_fs.h>
 /* struct dentry */
 #include <linux/dcache.h>
-#include <linux/lustre_iam.h>
 /* struct dirent64 */
 #include <linux/dirent.h>
 
@@ -65,6 +64,7 @@
 
 #include <dt_object.h>
 #include "osd_oi.h"
+#include "osd_iam.h"
 
 struct inode;