#endif
#define DELTA 0x9E3779B9
+#define DX_HASH_R5 98
+#define DX_HASH_SAME 99
+
static void TEA_transform(__u32 buf[4], __u32 const in[])
{
MODULES := osd
-osd-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o
+osd-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o osd_iam.o \
+ osd_iam_lfix.o osd_iam_lvar.o
EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs
modulefs_DATA = osd$(KMODEXT)
endif
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
-DIST_SOURCES := $(osd-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
+DIST_SOURCES := $(osd-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h \
+ osd_iam.h
/* fid_is_local() */
#include <lustre_fid.h>
-#include <linux/lustre_iam.h>
#include "osd_internal.h"
#include "osd_igif.h"
return 0;
}
-extern struct inode *ldiskfs_create_inode(handle_t *handle,
- struct inode * dir, int mode);
-extern int ldiskfs_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode);
-extern int ldiskfs_delete_entry(handle_t *handle,
- struct inode * dir,
- struct ldiskfs_dir_entry_2 * de_del,
- struct buffer_head * bh);
-extern struct buffer_head * ldiskfs_find_entry(struct dentry *dentry,
- struct ldiskfs_dir_entry_2
- ** res_dir);
-extern int ldiskfs_add_dot_dotdot(handle_t *handle, struct inode *dir,
- struct inode *inode);
-
-extern int ldiskfs_xattr_set_handle(handle_t *handle, struct inode *inode,
- int name_index, const char *name,
- const void *value, size_t value_len,
- int flags);
-
static struct dentry * osd_child_dentry_get(const struct lu_env *env,
struct osd_object *obj,
const char *name,
return result;
}
-
-extern int iam_lvar_create(struct inode *obj, int keysize, int ptrsize,
- int recsize, handle_t *handle);
-
-extern int iam_lfix_create(struct inode *obj, int keysize, int ptrsize,
- int recsize, handle_t *handle);
-
enum {
OSD_NAME_LEN = 255
};
const struct dt_index_features *feat)
{
struct iam_descr *descr;
- struct dt_object *dt = &o->oo_dt;
if (osd_object_is_root(o))
return feat == &dt_directory_features;
if (feat == &dt_directory_features) {
if (descr->id_rec_size == sizeof(struct lu_fid_pack))
return 1;
-
- if (descr == &iam_htree_compat_param) {
- /* if it is a HTREE dir then there is good chance that,
- * we dealing with ext3 directory here with no FIDs. */
-
- if (descr->id_rec_size ==
- sizeof ((struct ldiskfs_dir_entry_2 *)NULL)->inode) {
-
- dt->do_index_ops = &osd_index_ea_ops;
- return 1;
- }
- }
- return 0;
+ else
+ return 0;
} else {
return
feat->dif_keysize_min <= descr->id_key_size &&
RETURN(err);
}
-extern void ldiskfs_orphan_cleanup (struct super_block * sb,
- struct ldiskfs_super_block * es);
-
static int osd_recovery_complete(const struct lu_env *env,
struct lu_device *d)
{
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * iam.c
+ * Top-level entry points into iam module
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+/*
+ * iam: big theory statement.
+ *
+ * iam (Index Access Module) is a module providing abstraction of persistent
+ * transactional container on top of generalized ldiskfs htree.
+ *
+ * iam supports:
+ *
+ * - key, pointer, and record size specifiable per container.
+ *
+ * - trees taller than 2 index levels.
+ *
+ * - read/write to existing ldiskfs htree directories as iam containers.
+ *
+ * iam container is a tree, consisting of leaf nodes containing keys and
+ * records stored in this container, and index nodes, containing keys and
+ * pointers to leaf or index nodes.
+ *
+ * iam does not work with keys directly, instead it calls user-supplied key
+ * comparison function (->dpo_keycmp()).
+ *
+ * Pointers are (currently) interpreted as logical offsets (measured in
+ * blocksful) within underlying flat file on top of which iam tree lives.
+ *
+ * On-disk format:
+ *
+ * iam mostly tries to reuse existing htree formats.
+ *
+ * Format of index node:
+ *
+ * +-----+-------+-------+-------+------+-------+------------+
+ * | | count | | | | | |
+ * | gap | / | entry | entry | .... | entry | free space |
+ * | | limit | | | | | |
+ * +-----+-------+-------+-------+------+-------+------------+
+ *
+ * gap this part of node is never accessed by iam code. It
+ * exists for binary compatibility with ldiskfs htree (that,
+ * in turn, stores fake struct ext2_dirent for ext2
+ * compatibility), and to keep some unspecified per-node
+ * data. Gap can be different for root and non-root index
+ * nodes. Gap size can be specified for each container
+ * (gap of 0 is allowed).
+ *
+ * count/limit current number of entries in this node, and the maximal
+ * number of entries that can fit into node. count/limit
+ * has the same size as entry, and is itself counted in
+ * count.
+ *
+ * entry index entry: consists of a key immediately followed by
+ * a pointer to a child node. Size of a key and size of a
+ * pointer depends on container. Entry has neither
+ * alignment nor padding.
+ *
+ * free space portion of node new entries are added to
+ *
+ * Entries in index node are sorted by their key value.
+ *
+ * Format of a leaf node is not specified. Generic iam code accesses leaf
+ * nodes through ->id_leaf methods in struct iam_descr.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd.h>
+#include <linux/time.h>
+#include <linux/ldiskfs_fs.h>
+#include <linux/ldiskfs_jbd.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include "osd_internal.h"
+
+#include "xattr.h"
+#include "iopen.h"
+#include "acl.h"
+
+/*
+ * List of all registered formats.
+ *
+ * No locking. Callers synchronize.
+ */
+static LIST_HEAD(iam_formats);
+
+void iam_format_register(struct iam_format *fmt)
+{
+ list_add(&fmt->if_linkage, &iam_formats);
+}
+EXPORT_SYMBOL(iam_format_register);
+
+/*
+ * Determine format of given container. This is done by scanning list of
+ * registered formats and calling ->if_guess() method of each in turn.
+ */
+static int iam_format_guess(struct iam_container *c)
+{
+ int result;
+ struct iam_format *fmt;
+
+ /*
+ * XXX temporary initialization hook.
+ */
+ {
+ static int initialized = 0;
+
+ if (!initialized) {
+ iam_lvar_format_init();
+ iam_lfix_format_init();
+ initialized = 1;
+ }
+ }
+
+ result = -ENOENT;
+ list_for_each_entry(fmt, &iam_formats, if_linkage) {
+ result = fmt->if_guess(c);
+ if (result == 0)
+ break;
+ }
+ return result;
+}
+
+/*
+ * Initialize container @c.
+ */
+int iam_container_init(struct iam_container *c,
+ struct iam_descr *descr, struct inode *inode)
+{
+ memset(c, 0, sizeof *c);
+ c->ic_descr = descr;
+ c->ic_object = inode;
+ init_rwsem(&c->ic_sem);
+ return 0;
+}
+EXPORT_SYMBOL(iam_container_init);
+
+/*
+ * Determine container format.
+ */
+int iam_container_setup(struct iam_container *c)
+{
+ return iam_format_guess(c);
+}
+EXPORT_SYMBOL(iam_container_setup);
+
+/*
+ * Finalize container @c, release all resources.
+ */
+void iam_container_fini(struct iam_container *c)
+{
+}
+EXPORT_SYMBOL(iam_container_fini);
+
+void iam_path_init(struct iam_path *path, struct iam_container *c,
+ struct iam_path_descr *pd)
+{
+ memset(path, 0, sizeof *path);
+ path->ip_container = c;
+ path->ip_frame = path->ip_frames;
+ path->ip_data = pd;
+ path->ip_leaf.il_path = path;
+}
+
+static void iam_leaf_fini(struct iam_leaf *leaf);
+
+void iam_path_release(struct iam_path *path)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
+ if (path->ip_frames[i].bh != NULL) {
+ brelse(path->ip_frames[i].bh);
+ path->ip_frames[i].bh = NULL;
+ }
+ }
+}
+
+void iam_path_fini(struct iam_path *path)
+{
+ iam_leaf_fini(&path->ip_leaf);
+ iam_path_release(path);
+}
+
+
+void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
+{
+ int i;
+
+ path->ipc_hinfo = &path->ipc_hinfo_area;
+ for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
+ path->ipc_descr.ipd_key_scratch[i] =
+ (struct iam_ikey *)&path->ipc_scratch[i];
+
+ iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
+}
+
+void iam_path_compat_fini(struct iam_path_compat *path)
+{
+ iam_path_fini(&path->ipc_path);
+}
+
+/*
+ * Helper function initializing iam_path_descr and its key scratch area.
+ */
+struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
+{
+ struct iam_path_descr *ipd;
+ void *karea;
+ int i;
+
+ ipd = area;
+ karea = ipd + 1;
+ for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
+ ipd->ipd_key_scratch[i] = karea;
+ return ipd;
+}
+EXPORT_SYMBOL(iam_ipd_alloc);
+
+void iam_ipd_free(struct iam_path_descr *ipd)
+{
+}
+EXPORT_SYMBOL(iam_ipd_free);
+
+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+ handle_t *h, struct buffer_head **bh)
+{
+ int result = 0;
+
+ *bh = ldiskfs_bread(h, c->ic_object, (int)ptr, 0, &result);
+ if (*bh == NULL)
+ result = -EIO;
+ return result;
+}
+
+/*
+ * Return pointer to current leaf record. Pointer is valid while corresponding
+ * leaf node is locked and pinned.
+ */
+static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
+{
+ return iam_leaf_ops(leaf)->rec(leaf);
+}
+
+/*
+ * Return pointer to the current leaf key. This function returns pointer to
+ * the key stored in node.
+ *
+ * Caller should assume that returned pointer is only valid while leaf node is
+ * pinned and locked.
+ */
+static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
+{
+ return iam_leaf_ops(leaf)->key(leaf);
+}
+
+static int iam_leaf_key_size(const struct iam_leaf *leaf)
+{
+ return iam_leaf_ops(leaf)->key_size(leaf);
+}
+
+static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
+ struct iam_ikey *key)
+{
+ return iam_leaf_ops(leaf)->ikey(leaf, key);
+}
+
+static int iam_leaf_keycmp(const struct iam_leaf *leaf,
+ const struct iam_key *key)
+{
+ return iam_leaf_ops(leaf)->key_cmp(leaf, key);
+}
+
+static int iam_leaf_keyeq(const struct iam_leaf *leaf,
+ const struct iam_key *key)
+{
+ return iam_leaf_ops(leaf)->key_eq(leaf, key);
+}
+
+#if LDISKFS_INVARIANT_ON
+static int iam_leaf_check(struct iam_leaf *leaf);
+extern int dx_node_check(struct iam_path *p, struct iam_frame *f);
+
+static int iam_path_check(struct iam_path *p)
+{
+ int i;
+ int result;
+ struct iam_frame *f;
+ struct iam_descr *param;
+
+ result = 1;
+ param = iam_path_descr(p);
+ for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
+ f = &p->ip_frames[i];
+ if (f->bh != NULL) {
+ result = dx_node_check(p, f);
+ if (result)
+ result = !param->id_ops->id_node_check(p, f);
+ }
+ }
+ if (result && p->ip_leaf.il_bh != NULL)
+ result = iam_leaf_check(&p->ip_leaf);
+ if (result == 0) {
+ ldiskfs_std_error(iam_path_obj(p)->i_sb, result);
+ }
+ return result;
+}
+#endif
+
+static int iam_leaf_load(struct iam_path *path)
+{
+ iam_ptr_t block;
+ int err;
+ struct iam_container *c;
+ struct buffer_head *bh;
+ struct iam_leaf *leaf;
+ struct iam_descr *descr;
+
+ c = path->ip_container;
+ leaf = &path->ip_leaf;
+ descr = iam_path_descr(path);
+ block = path->ip_frame->leaf;
+ if (block == 0) {
+ /* XXX bug 11027 */
+ printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
+ (long unsigned)path->ip_frame->leaf,
+ dx_get_count(dx_node_get_entries(path, path->ip_frame)),
+ path->ip_frames[0].bh, path->ip_frames[1].bh,
+ path->ip_frames[2].bh);
+ }
+ err = descr->id_ops->id_node_read(c, block, NULL, &bh);
+ if (err == 0) {
+ leaf->il_bh = bh;
+ leaf->il_curidx = block;
+ err = iam_leaf_ops(leaf)->init(leaf);
+ assert_inv(ergo(err == 0, iam_leaf_check(leaf)));
+ }
+ return err;
+}
+
+static void iam_unlock_htree(struct inode *dir, struct dynlock_handle *lh)
+{
+ if (lh != NULL)
+ dynlock_unlock(&LDISKFS_I(dir)->i_htree_lock, lh);
+}
+
+
+static void iam_leaf_unlock(struct iam_leaf *leaf)
+{
+ if (leaf->il_lock != NULL) {
+ iam_unlock_htree(iam_leaf_container(leaf)->ic_object,
+ leaf->il_lock);
+ do_corr(schedule());
+ leaf->il_lock = NULL;
+ }
+}
+
+static void iam_leaf_fini(struct iam_leaf *leaf)
+{
+ if (leaf->il_path != NULL) {
+ iam_leaf_unlock(leaf);
+ assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf)));
+ iam_leaf_ops(leaf)->fini(leaf);
+ if (leaf->il_bh) {
+ brelse(leaf->il_bh);
+ leaf->il_bh = NULL;
+ leaf->il_curidx = 0;
+ }
+ }
+}
+
+static void iam_leaf_start(struct iam_leaf *folio)
+{
+ iam_leaf_ops(folio)->start(folio);
+}
+
+void iam_leaf_next(struct iam_leaf *folio)
+{
+ iam_leaf_ops(folio)->next(folio);
+}
+
+static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
+ const struct iam_rec *rec)
+{
+ iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
+}
+
+static void iam_rec_del(struct iam_leaf *leaf, int shift)
+{
+ iam_leaf_ops(leaf)->rec_del(leaf, shift);
+}
+
+int iam_leaf_at_end(const struct iam_leaf *leaf)
+{
+ return iam_leaf_ops(leaf)->at_end(leaf);
+}
+
+void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr)
+{
+ iam_leaf_ops(l)->split(l, bh, nr);
+}
+
+int iam_leaf_can_add(const struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ return iam_leaf_ops(l)->can_add(l, k, r);
+}
+
+#if LDISKFS_INVARIANT_ON
+static int iam_leaf_check(struct iam_leaf *leaf)
+{
+ return 1;
+#if 0
+ struct iam_lentry *orig;
+ struct iam_path *path;
+ struct iam_container *bag;
+ struct iam_ikey *k0;
+ struct iam_ikey *k1;
+ int result;
+ int first;
+
+ orig = leaf->il_at;
+ path = iam_leaf_path(leaf);
+ bag = iam_leaf_container(leaf);
+
+ result = iam_leaf_ops(leaf)->init(leaf);
+ if (result != 0)
+ return result;
+
+ first = 1;
+ iam_leaf_start(leaf);
+ k0 = iam_path_ikey(path, 0);
+ k1 = iam_path_ikey(path, 1);
+ while (!iam_leaf_at_end(leaf)) {
+ iam_ikeycpy(bag, k0, k1);
+ iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1));
+ if (!first && iam_ikeycmp(bag, k0, k1) > 0) {
+ return 0;
+ }
+ first = 0;
+ iam_leaf_next(leaf);
+ }
+ leaf->il_at = orig;
+ return 1;
+#endif
+}
+#endif
+
+static int iam_txn_dirty(handle_t *handle,
+ struct iam_path *path, struct buffer_head *bh)
+{
+ int result;
+
+ result = ldiskfs_journal_dirty_metadata(handle, bh);
+ if (result != 0)
+ ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
+ return result;
+}
+
+static int iam_txn_add(handle_t *handle,
+ struct iam_path *path, struct buffer_head *bh)
+{
+ int result;
+
+ result = ldiskfs_journal_get_write_access(handle, bh);
+ if (result != 0)
+ ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
+ return result;
+}
+
+/***********************************************************************/
+/* iterator interface */
+/***********************************************************************/
+
+static enum iam_it_state it_state(const struct iam_iterator *it)
+{
+ return it->ii_state;
+}
+
+/*
+ * Helper function returning scratch key.
+ */
+static struct iam_container *iam_it_container(const struct iam_iterator *it)
+{
+ return it->ii_path.ip_container;
+}
+
+static inline int it_keycmp(const struct iam_iterator *it,
+ const struct iam_key *k)
+{
+ return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
+}
+
+static inline int it_keyeq(const struct iam_iterator *it,
+ const struct iam_key *k)
+{
+ return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
+}
+
+static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
+{
+ return iam_ikeycmp(it->ii_path.ip_container,
+ iam_leaf_ikey(&it->ii_path.ip_leaf,
+ iam_path_ikey(&it->ii_path, 0)), ik);
+}
+
+static inline int it_at_rec(const struct iam_iterator *it)
+{
+ return !iam_leaf_at_end(&it->ii_path.ip_leaf);
+}
+
+static inline int it_before(const struct iam_iterator *it)
+{
+ return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
+}
+
+/*
+ * Helper wrapper around iam_it_get(): returns 0 (success) only when record
+ * with exactly the same key as asked is found.
+ */
+static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
+{
+ int result;
+
+ result = iam_it_get(it, k);
+ if (result > 0)
+ result = 0;
+ else if (result == 0)
+ /*
+ * Return -ENOENT if cursor is located above record with a key
+ * different from one specified, or in the empty leaf.
+ *
+ * XXX returning -ENOENT only works if iam_it_get() never
+ * returns -ENOENT as a legitimate error.
+ */
+ result = -ENOENT;
+ return result;
+}
+
+void iam_container_write_lock(struct iam_container *ic)
+{
+ down_write(&ic->ic_sem);
+}
+
+void iam_container_write_unlock(struct iam_container *ic)
+{
+ up_write(&ic->ic_sem);
+}
+
+void iam_container_read_lock(struct iam_container *ic)
+{
+ down_read(&ic->ic_sem);
+}
+
+void iam_container_read_unlock(struct iam_container *ic)
+{
+ up_read(&ic->ic_sem);
+}
+
+/*
+ * Initialize iterator to IAM_IT_DETACHED state.
+ *
+ * postcondition: it_state(it) == IAM_IT_DETACHED
+ */
+int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
+ struct iam_path_descr *pd)
+{
+ memset(it, 0, sizeof *it);
+ it->ii_flags = flags;
+ it->ii_state = IAM_IT_DETACHED;
+ iam_path_init(&it->ii_path, c, pd);
+ return 0;
+}
+EXPORT_SYMBOL(iam_it_init);
+
+/*
+ * Finalize iterator and release all resources.
+ *
+ * precondition: it_state(it) == IAM_IT_DETACHED
+ */
+void iam_it_fini(struct iam_iterator *it)
+{
+ assert_corr(it_state(it) == IAM_IT_DETACHED);
+ iam_path_fini(&it->ii_path);
+}
+EXPORT_SYMBOL(iam_it_fini);
+
+/*
+ * this locking primitives are used to protect parts
+ * of dir's htree. protection unit is block: leaf or index
+ */
+struct dynlock_handle *iam_lock_htree(struct inode *dir, unsigned long value,
+ enum dynlock_type lt)
+{
+ return dynlock_lock(&LDISKFS_I(dir)->i_htree_lock, value, lt, GFP_NOFS);
+}
+
+
+
+int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
+{
+ struct iam_frame *f;
+
+ for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
+ do_corr(schedule());
+ *lh = iam_lock_htree(iam_path_obj(path), f->curidx, DLT_READ);
+ if (*lh == NULL)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/*
+ * Fast check for frame consistency.
+ */
+static int iam_check_fast(struct iam_path *path, struct iam_frame *frame)
+{
+ struct iam_container *bag;
+ struct iam_entry *next;
+ struct iam_entry *last;
+ struct iam_entry *entries;
+ struct iam_entry *at;
+
+ bag = path->ip_container;
+ at = frame->at;
+ entries = frame->entries;
+ last = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
+
+ if (unlikely(at > last))
+ return -EAGAIN;
+
+ if (unlikely(dx_get_block(path, at) != frame->leaf))
+ return -EAGAIN;
+
+ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
+ path->ip_ikey_target) > 0))
+ return -EAGAIN;
+
+ next = iam_entry_shift(path, at, +1);
+ if (next <= last) {
+ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
+ path->ip_ikey_target) <= 0))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+int dx_index_is_compat(struct iam_path *path)
+{
+ return iam_path_descr(path) == NULL;
+}
+
+/*
+ * dx_find_position
+ *
+ * search position of specified hash in index
+ *
+ */
+
+struct iam_entry *iam_find_position(struct iam_path *path,
+ struct iam_frame *frame)
+{
+ int count;
+ struct iam_entry *p;
+ struct iam_entry *q;
+ struct iam_entry *m;
+
+ count = dx_get_count(frame->entries);
+ assert_corr(count && count <= dx_get_limit(frame->entries));
+ p = iam_entry_shift(path, frame->entries,
+ dx_index_is_compat(path) ? 1 : 2);
+ q = iam_entry_shift(path, frame->entries, count - 1);
+ while (p <= q) {
+ m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
+ if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
+ path->ip_ikey_target) > 0)
+ q = iam_entry_shift(path, m, -1);
+ else
+ p = iam_entry_shift(path, m, +1);
+ }
+ return iam_entry_shift(path, p, -1);
+}
+
+
+
+static iam_ptr_t iam_find_ptr(struct iam_path *path, struct iam_frame *frame)
+{
+ return dx_get_block(path, iam_find_position(path, frame));
+}
+
+void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
+ const struct iam_ikey *key, iam_ptr_t ptr)
+{
+ struct iam_entry *entries = frame->entries;
+ struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
+ int count = dx_get_count(entries);
+
+ /*
+ * Unfortunately we cannot assert this, as this function is sometimes
+ * called by VFS under i_sem and without pdirops lock.
+ */
+ assert_corr(1 || iam_frame_is_locked(path, frame));
+ assert_corr(count < dx_get_limit(entries));
+ assert_corr(frame->at < iam_entry_shift(path, entries, count));
+ assert_inv(dx_node_check(path, frame));
+
+ memmove(iam_entry_shift(path, new, 1), new,
+ (char *)iam_entry_shift(path, entries, count) - (char *)new);
+ dx_set_ikey(path, new, key);
+ dx_set_block(path, new, ptr);
+ dx_set_count(entries, count + 1);
+ assert_inv(dx_node_check(path, frame));
+}
+
+void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
+ const struct iam_ikey *key, iam_ptr_t ptr)
+{
+ iam_lock_bh(frame->bh);
+ iam_insert_key(path, frame, key, ptr);
+ iam_unlock_bh(frame->bh);
+}
+/*
+ * returns 0 if path was unchanged, -EAGAIN otherwise.
+ */
+static int iam_check_path(struct iam_path *path, struct iam_frame *frame)
+{
+ int equal;
+
+ iam_lock_bh(frame->bh);
+ equal = iam_check_fast(path, frame) == 0 ||
+ frame->leaf == iam_find_ptr(path, frame);
+ DX_DEVAL(iam_lock_stats.dls_bh_again += !equal);
+ iam_unlock_bh(frame->bh);
+
+ return equal ? 0 : -EAGAIN;
+}
+
+static int iam_lookup_try(struct iam_path *path)
+{
+ u32 ptr;
+ int err = 0;
+ int i;
+
+ struct iam_descr *param;
+ struct iam_frame *frame;
+ struct iam_container *c;
+
+ param = iam_path_descr(path);
+ c = path->ip_container;
+
+ ptr = param->id_ops->id_root_ptr(c);
+ for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
+ ++frame, ++i) {
+ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
+ &frame->bh);
+ do_corr(schedule());
+
+ iam_lock_bh(frame->bh);
+ /*
+ * node must be initialized under bh lock because concurrent
+ * creation procedure may change it and iam_lookup_try() will
+ * see obsolete tree height. -bzzz
+ */
+ if (err != 0)
+ break;
+
+ if (LDISKFS_INVARIANT_ON) {
+ err = param->id_ops->id_node_check(path, frame);
+ if (err != 0)
+ break;
+ }
+
+ err = param->id_ops->id_node_load(path, frame);
+ if (err != 0)
+ break;
+
+ assert_inv(dx_node_check(path, frame));
+ /*
+ * splitting may change root index block and move hash we're
+ * looking for into another index block so, we have to check
+ * this situation and repeat from begining if path got changed
+ * -bzzz
+ */
+ if (i > 0) {
+ err = iam_check_path(path, frame - 1);
+ if (err != 0)
+ break;
+ }
+
+ frame->at = iam_find_position(path, frame);
+ frame->curidx = ptr;
+ frame->leaf = ptr = dx_get_block(path, frame->at);
+
+ iam_unlock_bh(frame->bh);
+ do_corr(schedule());
+ }
+ if (err != 0)
+ iam_unlock_bh(frame->bh);
+ path->ip_frame = --frame;
+ return err;
+}
+
+static int __iam_path_lookup(struct iam_path *path)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
+ assert(path->ip_frames[i].bh == NULL);
+
+ do {
+ err = iam_lookup_try(path);
+ do_corr(schedule());
+ if (err != 0)
+ iam_path_fini(path);
+ } while (err == -EAGAIN);
+
+ return err;
+}
+
+/*
+ * returns 0 if path was unchanged, -EAGAIN otherwise.
+ */
+static int iam_check_full_path(struct iam_path *path, int search)
+{
+ struct iam_frame *bottom;
+ struct iam_frame *scan;
+ int i;
+ int result;
+
+ do_corr(schedule());
+
+ for (bottom = path->ip_frames, i = 0;
+ i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
+ ; /* find last filled in frame */
+ }
+
+ /*
+ * Lock frames, bottom to top.
+ */
+ for (scan = bottom - 1; scan >= path->ip_frames; --scan)
+ iam_lock_bh(scan->bh);
+ /*
+ * Check them top to bottom.
+ */
+ result = 0;
+ for (scan = path->ip_frames; scan < bottom; ++scan) {
+ struct iam_entry *pos;
+
+ if (search) {
+ if (iam_check_fast(path, scan) == 0)
+ continue;
+
+ pos = iam_find_position(path, scan);
+ if (scan->leaf != dx_get_block(path, pos)) {
+ result = -EAGAIN;
+ break;
+ }
+ scan->at = pos;
+ } else {
+ pos = iam_entry_shift(path, scan->entries,
+ dx_get_count(scan->entries) - 1);
+ if (scan->at > pos ||
+ scan->leaf != dx_get_block(path, scan->at)) {
+ result = -EAGAIN;
+ break;
+ }
+ }
+ }
+
+ /*
+ * Unlock top to bottom.
+ */
+ for (scan = path->ip_frames; scan < bottom; ++scan)
+ iam_unlock_bh(scan->bh);
+ DX_DEVAL(iam_lock_stats.dls_bh_full_again += !!result);
+ do_corr(schedule());
+
+ return result;
+}
+
+
+/*
+ * Performs path lookup and returns with found leaf (if any) locked by htree
+ * lock.
+ */
+int iam_lookup_lock(struct iam_path *path,
+ struct dynlock_handle **dl, enum dynlock_type lt)
+{
+ int result;
+ struct inode *dir;
+
+ dir = iam_path_obj(path);
+ while ((result = __iam_path_lookup(path)) == 0) {
+ do_corr(schedule());
+ *dl = iam_lock_htree(dir, path->ip_frame->leaf, lt);
+ if (*dl == NULL) {
+ iam_path_fini(path);
+ result = -ENOMEM;
+ break;
+ }
+ do_corr(schedule());
+ /*
+ * while locking leaf we just found may get split so we need
+ * to check this -bzzz
+ */
+ if (iam_check_full_path(path, 1) == 0)
+ break;
+ iam_unlock_htree(dir, *dl);
+ *dl = NULL;
+ iam_path_fini(path);
+ }
+ return result;
+}
+/*
+ * Performs tree top-to-bottom traversal starting from root, and loads leaf
+ * node.
+ */
+static int iam_path_lookup(struct iam_path *path, int index)
+{
+ struct iam_container *c;
+ struct iam_descr *descr;
+ struct iam_leaf *leaf;
+ int result;
+
+ c = path->ip_container;
+ leaf = &path->ip_leaf;
+ descr = iam_path_descr(path);
+ result = iam_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
+ assert_inv(iam_path_check(path));
+ do_corr(schedule());
+ if (result == 0) {
+ result = iam_leaf_load(path);
+ assert_inv(ergo(result == 0, iam_leaf_check(leaf)));
+ if (result == 0) {
+ do_corr(schedule());
+ if (index)
+ result = iam_leaf_ops(leaf)->
+ ilookup(leaf, path->ip_ikey_target);
+ else
+ result = iam_leaf_ops(leaf)->
+ lookup(leaf, path->ip_key_target);
+ do_corr(schedule());
+ }
+ if (result < 0)
+ iam_leaf_unlock(leaf);
+ }
+ return result;
+}
+
+/*
+ * Common part of iam_it_{i,}get().
+ */
+static int __iam_it_get(struct iam_iterator *it, int index)
+{
+ int result;
+ assert_corr(it_state(it) == IAM_IT_DETACHED);
+
+ result = iam_path_lookup(&it->ii_path, index);
+ if (result >= 0) {
+ int collision;
+
+ collision = result & IAM_LOOKUP_LAST;
+ switch (result & ~IAM_LOOKUP_LAST) {
+ case IAM_LOOKUP_EXACT:
+ result = +1;
+ it->ii_state = IAM_IT_ATTACHED;
+ break;
+ case IAM_LOOKUP_OK:
+ result = 0;
+ it->ii_state = IAM_IT_ATTACHED;
+ break;
+ case IAM_LOOKUP_BEFORE:
+ case IAM_LOOKUP_EMPTY:
+ result = 0;
+ it->ii_state = IAM_IT_SKEWED;
+ break;
+ default:
+ assert(0);
+ }
+ result |= collision;
+ }
+ /*
+ * See iam_it_get_exact() for explanation.
+ */
+ assert_corr(result != -ENOENT);
+ return result;
+}
+
+/*
+ * Correct hash, but not the same key was found, iterate through hash
+ * collision chain, looking for correct record.
+ */
+static int iam_it_collision(struct iam_iterator *it)
+{
+ int result;
+
+ assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
+
+ while ((result = iam_it_next(it)) == 0) {
+ do_corr(schedule());
+ if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
+ return -ENOENT;
+ if (it_keyeq(it, it->ii_path.ip_key_target))
+ return 0;
+ }
+ return result;
+}
+
+/*
+ * Attach iterator. After successful completion, @it points to record with
+ * least key not larger than @k.
+ *
+ * Return value: 0: positioned on existing record,
+ * +ve: exact position found,
+ * -ve: error.
+ *
+ * precondition: it_state(it) == IAM_IT_DETACHED
+ * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
+ * it_keycmp(it, k) <= 0)
+ */
+int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
+{
+ int result;
+ assert_corr(it_state(it) == IAM_IT_DETACHED);
+
+ it->ii_path.ip_ikey_target = NULL;
+ it->ii_path.ip_key_target = k;
+
+ result = __iam_it_get(it, 0);
+
+ if (result == IAM_LOOKUP_LAST) {
+ result = iam_it_collision(it);
+ if (result != 0) {
+ iam_it_put(it);
+ iam_it_fini(it);
+ result = __iam_it_get(it, 0);
+ } else
+ result = +1;
+ }
+ if (result > 0)
+ result &= ~IAM_LOOKUP_LAST;
+
+ assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
+ assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
+ it_keycmp(it, k) <= 0));
+ return result;
+}
+EXPORT_SYMBOL(iam_it_get);
+
+/*
+ * Attach iterator by index key.
+ */
+static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
+{
+ assert_corr(it_state(it) == IAM_IT_DETACHED);
+
+ it->ii_path.ip_ikey_target = k;
+ return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
+}
+
+/*
+ * Attach iterator, and assure it points to the record (not skewed).
+ *
+ * Return value: 0: positioned on existing record,
+ * +ve: exact position found,
+ * -ve: error.
+ *
+ * precondition: it_state(it) == IAM_IT_DETACHED &&
+ * !(it->ii_flags&IAM_IT_WRITE)
+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
+ */
+int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
+{
+ int result;
+ assert_corr(it_state(it) == IAM_IT_DETACHED &&
+ !(it->ii_flags&IAM_IT_WRITE));
+ result = iam_it_get(it, k);
+ if (result == 0) {
+ if (it_state(it) != IAM_IT_ATTACHED) {
+ assert_corr(it_state(it) == IAM_IT_SKEWED);
+ result = iam_it_next(it);
+ }
+ }
+ assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
+ return result;
+}
+EXPORT_SYMBOL(iam_it_get_at);
+
+/*
+ * Duplicates iterator.
+ *
+ * postcondition: it_state(dst) == it_state(src) &&
+ * iam_it_container(dst) == iam_it_container(src) &&
+ * dst->ii_flags = src->ii_flags &&
+ * ergo(it_state(src) == IAM_IT_ATTACHED,
+ * iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+ * iam_it_key_get(dst) == iam_it_key_get(src))
+ */
+void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
+{
+ dst->ii_flags = src->ii_flags;
+ dst->ii_state = src->ii_state;
+ /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
+ /*
+ * XXX: duplicate lock.
+ */
+ assert_corr(it_state(dst) == it_state(src));
+ assert_corr(iam_it_container(dst) == iam_it_container(src));
+ assert_corr(dst->ii_flags = src->ii_flags);
+ assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
+ iam_it_rec_get(dst) == iam_it_rec_get(src) &&
+ iam_it_key_get(dst) == iam_it_key_get(src)));
+
+}
+
+/*
+ * Detach iterator. Does nothing it detached state.
+ *
+ * postcondition: it_state(it) == IAM_IT_DETACHED
+ */
+void iam_it_put(struct iam_iterator *it)
+{
+ if (it->ii_state != IAM_IT_DETACHED) {
+ it->ii_state = IAM_IT_DETACHED;
+ iam_leaf_fini(&it->ii_path.ip_leaf);
+ }
+}
+EXPORT_SYMBOL(iam_it_put);
+
+static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
+ struct iam_ikey *ikey);
+
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary. Whether or not the search is necessary is
+ * controlled by the hash parameter. If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value. This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not. If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int iam_htree_advance(struct inode *dir, __u32 hash,
+ struct iam_path *path, __u32 *start_hash,
+ int compat)
+{
+ struct iam_frame *p;
+ struct buffer_head *bh;
+ int err, num_frames = 0;
+ __u32 bhash;
+
+ p = path->ip_frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+ * If we run out of entries in the interior node, loop around and
+ * increment pointer in the parent node. When we break out of
+ * this loop, num_frames indicates the number of interior
+ * nodes need to be read.
+ */
+ while (1) {
+ do_corr(schedule());
+ iam_lock_bh(p->bh);
+ p->at = iam_entry_shift(path, p->at, +1);
+ if (p->at < iam_entry_shift(path, p->entries,
+ dx_get_count(p->entries))) {
+ p->leaf = dx_get_block(path, p->at);
+ iam_unlock_bh(p->bh);
+ break;
+ }
+ iam_unlock_bh(p->bh);
+ if (p == path->ip_frames)
+ return 0;
+ num_frames++;
+ --p;
+ }
+
+ if (compat) {
+ /*
+ * Htree hash magic.
+ */
+ /*
+ * If the hash is 1, then continue only if the next page has a
+ * continuation hash of any value. This is used for readdir
+ * handling. Otherwise, check to see if the hash matches the
+ * desired contiuation hash. If it doesn't, return since
+ * there's no point to read in the successive index pages.
+ */
+ dx_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
+ if (start_hash)
+ *start_hash = bhash;
+ if ((hash & 1) == 0) {
+ if ((bhash & ~1) != hash)
+ return 0;
+ }
+ }
+ /*
+ * If the hash is HASH_NB_ALWAYS, we always go to the next
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+ iam_ptr_t idx;
+
+ do_corr(schedule());
+ iam_lock_bh(p->bh);
+ idx = p->leaf = dx_get_block(path, p->at);
+ iam_unlock_bh(p->bh);
+ err = iam_path_descr(path)->id_ops->
+ id_node_read(path->ip_container, idx, NULL, &bh);
+ if (err != 0)
+ return err; /* Failure */
+ ++p;
+ brelse(p->bh);
+ assert_corr(p->bh != bh);
+ p->bh = bh;
+ p->entries = dx_node_get_entries(path, p);
+ p->at = iam_entry_shift(path, p->entries, !compat);
+ assert_corr(p->curidx != idx);
+ p->curidx = idx;
+ iam_lock_bh(p->bh);
+ assert_corr(p->leaf != dx_get_block(path, p->at));
+ p->leaf = dx_get_block(path, p->at);
+ iam_unlock_bh(p->bh);
+ assert_inv(dx_node_check(path, p));
+ }
+ return 1;
+}
+
+
+static inline int iam_index_advance(struct iam_path *path)
+{
+ return iam_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
+}
+
+static void iam_unlock_array(struct inode *dir, struct dynlock_handle **lh)
+{
+ int i;
+
+ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
+ if (*lh != NULL) {
+ iam_unlock_htree(dir, *lh);
+ *lh = NULL;
+ }
+ }
+}
+/*
+ * Advance index part of @path to point to the next leaf. Returns 1 on
+ * success, 0, when end of container was reached. Leaf node is locked.
+ */
+int iam_index_next(struct iam_container *c, struct iam_path *path)
+{
+ iam_ptr_t cursor;
+ struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, };
+ int result;
+ struct inode *object;
+
+ /*
+ * Locking for iam_index_next()... is to be described.
+ */
+
+ object = c->ic_object;
+ cursor = path->ip_frame->leaf;
+
+ while (1) {
+ result = iam_index_lock(path, lh);
+ do_corr(schedule());
+ if (result < 0)
+ break;
+
+ result = iam_check_full_path(path, 0);
+ if (result == 0 && cursor == path->ip_frame->leaf) {
+ result = iam_index_advance(path);
+
+ assert_corr(result == 0 ||
+ cursor != path->ip_frame->leaf);
+ break;
+ }
+ do {
+ iam_unlock_array(object, lh);
+
+ iam_path_release(path);
+ do_corr(schedule());
+
+ result = __iam_path_lookup(path);
+ if (result < 0)
+ break;
+
+ while (path->ip_frame->leaf != cursor) {
+ do_corr(schedule());
+
+ result = iam_index_lock(path, lh);
+ do_corr(schedule());
+ if (result < 0)
+ break;
+
+ result = iam_check_full_path(path, 0);
+ if (result != 0)
+ break;
+
+ result = iam_index_advance(path);
+ if (result == 0) {
+ CERROR("cannot find cursor : %u\n",
+ cursor);
+ result = -EIO;
+ }
+ if (result < 0)
+ break;
+ result = iam_check_full_path(path, 0);
+ if (result != 0)
+ break;
+ iam_unlock_array(object, lh);
+ }
+ } while (result == -EAGAIN);
+ if (result < 0)
+ break;
+ }
+ iam_unlock_array(object, lh);
+ return result;
+}
+
+/*
+ * Move iterator one record right.
+ *
+ * Return value: 0: success,
+ * +1: end of container reached
+ * -ve: error
+ *
+ * precondition: (it_state(it) == IAM_IT_ATTACHED ||
+ * it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
+ * ergo(result > 0, it_state(it) == IAM_IT_DETACHED)
+ */
+int iam_it_next(struct iam_iterator *it)
+{
+ int result;
+ struct iam_path *path;
+ struct iam_leaf *leaf;
+ struct inode *obj;
+ do_corr(struct iam_ikey *ik_orig);
+
+ /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
+ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+ it_state(it) == IAM_IT_SKEWED);
+
+ path = &it->ii_path;
+ leaf = &path->ip_leaf;
+ obj = iam_path_obj(path);
+
+ assert_corr(iam_leaf_is_locked(leaf));
+
+ result = 0;
+ do_corr(ik_orig = it_at_rec(it) ?
+ iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
+ if (it_before(it)) {
+ assert_corr(!iam_leaf_at_end(leaf));
+ it->ii_state = IAM_IT_ATTACHED;
+ } else {
+ if (!iam_leaf_at_end(leaf))
+ /* advance within leaf node */
+ iam_leaf_next(leaf);
+ /*
+ * multiple iterations may be necessary due to empty leaves.
+ */
+ while (result == 0 && iam_leaf_at_end(leaf)) {
+ do_corr(schedule());
+ /* advance index portion of the path */
+ result = iam_index_next(iam_it_container(it), path);
+ assert_corr(iam_leaf_is_locked(leaf));
+ if (result == 1) {
+ struct dynlock_handle *lh;
+ lh = iam_lock_htree(obj, path->ip_frame->leaf,
+ DLT_WRITE);
+ if (lh != NULL) {
+ iam_leaf_fini(leaf);
+ leaf->il_lock = lh;
+ result = iam_leaf_load(path);
+ if (result == 0)
+ iam_leaf_start(leaf);
+ } else
+ result = -ENOMEM;
+ } else if (result == 0)
+ /* end of container reached */
+ result = +1;
+ if (result != 0)
+ iam_it_put(it);
+ }
+ if (result == 0)
+ it->ii_state = IAM_IT_ATTACHED;
+ }
+ assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
+ assert_corr(ergo(result > 0, it_state(it) == IAM_IT_DETACHED));
+ assert_corr(ergo(result == 0 && ik_orig != NULL,
+ it_ikeycmp(it, ik_orig) >= 0));
+ return result;
+}
+EXPORT_SYMBOL(iam_it_next);
+
+/*
+ * Return pointer to the record under iterator.
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
+ * postcondition: it_state(it) == IAM_IT_ATTACHED
+ */
+struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
+{
+ assert_corr(it_state(it) == IAM_IT_ATTACHED);
+ assert_corr(it_at_rec(it));
+ return iam_leaf_rec(&it->ii_path.ip_leaf);
+}
+EXPORT_SYMBOL(iam_it_rec_get);
+
+static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
+{
+ struct iam_leaf *folio;
+
+ folio = &it->ii_path.ip_leaf;
+ iam_leaf_ops(folio)->rec_set(folio, r);
+}
+
+/*
+ * Replace contents of record under iterator.
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED &&
+ * it->ii_flags&IAM_IT_WRITE
+ * postcondition: it_state(it) == IAM_IT_ATTACHED &&
+ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
+ */
+int iam_it_rec_set(handle_t *h,
+ struct iam_iterator *it, const struct iam_rec *r)
+{
+ int result;
+ struct iam_path *path;
+ struct buffer_head *bh;
+
+ assert_corr(it_state(it) == IAM_IT_ATTACHED &&
+ it->ii_flags&IAM_IT_WRITE);
+ assert_corr(it_at_rec(it));
+
+ path = &it->ii_path;
+ bh = path->ip_leaf.il_bh;
+ result = iam_txn_add(h, path, bh);
+ if (result == 0) {
+ iam_it_reccpy(it, r);
+ result = iam_txn_dirty(h, path, bh);
+ }
+ return result;
+}
+EXPORT_SYMBOL(iam_it_rec_set);
+
+/*
+ * Return pointer to the index key under iterator.
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED ||
+ * it_state(it) == IAM_IT_SKEWED
+ */
+static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
+ struct iam_ikey *ikey)
+{
+ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+ it_state(it) == IAM_IT_SKEWED);
+ assert_corr(it_at_rec(it));
+ return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
+}
+
+/*
+ * Return pointer to the key under iterator.
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED ||
+ * it_state(it) == IAM_IT_SKEWED
+ */
+struct iam_key *iam_it_key_get(const struct iam_iterator *it)
+{
+ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+ it_state(it) == IAM_IT_SKEWED);
+ assert_corr(it_at_rec(it));
+ return iam_leaf_key(&it->ii_path.ip_leaf);
+}
+EXPORT_SYMBOL(iam_it_key_get);
+
+/*
+ * Return size of key under iterator (in bytes)
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED ||
+ * it_state(it) == IAM_IT_SKEWED
+ */
+int iam_it_key_size(const struct iam_iterator *it)
+{
+ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+ it_state(it) == IAM_IT_SKEWED);
+ assert_corr(it_at_rec(it));
+ return iam_leaf_key_size(&it->ii_path.ip_leaf);
+}
+EXPORT_SYMBOL(iam_it_key_size);
+
+/*
+ * Insertion of new record. Interaction with jbd during non-trivial case (when
+ * split happens) is as following:
+ *
+ * - new leaf node is involved into transaction by ldiskfs_append();
+ *
+ * - old leaf node is involved into transaction by iam_add_rec();
+ *
+ * - leaf where insertion point ends in, is marked dirty by iam_add_rec();
+ *
+ * - leaf without insertion point is marked dirty (as @new_leaf) by
+ * iam_new_leaf();
+ *
+ * - split index nodes are involved into transaction and marked dirty by
+ * split_index_node().
+ *
+ * - "safe" index node, which is no split, but where new pointer is inserted
+ * is involved into transaction and marked dirty by split_index_node().
+ *
+ * - index node where pointer to new leaf is inserted is involved into
+ * transaction by split_index_node() and marked dirty by iam_add_rec().
+ *
+ * - inode is marked dirty by iam_add_rec().
+ *
+ */
+
+static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
+{
+ int err;
+ iam_ptr_t blknr;
+ struct buffer_head *new_leaf;
+ struct buffer_head *old_leaf;
+ struct iam_container *c;
+ struct inode *obj;
+ struct iam_path *path;
+
+ assert_inv(iam_leaf_check(leaf));
+
+ c = iam_leaf_container(leaf);
+ path = leaf->il_path;
+
+ obj = c->ic_object;
+ new_leaf = ldiskfs_append(handle, obj, (__u32 *)&blknr, &err);
+ do_corr(schedule());
+ if (new_leaf != NULL) {
+ struct dynlock_handle *lh;
+
+ lh = iam_lock_htree(obj, blknr, DLT_WRITE);
+ do_corr(schedule());
+ if (lh != NULL) {
+ iam_leaf_ops(leaf)->init_new(c, new_leaf);
+ do_corr(schedule());
+ old_leaf = leaf->il_bh;
+ iam_leaf_split(leaf, &new_leaf, blknr);
+ if (old_leaf != leaf->il_bh) {
+ /*
+ * Switched to the new leaf.
+ */
+ iam_leaf_unlock(leaf);
+ leaf->il_lock = lh;
+ path->ip_frame->leaf = blknr;
+ } else
+ iam_unlock_htree(obj, lh);
+ do_corr(schedule());
+ err = iam_txn_dirty(handle, path, new_leaf);
+ brelse(new_leaf);
+ if (err == 0)
+ err = ldiskfs_mark_inode_dirty(handle, obj);
+ do_corr(schedule());
+ } else
+ err = -ENOMEM;
+ }
+ assert_inv(iam_leaf_check(leaf));
+ assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf));
+ assert_inv(iam_path_check(iam_leaf_path(leaf)));
+ return err;
+}
+
+static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
+{
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static int iam_shift_entries(struct iam_path *path,
+ struct iam_frame *frame, unsigned count,
+ struct iam_entry *entries, struct iam_entry *entries2,
+ u32 newblock)
+{
+ unsigned count1;
+ unsigned count2;
+ int delta;
+
+ struct iam_frame *parent = frame - 1;
+ struct iam_ikey *pivot = iam_path_ikey(path, 3);
+
+ delta = dx_index_is_compat(path) ? 0 : +1;
+
+ count1 = count/2 + delta;
+ count2 = count - count1;
+ dx_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
+
+ dxtrace(printk("Split index %i/%i\n", count1, count2));
+
+ memcpy((char *) iam_entry_shift(path, entries2, delta),
+ (char *) iam_entry_shift(path, entries, count1),
+ count2 * iam_entry_size(path));
+
+ dx_set_count(entries2, count2 + delta);
+ dx_set_limit(entries2, dx_node_limit(path));
+
+ /*
+ * NOTE: very subtle piece of code competing dx_probe() may find 2nd
+ * level index in root index, then we insert new index here and set
+ * new count in that 2nd level index. so, dx_probe() may see 2nd level
+ * index w/o hash it looks for. the solution is to check root index
+ * after we locked just founded 2nd level index -bzzz
+ */
+ iam_insert_key_lock(path, parent, pivot, newblock);
+
+ /*
+ * now old and new 2nd level index blocks contain all pointers, so
+ * dx_probe() may find it in the both. it's OK -bzzz
+ */
+ iam_lock_bh(frame->bh);
+ dx_set_count(entries, count1);
+ iam_unlock_bh(frame->bh);
+
+ /*
+ * now old 2nd level index block points to first half of leafs. it's
+ * importand that dx_probe() must check root index block for changes
+ * under dx_lock_bh(frame->bh) -bzzz
+ */
+
+ return count1;
+}
+
+
+int split_index_node(handle_t *handle, struct iam_path *path,
+ struct dynlock_handle **lh)
+{
+
+ struct iam_entry *entries; /* old block contents */
+ struct iam_entry *entries2; /* new block contents */
+ struct iam_frame *frame, *safe;
+ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
+ u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
+ struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
+ struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
+ struct inode *dir = iam_path_obj(path);
+ struct iam_descr *descr;
+ int nr_splet;
+ int i, err;
+
+ descr = iam_path_descr(path);
+ /*
+ * Algorithm below depends on this.
+ */
+ assert_corr(dx_root_limit(path) < dx_node_limit(path));
+
+ frame = path->ip_frame;
+ entries = frame->entries;
+
+ /*
+ * Tall-tree handling: we might have to split multiple index blocks
+ * all the way up to tree root. Tricky point here is error handling:
+ * to avoid complicated undo/rollback we
+ *
+ * - first allocate all necessary blocks
+ *
+ * - insert pointers into them atomically.
+ */
+
+ /*
+ * Locking: leaf is already locked. htree-locks are acquired on all
+ * index nodes that require split bottom-to-top, on the "safe" node,
+ * and on all new nodes
+ */
+
+ dxtrace(printk("using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+
+ /* What levels need split? */
+ for (nr_splet = 0; frame >= path->ip_frames &&
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++nr_splet) {
+ do_corr(schedule());
+ if (nr_splet == DX_MAX_TREE_HEIGHT) {
+ /*
+ CWARN(dir->i_sb, __FUNCTION__,
+ "Directory index full!\n");
+ */
+ err = -ENOSPC;
+ goto cleanup;
+ }
+ }
+
+ safe = frame;
+
+ /*
+ * Lock all nodes, bottom to top.
+ */
+ for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
+ do_corr(schedule());
+ lock[i] = iam_lock_htree(dir, frame->curidx, DLT_WRITE);
+ if (lock[i] == NULL) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ }
+
+ /*
+ * Check for concurrent index modification.
+ */
+ err = iam_check_full_path(path, 1);
+ if (err)
+ goto cleanup;
+ /*
+ * And check that the same number of nodes is to be split.
+ */
+ for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
+ dx_get_count(frame->entries) == dx_get_limit(frame->entries);
+ --frame, ++i) {
+ ;
+ }
+ if (i != nr_splet) {
+ err = -EAGAIN;
+ goto cleanup;
+ }
+
+ /* Go back down, allocating blocks, locking them, and adding into
+ * transaction... */
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ bh_new[i] = ldiskfs_append (handle, dir, &newblock[i], &err);
+ do_corr(schedule());
+ if (!bh_new[i] ||
+ descr->id_ops->id_node_init(path->ip_container,
+ bh_new[i], 0) != 0)
+ goto cleanup;
+ new_lock[i] = iam_lock_htree(dir, newblock[i], DLT_WRITE);
+ if (new_lock[i] == NULL) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ do_corr(schedule());
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ldiskfs_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+ }
+ /* Add "safe" node to transaction too */
+ if (safe + 1 != path->ip_frames) {
+ do_corr(schedule());
+ err = ldiskfs_journal_get_write_access(handle, safe->bh);
+ if (err)
+ goto journal_error;
+ }
+
+ /* Go through nodes once more, inserting pointers */
+ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
+ unsigned count;
+ int idx;
+ struct buffer_head *bh2;
+ struct buffer_head *bh;
+
+ entries = frame->entries;
+ count = dx_get_count(entries);
+ idx = iam_entry_diff(path, frame->at, entries);
+
+ bh2 = bh_new[i];
+ entries2 = dx_get_entries(path, bh2->b_data, 0);
+
+ bh = frame->bh;
+ if (frame == path->ip_frames) {
+ /* splitting root node. Tricky point:
+ *
+ * In the "normal" B-tree we'd split root *and* add
+ * new root to the tree with pointers to the old root
+ * and its sibling (thus introducing two new nodes).
+ *
+ * In htree it's enough to add one node, because
+ * capacity of the root node is smaller than that of
+ * non-root one.
+ */
+ struct iam_frame *frames;
+ struct iam_entry *next;
+
+ assert_corr(i == 0);
+
+ do_corr(schedule());
+
+ frames = path->ip_frames;
+ memcpy((char *) entries2, (char *) entries,
+ count * iam_entry_size(path));
+ dx_set_limit(entries2, dx_node_limit(path));
+
+ /* Set up root */
+ iam_lock_bh(frame->bh);
+ next = descr->id_ops->id_root_inc(path->ip_container,
+ path, frame);
+ dx_set_block(path, next, newblock[0]);
+ iam_unlock_bh(frame->bh);
+
+ do_corr(schedule());
+ /* Shift frames in the path */
+ memmove(frames + 2, frames + 1,
+ (sizeof path->ip_frames) - 2 * sizeof frames[0]);
+ /* Add new access path frame */
+ frames[1].at = iam_entry_shift(path, entries2, idx);
+ frames[1].entries = entries = entries2;
+ frames[1].bh = bh2;
+ assert_inv(dx_node_check(path, frame));
+ ++ path->ip_frame;
+ ++ frame;
+ assert_inv(dx_node_check(path, frame));
+ bh_new[0] = NULL; /* buffer head is "consumed" */
+ err = ldiskfs_journal_get_write_access(handle, bh2);
+ if (err)
+ goto journal_error;
+ do_corr(schedule());
+ } else {
+ /* splitting non-root index node. */
+ struct iam_frame *parent = frame - 1;
+
+ do_corr(schedule());
+ count = iam_shift_entries(path, frame, count,
+ entries, entries2, newblock[i]);
+ /* Which index block gets the new entry? */
+ if (idx >= count) {
+ int d = dx_index_is_compat(path) ? 0 : +1;
+
+ frame->at = iam_entry_shift(path, entries2,
+ idx - count + d);
+ frame->entries = entries = entries2;
+ frame->curidx = newblock[i];
+ swap(frame->bh, bh2);
+ assert_corr(lock[i + 1] != NULL);
+ assert_corr(new_lock[i] != NULL);
+ swap(lock[i + 1], new_lock[i]);
+ bh_new[i] = bh2;
+ parent->at = iam_entry_shift(path,
+ parent->at, +1);
+ }
+ assert_inv(dx_node_check(path, frame));
+ assert_inv(dx_node_check(path, parent));
+ dxtrace(dx_show_index ("node", frame->entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ldiskfs_journal_dirty_metadata(handle, bh2);
+ if (err)
+ goto journal_error;
+ do_corr(schedule());
+ err = ldiskfs_journal_dirty_metadata(handle, parent->bh);
+ if (err)
+ goto journal_error;
+ }
+ do_corr(schedule());
+ err = ldiskfs_journal_dirty_metadata(handle, bh);
+ if (err)
+ goto journal_error;
+ }
+ /*
+ * This function was called to make insertion of new leaf
+ * possible. Check that it fulfilled its obligations.
+ */
+ assert_corr(dx_get_count(path->ip_frame->entries) <
+ dx_get_limit(path->ip_frame->entries));
+ assert_corr(lock[nr_splet] != NULL);
+ *lh = lock[nr_splet];
+ lock[nr_splet] = NULL;
+ if (nr_splet > 0) {
+ /*
+ * Log ->i_size modification.
+ */
+ err = ldiskfs_mark_inode_dirty(handle, dir);
+ if (err)
+ goto journal_error;
+ }
+ goto cleanup;
+journal_error:
+ ldiskfs_std_error(dir->i_sb, err);
+
+cleanup:
+ iam_unlock_array(dir, lock);
+ iam_unlock_array(dir, new_lock);
+
+ assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
+
+ do_corr(schedule());
+ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
+ if (bh_new[i] != NULL)
+ brelse(bh_new[i]);
+ }
+ return err;
+}
+
+static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
+ struct iam_path *path,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ int err;
+ struct iam_leaf *leaf;
+
+ leaf = &path->ip_leaf;
+ assert_inv(iam_leaf_check(leaf));
+ assert_inv(iam_path_check(path));
+ err = iam_txn_add(handle, path, leaf->il_bh);
+ if (err == 0) {
+ do_corr(schedule());
+ if (!iam_leaf_can_add(leaf, k, r)) {
+ struct dynlock_handle *lh = NULL;
+
+ do {
+ assert_corr(lh == NULL);
+ do_corr(schedule());
+ err = split_index_node(handle, path, &lh);
+ if (err == -EAGAIN) {
+ assert_corr(lh == NULL);
+
+ iam_path_fini(path);
+ it->ii_state = IAM_IT_DETACHED;
+
+ do_corr(schedule());
+ err = iam_it_get_exact(it, k);
+ if (err == -ENOENT)
+ err = +1; /* repeat split */
+ else if (err == 0)
+ err = -EEXIST;
+ }
+ } while (err > 0);
+ assert_inv(iam_path_check(path));
+ if (err == 0) {
+ assert_corr(lh != NULL);
+ do_corr(schedule());
+ err = iam_new_leaf(handle, leaf);
+ if (err == 0)
+ err = iam_txn_dirty(handle, path,
+ path->ip_frame->bh);
+ }
+ iam_unlock_htree(iam_path_obj(path), lh);
+ do_corr(schedule());
+ }
+ if (err == 0) {
+ iam_leaf_rec_add(leaf, k, r);
+ err = iam_txn_dirty(handle, path, leaf->il_bh);
+ }
+ }
+ assert_inv(iam_leaf_check(leaf));
+ assert_inv(iam_leaf_check(&path->ip_leaf));
+ assert_inv(iam_path_check(path));
+ return err;
+}
+
+/*
+ * Insert new record with key @k and contents from @r, shifting records to the
+ * right. On success, iterator is positioned on the newly inserted record.
+ *
+ * precondition: it->ii_flags&IAM_IT_WRITE &&
+ * (it_state(it) == IAM_IT_ATTACHED ||
+ * it_state(it) == IAM_IT_SKEWED) &&
+ * ergo(it_state(it) == IAM_IT_ATTACHED,
+ * it_keycmp(it, k) <= 0) &&
+ * ergo(it_before(it), it_keycmp(it, k) > 0));
+ * postcondition: ergo(result == 0,
+ * it_state(it) == IAM_IT_ATTACHED &&
+ * it_keycmp(it, k) == 0 &&
+ * !memcmp(iam_it_rec_get(it), r, ...))
+ */
+int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ int result;
+ struct iam_path *path;
+
+ path = &it->ii_path;
+
+ assert_corr(it->ii_flags&IAM_IT_WRITE);
+ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+ it_state(it) == IAM_IT_SKEWED);
+ assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
+ it_keycmp(it, k) <= 0));
+ assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
+ result = iam_add_rec(h, it, path, k, r);
+ if (result == 0)
+ it->ii_state = IAM_IT_ATTACHED;
+ assert_corr(ergo(result == 0,
+ it_state(it) == IAM_IT_ATTACHED &&
+ it_keycmp(it, k) == 0));
+ return result;
+}
+EXPORT_SYMBOL(iam_it_rec_insert);
+
+/*
+ * Delete record under iterator.
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED &&
+ * it->ii_flags&IAM_IT_WRITE &&
+ * it_at_rec(it)
+ * postcondition: it_state(it) == IAM_IT_ATTACHED ||
+ * it_state(it) == IAM_IT_DETACHED
+ */
+int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
+{
+ int result;
+ struct iam_leaf *leaf;
+ struct iam_path *path;
+
+ assert_corr(it_state(it) == IAM_IT_ATTACHED &&
+ it->ii_flags&IAM_IT_WRITE);
+ assert_corr(it_at_rec(it));
+
+ path = &it->ii_path;
+ leaf = &path->ip_leaf;
+
+ assert_inv(iam_leaf_check(leaf));
+ assert_inv(iam_path_check(path));
+
+ result = iam_txn_add(h, path, leaf->il_bh);
+ /*
+ * no compaction for now.
+ */
+ if (result == 0) {
+ iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
+ result = iam_txn_dirty(h, path, leaf->il_bh);
+ if (result == 0 && iam_leaf_at_end(leaf) &&
+ it->ii_flags&IAM_IT_MOVE) {
+ result = iam_it_next(it);
+ if (result > 0)
+ result = 0;
+ }
+ }
+ assert_inv(iam_leaf_check(leaf));
+ assert_inv(iam_path_check(path));
+ assert_corr(it_state(it) == IAM_IT_ATTACHED ||
+ it_state(it) == IAM_IT_DETACHED);
+ return result;
+}
+EXPORT_SYMBOL(iam_it_rec_delete);
+
+/*
+ * Convert iterator to cookie.
+ *
+ * precondition: it_state(it) == IAM_IT_ATTACHED &&
+ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+ * postcondition: it_state(it) == IAM_IT_ATTACHED
+ */
+iam_pos_t iam_it_store(const struct iam_iterator *it)
+{
+ iam_pos_t result;
+
+ assert_corr(it_state(it) == IAM_IT_ATTACHED);
+ assert_corr(it_at_rec(it));
+ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
+ sizeof result);
+
+ result = 0;
+ return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
+}
+EXPORT_SYMBOL(iam_it_store);
+
+/*
+ * Restore iterator from cookie.
+ *
+ * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
+ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
+ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
+ * iam_it_store(it) == pos)
+ */
+int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
+{
+ assert_corr(it_state(it) == IAM_IT_DETACHED &&
+ it->ii_flags&IAM_IT_MOVE);
+ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
+ return iam_it_iget(it, (struct iam_ikey *)&pos);
+}
+EXPORT_SYMBOL(iam_it_load);
+
+/***********************************************************************/
+/* invariants */
+/***********************************************************************/
+
+static inline int ptr_inside(void *base, size_t size, void *ptr)
+{
+ return (base <= ptr) && (ptr < base + size);
+}
+
+int iam_frame_invariant(struct iam_frame *f)
+{
+ return
+ (f->bh != NULL &&
+ f->bh->b_data != NULL &&
+ ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
+ ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
+ f->entries <= f->at);
+}
+int iam_leaf_invariant(struct iam_leaf *l)
+{
+ return
+ l->il_bh != NULL &&
+ l->il_bh->b_data != NULL &&
+ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
+ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
+ l->il_entries <= l->il_at;
+}
+
+int iam_path_invariant(struct iam_path *p)
+{
+ int i;
+
+ if (p->ip_container == NULL ||
+ p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
+ p->ip_frame != p->ip_frames + p->ip_indirect ||
+ !iam_leaf_invariant(&p->ip_leaf))
+ return 0;
+ for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
+ if (i <= p->ip_indirect) {
+ if (!iam_frame_invariant(&p->ip_frames[i]))
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int iam_it_invariant(struct iam_iterator *it)
+{
+ return
+ (it->ii_state == IAM_IT_DETACHED ||
+ it->ii_state == IAM_IT_ATTACHED ||
+ it->ii_state == IAM_IT_SKEWED) &&
+ !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
+ ergo(it->ii_state == IAM_IT_ATTACHED ||
+ it->ii_state == IAM_IT_SKEWED,
+ iam_path_invariant(&it->ii_path) &&
+ equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
+}
+
+/*
+ * Search container @c for record with key @k. If record is found, its data
+ * are moved into @r.
+ *
+ * Return values: 0: found, -ENOENT: not-found, -ve: error
+ */
+int iam_lookup(struct iam_container *c, const struct iam_key *k,
+ struct iam_rec *r, struct iam_path_descr *pd)
+{
+ struct iam_iterator it;
+ int result;
+
+ iam_it_init(&it, c, 0, pd);
+
+ result = iam_it_get_exact(&it, k);
+ if (result == 0)
+ /*
+ * record with required key found, copy it into user buffer
+ */
+ iam_reccpy(&it.ii_path.ip_leaf, r);
+ iam_it_put(&it);
+ iam_it_fini(&it);
+ return result;
+}
+EXPORT_SYMBOL(iam_lookup);
+
+/*
+ * Insert new record @r with key @k into container @c (within context of
+ * transaction @h).
+ *
+ * Return values: 0: success, -ve: error, including -EEXIST when record with
+ * given key is already present.
+ *
+ * postcondition: ergo(result == 0 || result == -EEXIST,
+ * iam_lookup(c, k, r2) > 0;
+ */
+int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
+ const struct iam_rec *r, struct iam_path_descr *pd)
+{
+ struct iam_iterator it;
+ int result;
+
+ iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+ result = iam_it_get_exact(&it, k);
+ if (result == -ENOENT)
+ result = iam_it_rec_insert(h, &it, k, r);
+ else if (result == 0)
+ result = -EEXIST;
+ iam_it_put(&it);
+ iam_it_fini(&it);
+ return result;
+}
+EXPORT_SYMBOL(iam_insert);
+
+/*
+ * Update record with the key @k in container @c (within context of
+ * transaction @h), new record is given by @r.
+ *
+ * Return values: 0: success, -ve: error, including -ENOENT if no record with
+ * the given key found.
+ */
+int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
+ const struct iam_rec *r, struct iam_path_descr *pd)
+{
+ struct iam_iterator it;
+ int result;
+
+ iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+ result = iam_it_get_exact(&it, k);
+ if (result == 0)
+ iam_it_rec_set(h, &it, r);
+ iam_it_put(&it);
+ iam_it_fini(&it);
+ return result;
+}
+EXPORT_SYMBOL(iam_update);
+
+/*
+ * Delete existing record with key @k.
+ *
+ * Return values: 0: success, -ENOENT: not-found, -ve: other error.
+ *
+ * postcondition: ergo(result == 0 || result == -ENOENT,
+ * !iam_lookup(c, k, *));
+ */
+int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
+ struct iam_path_descr *pd)
+{
+ struct iam_iterator it;
+ int result;
+
+ iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+ result = iam_it_get_exact(&it, k);
+ if (result == 0)
+ iam_it_rec_delete(h, &it);
+ iam_it_put(&it);
+ iam_it_fini(&it);
+ return result;
+}
+EXPORT_SYMBOL(iam_delete);
+
+int iam_root_limit(int rootgap, int blocksize, int size)
+{
+ int limit;
+ int nlimit;
+
+ limit = (blocksize - rootgap) / size;
+ nlimit = blocksize / size;
+ if (limit == nlimit)
+ limit--;
+ return limit;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osd_iam.c
+ * Top-level entry points into osd module
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LINUX_LUSTRE_IAM_H__
+#define __LINUX_LUSTRE_IAM_H__
+
+#include <linux/module.h>
+#include <asm/unaligned.h>
+#include <linux/dynlocks.h>
+
+/*
+ * linux/include/linux/osd_iam.h
+ */
+#ifndef CLASSERT
+#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0)
+#endif
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+enum {
+ /*
+ * Maximal number of non-leaf levels in htree. In the stock ldiskfs this
+ * is 2.
+ */
+ /*
+ * XXX reduced back to 2 to make per-node locking work.
+ */
+ DX_MAX_TREE_HEIGHT = 5,
+ /*
+ * Scratch keys used by generic code for temporaries.
+ *
+ * Allocation:
+ *
+ * [0] reserved for assertions and as a staging area for
+ * record keys immediately used for key comparisons.
+ *
+ * [1] reserved for record key, stored during iteration over
+ * node records (see dx_node_check()).
+ *
+ * [2] reserved for leaf node operations.
+ *
+ * [3] reserved for index operations.
+ *
+ * [4] reserved for path->ip_ikey_target
+ *
+ */
+ DX_SCRATCH_KEYS = 5,
+ /*
+ * Maximal format name length.
+ */
+ DX_FMT_NAME_LEN = 16,
+};
+
+#ifdef __KERNEL__
+/* handle_t, journal_start(), journal_stop() */
+#include <linux/jbd.h>
+
+/*
+ * Debugging.
+ *
+ * Various debugging levels.
+ */
+
+#if 0
+/*
+ * Following macros are defined in config.h and are tunable through
+ * appropriate configure switches (indicated below).
+ */
+
+/*
+ * Compile basic assertions in. You want this most of the time.
+ *
+ * --{enable,disable}-ldiskfs-assert (on by default).
+ */
+#define LDISKFS_ASSERT (1)
+
+/*
+ * Compile heavier correctness checks in. You want this during development
+ * cycle.
+ *
+ * --{enable,disable}-ldiskfs-correctness (off by default).
+ */
+#define LDISKFS_CORRECTNESS (1)
+
+/*
+ * Compile heavy invariant checking in. You want this early during development
+ * or when chasing a bug.
+ *
+ * --{enable,disable}-ldiskfs-invariant (off by default).
+ */
+#define LDISKFS_INVARIANT (1)
+#endif
+
+#if defined(LDISKFS_ASSERT)
+#define LDISKFS_ASSERT_ON (1)
+#else
+#define LDISKFS_ASSERT_ON (0)
+#endif
+
+#if defined(LDISKFS_CORRECTNESS)
+#define LDISKFS_CORRECTNESS_ON (1)
+#else
+#define LDISKFS_CORRECTNESS_ON (0)
+#endif
+
+#if defined(LDISKFS_INVARIANT)
+#define LDISKFS_INVARIANT_ON (1)
+#else
+#define LDISKFS_INVARIANT_ON (0)
+#endif
+
+#ifndef assert
+#if LDISKFS_ASSERT_ON
+#define assert(test) J_ASSERT(test)
+#else
+#define assert(test) ((void)(test))
+#endif
+#endif
+
+#if LDISKFS_CORRECTNESS_ON
+#define assert_corr(test) J_ASSERT(test)
+#define do_corr(exp) exp
+#else
+#define assert_corr(test) do {;} while (0)
+#define do_corr(exp) do {;} while (0)
+#endif
+
+#if LDISKFS_INVARIANT_ON
+#define assert_inv(test) J_ASSERT(test)
+#else
+#define assert_inv(test) do {;} while (0)
+#endif
+
+/*
+ * Entry within index tree node. Consists of a key immediately followed
+ * (without padding) by a pointer to the child node.
+ *
+ * Both key and pointer are of variable size, hence incomplete type.
+ */
+struct iam_entry;
+
+struct iam_entry_compat {
+ __le32 hash;
+ __le32 block;
+};
+
+/*
+ * Incomplete type used to refer to keys in iam container.
+ *
+ * As key size can be different from container to container, iam has to use
+ * incomplete type. Clients cast pointer to iam_key to real key type and back.
+ */
+struct iam_key;
+
+/*
+ * Incomplete type use to refer to the records stored in iam containers.
+ */
+struct iam_rec;
+
+/*
+ * Key in index node. Possibly compressed. Fixed size.
+ */
+struct iam_ikey;
+
+/*
+ * Scalar type into which certain iam_key's can be uniquely mapped. Used to
+ * support interfaces like readdir(), where iteration over index has to be
+ * re-startable.
+ */
+typedef __u32 iam_ptr_t;
+
+/*
+ * Index node traversed during tree lookup.
+ */
+struct iam_frame {
+ struct buffer_head *bh; /* buffer holding node data */
+ struct iam_entry *entries; /* array of entries */
+ struct iam_entry *at; /* target entry, found by binary search */
+ iam_ptr_t leaf; /* (logical) offset of child node found by
+ * binary search. */
+ iam_ptr_t curidx; /* (logical) offset of this node. Used to
+ * per-node locking to detect concurrent
+ * splits. */
+};
+
+/*
+ * Opaque entry in the leaf node.
+ */
+struct iam_lentry;
+
+struct iam_path;
+struct iam_container;
+
+
+/* leaf node reached by tree lookup */
+struct iam_leaf {
+ struct iam_path *il_path;
+ struct buffer_head *il_bh;
+ struct iam_lentry *il_entries;
+ struct iam_lentry *il_at;
+ /*
+ * Lock on a leaf node.
+ */
+ struct dynlock_handle *il_lock;
+ iam_ptr_t il_curidx; /* logical offset of leaf node. */
+ void *il_descr_data;
+};
+
+/*
+ * Return values of ->lookup() operation from struct iam_leaf_operations.
+ */
+enum iam_lookup_t {
+ /*
+ * lookup found a record with the key requested
+ */
+ IAM_LOOKUP_EXACT = 0,
+ /*
+ * lookup positioned leaf on some record
+ */
+ IAM_LOOKUP_OK = 1,
+ /*
+ * leaf was empty
+ */
+ IAM_LOOKUP_EMPTY = 2,
+ /*
+ * lookup positioned leaf before first record
+ */
+ IAM_LOOKUP_BEFORE = 3,
+ /*
+ * Found hash may have a continuation in the next leaf.
+ */
+ IAM_LOOKUP_LAST = 0x100
+};
+
+/*
+ * Format-specific container operations. These are called by generic iam code.
+ */
+struct iam_operations {
+ /*
+ * Returns pointer (in the same sense as pointer in index entry) to
+ * the root node.
+ */
+ __u32 (*id_root_ptr)(struct iam_container *c);
+
+ /*
+ * Check validity and consistency of index node.
+ */
+ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame);
+ /*
+ * Copy some data from node header into frame. This is called when
+ * new node is loaded into frame.
+ */
+ int (*id_node_load)(struct iam_path *path, struct iam_frame *frame);
+ /*
+ * Initialize new node (stored in @bh) that is going to be added into
+ * tree.
+ */
+ int (*id_node_init)(struct iam_container *c,
+ struct buffer_head *bh, int root);
+ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr,
+ handle_t *h, struct buffer_head **bh);
+ /*
+ * Key comparison functions. Returns -1, 0, +1.
+ */
+ int (*id_ikeycmp)(const struct iam_container *c,
+ const struct iam_ikey *k1,
+ const struct iam_ikey *k2);
+ /*
+ * Modify root node when tree height increases.
+ */
+ struct iam_entry *(*id_root_inc)(struct iam_container *c,
+ struct iam_path *path,
+ struct iam_frame *frame);
+
+ struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c,
+ void *area);
+ void (*id_ipd_free)(struct iam_path_descr *ipd);
+ /*
+ * Format name.
+ */
+ char id_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Another format-specific operation vector, consisting of methods to access
+ * leaf nodes. This is separated from struct iam_operations, because it is
+ * assumed that there will be many formats with different format of leaf
+ * nodes, yes the same struct iam_operations.
+ */
+struct iam_leaf_operations {
+ /*
+ * leaf operations.
+ */
+
+ /*
+ * initialize just loaded leaf node.
+ */
+ int (*init)(struct iam_leaf *p);
+ /*
+ * Format new node.
+ */
+ void (*init_new)(struct iam_container *c, struct buffer_head *bh);
+ /*
+ * Release resources.
+ */
+ void (*fini)(struct iam_leaf *l);
+ /*
+ * returns true iff leaf is positioned at the last entry.
+ */
+ int (*at_end)(const struct iam_leaf *l);
+ /* position leaf at the first entry */
+ void (*start)(struct iam_leaf *l);
+ /* more leaf to the next entry. */
+ void (*next)(struct iam_leaf *l);
+ /*
+ * return key of current leaf record. This method may return
+ * either pointer to the key stored in node, or copy key into
+ * @k buffer supplied by caller and return pointer to this
+ * buffer. The latter approach is used when keys in nodes are
+ * not stored in plain form (e.g., htree doesn't store keys at
+ * all).
+ *
+ * Caller should assume that returned pointer is only valid
+ * while leaf node is pinned and locked.
+ */
+ struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k);
+ struct iam_key *(*key)(const struct iam_leaf *l);
+ /* return pointer to entry body. Pointer is valid while
+ corresponding leaf node is locked and pinned. */
+ struct iam_rec *(*rec)(const struct iam_leaf *l);
+
+ void (*key_set)(struct iam_leaf *l, const struct iam_key *k);
+ void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r);
+ void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r);
+
+ int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
+ int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k);
+
+ int (*key_size)(const struct iam_leaf *l);
+ /*
+ * Search leaf @l for a record with key @k or for a place
+ * where such record is to be inserted.
+ *
+ * Scratch keys from @path can be used.
+ */
+ int (*lookup)(struct iam_leaf *l, const struct iam_key *k);
+ int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik);
+
+ int (*can_add)(const struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r);
+ /*
+ * add rec for a leaf
+ */
+ void (*rec_add)(struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r);
+ /*
+ * remove rec for a leaf
+ */
+ void (*rec_del)(struct iam_leaf *l, int shift);
+ /*
+ * split leaf node, moving some entries into @bh (the latter currently
+ * is assumed to be empty).
+ */
+ void (*split)(struct iam_leaf *l, struct buffer_head **bh,
+ iam_ptr_t newblknr);
+};
+
+/*
+ * Parameters, describing a flavor of iam container.
+ */
+struct iam_descr {
+ /*
+ * Size of a key in this container, in bytes.
+ */
+ size_t id_key_size;
+ /*
+ * Size of a key in index nodes, in bytes.
+ */
+ size_t id_ikey_size;
+ /*
+ * Size of a pointer to the next level (stored in index nodes), in
+ * bytes.
+ */
+ size_t id_ptr_size;
+ /*
+ * Size of a record (stored in leaf nodes), in bytes.
+ */
+ size_t id_rec_size;
+ /*
+ * Size of unused (by iam) space at the beginning of every non-root
+ * node, in bytes. Used for compatibility with ldiskfs.
+ */
+ size_t id_node_gap;
+ /*
+ * Size of unused (by iam) space at the beginning of root node, in
+ * bytes. Used for compatibility with ldiskfs.
+ */
+ size_t id_root_gap;
+
+ struct iam_operations *id_ops;
+ struct iam_leaf_operations *id_leaf_ops;
+};
+
+/*
+ * An instance of iam container.
+ */
+struct iam_container {
+ /*
+ * Underlying flat file. IO against this object is issued to
+ * read/write nodes.
+ */
+ struct inode *ic_object;
+ /*
+ * container flavor.
+ */
+ struct iam_descr *ic_descr;
+ /*
+ * read-write lock protecting index consistency.
+ */
+ struct rw_semaphore ic_sem;
+};
+
+/*
+ * description-specific part of iam_path. This is usually embedded into larger
+ * structure.
+ */
+struct iam_path_descr {
+ /*
+ * Scratch-pad area for temporary keys.
+ */
+ struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS];
+};
+
+/*
+ * Structure to keep track of a path drilled through htree.
+ */
+struct iam_path {
+ /*
+ * Parent container.
+ */
+ struct iam_container *ip_container;
+ /*
+ * Number of index levels minus one.
+ */
+ int ip_indirect;
+ /*
+ * Nodes that top-to-bottom traversal passed through.
+ */
+ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT];
+ /*
+ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one
+ * immediately above leaf).
+ */
+ struct iam_frame *ip_frame;
+ /*
+ * Leaf node: a child of ->ip_frame.
+ */
+ struct iam_leaf ip_leaf;
+ /*
+ * Key searched for.
+ */
+ const struct iam_key *ip_key_target;
+ const struct iam_ikey *ip_ikey_target;
+ /*
+ * Description-specific data.
+ */
+ struct iam_path_descr *ip_data;
+};
+
+struct ldiskfs_dx_hash_info;
+
+/*
+ * Helper structure for legacy htrees.
+ */
+struct iam_path_compat {
+ struct iam_path ipc_path;
+ struct iam_container ipc_container;
+ __u32 ipc_scratch[DX_SCRATCH_KEYS];
+ struct ldiskfs_dx_hash_info *ipc_hinfo;
+ struct qstr *ipc_qstr;
+ struct iam_path_descr ipc_descr;
+ struct ldiskfs_dx_hash_info ipc_hinfo_area;
+};
+
+#define const_max(p, q) ((p > q) ? p : q)
+
+enum {
+ DX_MAX_IKEY_SIZE = 32, /* be generous */
+ /*
+ * Hack to avoid dynamic allocation and freeing of ipd.
+ */
+ DX_IPD_MAX_SIZE = const_max(sizeof(struct iam_path_compat),
+ DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS +
+ sizeof(struct iam_path_descr))
+};
+
+/*
+ * iam cursor (iterator) api.
+ */
+
+/*
+ * States of iterator state machine.
+ */
+enum iam_it_state {
+ /* initial state */
+ IAM_IT_DETACHED,
+ /* iterator is above particular record in the container */
+ IAM_IT_ATTACHED,
+ /* iterator is positioned before record */
+ IAM_IT_SKEWED
+};
+
+/*
+ * Flags controlling iterator functionality.
+ */
+enum iam_it_flags {
+ /*
+ * this iterator will move (iam_it_next() will be called on it)
+ */
+ IAM_IT_MOVE = (1 << 0),
+ /*
+ * tree can be updated through this iterator.
+ */
+ IAM_IT_WRITE = (1 << 1)
+};
+
+/*
+ * Iterator.
+ *
+ * Immediately after call to iam_it_init() iterator is in "detached"
+ * (IAM_IT_DETACHED) state: it is associated with given parent container, but
+ * doesn't point to any particular record in this container.
+ *
+ * After successful call to iam_it_get() and until corresponding call to
+ * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or
+ * IAM_IT_SKEWED.
+ *
+ * Active iterator can move through records in a container (provided
+ * IAM_IT_MOVE permission) in a key order, can get record and key values as it
+ * passes over them, and can modify container (provided IAM_IT_WRITE
+ * permission).
+ *
+ * Iteration may reach the end of container, at which point iterator switches
+ * into IAM_IT_DETACHED state.
+ *
+ * Concurrency: iterators are supposed to be local to thread. Interfaces below
+ * do no internal serialization of access to the iterator fields.
+ *
+ * When in non-detached state, iterator keeps some container nodes pinned in
+ * memory and locked (that locking may be implemented at the container
+ * granularity though). In particular, clients may assume that pointers to
+ * records and keys obtained through iterator interface as valid until
+ * iterator is detached (except that they may be invalidated by sub-sequent
+ * operations done through the same iterator).
+ *
+ */
+struct iam_iterator {
+ /*
+ * iterator flags, taken from enum iam_it_flags.
+ */
+ __u32 ii_flags;
+ enum iam_it_state ii_state;
+ /*
+ * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED
+ * states.
+ */
+ struct iam_path ii_path;
+};
+
+void iam_path_init(struct iam_path *path, struct iam_container *c,
+ struct iam_path_descr *pd);
+void iam_path_fini(struct iam_path *path);
+void iam_path_release(struct iam_path *path);
+
+void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode);
+void iam_path_compat_fini(struct iam_path_compat *path);
+
+struct iam_path_descr *iam_ipd_alloc(void *area, int keysize);
+void iam_ipd_free(struct iam_path_descr *ipd);
+
+int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
+ struct iam_path_descr *pd);
+void iam_it_fini(struct iam_iterator *it);
+int iam_it_get(struct iam_iterator *it, const struct iam_key *k);
+int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k);
+void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src);
+void iam_it_put(struct iam_iterator *it);
+int iam_it_next(struct iam_iterator *it);
+struct iam_rec *iam_it_rec_get(const struct iam_iterator *it);
+int iam_it_rec_set(handle_t *h,
+ struct iam_iterator *it, const struct iam_rec *r);
+struct iam_key *iam_it_key_get(const struct iam_iterator *it);
+int iam_it_key_size(const struct iam_iterator *it);
+int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
+ const struct iam_key *k, const struct iam_rec *r);
+int iam_it_rec_delete(handle_t *h, struct iam_iterator *it);
+
+typedef __u64 iam_pos_t;
+
+iam_pos_t iam_it_store(const struct iam_iterator *it);
+int iam_it_load(struct iam_iterator *it, iam_pos_t pos);
+
+int iam_lookup(struct iam_container *c, const struct iam_key *k,
+ struct iam_rec *r, struct iam_path_descr *pd);
+int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
+ struct iam_path_descr *pd);
+int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
+ const struct iam_rec *r, struct iam_path_descr *pd);
+int iam_insert(handle_t *handle, struct iam_container *c,
+ const struct iam_key *k,
+ const struct iam_rec *r, struct iam_path_descr *pd);
+/*
+ * Initialize container @c.
+ */
+int iam_container_init(struct iam_container *c,
+ struct iam_descr *descr, struct inode *inode);
+/*
+ * Finalize container @c, release all resources.
+ */
+void iam_container_fini(struct iam_container *c);
+
+/*
+ * Determine container format.
+ */
+int iam_container_setup(struct iam_container *c);
+
+static inline struct iam_descr *iam_container_descr(struct iam_container *c)
+{
+ return c->ic_descr;
+}
+
+static inline struct iam_descr *iam_path_descr(const struct iam_path *p)
+{
+ return p->ip_container->ic_descr;
+}
+
+static inline struct inode *iam_path_obj(struct iam_path *p)
+{
+ return p->ip_container->ic_object;
+}
+
+static inline void iam_ikeycpy(const struct iam_container *c,
+ struct iam_ikey *k1, const struct iam_ikey *k2)
+{
+ memcpy(k1, k2, c->ic_descr->id_ikey_size);
+}
+
+static inline size_t iam_entry_size(struct iam_path *p)
+{
+ return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size;
+}
+
+static inline struct iam_entry *iam_entry_shift(struct iam_path *p,
+ struct iam_entry *entry,
+ int shift)
+{
+ void *e = entry;
+ return e + shift * iam_entry_size(p);
+}
+
+static inline struct iam_ikey *dx_get_ikey(struct iam_path *p,
+ struct iam_entry *entry,
+ struct iam_ikey *key)
+{
+ return memcpy(key, entry, iam_path_descr(p)->id_ikey_size);
+}
+
+static inline struct iam_ikey *iam_ikey_at(struct iam_path *p,
+ struct iam_entry *entry)
+{
+ return (struct iam_ikey *)entry;
+}
+
+static inline ptrdiff_t iam_entry_diff(struct iam_path *p,
+ struct iam_entry *e1,
+ struct iam_entry *e2)
+{
+ ptrdiff_t diff;
+
+ diff = (void *)e1 - (void *)e2;
+ assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff);
+ return diff / iam_entry_size(p);
+}
+
+/*
+ * Helper for the frequent case, where key was already placed into @k1 by
+ * callback.
+ */
+static inline void iam_ikeycpy0(const struct iam_container *c,
+ struct iam_ikey *k1, const struct iam_ikey *k2)
+{
+ if (k1 != k2)
+ iam_ikeycpy(c, k1, k2);
+}
+
+static inline int iam_ikeycmp(const struct iam_container *c,
+ const struct iam_ikey *k1,
+ const struct iam_ikey *k2)
+{
+ return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2);
+}
+
+static inline void *iam_entry_off(struct iam_entry *entry, size_t off)
+{
+ return (void *)((char *)entry + off);
+}
+
+/*
+ * Leaf helpers.
+ */
+
+static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf)
+{
+ return leaf->il_path;
+}
+
+static inline struct iam_container *
+iam_leaf_container(const struct iam_leaf *leaf)
+{
+ return iam_leaf_path(leaf)->ip_container;
+}
+
+static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf)
+{
+ return iam_leaf_container(leaf)->ic_descr;
+}
+
+static inline struct iam_leaf_operations *
+iam_leaf_ops(const struct iam_leaf *leaf)
+{
+ return iam_leaf_descr(leaf)->id_leaf_ops;
+}
+
+static inline void iam_reccpy(const struct iam_leaf *leaf,
+ struct iam_rec *rec_dst)
+{
+ iam_leaf_ops(leaf)->rec_get(leaf, rec_dst);
+}
+
+/*XXX These stuff put here, just because they are used by iam.c */
+static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry)
+{
+ u32 *addr;
+
+ addr = iam_entry_off(entry, iam_path_descr(p)->id_ikey_size);
+ return le32_to_cpu(get_unaligned(addr)) & 0x00ffffff;
+}
+
+static inline void dx_set_block(struct iam_path *p,
+ struct iam_entry *entry, unsigned value)
+{
+ u32 *addr;
+
+ addr = iam_entry_off(entry, iam_path_descr(p)->id_ikey_size);
+ put_unaligned(cpu_to_le32(value), addr);
+}
+
+static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry,
+ const struct iam_ikey *key)
+{
+ iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key);
+}
+
+struct dx_map_entry
+{
+ u32 hash;
+ u32 offs;
+};
+
+struct fake_dirent {
+ __le32 inode;
+ __le16 rec_len;
+ u8 name_len;
+ u8 file_type;
+};
+
+struct dx_countlimit {
+ __le16 limit;
+ __le16 count;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero. Therefore, the
+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
+ */
+
+struct dx_root {
+ struct fake_dirent dot;
+ char dot_name[4];
+ struct fake_dirent dotdot;
+ char dotdot_name[4];
+ struct dx_root_info
+ {
+ __le32 reserved_zero;
+ u8 hash_version;
+ u8 info_length; /* 8 */
+ u8 indirect_levels;
+ u8 unused_flags;
+ }
+ info;
+ struct {} entries[0];
+};
+
+struct dx_node
+{
+ struct fake_dirent fake;
+ struct {} entries[0];
+};
+
+
+static inline unsigned dx_get_count(struct iam_entry *entries)
+{
+ return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit(struct iam_entry *entries)
+{
+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count(struct iam_entry *entries, unsigned value)
+{
+ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline unsigned dx_node_limit(struct iam_path *p)
+{
+ struct iam_descr *param = iam_path_descr(p);
+ unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize -
+ param->id_node_gap;
+ return entry_space / (param->id_ikey_size + param->id_ptr_size);
+}
+
+static inline unsigned dx_root_limit(struct iam_path *p)
+{
+ struct iam_descr *param = iam_path_descr(p);
+ unsigned limit = iam_path_obj(p)->i_sb->s_blocksize -
+ param->id_root_gap;
+ limit /= (param->id_ikey_size + param->id_ptr_size);
+ if (limit == dx_node_limit(p))
+ limit--;
+ return limit;
+}
+
+
+static inline struct iam_entry *dx_get_entries(struct iam_path *path,
+ void *data, int root)
+{
+ struct iam_descr *param = iam_path_descr(path);
+ return data + (root ? param->id_root_gap : param->id_node_gap);
+}
+
+
+static inline struct iam_entry *dx_node_get_entries(struct iam_path *path,
+ struct iam_frame *frame)
+{
+ return dx_get_entries(path,
+ frame->bh->b_data, frame == path->ip_frames);
+}
+
+static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path,
+ int nr)
+{
+ assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch));
+ return path->ip_data->ipd_key_scratch[nr];
+}
+
+static inline struct dynlock *path_dynlock(struct iam_path *path)
+{
+ return &LDISKFS_I(iam_path_obj(path))->i_htree_lock;
+}
+
+static inline int iam_leaf_is_locked(const struct iam_leaf *leaf)
+{
+ int result;
+
+ result = dynlock_is_locked(path_dynlock(leaf->il_path),
+ leaf->il_curidx);
+ if (!result)
+ dump_stack();
+ return result;
+}
+
+static inline int iam_frame_is_locked(struct iam_path *path,
+ const struct iam_frame *frame)
+{
+ int result;
+
+ result = dynlock_is_locked(path_dynlock(path), frame->curidx);
+ if (!result)
+ dump_stack();
+ return result;
+}
+
+int dx_lookup_lock(struct iam_path *path,
+ struct dynlock_handle **dl, enum dynlock_type lt);
+
+void dx_insert_block(struct iam_path *path, struct iam_frame *frame,
+ u32 hash, u32 block);
+int dx_index_is_compat(struct iam_path *path);
+
+int ldiskfs_htree_next_block(struct inode *dir, __u32 hash,
+ struct iam_path *path, __u32 *start_hash);
+
+struct buffer_head *ldiskfs_append(handle_t *handle, struct inode *inode,
+ u32 *block, int *err);
+int split_index_node(handle_t *handle, struct iam_path *path,
+ struct dynlock_handle **lh);
+struct ldiskfs_dir_entry_2 *split_entry(struct inode *dir,
+ struct ldiskfs_dir_entry_2 *de,
+ unsigned long ino, mode_t mode,
+ const char *name, int namelen);
+struct ldiskfs_dir_entry_2 *find_insertion_point(struct inode *dir,
+ struct buffer_head *bh,
+ const char *name, int namelen);
+struct ldiskfs_dir_entry_2 *move_entries(struct inode *dir,
+ struct ldiskfs_dx_hash_info *hinfo,
+ struct buffer_head **bh1,
+ struct buffer_head **bh2,
+ __u32 *delim_hash);
+
+extern struct iam_descr iam_htree_compat_param;
+
+struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value,
+ enum dynlock_type lt);
+void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh);
+
+/*
+ * external
+ */
+void iam_container_write_lock(struct iam_container *c);
+void iam_container_write_unlock(struct iam_container *c);
+
+void iam_container_read_lock(struct iam_container *c);
+void iam_container_read_unlock(struct iam_container *c);
+
+int iam_index_next(struct iam_container *c, struct iam_path *p);
+int iam_read_leaf(struct iam_path *p);
+
+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+ handle_t *handle, struct buffer_head **bh);
+int iam_lvar_create(struct inode *obj, int keysize, int ptrsize,
+ int recsize, handle_t *handle);
+int iam_lfix_create(struct inode *obj, int keysize, int ptrsize,
+ int recsize, handle_t *handle);
+
+#ifndef swap
+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+#define BH_DXLock 25
+#define DX_DEBUG (1)
+#if DX_DEBUG
+static struct iam_lock_stats {
+ unsigned dls_bh_lock;
+ unsigned dls_bh_busy;
+ unsigned dls_bh_again;
+ unsigned dls_bh_full_again;
+} iam_lock_stats = { 0, };
+#define DX_DEVAL(x) x
+#else
+#define DX_DEVAL(x)
+#endif
+
+static inline void iam_lock_bh(struct buffer_head volatile *bh)
+{
+ DX_DEVAL(iam_lock_stats.dls_bh_lock++);
+#ifdef CONFIG_SMP
+ while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
+ DX_DEVAL(iam_lock_stats.dls_bh_busy++);
+ while (test_bit(BH_DXLock, &bh->b_state))
+ cpu_relax();
+ }
+#endif
+}
+
+static inline void iam_unlock_bh(struct buffer_head *bh)
+{
+#ifdef CONFIG_SMP
+ smp_mb__before_clear_bit();
+ clear_bit(BH_DXLock, &bh->b_state);
+#endif
+}
+
+
+void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
+ const struct iam_ikey *key, iam_ptr_t ptr);
+
+void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
+ const struct iam_ikey *key, iam_ptr_t ptr);
+
+
+int iam_leaf_at_end(const struct iam_leaf *l);
+void iam_leaf_next(struct iam_leaf *folio);
+int iam_leaf_can_add(const struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r);
+
+struct iam_path *iam_leaf_path(const struct iam_leaf *leaf);
+struct iam_container *iam_leaf_container(const struct iam_leaf *leaf);
+struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf);
+struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf);
+
+
+int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
+ handle_t *h, struct buffer_head **bh);
+
+/*
+ * Container format.
+ */
+struct iam_format {
+ /*
+ * Method called to recognize container format. Should return true iff
+ * container @c conforms to this format. This method may do IO to read
+ * container pages.
+ *
+ * If container is recognized, this method sets operation vectors
+ * ->id_ops and ->id_leaf_ops in container description (c->ic_descr),
+ * and fills other description fields.
+ */
+ int (*if_guess)(struct iam_container *c);
+ /*
+ * Linkage into global list of container formats.
+ */
+ struct list_head if_linkage;
+};
+
+void iam_format_register(struct iam_format *fmt);
+int iam_root_limit(int rootgap, int blocksize, int size);
+
+void iam_lfix_format_init(void);
+void iam_lvar_format_init(void);
+void iam_htree_format_init(void);
+
+struct iam_private_info;
+
+void ldiskfs_iam_release(struct file *filp, struct inode *inode);
+
+int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
+ unsigned long arg);
+
+/* dir.c
+#if LDISKFS_INVARIANT_ON
+extern int ldiskfs_check_dir_entry(const char *, struct inode *,
+ struct ldiskfs_dir_entry_2 *,
+ struct buffer_head *, unsigned long);
+#else
+static inline int ldiskfs_check_dir_entry(const char * function,
+ struct inode * dir,
+ struct ldiskfs_dir_entry_2 * de,
+ struct buffer_head * bh,
+ unsigned long offset)
+{
+ return 1;
+}
+#endif
+*/
+
+/* __KERNEL__ */
+#endif
+
+/*
+ * User level API. Copy exists in lustre/lustre/tests/iam_ut.c
+ */
+
+struct iam_uapi_info {
+ __u16 iui_keysize;
+ __u16 iui_recsize;
+ __u16 iui_ptrsize;
+ __u16 iui_height;
+ char iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+struct iam_uapi_op {
+ void *iul_key;
+ void *iul_rec;
+};
+
+struct iam_uapi_it {
+ struct iam_uapi_op iui_op;
+ __u16 iui_state;
+};
+
+enum iam_ioctl_cmd {
+ IAM_IOC_INIT = _IOW('i', 1, struct iam_uapi_info),
+ IAM_IOC_GETINFO = _IOR('i', 2, struct iam_uapi_info),
+ IAM_IOC_INSERT = _IOR('i', 3, struct iam_uapi_op),
+ IAM_IOC_LOOKUP = _IOWR('i', 4, struct iam_uapi_op),
+ IAM_IOC_DELETE = _IOR('i', 5, struct iam_uapi_op),
+ IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it),
+ IAM_IOC_IT_NEXT = _IOW('i', 7, struct iam_uapi_it),
+ IAM_IOC_IT_STOP = _IOR('i', 8, struct iam_uapi_it),
+
+ IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long)
+};
+
+/* __LINUX_LUSTRE_IAM_H__ */
+#endif
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * iam_lfix.c
+ * implementation of iam format for fixed size records.
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/types.h>
+#include <linux/jbd.h>
+/* ldiskfs_error() */
+#include <linux/ldiskfs_fs.h>
+#include "osd_internal.h"
+
+/*
+ * Leaf operations.
+ */
+
+enum {
+ IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in
+ * lustre/utils/create_iam.c */
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct iam_leaf_head {
+ __le16 ill_magic;
+ __le16 ill_count;
+};
+
+static inline int iam_lfix_entry_size(const struct iam_leaf *l)
+{
+ return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size;
+}
+
+static inline struct iam_lentry *
+iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift)
+{
+ return (void *)entry + shift * iam_lfix_entry_size(l);
+}
+
+static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry)
+{
+ return (struct iam_key *)entry;
+}
+
+static inline int lfix_keycmp(const struct iam_container *c,
+ const struct iam_key *k1,
+ const struct iam_key *k2)
+{
+ return memcmp(k1, k2, c->ic_descr->id_key_size);
+}
+
+static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l)
+{
+ return (struct iam_leaf_head *)l->il_bh->b_data;
+}
+
+static struct iam_lentry *iam_entries(const struct buffer_head *bh)
+{
+ return (void *)bh->b_data + sizeof(struct iam_leaf_head);
+}
+
+static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l)
+{
+ return iam_entries(l->il_bh);
+}
+
+static int leaf_count_limit(const struct iam_leaf *leaf)
+{
+ int free_space;
+
+ free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
+ free_space -= sizeof(struct iam_leaf_head);
+ return free_space / iam_lfix_entry_size(leaf);
+}
+
+static int lentry_count_get(const struct iam_leaf *leaf)
+{
+ return le16_to_cpu(iam_get_head(leaf)->ill_count);
+}
+
+static void lentry_count_set(struct iam_leaf *leaf, unsigned count)
+{
+ assert_corr(0 <= count && count <= leaf_count_limit(leaf));
+ iam_get_head(leaf)->ill_count = cpu_to_le16(count);
+}
+
+static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l);
+
+#if LDISKFS_CORRECTNESS_ON || LDISKFS_INVARIANT_ON
+static int iam_leaf_at_rec(const struct iam_leaf *folio)
+{
+ return
+ iam_get_lentries(folio) <= folio->il_at &&
+ folio->il_at < iam_lfix_get_end(folio);
+}
+#endif
+
+static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l,
+ struct iam_ikey *key)
+{
+ void *ie = l->il_at;
+ assert_corr(iam_leaf_at_rec(l));
+ return (struct iam_ikey*)ie;
+}
+
+static struct iam_key *iam_lfix_key(const struct iam_leaf *l)
+{
+ void *ie = l->il_at;
+ assert_corr(iam_leaf_at_rec(l));
+ return (struct iam_key*)ie;
+}
+
+static int iam_lfix_key_size(const struct iam_leaf *l)
+{
+ return iam_leaf_descr(l)->id_key_size;
+}
+
+static void iam_lfix_start(struct iam_leaf *l)
+{
+ l->il_at = iam_get_lentries(l);
+}
+
+static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l,
+ const struct iam_lentry *e1,
+ const struct iam_lentry *e2)
+{
+ ptrdiff_t diff;
+ int esize;
+
+ esize = iam_lfix_entry_size(l);
+ diff = (void *)e1 - (void *)e2;
+ assert_corr(diff / esize * esize == diff);
+ return diff / esize;
+}
+
+static int iam_lfix_init(struct iam_leaf *l)
+{
+ int result;
+ struct iam_leaf_head *ill;
+ int count;
+
+ assert_corr(l->il_bh != NULL);
+
+ ill = iam_get_head(l);
+ count = le16_to_cpu(ill->ill_count);
+ if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) &&
+ 0 <= count && count <= leaf_count_limit(l)) {
+ l->il_at = l->il_entries = iam_get_lentries(l);
+ result = 0;
+ } else {
+ struct inode *obj;
+
+ obj = iam_leaf_container(l)->ic_object;
+ CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or "
+ "wrong count: %i (%i)",
+ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
+ ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC),
+ count, leaf_count_limit(l));
+ result = -EIO;
+ }
+ return result;
+}
+
+static void iam_lfix_fini(struct iam_leaf *l)
+{
+ l->il_entries = l->il_at = NULL;
+}
+
+static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l)
+{
+ int count = lentry_count_get(l);
+ struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count);
+
+ return ile;
+}
+
+struct iam_rec *iam_lfix_rec(const struct iam_leaf *l)
+{
+ void *e = l->il_at;
+ assert_corr(iam_leaf_at_rec(l));
+ return e + iam_leaf_descr(l)->id_key_size;
+}
+
+static void iam_lfix_next(struct iam_leaf *l)
+{
+ assert_corr(iam_leaf_at_rec(l));
+ l->il_at = iam_lfix_shift(l, l->il_at, 1);
+}
+
+/*
+ * Bug chasing.
+ */
+int lfix_dump = 0;
+EXPORT_SYMBOL(lfix_dump);
+
+static char hdigit(char ch)
+{
+ static char d[] = "0123456789abcdef";
+ return d[ch & 0xf];
+}
+
+static char *hex(char ch, char *area)
+{
+ area[0] = hdigit(ch >> 4);
+ area[1] = hdigit(ch);
+ area[2] = 0;
+ return area;
+}
+
+static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry)
+{
+ int i;
+ char *area;
+ char h[3];
+
+ area = (char *)entry;
+ printk(KERN_EMERG "[");
+ for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area)
+ printk("%s", hex(*area, h));
+ printk("]-(");
+ for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area)
+ printk("%s", hex(*area, h));
+ printk(")\n");
+}
+
+static void lfix_print(struct iam_leaf *leaf)
+{
+ struct iam_lentry *entry;
+ int count;
+ int i;
+
+ entry = leaf->il_entries;
+ count = lentry_count_get(leaf);
+ printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count);
+ for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1))
+ l_print(leaf, entry);
+}
+
+static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k)
+{
+ struct iam_lentry *p, *q, *m, *t;
+ struct iam_container *c;
+ int count;
+ int result;
+
+ count = lentry_count_get(l);
+ if (count == 0)
+ return IAM_LOOKUP_EMPTY;
+
+ result = IAM_LOOKUP_OK;
+ c = iam_leaf_container(l);
+
+ p = l->il_entries;
+ q = iam_lfix_shift(l, p, count - 1);
+ if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) {
+ /*
+ * @k is less than the least key in the leaf
+ */
+ l->il_at = p;
+ result = IAM_LOOKUP_BEFORE;
+ } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) {
+ l->il_at = q;
+ } else {
+ /*
+ * EWD1293
+ */
+ while (iam_lfix_shift(l, p, 1) != q) {
+ m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2);
+ assert_corr(p < m && m < q);
+ if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0)
+ p = m;
+ else
+ q = m;
+ }
+ assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 &&
+ lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0);
+ /*
+ * skip over records with duplicate keys.
+ */
+ while (p > l->il_entries) {
+ t = iam_lfix_shift(l, p, -1);
+ if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0)
+ p = t;
+ else
+ break;
+ }
+ l->il_at = p;
+ }
+ assert_corr(iam_leaf_at_rec(l));
+
+ if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0)
+ result = IAM_LOOKUP_EXACT;
+
+ if (lfix_dump)
+ lfix_print(l);
+
+ return result;
+}
+
+static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik)
+{
+ assert(0);
+ return IAM_LOOKUP_OK;
+}
+
+static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k)
+{
+ assert_corr(iam_leaf_at_rec(l));
+ memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size);
+}
+
+static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
+{
+ return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k);
+}
+
+static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k)
+{
+ return !lfix_keycmp(iam_leaf_container(l),
+ iam_leaf_key_at(l->il_at), k);
+}
+
+static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r)
+{
+ assert_corr(iam_leaf_at_rec(l));
+ memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size);
+}
+
+static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r)
+{
+ assert_corr(iam_leaf_at_rec(l));
+ memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size);
+}
+
+static void iam_lfix_rec_add(struct iam_leaf *leaf,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ struct iam_lentry *end;
+ struct iam_lentry *cur;
+ struct iam_lentry *start;
+ ptrdiff_t diff;
+ int count;
+
+ assert_corr(iam_leaf_can_add(leaf, k, r));
+
+ count = lentry_count_get(leaf);
+ /*
+ * This branch handles two exceptional cases:
+ *
+ * - leaf positioned beyond last record, and
+ *
+ * - empty leaf.
+ */
+ if (!iam_leaf_at_end(leaf)) {
+ end = iam_lfix_get_end(leaf);
+ cur = leaf->il_at;
+ if (lfix_keycmp(iam_leaf_container(leaf),
+ k, iam_leaf_key_at(cur)) >= 0)
+ iam_lfix_next(leaf);
+ else
+ /*
+ * Another exceptional case: insertion with the key
+ * less than least key in the leaf.
+ */
+ assert_corr(cur == leaf->il_entries);
+
+ start = leaf->il_at;
+ diff = (void *)end - (void *)start;
+ assert_corr(diff >= 0);
+ memmove(iam_lfix_shift(leaf, start, 1), start, diff);
+ }
+ lentry_count_set(leaf, count + 1);
+ iam_lfix_key_set(leaf, k);
+ iam_lfix_rec_set(leaf, r);
+ assert_corr(iam_leaf_at_rec(leaf));
+}
+
+static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift)
+{
+ struct iam_lentry *next, *end;
+ int count;
+ ptrdiff_t diff;
+
+ assert_corr(iam_leaf_at_rec(leaf));
+
+ count = lentry_count_get(leaf);
+ end = iam_lfix_get_end(leaf);
+ next = iam_lfix_shift(leaf, leaf->il_at, 1);
+ diff = (void *)end - (void *)next;
+ memmove(leaf->il_at, next, diff);
+
+ lentry_count_set(leaf, count - 1);
+}
+
+static int iam_lfix_can_add(const struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ return lentry_count_get(l) < leaf_count_limit(l);
+}
+
+static int iam_lfix_at_end(const struct iam_leaf *folio)
+{
+ return folio->il_at == iam_lfix_get_end(folio);
+}
+
+static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh)
+{
+ struct iam_leaf_head *hdr;
+
+ hdr = (struct iam_leaf_head*)bh->b_data;
+ hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC);
+ hdr->ill_count = cpu_to_le16(0);
+}
+
+static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh,
+ iam_ptr_t new_blknr)
+{
+ struct iam_path *path;
+ struct iam_leaf_head *hdr;
+ const struct iam_ikey *pivot;
+ struct buffer_head *new_leaf;
+
+ unsigned count;
+ unsigned split;
+
+ void *start;
+ void *finis;
+
+ new_leaf = *bh;
+ path = iam_leaf_path(l);
+
+ hdr = (void *)new_leaf->b_data;
+
+ count = lentry_count_get(l);
+ split = count / 2;
+
+ start = iam_lfix_shift(l, iam_get_lentries(l), split);
+ finis = iam_lfix_shift(l, iam_get_lentries(l), count);
+
+ pivot = (const struct iam_ikey *)iam_leaf_key_at(start);
+
+ memmove(iam_entries(new_leaf), start, finis - start);
+ hdr->ill_count = count - split;
+ lentry_count_set(l, split);
+ if ((void *)l->il_at >= start) {
+ /*
+ * insertion point moves into new leaf.
+ */
+ int shift;
+ int result;
+
+ shift = iam_lfix_diff(l, l->il_at, start);
+ *bh = l->il_bh;
+ l->il_bh = new_leaf;
+ l->il_curidx = new_blknr;
+ result = iam_lfix_init(l);
+ /*
+ * init cannot fail, as node was just initialized.
+ */
+ assert_corr(result == 0);
+ l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift);
+ }
+ /*
+ * Insert pointer to the new node (together with the least key in
+ * the node) into index node.
+ */
+ iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr);
+}
+
+static struct iam_leaf_operations iam_lfix_leaf_ops = {
+ .init = iam_lfix_init,
+ .init_new = iam_lfix_init_new,
+ .fini = iam_lfix_fini,
+ .start = iam_lfix_start,
+ .next = iam_lfix_next,
+ .key = iam_lfix_key,
+ .ikey = iam_lfix_ikey,
+ .rec = iam_lfix_rec,
+ .key_set = iam_lfix_key_set,
+ .key_cmp = iam_lfix_key_cmp,
+ .key_eq = iam_lfix_key_eq,
+ .key_size = iam_lfix_key_size,
+ .rec_set = iam_lfix_rec_set,
+ .rec_get = iam_lfix_rec_get,
+ .lookup = iam_lfix_lookup,
+ .ilookup = iam_lfix_ilookup,
+ .at_end = iam_lfix_at_end,
+ .rec_add = iam_lfix_rec_add,
+ .rec_del = iam_lfix_rec_del,
+ .can_add = iam_lfix_can_add,
+ .split = iam_lfix_split
+};
+
+/*
+ * Index operations.
+ */
+
+enum {
+ /* This is duplicated in lustre/utils/create_iam.c */
+ /*
+ * Then shalt thou see the dew-BEDABBLED wretch
+ * Turn, and return, indenting with the way;
+ * Each envious brier his weary legs doth scratch,
+ * Each shadow makes him stop, each murmur stay:
+ * For misery is trodden on by many,
+ * And being low never relieved by any.
+ */
+ IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct iam_lfix_root {
+ __le64 ilr_magic;
+ __le16 ilr_keysize;
+ __le16 ilr_recsize;
+ __le16 ilr_ptrsize;
+ u8 ilr_indirect_levels;
+ u8 ilr_padding;
+};
+
+static __u32 iam_lfix_root_ptr(struct iam_container *c)
+{
+ return 0;
+}
+
+static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh,
+ int root)
+{
+ return 0;
+}
+
+static struct iam_entry *iam_lfix_root_inc(struct iam_container *c,
+ struct iam_path *path,
+ struct iam_frame *frame)
+{
+ struct iam_lfix_root *root;
+ struct iam_entry *entries;
+
+ entries = frame->entries;
+
+ dx_set_count(entries, 2);
+ assert_corr(dx_get_limit(entries) == dx_root_limit(path));
+
+ root = (void *)frame->bh->b_data;
+ assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC);
+ root->ilr_indirect_levels ++;
+ frame->at = entries = iam_entry_shift(path, entries, 1);
+ memset(iam_ikey_at(path, entries), 0,
+ iam_path_descr(path)->id_ikey_size);
+ return entries;
+}
+
+static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame)
+{
+ unsigned count;
+ unsigned limit;
+ unsigned limit_correct;
+ struct iam_entry *entries;
+
+ entries = dx_node_get_entries(path, frame);
+
+ if (frame == path->ip_frames) {
+ struct iam_lfix_root *root;
+
+ root = (void *)frame->bh->b_data;
+ if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) {
+ return -EIO;
+ }
+ limit_correct = dx_root_limit(path);
+ } else
+ limit_correct = dx_node_limit(path);
+ count = dx_get_count(entries);
+ limit = dx_get_limit(entries);
+ if (count > limit) {
+ return -EIO;
+ }
+ if (limit != limit_correct) {
+ return -EIO;
+ }
+ return 0;
+}
+
+static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame)
+{
+ struct iam_entry *entries;
+ void *data;
+ entries = dx_node_get_entries(path, frame);
+
+ data = frame->bh->b_data;
+
+ if (frame == path->ip_frames) {
+ struct iam_lfix_root *root;
+
+ root = data;
+ path->ip_indirect = root->ilr_indirect_levels;
+ if (path->ip_ikey_target == NULL)
+ path->ip_ikey_target =
+ (struct iam_ikey *)path->ip_key_target;
+ }
+ frame->entries = frame->at = entries;
+ return 0;
+}
+
+static int iam_lfix_ikeycmp(const struct iam_container *c,
+ const struct iam_ikey *k1,
+ const struct iam_ikey *k2)
+{
+ return memcmp(k1, k2, c->ic_descr->id_ikey_size);
+}
+
+static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c,
+ void *area)
+{
+ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
+}
+
+static struct iam_operations iam_lfix_ops = {
+ .id_root_ptr = iam_lfix_root_ptr,
+ .id_node_read = iam_node_read,
+ .id_node_init = iam_lfix_node_init,
+ .id_node_check = iam_lfix_node_check,
+ .id_node_load = iam_lfix_node_load,
+ .id_ikeycmp = iam_lfix_ikeycmp,
+ .id_root_inc = iam_lfix_root_inc,
+ .id_ipd_alloc = iam_lfix_ipd_alloc,
+ .id_ipd_free = iam_ipd_free,
+ .id_name = "lfix"
+};
+
+static int iam_lfix_guess(struct iam_container *c)
+{
+ int result;
+ struct buffer_head *bh;
+ const struct iam_lfix_root *root;
+
+ assert_corr(c->ic_object != NULL);
+
+ result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh);
+ if (result == 0) {
+ root = (void *)bh->b_data;
+ if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) {
+ struct iam_descr *descr;
+
+ descr = c->ic_descr;
+ descr->id_key_size = le16_to_cpu(root->ilr_keysize);
+ descr->id_ikey_size = le16_to_cpu(root->ilr_keysize);
+ descr->id_rec_size = le16_to_cpu(root->ilr_recsize);
+ descr->id_ptr_size = le16_to_cpu(root->ilr_ptrsize);
+ descr->id_root_gap = sizeof(struct iam_lfix_root);
+ descr->id_node_gap = 0;
+ descr->id_ops = &iam_lfix_ops;
+ descr->id_leaf_ops = &iam_lfix_leaf_ops;
+ } else
+ result = -EBADF;
+ brelse(bh);
+ }
+ return result;
+}
+
+static struct iam_format iam_lfix_format = {
+ .if_guess = iam_lfix_guess
+};
+
+void iam_lfix_format_init(void)
+{
+ iam_format_register(&iam_lfix_format);
+}
+
+/*
+ * Debugging aid.
+ */
+
+#define KEYSIZE (8)
+#define RECSIZE (8)
+#define PTRSIZE (4)
+
+#define LFIX_ROOT_RECNO \
+ ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE))
+
+#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE))
+
+#define LFIX_LEAF_RECNO \
+ ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE))
+
+struct lfix_root {
+ struct iam_lfix_root lr_root;
+ struct {
+ char key[KEYSIZE];
+ char ptr[PTRSIZE];
+ } lr_entry[LFIX_ROOT_RECNO];
+};
+
+struct lfix_index {
+ struct dx_countlimit li_cl;
+ char li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)];
+ struct {
+ char key[KEYSIZE];
+ char ptr[PTRSIZE];
+ } li_entry[LFIX_INDEX_RECNO - 1];
+};
+
+struct lfix_leaf {
+ struct iam_leaf_head ll_head;
+ struct {
+ char key[KEYSIZE];
+ char rec[RECSIZE];
+ } ll_entry[LFIX_LEAF_RECNO];
+};
+
+#define STORE_UNALIGNED(val, dst) \
+({ \
+ typeof(val) __val = (val); \
+ CLASSERT(sizeof(val) == sizeof(*(dst))); \
+ memcpy(dst, &__val, sizeof(*(dst))); \
+})
+
+#include <linux/jbd.h>
+#include <linux/ldiskfs_fs.h>
+#include <linux/ldiskfs_jbd.h>
+
+static void lfix_root(void *buf,
+ int blocksize, int keysize, int ptrsize, int recsize)
+{
+ struct iam_lfix_root *root;
+ struct dx_countlimit *limit;
+ void *entry;
+
+ root = buf;
+ *root = (typeof(*root)) {
+ .ilr_magic = cpu_to_le64(IAM_LFIX_ROOT_MAGIC),
+ .ilr_keysize = cpu_to_le16(keysize),
+ .ilr_recsize = cpu_to_le16(recsize),
+ .ilr_ptrsize = cpu_to_le16(ptrsize),
+ .ilr_indirect_levels = 0
+ };
+
+ limit = (void *)(root + 1);
+ *limit = (typeof(*limit)){
+ /*
+ * limit itself + one pointer to the leaf.
+ */
+ .count = cpu_to_le16(2),
+ .limit = iam_root_limit(sizeof(struct iam_lfix_root),
+ blocksize, keysize + ptrsize)
+ };
+
+ entry = root + 1;
+ /*
+ * Skip over @limit.
+ */
+ entry += keysize + ptrsize;
+
+ /*
+ * Entry format is <key> followed by <ptr>. In the minimal tree
+ * consisting of a root and single node, <key> is a minimal possible
+ * key.
+ *
+ * XXX: this key is hard-coded to be a sequence of 0's.
+ */
+
+ entry += keysize;
+ /* now @entry points to <ptr> */
+ if (ptrsize == 4)
+ STORE_UNALIGNED(cpu_to_le32(1), (u_int32_t *)entry);
+ else
+ STORE_UNALIGNED(cpu_to_le64(1), (u_int64_t *)entry);
+}
+
+static void lfix_leaf(void *buf,
+ int blocksize, int keysize, int ptrsize, int recsize)
+{
+ struct iam_leaf_head *head;
+
+ /* form leaf */
+ head = buf;
+ *head = (struct iam_leaf_head) {
+ .ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC),
+ /*
+ * Leaf contains an entry with the smallest possible key
+ * (created by zeroing).
+ */
+ .ill_count = cpu_to_le16(1),
+ };
+}
+
+int iam_lfix_create(struct inode *obj,
+ int keysize, int ptrsize, int recsize, handle_t *handle)
+{
+ struct buffer_head *root_node;
+ struct buffer_head *leaf_node;
+ struct super_block *sb;
+
+ u32 blknr;
+ int result;
+ unsigned long bsize;
+
+ assert_corr(obj->i_size == 0);
+
+ sb = obj->i_sb;
+ bsize = sb->s_blocksize;
+ root_node = ldiskfs_append(handle, obj, &blknr, &result);
+ leaf_node = ldiskfs_append(handle, obj, &blknr, &result);
+ if (root_node != NULL && leaf_node != NULL) {
+ lfix_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
+ lfix_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
+ ldiskfs_mark_inode_dirty(handle, obj);
+ result = ldiskfs_journal_dirty_metadata(handle, root_node);
+ if (result == 0)
+ result = ldiskfs_journal_dirty_metadata(handle, leaf_node);
+ if (result != 0)
+ ldiskfs_std_error(sb, result);
+ }
+ brelse(leaf_node);
+ brelse(root_node);
+ return result;
+}
+EXPORT_SYMBOL(iam_lfix_create);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * iam_lvar.c
+ *
+ * implementation of iam format for fixed size records, variable sized keys.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/types.h>
+#include <linux/jbd.h>
+/* ldiskfs_error() */
+#include <linux/ldiskfs_fs.h>
+#include "osd_internal.h"
+
+/*
+ * Leaf operations.
+ */
+
+enum {
+ IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in
+ * lustre/utils/create_iam.c */
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct lvar_leaf_header {
+ __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
+ __le16 vlh_used; /* used bytes, including header */
+};
+
+/*
+ * Format of leaf entry:
+ *
+ * __le16 keysize
+ * u8 key[keysize]
+ * u8 record[rec_size]
+ *
+ * Entries are ordered in key order.
+ */
+
+/* This is duplicated in lustre/utils/create_iam.c */
+typedef __u32 lvar_hash_t;
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct lvar_leaf_entry {
+ __le32 vle_hash;
+ __le16 vle_keysize;
+ u8 vle_key[0];
+};
+
+#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
+
+
+static inline int blocksize(const struct iam_leaf *leaf)
+{
+ return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
+}
+
+static inline const char *kchar(const struct iam_key *key)
+{
+ return (void *)key;
+}
+
+static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
+{
+ return (struct iam_lentry *)ent;
+}
+
+static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
+{
+ return (struct lvar_leaf_entry *)lent;
+}
+
+
+static inline int e_keysize(const struct lvar_leaf_entry *ent)
+{
+ return le16_to_cpu(ent->vle_keysize);
+}
+
+/* This is duplicated in lustre/utils/create_iam.c */
+enum {
+ LVAR_PAD = 4,
+ LVAR_ROUND = LVAR_PAD - 1
+};
+
+static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
+{
+ CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1)));
+
+ return (offsetof(struct lvar_leaf_entry, vle_key) +
+ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
+}
+
+static inline int rec_size(const struct iam_rec *rec)
+{
+ return *(const char *)rec;
+}
+
+static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
+{
+ return ((void *)ent) +
+ offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
+}
+
+static inline int e_size(const struct iam_leaf *leaf,
+ const struct lvar_leaf_entry *ent)
+{
+ return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
+}
+
+static inline char *e_char(const struct lvar_leaf_entry *ent)
+{
+ return (char *)&ent->vle_key;
+}
+
+static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
+{
+ return (struct iam_key *)e_char(ent);
+}
+
+static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
+{
+ return le32_to_cpu(ent->vle_hash);
+}
+
+static void e_print(const struct lvar_leaf_entry *ent)
+{
+ printk(" %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
+ e_keysize(ent), e_keysize(ent), e_char(ent));
+}
+#if 0
+static int e_check(const struct iam_leaf *leaf,
+ const struct lvar_leaf_entry *ent)
+{
+ const void *point = ent;
+ const void *start = leaf->il_bh->b_data;
+ return
+ start + sizeof(struct lvar_leaf_header) <= point &&
+ point + e_size(leaf, ent) < start + blocksize(leaf);
+}
+#endif
+
+static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
+ const struct lvar_leaf_entry *ent)
+{
+ return ((void *)ent) + e_size(leaf, ent);
+}
+
+#define LVAR_HASH_SANDWICH (0)
+#define LVAR_HASH_TEA (1)
+#define LVAR_HASH_R5 (0)
+#define LVAR_HASH_PREFIX (0)
+
+static __u32 hash_build0(const char *name, int namelen)
+{
+ __u32 result;
+
+ if (namelen == 0)
+ return 0;
+ if (strncmp(name, ".", 1) == 0 && namelen == 1)
+ return 1;
+ if (strncmp(name, "..", 2) == 0 && namelen == 2)
+ return 2;
+
+ if (LVAR_HASH_PREFIX) {
+ result = 0;
+ strncpy((void *)&result,
+ name, min(namelen, (int)sizeof result));
+ } else {
+ struct ldiskfs_dx_hash_info hinfo;
+
+ if (LVAR_HASH_TEA)
+ hinfo.hash_version = LDISKFS_DX_HASH_TEA;
+ else
+ hinfo.hash_version = LDISKFS_DX_HASH_R5;
+ hinfo.seed = 0;
+ ldiskfsfs_dirhash(name, namelen, &hinfo);
+ result = hinfo.hash;
+ if (LVAR_HASH_SANDWICH) {
+ __u32 result2;
+
+ hinfo.hash_version = LDISKFS_DX_HASH_TEA;
+ hinfo.seed = 0;
+ ldiskfsfs_dirhash(name, namelen, &hinfo);
+ result2 = hinfo.hash;
+ result = (0xfc000000 & result2) | (0x03ffffff & result);
+ }
+ }
+ return result;
+}
+
+enum {
+ HASH_GRAY_AREA = 1024,
+ HASH_MAX_SIZE = 0x7fffffffUL
+};
+
+static __u32 hash_build(const char *name, int namelen)
+{
+ __u32 hash;
+
+ hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE;
+ if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
+ hash &= HASH_GRAY_AREA - 1;
+ return hash;
+}
+
+static inline lvar_hash_t get_hash(const struct iam_container *bag,
+ const char *name, int namelen)
+{
+ return hash_build(name, namelen);
+}
+
+static inline int e_eq(const struct lvar_leaf_entry *ent,
+ const char *name, int namelen)
+{
+ return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
+}
+
+static inline int e_cmp(const struct iam_leaf *leaf,
+ const struct lvar_leaf_entry *ent, lvar_hash_t hash)
+{
+ lvar_hash_t ehash;
+
+ ehash = e_hash(ent);
+ return ehash == hash ? 0 : (ehash < hash ? -1 : +1);
+}
+
+static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
+{
+ return (struct lvar_leaf_header *)l->il_bh->b_data;
+}
+
+static int h_used(const struct lvar_leaf_header *hdr)
+{
+ return le16_to_cpu(hdr->vlh_used);
+}
+
+static void h_used_adj(const struct iam_leaf *leaf,
+ struct lvar_leaf_header *hdr, int adj)
+{
+ int used;
+
+ used = h_used(hdr) + adj;
+ assert_corr(sizeof *hdr <= used && used <= blocksize(leaf));
+ hdr->vlh_used = cpu_to_le16(used);
+}
+
+static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
+{
+ return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
+}
+
+static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
+{
+ return (void *)l->il_bh->b_data + h_used(n_head(l));
+}
+
+static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
+{
+ return lentry_lvar(l->il_at);
+}
+
+void n_print(const struct iam_leaf *l)
+{
+ struct lvar_leaf_entry *scan;
+
+ printk(KERN_EMERG "used: %d\n", h_used(n_head(l)));
+ for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
+ e_print(scan);
+}
+
+#if LDISKFS_CORRECTNESS_ON
+static int n_at_rec(const struct iam_leaf *folio)
+{
+ return
+ n_start(folio) <= lentry_lvar(folio->il_at) &&
+ lentry_lvar(folio->il_at) < n_end(folio);
+}
+
+#if LDISKFS_INVARIANT_ON
+static int n_invariant(const struct iam_leaf *leaf)
+{
+ struct iam_path *path;
+ struct lvar_leaf_entry *scan;
+ struct lvar_leaf_entry *end;
+ lvar_hash_t hash;
+ lvar_hash_t nexthash;
+ lvar_hash_t starthash;
+
+ end = n_end(leaf);
+ hash = 0;
+ path = leaf->il_path;
+
+ if (h_used(n_head(leaf)) > blocksize(leaf))
+ return 0;
+
+ /*
+ * Delimiting key in the parent index node. Clear least bit to account
+ * for hash collision marker.
+ */
+ starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
+ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
+ nexthash = e_hash(scan);
+ if (nexthash != get_hash(iam_leaf_container(leaf),
+ e_char(scan), e_keysize(scan))) {
+ BREAKPOINT();
+ return 0;
+ }
+ if (0 && nexthash < starthash) {
+ /*
+ * Unfortunately this useful invariant cannot be
+ * reliably checked as parent node is nor necessarily
+ * locked.
+ */
+ n_print(leaf);
+ printk("%#x < %#x\n", nexthash, starthash);
+ dump_stack();
+ return 0;
+ }
+ if (nexthash < hash) {
+ BREAKPOINT();
+ return 0;
+ }
+ hash = nexthash;
+ }
+ if (scan != end) {
+ BREAKPOINT();
+ return 0;
+ }
+ return 1;
+}
+/* LDISKFS_INVARIANT_ON */
+#endif
+
+/* LDISKFS_CORRECTNESS_ON */
+#endif
+
+static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
+ struct iam_ikey *key)
+{
+ lvar_hash_t *hash;
+
+ assert_corr(n_at_rec(l));
+
+ hash = (void *)key;
+ *hash = e_hash(n_cur(l));
+ return key;
+}
+
+static struct iam_key *lvar_key(const struct iam_leaf *l)
+{
+ return e_key(n_cur(l));
+}
+
+static int lvar_key_size(const struct iam_leaf *l)
+{
+ return e_keysize(n_cur(l));
+}
+
+static void lvar_start(struct iam_leaf *l)
+{
+ l->il_at = lvar_lentry(n_start(l));
+}
+
+static int lvar_init(struct iam_leaf *l)
+{
+ int result;
+ int used;
+ struct lvar_leaf_header *head;
+
+ assert_corr(l->il_bh != NULL);
+
+ head = n_head(l);
+ used = h_used(head);
+ if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) &&
+ used <= blocksize(l)) {
+ l->il_at = l->il_entries = lvar_lentry(n_start(l));
+ result = 0;
+ } else {
+ struct inode *obj;
+
+ obj = iam_leaf_container(l)->ic_object;
+ CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or "
+ "wrong used: %i",
+ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
+ head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC),
+ used);
+ result = -EIO;
+ }
+ return result;
+}
+
+static void lvar_fini(struct iam_leaf *l)
+{
+ l->il_entries = l->il_at = NULL;
+}
+
+struct iam_rec *lvar_rec(const struct iam_leaf *l)
+{
+ assert_corr(n_at_rec(l));
+ return e_rec(n_cur(l));
+}
+
+static void lvar_next(struct iam_leaf *l)
+{
+ assert_corr(n_at_rec(l));
+ assert_corr(iam_leaf_is_locked(l));
+ l->il_at = lvar_lentry(e_next(l, n_cur(l)));
+}
+
+static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
+{
+ struct lvar_leaf_entry *found;
+ struct lvar_leaf_entry *scan;
+ struct lvar_leaf_entry *end;
+ int result;
+ const char *name;
+ int namelen;
+ int found_equal;
+ lvar_hash_t hash;
+ int last;
+
+ assert_inv(n_invariant(leaf));
+ end = n_end(leaf);
+
+ name = kchar(k);
+ namelen = strlen(name);
+ hash = get_hash(iam_leaf_container(leaf), name, namelen);
+ found = NULL;
+ found_equal = 0;
+ last = 1;
+
+ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
+ lvar_hash_t scan_hash;
+
+ scan_hash = e_hash(scan);
+ if (scan_hash < hash)
+ found = scan;
+ else if (scan_hash == hash) {
+ if (e_eq(scan, name, namelen)) {
+ /*
+ * perfect match
+ */
+ leaf->il_at = lvar_lentry(scan);
+ return IAM_LOOKUP_EXACT;
+ } else if (!found_equal) {
+ found = scan;
+ found_equal = 1;
+ }
+ } else {
+ last = 0;
+ break;
+ }
+ }
+ if (found == NULL) {
+ /*
+ * @k is less than all hashes in the leaf.
+ */
+ lvar_start(leaf);
+ result = IAM_LOOKUP_BEFORE;
+ } else {
+ leaf->il_at = lvar_lentry(found);
+ result = IAM_LOOKUP_OK;
+ assert_corr(n_at_rec(leaf));
+ }
+ if (last)
+ result |= IAM_LOOKUP_LAST;
+ assert_inv(n_invariant(leaf));
+
+ return result;
+}
+
+static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
+{
+ struct lvar_leaf_entry *scan;
+ struct lvar_leaf_entry *end;
+ lvar_hash_t hash;
+
+ assert_inv(n_invariant(leaf));
+ end = n_end(leaf);
+ hash = *(const lvar_hash_t *)ik;
+
+ lvar_start(leaf);
+ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
+ lvar_hash_t scan_hash;
+
+ scan_hash = e_hash(scan);
+ if (scan_hash > hash)
+ return scan == n_start(leaf) ?
+ IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
+ leaf->il_at = lvar_lentry(scan);
+ if (scan_hash == hash)
+ return IAM_LOOKUP_EXACT;
+ }
+ assert_inv(n_invariant(leaf));
+ /*
+ * @ik is greater than any key in the node. Return last record in the
+ * node.
+ */
+ return IAM_LOOKUP_OK;
+}
+
+static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
+{
+ memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
+}
+
+static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
+{
+ assert_corr(n_at_rec(l));
+ assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
+ assert_corr(iam_leaf_is_locked(l));
+ __lvar_key_set(l, k);
+ assert_inv(n_invariant(l));
+}
+
+static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
+{
+ lvar_hash_t hash;
+ const char *name;
+
+ name = kchar(k);
+
+ hash = get_hash(iam_leaf_container(l), name, strlen(name));
+ return e_cmp(l, n_cur(l), hash);
+}
+
+static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
+{
+ const char *name;
+
+ name = kchar(k);
+ return e_eq(n_cur(l), name, strlen(name));
+}
+
+static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
+{
+ memcpy(e_rec(n_cur(l)), r, rec_size(r));
+}
+
+static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
+{
+ assert_corr(n_at_rec(l));
+ assert_corr(iam_leaf_is_locked(l));
+ __lvar_rec_set(l, r);
+ assert_inv(n_invariant(l));
+}
+
+static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
+{
+ struct iam_rec *rec;
+
+ rec = e_rec(n_cur(l));
+ assert_corr(n_at_rec(l));
+ assert_corr(iam_leaf_is_locked(l));
+ memcpy(r, rec, rec_size(rec));
+ assert_inv(n_invariant(l));
+}
+
+static int lvar_can_add(const struct iam_leaf *l,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ assert_corr(iam_leaf_is_locked(l));
+ return
+ h_used(n_head(l)) +
+ getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
+}
+
+static int lvar_at_end(const struct iam_leaf *folio)
+{
+ assert_corr(iam_leaf_is_locked(folio));
+ return n_cur(folio) == n_end(folio);
+}
+
+static void lvar_rec_add(struct iam_leaf *leaf,
+ const struct iam_key *k, const struct iam_rec *r)
+{
+ const char *key;
+ int ksize;
+ int shift;
+ void *end;
+ void *start;
+ ptrdiff_t diff;
+
+ assert_corr(lvar_can_add(leaf, k, r));
+ assert_inv(n_invariant(leaf));
+ assert_corr(iam_leaf_is_locked(leaf));
+
+ key = kchar(k);
+ ksize = strlen(key);
+ shift = getsize(leaf, ksize, rec_size(r));
+
+ if (!lvar_at_end(leaf)) {
+ assert_corr(n_cur(leaf) < n_end(leaf));
+ end = n_end(leaf);
+ if (lvar_key_cmp(leaf, k) <= 0)
+ lvar_next(leaf);
+ else
+ /*
+ * Another exceptional case: insertion with the key
+ * less than least key in the leaf.
+ */
+ assert_corr(leaf->il_at == leaf->il_entries);
+
+ start = leaf->il_at;
+ diff = PDIFF(end, start);
+ assert_corr(diff >= 0);
+ memmove(start + shift, start, diff);
+ }
+ h_used_adj(leaf, n_head(leaf), shift);
+ n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
+ n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
+ key, ksize));
+ __lvar_key_set(leaf, k);
+ __lvar_rec_set(leaf, r);
+ assert_corr(n_at_rec(leaf));
+ assert_inv(n_invariant(leaf));
+}
+
+static void lvar_rec_del(struct iam_leaf *leaf, int shift)
+{
+ void *next;
+ void *end;
+ int nob;
+
+ assert_corr(n_at_rec(leaf));
+ assert_inv(n_invariant(leaf));
+ assert_corr(iam_leaf_is_locked(leaf));
+
+ end = n_end(leaf);
+ next = e_next(leaf, n_cur(leaf));
+ nob = e_size(leaf, n_cur(leaf));
+ memmove(leaf->il_at, next, end - next);
+ h_used_adj(leaf, n_head(leaf), -nob);
+ assert_inv(n_invariant(leaf));
+}
+
+static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
+{
+ struct lvar_leaf_header *hdr;
+
+ hdr = (struct lvar_leaf_header *)bh->b_data;
+ hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
+ hdr->vlh_used = sizeof *hdr;
+}
+
+static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
+ struct lvar_leaf_entry **prev)
+{
+ void *scan;
+ void *start;
+ int threshold;
+
+ *prev = NULL;
+ threshold = blocksize(leaf) / 2;
+ for (scan = start = n_start(leaf); scan - start <= threshold;
+ *prev = scan, scan = e_next(leaf, scan)) {
+ ;
+ }
+ return scan;
+}
+
+static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
+ iam_ptr_t new_blknr)
+{
+ struct lvar_leaf_entry *first_to_move;
+ struct lvar_leaf_entry *last_to_stay;
+ struct iam_path *path;
+ struct lvar_leaf_header *hdr;
+ struct buffer_head *new_leaf;
+
+ ptrdiff_t tomove;
+ lvar_hash_t hash;
+
+ assert_inv(n_invariant(leaf));
+ assert_corr(iam_leaf_is_locked(leaf));
+
+ new_leaf = *bh;
+ path = iam_leaf_path(leaf);
+
+ hdr = (void *)new_leaf->b_data;
+
+ first_to_move = find_pivot(leaf, &last_to_stay);
+ assert_corr(last_to_stay != NULL);
+ assert_corr(e_next(leaf, last_to_stay) == first_to_move);
+
+ hash = e_hash(first_to_move);
+ if (hash == e_hash(last_to_stay))
+ /*
+ * Duplicate hash.
+ */
+ hash |= 1;
+
+ tomove = PDIFF(n_end(leaf), first_to_move);
+ memmove(hdr + 1, first_to_move, tomove);
+
+ h_used_adj(leaf, hdr, tomove);
+ h_used_adj(leaf, n_head(leaf), -tomove);
+
+ assert_corr(n_end(leaf) == first_to_move);
+
+ if (n_cur(leaf) >= first_to_move) {
+ /*
+ * insertion point moves into new leaf.
+ */
+ ptrdiff_t shift;
+ int result;
+
+ shift = PDIFF(leaf->il_at, first_to_move);
+ *bh = leaf->il_bh;
+ leaf->il_bh = new_leaf;
+ leaf->il_curidx = new_blknr;
+
+ assert_corr(iam_leaf_is_locked(leaf));
+ result = lvar_init(leaf);
+ /*
+ * init cannot fail, as node was just initialized.
+ */
+ assert_corr(result == 0);
+ leaf->il_at = ((void *)leaf->il_at) + shift;
+ }
+ /*
+ * Insert pointer to the new node (together with the least key in
+ * the node) into index node.
+ */
+ iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
+ new_blknr);
+ assert_corr(n_cur(leaf) < n_end(leaf));
+ assert_inv(n_invariant(leaf));
+}
+
+static struct iam_leaf_operations lvar_leaf_ops = {
+ .init = lvar_init,
+ .init_new = lvar_init_new,
+ .fini = lvar_fini,
+ .start = lvar_start,
+ .next = lvar_next,
+ .key = lvar_key,
+ .ikey = lvar_ikey,
+ .rec = lvar_rec,
+ .key_set = lvar_key_set,
+ .key_cmp = lvar_key_cmp,
+ .key_eq = lvar_key_eq,
+ .key_size = lvar_key_size,
+ .rec_set = lvar_rec_set,
+ .rec_get = lvar_rec_get,
+ .lookup = lvar_lookup,
+ .ilookup = lvar_ilookup,
+ .at_end = lvar_at_end,
+ .rec_add = lvar_rec_add,
+ .rec_del = lvar_rec_del,
+ .can_add = lvar_can_add,
+ .split = lvar_split
+};
+
+/*
+ * Index operations.
+ */
+
+enum {
+ /* This is duplicated in lustre/utils/create_iam.c */
+ /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
+ IAM_LVAR_ROOT_MAGIC = 0xb01dface
+};
+
+/* This is duplicated in lustre/utils/create_iam.c */
+struct lvar_root {
+ __le32 vr_magic;
+ __le16 vr_recsize;
+ __le16 vr_ptrsize;
+ u8 vr_indirect_levels;
+ u8 vr_padding0;
+ __le16 vr_padding1;
+};
+
+static __u32 lvar_root_ptr(struct iam_container *c)
+{
+ return 0;
+}
+
+static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
+ int root)
+{
+ return 0;
+}
+
+static struct iam_entry *lvar_root_inc(struct iam_container *c,
+ struct iam_path *path,
+ struct iam_frame *frame)
+{
+ struct lvar_root *root;
+ struct iam_entry *entries;
+
+ assert_corr(iam_frame_is_locked(path, frame));
+ entries = frame->entries;
+
+ dx_set_count(entries, 2);
+ assert_corr(dx_get_limit(entries) == dx_root_limit(path));
+
+ root = (void *)frame->bh->b_data;
+ assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
+ root->vr_indirect_levels ++;
+ frame->at = entries = iam_entry_shift(path, entries, 1);
+ memset(iam_ikey_at(path, entries), 0,
+ iam_path_descr(path)->id_ikey_size);
+ return entries;
+}
+
+static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
+{
+ unsigned count;
+ unsigned limit;
+ unsigned limit_correct;
+ struct iam_entry *entries;
+
+ entries = dx_node_get_entries(path, frame);
+
+ if (frame == path->ip_frames) {
+ struct lvar_root *root;
+
+ root = (void *)frame->bh->b_data;
+ if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
+ return -EIO;
+ limit_correct = dx_root_limit(path);
+ } else
+ limit_correct = dx_node_limit(path);
+ count = dx_get_count(entries);
+ limit = dx_get_limit(entries);
+ if (count > limit)
+ return -EIO;
+ if (limit != limit_correct)
+ return -EIO;
+ return 0;
+}
+
+static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
+{
+ struct iam_entry *entries;
+ void *data;
+ entries = dx_node_get_entries(path, frame);
+
+ data = frame->bh->b_data;
+
+ if (frame == path->ip_frames) {
+ struct lvar_root *root;
+ const char *name;
+
+ root = data;
+ name = kchar(path->ip_key_target);
+ path->ip_indirect = root->vr_indirect_levels;
+ if (path->ip_ikey_target == NULL) {
+ path->ip_ikey_target = iam_path_ikey(path, 4);
+ *(lvar_hash_t *)path->ip_ikey_target =
+ get_hash(path->ip_container, name,
+ strlen(name));
+ }
+ }
+ frame->entries = frame->at = entries;
+ return 0;
+}
+
+static int lvar_ikeycmp(const struct iam_container *c,
+ const struct iam_ikey *k1, const struct iam_ikey *k2)
+{
+ lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
+ lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
+
+ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
+}
+
+static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
+ void *area)
+{
+ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
+}
+
+static void lvar_root(void *buf,
+ int blocksize, int keysize, int ptrsize, int recsize)
+{
+ struct lvar_root *root;
+ struct dx_countlimit *limit;
+ void *entry;
+ int isize;
+
+ isize = sizeof(lvar_hash_t) + ptrsize;
+ root = buf;
+ *root = (typeof(*root)) {
+ .vr_magic = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
+ .vr_recsize = cpu_to_le16(recsize),
+ .vr_ptrsize = cpu_to_le16(ptrsize),
+ .vr_indirect_levels = 0
+ };
+
+ limit = (void *)(root + 1);
+ *limit = (typeof(*limit)){
+ /*
+ * limit itself + one pointer to the leaf.
+ */
+ .count = cpu_to_le16(2),
+ .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
+ sizeof (lvar_hash_t) + ptrsize)
+ };
+
+ entry = root + 1;
+ /*
+ * Skip over @limit.
+ */
+ entry += isize;
+
+ /*
+ * Entry format is <key> followed by <ptr>. In the minimal tree
+ * consisting of a root and single node, <key> is a minimal possible
+ * key.
+ */
+ *(lvar_hash_t *)entry = 0;
+ entry += sizeof(lvar_hash_t);
+ /* now @entry points to <ptr> */
+ if (ptrsize == 4)
+ *(u_int32_t *)entry = cpu_to_le32(1);
+ else
+ *(u_int64_t *)entry = cpu_to_le64(1);
+}
+
+static int lvar_esize(int namelen, int recsize)
+{
+ return (offsetof(struct lvar_leaf_entry, vle_key) +
+ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
+}
+
+static void lvar_leaf(void *buf,
+ int blocksize, int keysize, int ptrsize, int recsize)
+{
+ struct lvar_leaf_header *head;
+ struct lvar_leaf_entry *entry;
+
+ /* form leaf */
+ head = buf;
+ *head = (typeof(*head)) {
+ .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
+ .vlh_used = cpu_to_le16(sizeof *head + lvar_esize(0, recsize))
+ };
+ entry = (void *)(head + 1);
+ *entry = (typeof(*entry)) {
+ .vle_hash = 0,
+ .vle_keysize = 0
+ };
+ memset(e_rec(entry), 0, recsize);
+ *(char *)e_rec(entry) = recsize;
+}
+
+#include <linux/jbd.h>
+#include <linux/ldiskfs_fs.h>
+#include <linux/ldiskfs_jbd.h>
+
+int iam_lvar_create(struct inode *obj,
+ int keysize, int ptrsize, int recsize, handle_t *handle)
+{
+ struct buffer_head *root_node;
+ struct buffer_head *leaf_node;
+ struct super_block *sb;
+
+ u32 blknr;
+ int result;
+ unsigned long bsize;
+
+ assert_corr(obj->i_size == 0);
+
+ sb = obj->i_sb;
+ bsize = sb->s_blocksize;
+ root_node = ldiskfs_append(handle, obj, &blknr, &result);
+ leaf_node = ldiskfs_append(handle, obj, &blknr, &result);
+ if (root_node != NULL && leaf_node != NULL) {
+ lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
+ lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
+ ldiskfs_mark_inode_dirty(handle, obj);
+ result = ldiskfs_journal_dirty_metadata(handle, root_node);
+ if (result == 0)
+ result = ldiskfs_journal_dirty_metadata(handle, leaf_node);
+ if (result != 0)
+ ldiskfs_std_error(sb, result);
+ }
+ brelse(leaf_node);
+ brelse(root_node);
+ return result;
+}
+EXPORT_SYMBOL(iam_lvar_create);
+
+static struct iam_operations lvar_ops = {
+ .id_root_ptr = lvar_root_ptr,
+ .id_node_read = iam_node_read,
+ .id_node_init = lvar_node_init,
+ .id_node_check = lvar_node_check,
+ .id_node_load = lvar_node_load,
+ .id_ikeycmp = lvar_ikeycmp,
+ .id_root_inc = lvar_root_inc,
+ .id_ipd_alloc = lvar_ipd_alloc,
+ .id_ipd_free = iam_ipd_free,
+ .id_name = "lvar"
+};
+
+static int lvar_guess(struct iam_container *c)
+{
+ int result;
+ struct buffer_head *bh;
+ const struct lvar_root *root;
+
+ assert_corr(c->ic_object != NULL);
+
+ result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
+ if (result == 0) {
+ root = (void *)bh->b_data;
+ if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
+ struct iam_descr *descr;
+
+ descr = c->ic_descr;
+ descr->id_key_size = LDISKFS_NAME_LEN;
+ descr->id_ikey_size = sizeof (lvar_hash_t);
+ descr->id_rec_size = le16_to_cpu(root->vr_recsize);
+ descr->id_ptr_size = le16_to_cpu(root->vr_ptrsize);
+ descr->id_root_gap = sizeof *root;
+ descr->id_node_gap = 0;
+ descr->id_ops = &lvar_ops;
+ descr->id_leaf_ops = &lvar_leaf_ops;
+ } else
+ result = -EBADF;
+ brelse(bh);
+ }
+ return result;
+}
+
+static struct iam_format lvar_format = {
+ .if_guess = lvar_guess
+};
+
+void iam_lvar_format_init(void)
+{
+ iam_format_register(&lvar_format);
+}
+
#include <linux/ldiskfs_fs.h>
/* struct dentry */
#include <linux/dcache.h>
-#include <linux/lustre_iam.h>
/* struct dirent64 */
#include <linux/dirent.h>
#include <dt_object.h>
#include "osd_oi.h"
+#include "osd_iam.h"
struct inode;