From: deshmukh Date: Tue, 7 Jul 2009 06:56:06 +0000 (+0000) Subject: b=18504 X-Git-Tag: v1_9_220~51 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=c3e10ade1eec36d7ed4068922f2b915dae124aac b=18504 i=adilger i=pravin i=girish Moved IAM code from ldiskfs to OSD. --- diff --git a/lustre/obdclass/hash.c b/lustre/obdclass/hash.c index 5165e7c..26e9e9e 100644 --- a/lustre/obdclass/hash.c +++ b/lustre/obdclass/hash.c @@ -23,6 +23,9 @@ #endif #define DELTA 0x9E3779B9 +#define DX_HASH_R5 98 +#define DX_HASH_SAME 99 + static void TEA_transform(__u32 buf[4], __u32 const in[]) { diff --git a/lustre/osd/Makefile.in b/lustre/osd/Makefile.in index 45ee501..ad99827 100644 --- a/lustre/osd/Makefile.in +++ b/lustre/osd/Makefile.in @@ -1,5 +1,6 @@ MODULES := osd -osd-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o +osd-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o osd_iam.o \ + osd_iam_lfix.o osd_iam_lvar.o EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs diff --git a/lustre/osd/autoMakefile.am b/lustre/osd/autoMakefile.am index 8e88a77..8cd8bf9 100644 --- a/lustre/osd/autoMakefile.am +++ b/lustre/osd/autoMakefile.am @@ -38,5 +38,6 @@ if MODULES modulefs_DATA = osd$(KMODEXT) endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES := $(osd-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ +DIST_SOURCES := $(osd-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h \ + osd_iam.h diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c index 7235a3c..2c639c7 100644 --- a/lustre/osd/osd_handler.c +++ b/lustre/osd/osd_handler.c @@ -77,7 +77,6 @@ /* fid_is_local() */ #include -#include #include "osd_internal.h" #include "osd_igif.h" @@ -1361,25 +1360,6 @@ static int osd_create_post(struct osd_thread_info *info, struct osd_object *obj, return 0; } -extern struct inode *ldiskfs_create_inode(handle_t *handle, - struct inode * dir, int mode); -extern int ldiskfs_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); -extern int ldiskfs_delete_entry(handle_t *handle, - struct inode * dir, - struct ldiskfs_dir_entry_2 * de_del, - struct buffer_head * bh); -extern struct buffer_head * ldiskfs_find_entry(struct dentry *dentry, - struct ldiskfs_dir_entry_2 - ** res_dir); -extern int ldiskfs_add_dot_dotdot(handle_t *handle, struct inode *dir, - struct inode *inode); - -extern int ldiskfs_xattr_set_handle(handle_t *handle, struct inode *inode, - int name_index, const char *name, - const void *value, size_t value_len, - int flags); - static struct dentry * osd_child_dentry_get(const struct lu_env *env, struct osd_object *obj, const char *name, @@ -1446,13 +1426,6 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, return result; } - -extern int iam_lvar_create(struct inode *obj, int keysize, int ptrsize, - int recsize, handle_t *handle); - -extern int iam_lfix_create(struct inode *obj, int keysize, int ptrsize, - int recsize, handle_t *handle); - enum { OSD_NAME_LEN = 255 }; @@ -2167,7 +2140,6 @@ static int osd_iam_index_probe(const struct lu_env *env, struct osd_object *o, const struct dt_index_features *feat) { struct iam_descr *descr; - struct dt_object *dt = &o->oo_dt; if (osd_object_is_root(o)) return feat == &dt_directory_features; @@ -2178,19 +2150,8 @@ static int osd_iam_index_probe(const struct lu_env *env, struct osd_object *o, if (feat == &dt_directory_features) { if (descr->id_rec_size == sizeof(struct lu_fid_pack)) return 1; - - if (descr == &iam_htree_compat_param) { - /* if it is a HTREE dir then there is good chance that, - * we dealing with ext3 directory here with no FIDs. */ - - if (descr->id_rec_size == - sizeof ((struct ldiskfs_dir_entry_2 *)NULL)->inode) { - - dt->do_index_ops = &osd_index_ea_ops; - return 1; - } - } - return 0; + else + return 0; } else { return feat->dif_keysize_min <= descr->id_key_size && @@ -3719,9 +3680,6 @@ static int osd_process_config(const struct lu_env *env, RETURN(err); } -extern void ldiskfs_orphan_cleanup (struct super_block * sb, - struct ldiskfs_super_block * es); - static int osd_recovery_complete(const struct lu_env *env, struct lu_device *d) { diff --git a/lustre/osd/osd_iam.c b/lustre/osd/osd_iam.c new file mode 100644 index 0000000..9f1cd26 --- /dev/null +++ b/lustre/osd/osd_iam.c @@ -0,0 +1,2297 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * iam.c + * Top-level entry points into iam module + * + * Author: Wang Di + * Author: Nikita Danilov + */ + +/* + * iam: big theory statement. + * + * iam (Index Access Module) is a module providing abstraction of persistent + * transactional container on top of generalized ldiskfs htree. + * + * iam supports: + * + * - key, pointer, and record size specifiable per container. + * + * - trees taller than 2 index levels. + * + * - read/write to existing ldiskfs htree directories as iam containers. + * + * iam container is a tree, consisting of leaf nodes containing keys and + * records stored in this container, and index nodes, containing keys and + * pointers to leaf or index nodes. + * + * iam does not work with keys directly, instead it calls user-supplied key + * comparison function (->dpo_keycmp()). + * + * Pointers are (currently) interpreted as logical offsets (measured in + * blocksful) within underlying flat file on top of which iam tree lives. + * + * On-disk format: + * + * iam mostly tries to reuse existing htree formats. + * + * Format of index node: + * + * +-----+-------+-------+-------+------+-------+------------+ + * | | count | | | | | | + * | gap | / | entry | entry | .... | entry | free space | + * | | limit | | | | | | + * +-----+-------+-------+-------+------+-------+------------+ + * + * gap this part of node is never accessed by iam code. It + * exists for binary compatibility with ldiskfs htree (that, + * in turn, stores fake struct ext2_dirent for ext2 + * compatibility), and to keep some unspecified per-node + * data. Gap can be different for root and non-root index + * nodes. Gap size can be specified for each container + * (gap of 0 is allowed). + * + * count/limit current number of entries in this node, and the maximal + * number of entries that can fit into node. count/limit + * has the same size as entry, and is itself counted in + * count. + * + * entry index entry: consists of a key immediately followed by + * a pointer to a child node. Size of a key and size of a + * pointer depends on container. Entry has neither + * alignment nor padding. + * + * free space portion of node new entries are added to + * + * Entries in index node are sorted by their key value. + * + * Format of a leaf node is not specified. Generic iam code accesses leaf + * nodes through ->id_leaf methods in struct iam_descr. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "osd_internal.h" + +#include "xattr.h" +#include "iopen.h" +#include "acl.h" + +/* + * List of all registered formats. + * + * No locking. Callers synchronize. + */ +static LIST_HEAD(iam_formats); + +void iam_format_register(struct iam_format *fmt) +{ + list_add(&fmt->if_linkage, &iam_formats); +} +EXPORT_SYMBOL(iam_format_register); + +/* + * Determine format of given container. This is done by scanning list of + * registered formats and calling ->if_guess() method of each in turn. + */ +static int iam_format_guess(struct iam_container *c) +{ + int result; + struct iam_format *fmt; + + /* + * XXX temporary initialization hook. + */ + { + static int initialized = 0; + + if (!initialized) { + iam_lvar_format_init(); + iam_lfix_format_init(); + initialized = 1; + } + } + + result = -ENOENT; + list_for_each_entry(fmt, &iam_formats, if_linkage) { + result = fmt->if_guess(c); + if (result == 0) + break; + } + return result; +} + +/* + * Initialize container @c. + */ +int iam_container_init(struct iam_container *c, + struct iam_descr *descr, struct inode *inode) +{ + memset(c, 0, sizeof *c); + c->ic_descr = descr; + c->ic_object = inode; + init_rwsem(&c->ic_sem); + return 0; +} +EXPORT_SYMBOL(iam_container_init); + +/* + * Determine container format. + */ +int iam_container_setup(struct iam_container *c) +{ + return iam_format_guess(c); +} +EXPORT_SYMBOL(iam_container_setup); + +/* + * Finalize container @c, release all resources. + */ +void iam_container_fini(struct iam_container *c) +{ +} +EXPORT_SYMBOL(iam_container_fini); + +void iam_path_init(struct iam_path *path, struct iam_container *c, + struct iam_path_descr *pd) +{ + memset(path, 0, sizeof *path); + path->ip_container = c; + path->ip_frame = path->ip_frames; + path->ip_data = pd; + path->ip_leaf.il_path = path; +} + +static void iam_leaf_fini(struct iam_leaf *leaf); + +void iam_path_release(struct iam_path *path) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { + if (path->ip_frames[i].bh != NULL) { + brelse(path->ip_frames[i].bh); + path->ip_frames[i].bh = NULL; + } + } +} + +void iam_path_fini(struct iam_path *path) +{ + iam_leaf_fini(&path->ip_leaf); + iam_path_release(path); +} + + +void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode) +{ + int i; + + path->ipc_hinfo = &path->ipc_hinfo_area; + for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i) + path->ipc_descr.ipd_key_scratch[i] = + (struct iam_ikey *)&path->ipc_scratch[i]; + + iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr); +} + +void iam_path_compat_fini(struct iam_path_compat *path) +{ + iam_path_fini(&path->ipc_path); +} + +/* + * Helper function initializing iam_path_descr and its key scratch area. + */ +struct iam_path_descr *iam_ipd_alloc(void *area, int keysize) +{ + struct iam_path_descr *ipd; + void *karea; + int i; + + ipd = area; + karea = ipd + 1; + for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize) + ipd->ipd_key_scratch[i] = karea; + return ipd; +} +EXPORT_SYMBOL(iam_ipd_alloc); + +void iam_ipd_free(struct iam_path_descr *ipd) +{ +} +EXPORT_SYMBOL(iam_ipd_free); + +int iam_node_read(struct iam_container *c, iam_ptr_t ptr, + handle_t *h, struct buffer_head **bh) +{ + int result = 0; + + *bh = ldiskfs_bread(h, c->ic_object, (int)ptr, 0, &result); + if (*bh == NULL) + result = -EIO; + return result; +} + +/* + * Return pointer to current leaf record. Pointer is valid while corresponding + * leaf node is locked and pinned. + */ +static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf) +{ + return iam_leaf_ops(leaf)->rec(leaf); +} + +/* + * Return pointer to the current leaf key. This function returns pointer to + * the key stored in node. + * + * Caller should assume that returned pointer is only valid while leaf node is + * pinned and locked. + */ +static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf) +{ + return iam_leaf_ops(leaf)->key(leaf); +} + +static int iam_leaf_key_size(const struct iam_leaf *leaf) +{ + return iam_leaf_ops(leaf)->key_size(leaf); +} + +static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf, + struct iam_ikey *key) +{ + return iam_leaf_ops(leaf)->ikey(leaf, key); +} + +static int iam_leaf_keycmp(const struct iam_leaf *leaf, + const struct iam_key *key) +{ + return iam_leaf_ops(leaf)->key_cmp(leaf, key); +} + +static int iam_leaf_keyeq(const struct iam_leaf *leaf, + const struct iam_key *key) +{ + return iam_leaf_ops(leaf)->key_eq(leaf, key); +} + +#if LDISKFS_INVARIANT_ON +static int iam_leaf_check(struct iam_leaf *leaf); +extern int dx_node_check(struct iam_path *p, struct iam_frame *f); + +static int iam_path_check(struct iam_path *p) +{ + int i; + int result; + struct iam_frame *f; + struct iam_descr *param; + + result = 1; + param = iam_path_descr(p); + for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) { + f = &p->ip_frames[i]; + if (f->bh != NULL) { + result = dx_node_check(p, f); + if (result) + result = !param->id_ops->id_node_check(p, f); + } + } + if (result && p->ip_leaf.il_bh != NULL) + result = iam_leaf_check(&p->ip_leaf); + if (result == 0) { + ldiskfs_std_error(iam_path_obj(p)->i_sb, result); + } + return result; +} +#endif + +static int iam_leaf_load(struct iam_path *path) +{ + iam_ptr_t block; + int err; + struct iam_container *c; + struct buffer_head *bh; + struct iam_leaf *leaf; + struct iam_descr *descr; + + c = path->ip_container; + leaf = &path->ip_leaf; + descr = iam_path_descr(path); + block = path->ip_frame->leaf; + if (block == 0) { + /* XXX bug 11027 */ + printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n", + (long unsigned)path->ip_frame->leaf, + dx_get_count(dx_node_get_entries(path, path->ip_frame)), + path->ip_frames[0].bh, path->ip_frames[1].bh, + path->ip_frames[2].bh); + } + err = descr->id_ops->id_node_read(c, block, NULL, &bh); + if (err == 0) { + leaf->il_bh = bh; + leaf->il_curidx = block; + err = iam_leaf_ops(leaf)->init(leaf); + assert_inv(ergo(err == 0, iam_leaf_check(leaf))); + } + return err; +} + +static void iam_unlock_htree(struct inode *dir, struct dynlock_handle *lh) +{ + if (lh != NULL) + dynlock_unlock(&LDISKFS_I(dir)->i_htree_lock, lh); +} + + +static void iam_leaf_unlock(struct iam_leaf *leaf) +{ + if (leaf->il_lock != NULL) { + iam_unlock_htree(iam_leaf_container(leaf)->ic_object, + leaf->il_lock); + do_corr(schedule()); + leaf->il_lock = NULL; + } +} + +static void iam_leaf_fini(struct iam_leaf *leaf) +{ + if (leaf->il_path != NULL) { + iam_leaf_unlock(leaf); + assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf))); + iam_leaf_ops(leaf)->fini(leaf); + if (leaf->il_bh) { + brelse(leaf->il_bh); + leaf->il_bh = NULL; + leaf->il_curidx = 0; + } + } +} + +static void iam_leaf_start(struct iam_leaf *folio) +{ + iam_leaf_ops(folio)->start(folio); +} + +void iam_leaf_next(struct iam_leaf *folio) +{ + iam_leaf_ops(folio)->next(folio); +} + +static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key, + const struct iam_rec *rec) +{ + iam_leaf_ops(leaf)->rec_add(leaf, key, rec); +} + +static void iam_rec_del(struct iam_leaf *leaf, int shift) +{ + iam_leaf_ops(leaf)->rec_del(leaf, shift); +} + +int iam_leaf_at_end(const struct iam_leaf *leaf) +{ + return iam_leaf_ops(leaf)->at_end(leaf); +} + +void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr) +{ + iam_leaf_ops(l)->split(l, bh, nr); +} + +int iam_leaf_can_add(const struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r) +{ + return iam_leaf_ops(l)->can_add(l, k, r); +} + +#if LDISKFS_INVARIANT_ON +static int iam_leaf_check(struct iam_leaf *leaf) +{ + return 1; +#if 0 + struct iam_lentry *orig; + struct iam_path *path; + struct iam_container *bag; + struct iam_ikey *k0; + struct iam_ikey *k1; + int result; + int first; + + orig = leaf->il_at; + path = iam_leaf_path(leaf); + bag = iam_leaf_container(leaf); + + result = iam_leaf_ops(leaf)->init(leaf); + if (result != 0) + return result; + + first = 1; + iam_leaf_start(leaf); + k0 = iam_path_ikey(path, 0); + k1 = iam_path_ikey(path, 1); + while (!iam_leaf_at_end(leaf)) { + iam_ikeycpy(bag, k0, k1); + iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1)); + if (!first && iam_ikeycmp(bag, k0, k1) > 0) { + return 0; + } + first = 0; + iam_leaf_next(leaf); + } + leaf->il_at = orig; + return 1; +#endif +} +#endif + +static int iam_txn_dirty(handle_t *handle, + struct iam_path *path, struct buffer_head *bh) +{ + int result; + + result = ldiskfs_journal_dirty_metadata(handle, bh); + if (result != 0) + ldiskfs_std_error(iam_path_obj(path)->i_sb, result); + return result; +} + +static int iam_txn_add(handle_t *handle, + struct iam_path *path, struct buffer_head *bh) +{ + int result; + + result = ldiskfs_journal_get_write_access(handle, bh); + if (result != 0) + ldiskfs_std_error(iam_path_obj(path)->i_sb, result); + return result; +} + +/***********************************************************************/ +/* iterator interface */ +/***********************************************************************/ + +static enum iam_it_state it_state(const struct iam_iterator *it) +{ + return it->ii_state; +} + +/* + * Helper function returning scratch key. + */ +static struct iam_container *iam_it_container(const struct iam_iterator *it) +{ + return it->ii_path.ip_container; +} + +static inline int it_keycmp(const struct iam_iterator *it, + const struct iam_key *k) +{ + return iam_leaf_keycmp(&it->ii_path.ip_leaf, k); +} + +static inline int it_keyeq(const struct iam_iterator *it, + const struct iam_key *k) +{ + return iam_leaf_keyeq(&it->ii_path.ip_leaf, k); +} + +static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik) +{ + return iam_ikeycmp(it->ii_path.ip_container, + iam_leaf_ikey(&it->ii_path.ip_leaf, + iam_path_ikey(&it->ii_path, 0)), ik); +} + +static inline int it_at_rec(const struct iam_iterator *it) +{ + return !iam_leaf_at_end(&it->ii_path.ip_leaf); +} + +static inline int it_before(const struct iam_iterator *it) +{ + return it_state(it) == IAM_IT_SKEWED && it_at_rec(it); +} + +/* + * Helper wrapper around iam_it_get(): returns 0 (success) only when record + * with exactly the same key as asked is found. + */ +static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k) +{ + int result; + + result = iam_it_get(it, k); + if (result > 0) + result = 0; + else if (result == 0) + /* + * Return -ENOENT if cursor is located above record with a key + * different from one specified, or in the empty leaf. + * + * XXX returning -ENOENT only works if iam_it_get() never + * returns -ENOENT as a legitimate error. + */ + result = -ENOENT; + return result; +} + +void iam_container_write_lock(struct iam_container *ic) +{ + down_write(&ic->ic_sem); +} + +void iam_container_write_unlock(struct iam_container *ic) +{ + up_write(&ic->ic_sem); +} + +void iam_container_read_lock(struct iam_container *ic) +{ + down_read(&ic->ic_sem); +} + +void iam_container_read_unlock(struct iam_container *ic) +{ + up_read(&ic->ic_sem); +} + +/* + * Initialize iterator to IAM_IT_DETACHED state. + * + * postcondition: it_state(it) == IAM_IT_DETACHED + */ +int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, + struct iam_path_descr *pd) +{ + memset(it, 0, sizeof *it); + it->ii_flags = flags; + it->ii_state = IAM_IT_DETACHED; + iam_path_init(&it->ii_path, c, pd); + return 0; +} +EXPORT_SYMBOL(iam_it_init); + +/* + * Finalize iterator and release all resources. + * + * precondition: it_state(it) == IAM_IT_DETACHED + */ +void iam_it_fini(struct iam_iterator *it) +{ + assert_corr(it_state(it) == IAM_IT_DETACHED); + iam_path_fini(&it->ii_path); +} +EXPORT_SYMBOL(iam_it_fini); + +/* + * this locking primitives are used to protect parts + * of dir's htree. protection unit is block: leaf or index + */ +struct dynlock_handle *iam_lock_htree(struct inode *dir, unsigned long value, + enum dynlock_type lt) +{ + return dynlock_lock(&LDISKFS_I(dir)->i_htree_lock, value, lt, GFP_NOFS); +} + + + +int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh) +{ + struct iam_frame *f; + + for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) { + do_corr(schedule()); + *lh = iam_lock_htree(iam_path_obj(path), f->curidx, DLT_READ); + if (*lh == NULL) + return -ENOMEM; + } + return 0; +} + +/* + * Fast check for frame consistency. + */ +static int iam_check_fast(struct iam_path *path, struct iam_frame *frame) +{ + struct iam_container *bag; + struct iam_entry *next; + struct iam_entry *last; + struct iam_entry *entries; + struct iam_entry *at; + + bag = path->ip_container; + at = frame->at; + entries = frame->entries; + last = iam_entry_shift(path, entries, dx_get_count(entries) - 1); + + if (unlikely(at > last)) + return -EAGAIN; + + if (unlikely(dx_get_block(path, at) != frame->leaf)) + return -EAGAIN; + + if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at), + path->ip_ikey_target) > 0)) + return -EAGAIN; + + next = iam_entry_shift(path, at, +1); + if (next <= last) { + if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next), + path->ip_ikey_target) <= 0)) + return -EAGAIN; + } + return 0; +} + +int dx_index_is_compat(struct iam_path *path) +{ + return iam_path_descr(path) == NULL; +} + +/* + * dx_find_position + * + * search position of specified hash in index + * + */ + +struct iam_entry *iam_find_position(struct iam_path *path, + struct iam_frame *frame) +{ + int count; + struct iam_entry *p; + struct iam_entry *q; + struct iam_entry *m; + + count = dx_get_count(frame->entries); + assert_corr(count && count <= dx_get_limit(frame->entries)); + p = iam_entry_shift(path, frame->entries, + dx_index_is_compat(path) ? 1 : 2); + q = iam_entry_shift(path, frame->entries, count - 1); + while (p <= q) { + m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2); + if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m), + path->ip_ikey_target) > 0) + q = iam_entry_shift(path, m, -1); + else + p = iam_entry_shift(path, m, +1); + } + return iam_entry_shift(path, p, -1); +} + + + +static iam_ptr_t iam_find_ptr(struct iam_path *path, struct iam_frame *frame) +{ + return dx_get_block(path, iam_find_position(path, frame)); +} + +void iam_insert_key(struct iam_path *path, struct iam_frame *frame, + const struct iam_ikey *key, iam_ptr_t ptr) +{ + struct iam_entry *entries = frame->entries; + struct iam_entry *new = iam_entry_shift(path, frame->at, +1); + int count = dx_get_count(entries); + + /* + * Unfortunately we cannot assert this, as this function is sometimes + * called by VFS under i_sem and without pdirops lock. + */ + assert_corr(1 || iam_frame_is_locked(path, frame)); + assert_corr(count < dx_get_limit(entries)); + assert_corr(frame->at < iam_entry_shift(path, entries, count)); + assert_inv(dx_node_check(path, frame)); + + memmove(iam_entry_shift(path, new, 1), new, + (char *)iam_entry_shift(path, entries, count) - (char *)new); + dx_set_ikey(path, new, key); + dx_set_block(path, new, ptr); + dx_set_count(entries, count + 1); + assert_inv(dx_node_check(path, frame)); +} + +void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, + const struct iam_ikey *key, iam_ptr_t ptr) +{ + iam_lock_bh(frame->bh); + iam_insert_key(path, frame, key, ptr); + iam_unlock_bh(frame->bh); +} +/* + * returns 0 if path was unchanged, -EAGAIN otherwise. + */ +static int iam_check_path(struct iam_path *path, struct iam_frame *frame) +{ + int equal; + + iam_lock_bh(frame->bh); + equal = iam_check_fast(path, frame) == 0 || + frame->leaf == iam_find_ptr(path, frame); + DX_DEVAL(iam_lock_stats.dls_bh_again += !equal); + iam_unlock_bh(frame->bh); + + return equal ? 0 : -EAGAIN; +} + +static int iam_lookup_try(struct iam_path *path) +{ + u32 ptr; + int err = 0; + int i; + + struct iam_descr *param; + struct iam_frame *frame; + struct iam_container *c; + + param = iam_path_descr(path); + c = path->ip_container; + + ptr = param->id_ops->id_root_ptr(c); + for (frame = path->ip_frames, i = 0; i <= path->ip_indirect; + ++frame, ++i) { + err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL, + &frame->bh); + do_corr(schedule()); + + iam_lock_bh(frame->bh); + /* + * node must be initialized under bh lock because concurrent + * creation procedure may change it and iam_lookup_try() will + * see obsolete tree height. -bzzz + */ + if (err != 0) + break; + + if (LDISKFS_INVARIANT_ON) { + err = param->id_ops->id_node_check(path, frame); + if (err != 0) + break; + } + + err = param->id_ops->id_node_load(path, frame); + if (err != 0) + break; + + assert_inv(dx_node_check(path, frame)); + /* + * splitting may change root index block and move hash we're + * looking for into another index block so, we have to check + * this situation and repeat from begining if path got changed + * -bzzz + */ + if (i > 0) { + err = iam_check_path(path, frame - 1); + if (err != 0) + break; + } + + frame->at = iam_find_position(path, frame); + frame->curidx = ptr; + frame->leaf = ptr = dx_get_block(path, frame->at); + + iam_unlock_bh(frame->bh); + do_corr(schedule()); + } + if (err != 0) + iam_unlock_bh(frame->bh); + path->ip_frame = --frame; + return err; +} + +static int __iam_path_lookup(struct iam_path *path) +{ + int err; + int i; + + for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i) + assert(path->ip_frames[i].bh == NULL); + + do { + err = iam_lookup_try(path); + do_corr(schedule()); + if (err != 0) + iam_path_fini(path); + } while (err == -EAGAIN); + + return err; +} + +/* + * returns 0 if path was unchanged, -EAGAIN otherwise. + */ +static int iam_check_full_path(struct iam_path *path, int search) +{ + struct iam_frame *bottom; + struct iam_frame *scan; + int i; + int result; + + do_corr(schedule()); + + for (bottom = path->ip_frames, i = 0; + i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) { + ; /* find last filled in frame */ + } + + /* + * Lock frames, bottom to top. + */ + for (scan = bottom - 1; scan >= path->ip_frames; --scan) + iam_lock_bh(scan->bh); + /* + * Check them top to bottom. + */ + result = 0; + for (scan = path->ip_frames; scan < bottom; ++scan) { + struct iam_entry *pos; + + if (search) { + if (iam_check_fast(path, scan) == 0) + continue; + + pos = iam_find_position(path, scan); + if (scan->leaf != dx_get_block(path, pos)) { + result = -EAGAIN; + break; + } + scan->at = pos; + } else { + pos = iam_entry_shift(path, scan->entries, + dx_get_count(scan->entries) - 1); + if (scan->at > pos || + scan->leaf != dx_get_block(path, scan->at)) { + result = -EAGAIN; + break; + } + } + } + + /* + * Unlock top to bottom. + */ + for (scan = path->ip_frames; scan < bottom; ++scan) + iam_unlock_bh(scan->bh); + DX_DEVAL(iam_lock_stats.dls_bh_full_again += !!result); + do_corr(schedule()); + + return result; +} + + +/* + * Performs path lookup and returns with found leaf (if any) locked by htree + * lock. + */ +int iam_lookup_lock(struct iam_path *path, + struct dynlock_handle **dl, enum dynlock_type lt) +{ + int result; + struct inode *dir; + + dir = iam_path_obj(path); + while ((result = __iam_path_lookup(path)) == 0) { + do_corr(schedule()); + *dl = iam_lock_htree(dir, path->ip_frame->leaf, lt); + if (*dl == NULL) { + iam_path_fini(path); + result = -ENOMEM; + break; + } + do_corr(schedule()); + /* + * while locking leaf we just found may get split so we need + * to check this -bzzz + */ + if (iam_check_full_path(path, 1) == 0) + break; + iam_unlock_htree(dir, *dl); + *dl = NULL; + iam_path_fini(path); + } + return result; +} +/* + * Performs tree top-to-bottom traversal starting from root, and loads leaf + * node. + */ +static int iam_path_lookup(struct iam_path *path, int index) +{ + struct iam_container *c; + struct iam_descr *descr; + struct iam_leaf *leaf; + int result; + + c = path->ip_container; + leaf = &path->ip_leaf; + descr = iam_path_descr(path); + result = iam_lookup_lock(path, &leaf->il_lock, DLT_WRITE); + assert_inv(iam_path_check(path)); + do_corr(schedule()); + if (result == 0) { + result = iam_leaf_load(path); + assert_inv(ergo(result == 0, iam_leaf_check(leaf))); + if (result == 0) { + do_corr(schedule()); + if (index) + result = iam_leaf_ops(leaf)-> + ilookup(leaf, path->ip_ikey_target); + else + result = iam_leaf_ops(leaf)-> + lookup(leaf, path->ip_key_target); + do_corr(schedule()); + } + if (result < 0) + iam_leaf_unlock(leaf); + } + return result; +} + +/* + * Common part of iam_it_{i,}get(). + */ +static int __iam_it_get(struct iam_iterator *it, int index) +{ + int result; + assert_corr(it_state(it) == IAM_IT_DETACHED); + + result = iam_path_lookup(&it->ii_path, index); + if (result >= 0) { + int collision; + + collision = result & IAM_LOOKUP_LAST; + switch (result & ~IAM_LOOKUP_LAST) { + case IAM_LOOKUP_EXACT: + result = +1; + it->ii_state = IAM_IT_ATTACHED; + break; + case IAM_LOOKUP_OK: + result = 0; + it->ii_state = IAM_IT_ATTACHED; + break; + case IAM_LOOKUP_BEFORE: + case IAM_LOOKUP_EMPTY: + result = 0; + it->ii_state = IAM_IT_SKEWED; + break; + default: + assert(0); + } + result |= collision; + } + /* + * See iam_it_get_exact() for explanation. + */ + assert_corr(result != -ENOENT); + return result; +} + +/* + * Correct hash, but not the same key was found, iterate through hash + * collision chain, looking for correct record. + */ +static int iam_it_collision(struct iam_iterator *it) +{ + int result; + + assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target))); + + while ((result = iam_it_next(it)) == 0) { + do_corr(schedule()); + if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0) + return -ENOENT; + if (it_keyeq(it, it->ii_path.ip_key_target)) + return 0; + } + return result; +} + +/* + * Attach iterator. After successful completion, @it points to record with + * least key not larger than @k. + * + * Return value: 0: positioned on existing record, + * +ve: exact position found, + * -ve: error. + * + * precondition: it_state(it) == IAM_IT_DETACHED + * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED, + * it_keycmp(it, k) <= 0) + */ +int iam_it_get(struct iam_iterator *it, const struct iam_key *k) +{ + int result; + assert_corr(it_state(it) == IAM_IT_DETACHED); + + it->ii_path.ip_ikey_target = NULL; + it->ii_path.ip_key_target = k; + + result = __iam_it_get(it, 0); + + if (result == IAM_LOOKUP_LAST) { + result = iam_it_collision(it); + if (result != 0) { + iam_it_put(it); + iam_it_fini(it); + result = __iam_it_get(it, 0); + } else + result = +1; + } + if (result > 0) + result &= ~IAM_LOOKUP_LAST; + + assert_corr(ergo(result > 0, it_keycmp(it, k) == 0)); + assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED, + it_keycmp(it, k) <= 0)); + return result; +} +EXPORT_SYMBOL(iam_it_get); + +/* + * Attach iterator by index key. + */ +static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k) +{ + assert_corr(it_state(it) == IAM_IT_DETACHED); + + it->ii_path.ip_ikey_target = k; + return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST; +} + +/* + * Attach iterator, and assure it points to the record (not skewed). + * + * Return value: 0: positioned on existing record, + * +ve: exact position found, + * -ve: error. + * + * precondition: it_state(it) == IAM_IT_DETACHED && + * !(it->ii_flags&IAM_IT_WRITE) + * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) + */ +int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k) +{ + int result; + assert_corr(it_state(it) == IAM_IT_DETACHED && + !(it->ii_flags&IAM_IT_WRITE)); + result = iam_it_get(it, k); + if (result == 0) { + if (it_state(it) != IAM_IT_ATTACHED) { + assert_corr(it_state(it) == IAM_IT_SKEWED); + result = iam_it_next(it); + } + } + assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)); + return result; +} +EXPORT_SYMBOL(iam_it_get_at); + +/* + * Duplicates iterator. + * + * postcondition: it_state(dst) == it_state(src) && + * iam_it_container(dst) == iam_it_container(src) && + * dst->ii_flags = src->ii_flags && + * ergo(it_state(src) == IAM_IT_ATTACHED, + * iam_it_rec_get(dst) == iam_it_rec_get(src) && + * iam_it_key_get(dst) == iam_it_key_get(src)) + */ +void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src) +{ + dst->ii_flags = src->ii_flags; + dst->ii_state = src->ii_state; + /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */ + /* + * XXX: duplicate lock. + */ + assert_corr(it_state(dst) == it_state(src)); + assert_corr(iam_it_container(dst) == iam_it_container(src)); + assert_corr(dst->ii_flags = src->ii_flags); + assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED, + iam_it_rec_get(dst) == iam_it_rec_get(src) && + iam_it_key_get(dst) == iam_it_key_get(src))); + +} + +/* + * Detach iterator. Does nothing it detached state. + * + * postcondition: it_state(it) == IAM_IT_DETACHED + */ +void iam_it_put(struct iam_iterator *it) +{ + if (it->ii_state != IAM_IT_DETACHED) { + it->ii_state = IAM_IT_DETACHED; + iam_leaf_fini(&it->ii_path.ip_leaf); + } +} +EXPORT_SYMBOL(iam_it_put); + +static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it, + struct iam_ikey *ikey); + + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int iam_htree_advance(struct inode *dir, __u32 hash, + struct iam_path *path, __u32 *start_hash, + int compat) +{ + struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + do_corr(schedule()); + iam_lock_bh(p->bh); + p->at = iam_entry_shift(path, p->at, +1); + if (p->at < iam_entry_shift(path, p->entries, + dx_get_count(p->entries))) { + p->leaf = dx_get_block(path, p->at); + iam_unlock_bh(p->bh); + break; + } + iam_unlock_bh(p->bh); + if (p == path->ip_frames) + return 0; + num_frames++; + --p; + } + + if (compat) { + /* + * Htree hash magic. + */ + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + dx_get_ikey(path, p->at, (struct iam_ikey *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + iam_ptr_t idx; + + do_corr(schedule()); + iam_lock_bh(p->bh); + idx = p->leaf = dx_get_block(path, p->at); + iam_unlock_bh(p->bh); + err = iam_path_descr(path)->id_ops-> + id_node_read(path->ip_container, idx, NULL, &bh); + if (err != 0) + return err; /* Failure */ + ++p; + brelse(p->bh); + assert_corr(p->bh != bh); + p->bh = bh; + p->entries = dx_node_get_entries(path, p); + p->at = iam_entry_shift(path, p->entries, !compat); + assert_corr(p->curidx != idx); + p->curidx = idx; + iam_lock_bh(p->bh); + assert_corr(p->leaf != dx_get_block(path, p->at)); + p->leaf = dx_get_block(path, p->at); + iam_unlock_bh(p->bh); + assert_inv(dx_node_check(path, p)); + } + return 1; +} + + +static inline int iam_index_advance(struct iam_path *path) +{ + return iam_htree_advance(iam_path_obj(path), 0, path, NULL, 0); +} + +static void iam_unlock_array(struct inode *dir, struct dynlock_handle **lh) +{ + int i; + + for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) { + if (*lh != NULL) { + iam_unlock_htree(dir, *lh); + *lh = NULL; + } + } +} +/* + * Advance index part of @path to point to the next leaf. Returns 1 on + * success, 0, when end of container was reached. Leaf node is locked. + */ +int iam_index_next(struct iam_container *c, struct iam_path *path) +{ + iam_ptr_t cursor; + struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, }; + int result; + struct inode *object; + + /* + * Locking for iam_index_next()... is to be described. + */ + + object = c->ic_object; + cursor = path->ip_frame->leaf; + + while (1) { + result = iam_index_lock(path, lh); + do_corr(schedule()); + if (result < 0) + break; + + result = iam_check_full_path(path, 0); + if (result == 0 && cursor == path->ip_frame->leaf) { + result = iam_index_advance(path); + + assert_corr(result == 0 || + cursor != path->ip_frame->leaf); + break; + } + do { + iam_unlock_array(object, lh); + + iam_path_release(path); + do_corr(schedule()); + + result = __iam_path_lookup(path); + if (result < 0) + break; + + while (path->ip_frame->leaf != cursor) { + do_corr(schedule()); + + result = iam_index_lock(path, lh); + do_corr(schedule()); + if (result < 0) + break; + + result = iam_check_full_path(path, 0); + if (result != 0) + break; + + result = iam_index_advance(path); + if (result == 0) { + CERROR("cannot find cursor : %u\n", + cursor); + result = -EIO; + } + if (result < 0) + break; + result = iam_check_full_path(path, 0); + if (result != 0) + break; + iam_unlock_array(object, lh); + } + } while (result == -EAGAIN); + if (result < 0) + break; + } + iam_unlock_array(object, lh); + return result; +} + +/* + * Move iterator one record right. + * + * Return value: 0: success, + * +1: end of container reached + * -ve: error + * + * precondition: (it_state(it) == IAM_IT_ATTACHED || + * it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE + * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) && + * ergo(result > 0, it_state(it) == IAM_IT_DETACHED) + */ +int iam_it_next(struct iam_iterator *it) +{ + int result; + struct iam_path *path; + struct iam_leaf *leaf; + struct inode *obj; + do_corr(struct iam_ikey *ik_orig); + + /* assert_corr(it->ii_flags&IAM_IT_MOVE); */ + assert_corr(it_state(it) == IAM_IT_ATTACHED || + it_state(it) == IAM_IT_SKEWED); + + path = &it->ii_path; + leaf = &path->ip_leaf; + obj = iam_path_obj(path); + + assert_corr(iam_leaf_is_locked(leaf)); + + result = 0; + do_corr(ik_orig = it_at_rec(it) ? + iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL); + if (it_before(it)) { + assert_corr(!iam_leaf_at_end(leaf)); + it->ii_state = IAM_IT_ATTACHED; + } else { + if (!iam_leaf_at_end(leaf)) + /* advance within leaf node */ + iam_leaf_next(leaf); + /* + * multiple iterations may be necessary due to empty leaves. + */ + while (result == 0 && iam_leaf_at_end(leaf)) { + do_corr(schedule()); + /* advance index portion of the path */ + result = iam_index_next(iam_it_container(it), path); + assert_corr(iam_leaf_is_locked(leaf)); + if (result == 1) { + struct dynlock_handle *lh; + lh = iam_lock_htree(obj, path->ip_frame->leaf, + DLT_WRITE); + if (lh != NULL) { + iam_leaf_fini(leaf); + leaf->il_lock = lh; + result = iam_leaf_load(path); + if (result == 0) + iam_leaf_start(leaf); + } else + result = -ENOMEM; + } else if (result == 0) + /* end of container reached */ + result = +1; + if (result != 0) + iam_it_put(it); + } + if (result == 0) + it->ii_state = IAM_IT_ATTACHED; + } + assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)); + assert_corr(ergo(result > 0, it_state(it) == IAM_IT_DETACHED)); + assert_corr(ergo(result == 0 && ik_orig != NULL, + it_ikeycmp(it, ik_orig) >= 0)); + return result; +} +EXPORT_SYMBOL(iam_it_next); + +/* + * Return pointer to the record under iterator. + * + * precondition: it_state(it) == IAM_IT_ATTACHED && it_at_rec(it) + * postcondition: it_state(it) == IAM_IT_ATTACHED + */ +struct iam_rec *iam_it_rec_get(const struct iam_iterator *it) +{ + assert_corr(it_state(it) == IAM_IT_ATTACHED); + assert_corr(it_at_rec(it)); + return iam_leaf_rec(&it->ii_path.ip_leaf); +} +EXPORT_SYMBOL(iam_it_rec_get); + +static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r) +{ + struct iam_leaf *folio; + + folio = &it->ii_path.ip_leaf; + iam_leaf_ops(folio)->rec_set(folio, r); +} + +/* + * Replace contents of record under iterator. + * + * precondition: it_state(it) == IAM_IT_ATTACHED && + * it->ii_flags&IAM_IT_WRITE + * postcondition: it_state(it) == IAM_IT_ATTACHED && + * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) + */ +int iam_it_rec_set(handle_t *h, + struct iam_iterator *it, const struct iam_rec *r) +{ + int result; + struct iam_path *path; + struct buffer_head *bh; + + assert_corr(it_state(it) == IAM_IT_ATTACHED && + it->ii_flags&IAM_IT_WRITE); + assert_corr(it_at_rec(it)); + + path = &it->ii_path; + bh = path->ip_leaf.il_bh; + result = iam_txn_add(h, path, bh); + if (result == 0) { + iam_it_reccpy(it, r); + result = iam_txn_dirty(h, path, bh); + } + return result; +} +EXPORT_SYMBOL(iam_it_rec_set); + +/* + * Return pointer to the index key under iterator. + * + * precondition: it_state(it) == IAM_IT_ATTACHED || + * it_state(it) == IAM_IT_SKEWED + */ +static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it, + struct iam_ikey *ikey) +{ + assert_corr(it_state(it) == IAM_IT_ATTACHED || + it_state(it) == IAM_IT_SKEWED); + assert_corr(it_at_rec(it)); + return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey); +} + +/* + * Return pointer to the key under iterator. + * + * precondition: it_state(it) == IAM_IT_ATTACHED || + * it_state(it) == IAM_IT_SKEWED + */ +struct iam_key *iam_it_key_get(const struct iam_iterator *it) +{ + assert_corr(it_state(it) == IAM_IT_ATTACHED || + it_state(it) == IAM_IT_SKEWED); + assert_corr(it_at_rec(it)); + return iam_leaf_key(&it->ii_path.ip_leaf); +} +EXPORT_SYMBOL(iam_it_key_get); + +/* + * Return size of key under iterator (in bytes) + * + * precondition: it_state(it) == IAM_IT_ATTACHED || + * it_state(it) == IAM_IT_SKEWED + */ +int iam_it_key_size(const struct iam_iterator *it) +{ + assert_corr(it_state(it) == IAM_IT_ATTACHED || + it_state(it) == IAM_IT_SKEWED); + assert_corr(it_at_rec(it)); + return iam_leaf_key_size(&it->ii_path.ip_leaf); +} +EXPORT_SYMBOL(iam_it_key_size); + +/* + * Insertion of new record. Interaction with jbd during non-trivial case (when + * split happens) is as following: + * + * - new leaf node is involved into transaction by ldiskfs_append(); + * + * - old leaf node is involved into transaction by iam_add_rec(); + * + * - leaf where insertion point ends in, is marked dirty by iam_add_rec(); + * + * - leaf without insertion point is marked dirty (as @new_leaf) by + * iam_new_leaf(); + * + * - split index nodes are involved into transaction and marked dirty by + * split_index_node(). + * + * - "safe" index node, which is no split, but where new pointer is inserted + * is involved into transaction and marked dirty by split_index_node(). + * + * - index node where pointer to new leaf is inserted is involved into + * transaction by split_index_node() and marked dirty by iam_add_rec(). + * + * - inode is marked dirty by iam_add_rec(). + * + */ + +static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf) +{ + int err; + iam_ptr_t blknr; + struct buffer_head *new_leaf; + struct buffer_head *old_leaf; + struct iam_container *c; + struct inode *obj; + struct iam_path *path; + + assert_inv(iam_leaf_check(leaf)); + + c = iam_leaf_container(leaf); + path = leaf->il_path; + + obj = c->ic_object; + new_leaf = ldiskfs_append(handle, obj, (__u32 *)&blknr, &err); + do_corr(schedule()); + if (new_leaf != NULL) { + struct dynlock_handle *lh; + + lh = iam_lock_htree(obj, blknr, DLT_WRITE); + do_corr(schedule()); + if (lh != NULL) { + iam_leaf_ops(leaf)->init_new(c, new_leaf); + do_corr(schedule()); + old_leaf = leaf->il_bh; + iam_leaf_split(leaf, &new_leaf, blknr); + if (old_leaf != leaf->il_bh) { + /* + * Switched to the new leaf. + */ + iam_leaf_unlock(leaf); + leaf->il_lock = lh; + path->ip_frame->leaf = blknr; + } else + iam_unlock_htree(obj, lh); + do_corr(schedule()); + err = iam_txn_dirty(handle, path, new_leaf); + brelse(new_leaf); + if (err == 0) + err = ldiskfs_mark_inode_dirty(handle, obj); + do_corr(schedule()); + } else + err = -ENOMEM; + } + assert_inv(iam_leaf_check(leaf)); + assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf)); + assert_inv(iam_path_check(iam_leaf_path(leaf))); + return err; +} + +static inline void dx_set_limit(struct iam_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static int iam_shift_entries(struct iam_path *path, + struct iam_frame *frame, unsigned count, + struct iam_entry *entries, struct iam_entry *entries2, + u32 newblock) +{ + unsigned count1; + unsigned count2; + int delta; + + struct iam_frame *parent = frame - 1; + struct iam_ikey *pivot = iam_path_ikey(path, 3); + + delta = dx_index_is_compat(path) ? 0 : +1; + + count1 = count/2 + delta; + count2 = count - count1; + dx_get_ikey(path, iam_entry_shift(path, entries, count1), pivot); + + dxtrace(printk("Split index %i/%i\n", count1, count2)); + + memcpy((char *) iam_entry_shift(path, entries2, delta), + (char *) iam_entry_shift(path, entries, count1), + count2 * iam_entry_size(path)); + + dx_set_count(entries2, count2 + delta); + dx_set_limit(entries2, dx_node_limit(path)); + + /* + * NOTE: very subtle piece of code competing dx_probe() may find 2nd + * level index in root index, then we insert new index here and set + * new count in that 2nd level index. so, dx_probe() may see 2nd level + * index w/o hash it looks for. the solution is to check root index + * after we locked just founded 2nd level index -bzzz + */ + iam_insert_key_lock(path, parent, pivot, newblock); + + /* + * now old and new 2nd level index blocks contain all pointers, so + * dx_probe() may find it in the both. it's OK -bzzz + */ + iam_lock_bh(frame->bh); + dx_set_count(entries, count1); + iam_unlock_bh(frame->bh); + + /* + * now old 2nd level index block points to first half of leafs. it's + * importand that dx_probe() must check root index block for changes + * under dx_lock_bh(frame->bh) -bzzz + */ + + return count1; +} + + +int split_index_node(handle_t *handle, struct iam_path *path, + struct dynlock_handle **lh) +{ + + struct iam_entry *entries; /* old block contents */ + struct iam_entry *entries2; /* new block contents */ + struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; + struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,}; + struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,}; + struct inode *dir = iam_path_obj(path); + struct iam_descr *descr; + int nr_splet; + int i, err; + + descr = iam_path_descr(path); + /* + * Algorithm below depends on this. + */ + assert_corr(dx_root_limit(path) < dx_node_limit(path)); + + frame = path->ip_frame; + entries = frame->entries; + + /* + * Tall-tree handling: we might have to split multiple index blocks + * all the way up to tree root. Tricky point here is error handling: + * to avoid complicated undo/rollback we + * + * - first allocate all necessary blocks + * + * - insert pointers into them atomically. + */ + + /* + * Locking: leaf is already locked. htree-locks are acquired on all + * index nodes that require split bottom-to-top, on the "safe" node, + * and on all new nodes + */ + + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ + for (nr_splet = 0; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + do_corr(schedule()); + if (nr_splet == DX_MAX_TREE_HEIGHT) { + /* + CWARN(dir->i_sb, __FUNCTION__, + "Directory index full!\n"); + */ + err = -ENOSPC; + goto cleanup; + } + } + + safe = frame; + + /* + * Lock all nodes, bottom to top. + */ + for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) { + do_corr(schedule()); + lock[i] = iam_lock_htree(dir, frame->curidx, DLT_WRITE); + if (lock[i] == NULL) { + err = -ENOMEM; + goto cleanup; + } + } + + /* + * Check for concurrent index modification. + */ + err = iam_check_full_path(path, 1); + if (err) + goto cleanup; + /* + * And check that the same number of nodes is to be split. + */ + for (i = 0, frame = path->ip_frame; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++i) { + ; + } + if (i != nr_splet) { + err = -EAGAIN; + goto cleanup; + } + + /* Go back down, allocating blocks, locking them, and adding into + * transaction... */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ldiskfs_append (handle, dir, &newblock[i], &err); + do_corr(schedule()); + if (!bh_new[i] || + descr->id_ops->id_node_init(path->ip_container, + bh_new[i], 0) != 0) + goto cleanup; + new_lock[i] = iam_lock_htree(dir, newblock[i], DLT_WRITE); + if (new_lock[i] == NULL) { + err = -ENOMEM; + goto cleanup; + } + do_corr(schedule()); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ldiskfs_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + } + /* Add "safe" node to transaction too */ + if (safe + 1 != path->ip_frames) { + do_corr(schedule()); + err = ldiskfs_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; + } + + /* Go through nodes once more, inserting pointers */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + unsigned count; + int idx; + struct buffer_head *bh2; + struct buffer_head *bh; + + entries = frame->entries; + count = dx_get_count(entries); + idx = iam_entry_diff(path, frame->at, entries); + + bh2 = bh_new[i]; + entries2 = dx_get_entries(path, bh2->b_data, 0); + + bh = frame->bh; + if (frame == path->ip_frames) { + /* splitting root node. Tricky point: + * + * In the "normal" B-tree we'd split root *and* add + * new root to the tree with pointers to the old root + * and its sibling (thus introducing two new nodes). + * + * In htree it's enough to add one node, because + * capacity of the root node is smaller than that of + * non-root one. + */ + struct iam_frame *frames; + struct iam_entry *next; + + assert_corr(i == 0); + + do_corr(schedule()); + + frames = path->ip_frames; + memcpy((char *) entries2, (char *) entries, + count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ + iam_lock_bh(frame->bh); + next = descr->id_ops->id_root_inc(path->ip_container, + path, frame); + dx_set_block(path, next, newblock[0]); + iam_unlock_bh(frame->bh); + + do_corr(schedule()); + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, + (sizeof path->ip_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ + frames[1].at = iam_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; + assert_inv(dx_node_check(path, frame)); + ++ path->ip_frame; + ++ frame; + assert_inv(dx_node_check(path, frame)); + bh_new[0] = NULL; /* buffer head is "consumed" */ + err = ldiskfs_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; + do_corr(schedule()); + } else { + /* splitting non-root index node. */ + struct iam_frame *parent = frame - 1; + + do_corr(schedule()); + count = iam_shift_entries(path, frame, count, + entries, entries2, newblock[i]); + /* Which index block gets the new entry? */ + if (idx >= count) { + int d = dx_index_is_compat(path) ? 0 : +1; + + frame->at = iam_entry_shift(path, entries2, + idx - count + d); + frame->entries = entries = entries2; + frame->curidx = newblock[i]; + swap(frame->bh, bh2); + assert_corr(lock[i + 1] != NULL); + assert_corr(new_lock[i] != NULL); + swap(lock[i + 1], new_lock[i]); + bh_new[i] = bh2; + parent->at = iam_entry_shift(path, + parent->at, +1); + } + assert_inv(dx_node_check(path, frame)); + assert_inv(dx_node_check(path, parent)); + dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ldiskfs_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + do_corr(schedule()); + err = ldiskfs_journal_dirty_metadata(handle, parent->bh); + if (err) + goto journal_error; + } + do_corr(schedule()); + err = ldiskfs_journal_dirty_metadata(handle, bh); + if (err) + goto journal_error; + } + /* + * This function was called to make insertion of new leaf + * possible. Check that it fulfilled its obligations. + */ + assert_corr(dx_get_count(path->ip_frame->entries) < + dx_get_limit(path->ip_frame->entries)); + assert_corr(lock[nr_splet] != NULL); + *lh = lock[nr_splet]; + lock[nr_splet] = NULL; + if (nr_splet > 0) { + /* + * Log ->i_size modification. + */ + err = ldiskfs_mark_inode_dirty(handle, dir); + if (err) + goto journal_error; + } + goto cleanup; +journal_error: + ldiskfs_std_error(dir->i_sb, err); + +cleanup: + iam_unlock_array(dir, lock); + iam_unlock_array(dir, new_lock); + + assert_corr(err || iam_frame_is_locked(path, path->ip_frame)); + + do_corr(schedule()); + for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { + if (bh_new[i] != NULL) + brelse(bh_new[i]); + } + return err; +} + +static int iam_add_rec(handle_t *handle, struct iam_iterator *it, + struct iam_path *path, + const struct iam_key *k, const struct iam_rec *r) +{ + int err; + struct iam_leaf *leaf; + + leaf = &path->ip_leaf; + assert_inv(iam_leaf_check(leaf)); + assert_inv(iam_path_check(path)); + err = iam_txn_add(handle, path, leaf->il_bh); + if (err == 0) { + do_corr(schedule()); + if (!iam_leaf_can_add(leaf, k, r)) { + struct dynlock_handle *lh = NULL; + + do { + assert_corr(lh == NULL); + do_corr(schedule()); + err = split_index_node(handle, path, &lh); + if (err == -EAGAIN) { + assert_corr(lh == NULL); + + iam_path_fini(path); + it->ii_state = IAM_IT_DETACHED; + + do_corr(schedule()); + err = iam_it_get_exact(it, k); + if (err == -ENOENT) + err = +1; /* repeat split */ + else if (err == 0) + err = -EEXIST; + } + } while (err > 0); + assert_inv(iam_path_check(path)); + if (err == 0) { + assert_corr(lh != NULL); + do_corr(schedule()); + err = iam_new_leaf(handle, leaf); + if (err == 0) + err = iam_txn_dirty(handle, path, + path->ip_frame->bh); + } + iam_unlock_htree(iam_path_obj(path), lh); + do_corr(schedule()); + } + if (err == 0) { + iam_leaf_rec_add(leaf, k, r); + err = iam_txn_dirty(handle, path, leaf->il_bh); + } + } + assert_inv(iam_leaf_check(leaf)); + assert_inv(iam_leaf_check(&path->ip_leaf)); + assert_inv(iam_path_check(path)); + return err; +} + +/* + * Insert new record with key @k and contents from @r, shifting records to the + * right. On success, iterator is positioned on the newly inserted record. + * + * precondition: it->ii_flags&IAM_IT_WRITE && + * (it_state(it) == IAM_IT_ATTACHED || + * it_state(it) == IAM_IT_SKEWED) && + * ergo(it_state(it) == IAM_IT_ATTACHED, + * it_keycmp(it, k) <= 0) && + * ergo(it_before(it), it_keycmp(it, k) > 0)); + * postcondition: ergo(result == 0, + * it_state(it) == IAM_IT_ATTACHED && + * it_keycmp(it, k) == 0 && + * !memcmp(iam_it_rec_get(it), r, ...)) + */ +int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, + const struct iam_key *k, const struct iam_rec *r) +{ + int result; + struct iam_path *path; + + path = &it->ii_path; + + assert_corr(it->ii_flags&IAM_IT_WRITE); + assert_corr(it_state(it) == IAM_IT_ATTACHED || + it_state(it) == IAM_IT_SKEWED); + assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED, + it_keycmp(it, k) <= 0)); + assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0)); + result = iam_add_rec(h, it, path, k, r); + if (result == 0) + it->ii_state = IAM_IT_ATTACHED; + assert_corr(ergo(result == 0, + it_state(it) == IAM_IT_ATTACHED && + it_keycmp(it, k) == 0)); + return result; +} +EXPORT_SYMBOL(iam_it_rec_insert); + +/* + * Delete record under iterator. + * + * precondition: it_state(it) == IAM_IT_ATTACHED && + * it->ii_flags&IAM_IT_WRITE && + * it_at_rec(it) + * postcondition: it_state(it) == IAM_IT_ATTACHED || + * it_state(it) == IAM_IT_DETACHED + */ +int iam_it_rec_delete(handle_t *h, struct iam_iterator *it) +{ + int result; + struct iam_leaf *leaf; + struct iam_path *path; + + assert_corr(it_state(it) == IAM_IT_ATTACHED && + it->ii_flags&IAM_IT_WRITE); + assert_corr(it_at_rec(it)); + + path = &it->ii_path; + leaf = &path->ip_leaf; + + assert_inv(iam_leaf_check(leaf)); + assert_inv(iam_path_check(path)); + + result = iam_txn_add(h, path, leaf->il_bh); + /* + * no compaction for now. + */ + if (result == 0) { + iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE); + result = iam_txn_dirty(h, path, leaf->il_bh); + if (result == 0 && iam_leaf_at_end(leaf) && + it->ii_flags&IAM_IT_MOVE) { + result = iam_it_next(it); + if (result > 0) + result = 0; + } + } + assert_inv(iam_leaf_check(leaf)); + assert_inv(iam_path_check(path)); + assert_corr(it_state(it) == IAM_IT_ATTACHED || + it_state(it) == IAM_IT_DETACHED); + return result; +} +EXPORT_SYMBOL(iam_it_rec_delete); + +/* + * Convert iterator to cookie. + * + * precondition: it_state(it) == IAM_IT_ATTACHED && + * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) + * postcondition: it_state(it) == IAM_IT_ATTACHED + */ +iam_pos_t iam_it_store(const struct iam_iterator *it) +{ + iam_pos_t result; + + assert_corr(it_state(it) == IAM_IT_ATTACHED); + assert_corr(it_at_rec(it)); + assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= + sizeof result); + + result = 0; + return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result); +} +EXPORT_SYMBOL(iam_it_store); + +/* + * Restore iterator from cookie. + * + * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE && + * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) + * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED && + * iam_it_store(it) == pos) + */ +int iam_it_load(struct iam_iterator *it, iam_pos_t pos) +{ + assert_corr(it_state(it) == IAM_IT_DETACHED && + it->ii_flags&IAM_IT_MOVE); + assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos); + return iam_it_iget(it, (struct iam_ikey *)&pos); +} +EXPORT_SYMBOL(iam_it_load); + +/***********************************************************************/ +/* invariants */ +/***********************************************************************/ + +static inline int ptr_inside(void *base, size_t size, void *ptr) +{ + return (base <= ptr) && (ptr < base + size); +} + +int iam_frame_invariant(struct iam_frame *f) +{ + return + (f->bh != NULL && + f->bh->b_data != NULL && + ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) && + ptr_inside(f->bh->b_data, f->bh->b_size, f->at) && + f->entries <= f->at); +} +int iam_leaf_invariant(struct iam_leaf *l) +{ + return + l->il_bh != NULL && + l->il_bh->b_data != NULL && + ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) && + ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) && + l->il_entries <= l->il_at; +} + +int iam_path_invariant(struct iam_path *p) +{ + int i; + + if (p->ip_container == NULL || + p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 || + p->ip_frame != p->ip_frames + p->ip_indirect || + !iam_leaf_invariant(&p->ip_leaf)) + return 0; + for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) { + if (i <= p->ip_indirect) { + if (!iam_frame_invariant(&p->ip_frames[i])) + return 0; + } + } + return 1; +} + +int iam_it_invariant(struct iam_iterator *it) +{ + return + (it->ii_state == IAM_IT_DETACHED || + it->ii_state == IAM_IT_ATTACHED || + it->ii_state == IAM_IT_SKEWED) && + !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) && + ergo(it->ii_state == IAM_IT_ATTACHED || + it->ii_state == IAM_IT_SKEWED, + iam_path_invariant(&it->ii_path) && + equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED)); +} + +/* + * Search container @c for record with key @k. If record is found, its data + * are moved into @r. + * + * Return values: 0: found, -ENOENT: not-found, -ve: error + */ +int iam_lookup(struct iam_container *c, const struct iam_key *k, + struct iam_rec *r, struct iam_path_descr *pd) +{ + struct iam_iterator it; + int result; + + iam_it_init(&it, c, 0, pd); + + result = iam_it_get_exact(&it, k); + if (result == 0) + /* + * record with required key found, copy it into user buffer + */ + iam_reccpy(&it.ii_path.ip_leaf, r); + iam_it_put(&it); + iam_it_fini(&it); + return result; +} +EXPORT_SYMBOL(iam_lookup); + +/* + * Insert new record @r with key @k into container @c (within context of + * transaction @h). + * + * Return values: 0: success, -ve: error, including -EEXIST when record with + * given key is already present. + * + * postcondition: ergo(result == 0 || result == -EEXIST, + * iam_lookup(c, k, r2) > 0; + */ +int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k, + const struct iam_rec *r, struct iam_path_descr *pd) +{ + struct iam_iterator it; + int result; + + iam_it_init(&it, c, IAM_IT_WRITE, pd); + + result = iam_it_get_exact(&it, k); + if (result == -ENOENT) + result = iam_it_rec_insert(h, &it, k, r); + else if (result == 0) + result = -EEXIST; + iam_it_put(&it); + iam_it_fini(&it); + return result; +} +EXPORT_SYMBOL(iam_insert); + +/* + * Update record with the key @k in container @c (within context of + * transaction @h), new record is given by @r. + * + * Return values: 0: success, -ve: error, including -ENOENT if no record with + * the given key found. + */ +int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, + const struct iam_rec *r, struct iam_path_descr *pd) +{ + struct iam_iterator it; + int result; + + iam_it_init(&it, c, IAM_IT_WRITE, pd); + + result = iam_it_get_exact(&it, k); + if (result == 0) + iam_it_rec_set(h, &it, r); + iam_it_put(&it); + iam_it_fini(&it); + return result; +} +EXPORT_SYMBOL(iam_update); + +/* + * Delete existing record with key @k. + * + * Return values: 0: success, -ENOENT: not-found, -ve: other error. + * + * postcondition: ergo(result == 0 || result == -ENOENT, + * !iam_lookup(c, k, *)); + */ +int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, + struct iam_path_descr *pd) +{ + struct iam_iterator it; + int result; + + iam_it_init(&it, c, IAM_IT_WRITE, pd); + + result = iam_it_get_exact(&it, k); + if (result == 0) + iam_it_rec_delete(h, &it); + iam_it_put(&it); + iam_it_fini(&it); + return result; +} +EXPORT_SYMBOL(iam_delete); + +int iam_root_limit(int rootgap, int blocksize, int size) +{ + int limit; + int nlimit; + + limit = (blocksize - rootgap) / size; + nlimit = blocksize / size; + if (limit == nlimit) + limit--; + return limit; +} diff --git a/lustre/osd/osd_iam.h b/lustre/osd/osd_iam.h new file mode 100644 index 0000000..b7c53cb --- /dev/null +++ b/lustre/osd/osd_iam.h @@ -0,0 +1,1145 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * osd_iam.c + * Top-level entry points into osd module + * + * Author: Wang Di + * Author: Nikita Danilov + */ + +#ifndef __LINUX_LUSTRE_IAM_H__ +#define __LINUX_LUSTRE_IAM_H__ + +#include +#include +#include + +/* + * linux/include/linux/osd_iam.h + */ +#ifndef CLASSERT +#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0) +#endif +/* implication */ +#define ergo(a, b) (!(a) || (b)) +/* logical equivalence */ +#define equi(a, b) (!!(a) == !!(b)) + +enum { + /* + * Maximal number of non-leaf levels in htree. In the stock ldiskfs this + * is 2. + */ + /* + * XXX reduced back to 2 to make per-node locking work. + */ + DX_MAX_TREE_HEIGHT = 5, + /* + * Scratch keys used by generic code for temporaries. + * + * Allocation: + * + * [0] reserved for assertions and as a staging area for + * record keys immediately used for key comparisons. + * + * [1] reserved for record key, stored during iteration over + * node records (see dx_node_check()). + * + * [2] reserved for leaf node operations. + * + * [3] reserved for index operations. + * + * [4] reserved for path->ip_ikey_target + * + */ + DX_SCRATCH_KEYS = 5, + /* + * Maximal format name length. + */ + DX_FMT_NAME_LEN = 16, +}; + +#ifdef __KERNEL__ +/* handle_t, journal_start(), journal_stop() */ +#include + +/* + * Debugging. + * + * Various debugging levels. + */ + +#if 0 +/* + * Following macros are defined in config.h and are tunable through + * appropriate configure switches (indicated below). + */ + +/* + * Compile basic assertions in. You want this most of the time. + * + * --{enable,disable}-ldiskfs-assert (on by default). + */ +#define LDISKFS_ASSERT (1) + +/* + * Compile heavier correctness checks in. You want this during development + * cycle. + * + * --{enable,disable}-ldiskfs-correctness (off by default). + */ +#define LDISKFS_CORRECTNESS (1) + +/* + * Compile heavy invariant checking in. You want this early during development + * or when chasing a bug. + * + * --{enable,disable}-ldiskfs-invariant (off by default). + */ +#define LDISKFS_INVARIANT (1) +#endif + +#if defined(LDISKFS_ASSERT) +#define LDISKFS_ASSERT_ON (1) +#else +#define LDISKFS_ASSERT_ON (0) +#endif + +#if defined(LDISKFS_CORRECTNESS) +#define LDISKFS_CORRECTNESS_ON (1) +#else +#define LDISKFS_CORRECTNESS_ON (0) +#endif + +#if defined(LDISKFS_INVARIANT) +#define LDISKFS_INVARIANT_ON (1) +#else +#define LDISKFS_INVARIANT_ON (0) +#endif + +#ifndef assert +#if LDISKFS_ASSERT_ON +#define assert(test) J_ASSERT(test) +#else +#define assert(test) ((void)(test)) +#endif +#endif + +#if LDISKFS_CORRECTNESS_ON +#define assert_corr(test) J_ASSERT(test) +#define do_corr(exp) exp +#else +#define assert_corr(test) do {;} while (0) +#define do_corr(exp) do {;} while (0) +#endif + +#if LDISKFS_INVARIANT_ON +#define assert_inv(test) J_ASSERT(test) +#else +#define assert_inv(test) do {;} while (0) +#endif + +/* + * Entry within index tree node. Consists of a key immediately followed + * (without padding) by a pointer to the child node. + * + * Both key and pointer are of variable size, hence incomplete type. + */ +struct iam_entry; + +struct iam_entry_compat { + __le32 hash; + __le32 block; +}; + +/* + * Incomplete type used to refer to keys in iam container. + * + * As key size can be different from container to container, iam has to use + * incomplete type. Clients cast pointer to iam_key to real key type and back. + */ +struct iam_key; + +/* + * Incomplete type use to refer to the records stored in iam containers. + */ +struct iam_rec; + +/* + * Key in index node. Possibly compressed. Fixed size. + */ +struct iam_ikey; + +/* + * Scalar type into which certain iam_key's can be uniquely mapped. Used to + * support interfaces like readdir(), where iteration over index has to be + * re-startable. + */ +typedef __u32 iam_ptr_t; + +/* + * Index node traversed during tree lookup. + */ +struct iam_frame { + struct buffer_head *bh; /* buffer holding node data */ + struct iam_entry *entries; /* array of entries */ + struct iam_entry *at; /* target entry, found by binary search */ + iam_ptr_t leaf; /* (logical) offset of child node found by + * binary search. */ + iam_ptr_t curidx; /* (logical) offset of this node. Used to + * per-node locking to detect concurrent + * splits. */ +}; + +/* + * Opaque entry in the leaf node. + */ +struct iam_lentry; + +struct iam_path; +struct iam_container; + + +/* leaf node reached by tree lookup */ +struct iam_leaf { + struct iam_path *il_path; + struct buffer_head *il_bh; + struct iam_lentry *il_entries; + struct iam_lentry *il_at; + /* + * Lock on a leaf node. + */ + struct dynlock_handle *il_lock; + iam_ptr_t il_curidx; /* logical offset of leaf node. */ + void *il_descr_data; +}; + +/* + * Return values of ->lookup() operation from struct iam_leaf_operations. + */ +enum iam_lookup_t { + /* + * lookup found a record with the key requested + */ + IAM_LOOKUP_EXACT = 0, + /* + * lookup positioned leaf on some record + */ + IAM_LOOKUP_OK = 1, + /* + * leaf was empty + */ + IAM_LOOKUP_EMPTY = 2, + /* + * lookup positioned leaf before first record + */ + IAM_LOOKUP_BEFORE = 3, + /* + * Found hash may have a continuation in the next leaf. + */ + IAM_LOOKUP_LAST = 0x100 +}; + +/* + * Format-specific container operations. These are called by generic iam code. + */ +struct iam_operations { + /* + * Returns pointer (in the same sense as pointer in index entry) to + * the root node. + */ + __u32 (*id_root_ptr)(struct iam_container *c); + + /* + * Check validity and consistency of index node. + */ + int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); + /* + * Copy some data from node header into frame. This is called when + * new node is loaded into frame. + */ + int (*id_node_load)(struct iam_path *path, struct iam_frame *frame); + /* + * Initialize new node (stored in @bh) that is going to be added into + * tree. + */ + int (*id_node_init)(struct iam_container *c, + struct buffer_head *bh, int root); + int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, + handle_t *h, struct buffer_head **bh); + /* + * Key comparison functions. Returns -1, 0, +1. + */ + int (*id_ikeycmp)(const struct iam_container *c, + const struct iam_ikey *k1, + const struct iam_ikey *k2); + /* + * Modify root node when tree height increases. + */ + struct iam_entry *(*id_root_inc)(struct iam_container *c, + struct iam_path *path, + struct iam_frame *frame); + + struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c, + void *area); + void (*id_ipd_free)(struct iam_path_descr *ipd); + /* + * Format name. + */ + char id_name[DX_FMT_NAME_LEN]; +}; + +/* + * Another format-specific operation vector, consisting of methods to access + * leaf nodes. This is separated from struct iam_operations, because it is + * assumed that there will be many formats with different format of leaf + * nodes, yes the same struct iam_operations. + */ +struct iam_leaf_operations { + /* + * leaf operations. + */ + + /* + * initialize just loaded leaf node. + */ + int (*init)(struct iam_leaf *p); + /* + * Format new node. + */ + void (*init_new)(struct iam_container *c, struct buffer_head *bh); + /* + * Release resources. + */ + void (*fini)(struct iam_leaf *l); + /* + * returns true iff leaf is positioned at the last entry. + */ + int (*at_end)(const struct iam_leaf *l); + /* position leaf at the first entry */ + void (*start)(struct iam_leaf *l); + /* more leaf to the next entry. */ + void (*next)(struct iam_leaf *l); + /* + * return key of current leaf record. This method may return + * either pointer to the key stored in node, or copy key into + * @k buffer supplied by caller and return pointer to this + * buffer. The latter approach is used when keys in nodes are + * not stored in plain form (e.g., htree doesn't store keys at + * all). + * + * Caller should assume that returned pointer is only valid + * while leaf node is pinned and locked. + */ + struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k); + struct iam_key *(*key)(const struct iam_leaf *l); + /* return pointer to entry body. Pointer is valid while + corresponding leaf node is locked and pinned. */ + struct iam_rec *(*rec)(const struct iam_leaf *l); + + void (*key_set)(struct iam_leaf *l, const struct iam_key *k); + void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r); + void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r); + + int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k); + int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k); + + int (*key_size)(const struct iam_leaf *l); + /* + * Search leaf @l for a record with key @k or for a place + * where such record is to be inserted. + * + * Scratch keys from @path can be used. + */ + int (*lookup)(struct iam_leaf *l, const struct iam_key *k); + int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik); + + int (*can_add)(const struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r); + /* + * add rec for a leaf + */ + void (*rec_add)(struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r); + /* + * remove rec for a leaf + */ + void (*rec_del)(struct iam_leaf *l, int shift); + /* + * split leaf node, moving some entries into @bh (the latter currently + * is assumed to be empty). + */ + void (*split)(struct iam_leaf *l, struct buffer_head **bh, + iam_ptr_t newblknr); +}; + +/* + * Parameters, describing a flavor of iam container. + */ +struct iam_descr { + /* + * Size of a key in this container, in bytes. + */ + size_t id_key_size; + /* + * Size of a key in index nodes, in bytes. + */ + size_t id_ikey_size; + /* + * Size of a pointer to the next level (stored in index nodes), in + * bytes. + */ + size_t id_ptr_size; + /* + * Size of a record (stored in leaf nodes), in bytes. + */ + size_t id_rec_size; + /* + * Size of unused (by iam) space at the beginning of every non-root + * node, in bytes. Used for compatibility with ldiskfs. + */ + size_t id_node_gap; + /* + * Size of unused (by iam) space at the beginning of root node, in + * bytes. Used for compatibility with ldiskfs. + */ + size_t id_root_gap; + + struct iam_operations *id_ops; + struct iam_leaf_operations *id_leaf_ops; +}; + +/* + * An instance of iam container. + */ +struct iam_container { + /* + * Underlying flat file. IO against this object is issued to + * read/write nodes. + */ + struct inode *ic_object; + /* + * container flavor. + */ + struct iam_descr *ic_descr; + /* + * read-write lock protecting index consistency. + */ + struct rw_semaphore ic_sem; +}; + +/* + * description-specific part of iam_path. This is usually embedded into larger + * structure. + */ +struct iam_path_descr { + /* + * Scratch-pad area for temporary keys. + */ + struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS]; +}; + +/* + * Structure to keep track of a path drilled through htree. + */ +struct iam_path { + /* + * Parent container. + */ + struct iam_container *ip_container; + /* + * Number of index levels minus one. + */ + int ip_indirect; + /* + * Nodes that top-to-bottom traversal passed through. + */ + struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; + /* + * Last filled frame in ->ip_frames. Refers to the 'twig' node (one + * immediately above leaf). + */ + struct iam_frame *ip_frame; + /* + * Leaf node: a child of ->ip_frame. + */ + struct iam_leaf ip_leaf; + /* + * Key searched for. + */ + const struct iam_key *ip_key_target; + const struct iam_ikey *ip_ikey_target; + /* + * Description-specific data. + */ + struct iam_path_descr *ip_data; +}; + +struct ldiskfs_dx_hash_info; + +/* + * Helper structure for legacy htrees. + */ +struct iam_path_compat { + struct iam_path ipc_path; + struct iam_container ipc_container; + __u32 ipc_scratch[DX_SCRATCH_KEYS]; + struct ldiskfs_dx_hash_info *ipc_hinfo; + struct qstr *ipc_qstr; + struct iam_path_descr ipc_descr; + struct ldiskfs_dx_hash_info ipc_hinfo_area; +}; + +#define const_max(p, q) ((p > q) ? p : q) + +enum { + DX_MAX_IKEY_SIZE = 32, /* be generous */ + /* + * Hack to avoid dynamic allocation and freeing of ipd. + */ + DX_IPD_MAX_SIZE = const_max(sizeof(struct iam_path_compat), + DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS + + sizeof(struct iam_path_descr)) +}; + +/* + * iam cursor (iterator) api. + */ + +/* + * States of iterator state machine. + */ +enum iam_it_state { + /* initial state */ + IAM_IT_DETACHED, + /* iterator is above particular record in the container */ + IAM_IT_ATTACHED, + /* iterator is positioned before record */ + IAM_IT_SKEWED +}; + +/* + * Flags controlling iterator functionality. + */ +enum iam_it_flags { + /* + * this iterator will move (iam_it_next() will be called on it) + */ + IAM_IT_MOVE = (1 << 0), + /* + * tree can be updated through this iterator. + */ + IAM_IT_WRITE = (1 << 1) +}; + +/* + * Iterator. + * + * Immediately after call to iam_it_init() iterator is in "detached" + * (IAM_IT_DETACHED) state: it is associated with given parent container, but + * doesn't point to any particular record in this container. + * + * After successful call to iam_it_get() and until corresponding call to + * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or + * IAM_IT_SKEWED. + * + * Active iterator can move through records in a container (provided + * IAM_IT_MOVE permission) in a key order, can get record and key values as it + * passes over them, and can modify container (provided IAM_IT_WRITE + * permission). + * + * Iteration may reach the end of container, at which point iterator switches + * into IAM_IT_DETACHED state. + * + * Concurrency: iterators are supposed to be local to thread. Interfaces below + * do no internal serialization of access to the iterator fields. + * + * When in non-detached state, iterator keeps some container nodes pinned in + * memory and locked (that locking may be implemented at the container + * granularity though). In particular, clients may assume that pointers to + * records and keys obtained through iterator interface as valid until + * iterator is detached (except that they may be invalidated by sub-sequent + * operations done through the same iterator). + * + */ +struct iam_iterator { + /* + * iterator flags, taken from enum iam_it_flags. + */ + __u32 ii_flags; + enum iam_it_state ii_state; + /* + * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED + * states. + */ + struct iam_path ii_path; +}; + +void iam_path_init(struct iam_path *path, struct iam_container *c, + struct iam_path_descr *pd); +void iam_path_fini(struct iam_path *path); +void iam_path_release(struct iam_path *path); + +void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode); +void iam_path_compat_fini(struct iam_path_compat *path); + +struct iam_path_descr *iam_ipd_alloc(void *area, int keysize); +void iam_ipd_free(struct iam_path_descr *ipd); + +int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, + struct iam_path_descr *pd); +void iam_it_fini(struct iam_iterator *it); +int iam_it_get(struct iam_iterator *it, const struct iam_key *k); +int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k); +void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src); +void iam_it_put(struct iam_iterator *it); +int iam_it_next(struct iam_iterator *it); +struct iam_rec *iam_it_rec_get(const struct iam_iterator *it); +int iam_it_rec_set(handle_t *h, + struct iam_iterator *it, const struct iam_rec *r); +struct iam_key *iam_it_key_get(const struct iam_iterator *it); +int iam_it_key_size(const struct iam_iterator *it); +int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, + const struct iam_key *k, const struct iam_rec *r); +int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); + +typedef __u64 iam_pos_t; + +iam_pos_t iam_it_store(const struct iam_iterator *it); +int iam_it_load(struct iam_iterator *it, iam_pos_t pos); + +int iam_lookup(struct iam_container *c, const struct iam_key *k, + struct iam_rec *r, struct iam_path_descr *pd); +int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, + struct iam_path_descr *pd); +int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, + const struct iam_rec *r, struct iam_path_descr *pd); +int iam_insert(handle_t *handle, struct iam_container *c, + const struct iam_key *k, + const struct iam_rec *r, struct iam_path_descr *pd); +/* + * Initialize container @c. + */ +int iam_container_init(struct iam_container *c, + struct iam_descr *descr, struct inode *inode); +/* + * Finalize container @c, release all resources. + */ +void iam_container_fini(struct iam_container *c); + +/* + * Determine container format. + */ +int iam_container_setup(struct iam_container *c); + +static inline struct iam_descr *iam_container_descr(struct iam_container *c) +{ + return c->ic_descr; +} + +static inline struct iam_descr *iam_path_descr(const struct iam_path *p) +{ + return p->ip_container->ic_descr; +} + +static inline struct inode *iam_path_obj(struct iam_path *p) +{ + return p->ip_container->ic_object; +} + +static inline void iam_ikeycpy(const struct iam_container *c, + struct iam_ikey *k1, const struct iam_ikey *k2) +{ + memcpy(k1, k2, c->ic_descr->id_ikey_size); +} + +static inline size_t iam_entry_size(struct iam_path *p) +{ + return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size; +} + +static inline struct iam_entry *iam_entry_shift(struct iam_path *p, + struct iam_entry *entry, + int shift) +{ + void *e = entry; + return e + shift * iam_entry_size(p); +} + +static inline struct iam_ikey *dx_get_ikey(struct iam_path *p, + struct iam_entry *entry, + struct iam_ikey *key) +{ + return memcpy(key, entry, iam_path_descr(p)->id_ikey_size); +} + +static inline struct iam_ikey *iam_ikey_at(struct iam_path *p, + struct iam_entry *entry) +{ + return (struct iam_ikey *)entry; +} + +static inline ptrdiff_t iam_entry_diff(struct iam_path *p, + struct iam_entry *e1, + struct iam_entry *e2) +{ + ptrdiff_t diff; + + diff = (void *)e1 - (void *)e2; + assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff); + return diff / iam_entry_size(p); +} + +/* + * Helper for the frequent case, where key was already placed into @k1 by + * callback. + */ +static inline void iam_ikeycpy0(const struct iam_container *c, + struct iam_ikey *k1, const struct iam_ikey *k2) +{ + if (k1 != k2) + iam_ikeycpy(c, k1, k2); +} + +static inline int iam_ikeycmp(const struct iam_container *c, + const struct iam_ikey *k1, + const struct iam_ikey *k2) +{ + return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2); +} + +static inline void *iam_entry_off(struct iam_entry *entry, size_t off) +{ + return (void *)((char *)entry + off); +} + +/* + * Leaf helpers. + */ + +static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf) +{ + return leaf->il_path; +} + +static inline struct iam_container * +iam_leaf_container(const struct iam_leaf *leaf) +{ + return iam_leaf_path(leaf)->ip_container; +} + +static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf) +{ + return iam_leaf_container(leaf)->ic_descr; +} + +static inline struct iam_leaf_operations * +iam_leaf_ops(const struct iam_leaf *leaf) +{ + return iam_leaf_descr(leaf)->id_leaf_ops; +} + +static inline void iam_reccpy(const struct iam_leaf *leaf, + struct iam_rec *rec_dst) +{ + iam_leaf_ops(leaf)->rec_get(leaf, rec_dst); +} + +/*XXX These stuff put here, just because they are used by iam.c */ +static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) +{ + u32 *addr; + + addr = iam_entry_off(entry, iam_path_descr(p)->id_ikey_size); + return le32_to_cpu(get_unaligned(addr)) & 0x00ffffff; +} + +static inline void dx_set_block(struct iam_path *p, + struct iam_entry *entry, unsigned value) +{ + u32 *addr; + + addr = iam_entry_off(entry, iam_path_descr(p)->id_ikey_size); + put_unaligned(cpu_to_le32(value), addr); +} + +static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry, + const struct iam_ikey *key) +{ + iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key); +} + +struct dx_map_entry +{ + u32 hash; + u32 offs; +}; + +struct fake_dirent { + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit { + __le16 limit; + __le16 count; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root { + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct {} entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct {} entries[0]; +}; + + +static inline unsigned dx_get_count(struct iam_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit(struct iam_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count(struct iam_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline unsigned dx_node_limit(struct iam_path *p) +{ + struct iam_descr *param = iam_path_descr(p); + unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize - + param->id_node_gap; + return entry_space / (param->id_ikey_size + param->id_ptr_size); +} + +static inline unsigned dx_root_limit(struct iam_path *p) +{ + struct iam_descr *param = iam_path_descr(p); + unsigned limit = iam_path_obj(p)->i_sb->s_blocksize - + param->id_root_gap; + limit /= (param->id_ikey_size + param->id_ptr_size); + if (limit == dx_node_limit(p)) + limit--; + return limit; +} + + +static inline struct iam_entry *dx_get_entries(struct iam_path *path, + void *data, int root) +{ + struct iam_descr *param = iam_path_descr(path); + return data + (root ? param->id_root_gap : param->id_node_gap); +} + + +static inline struct iam_entry *dx_node_get_entries(struct iam_path *path, + struct iam_frame *frame) +{ + return dx_get_entries(path, + frame->bh->b_data, frame == path->ip_frames); +} + +static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path, + int nr) +{ + assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch)); + return path->ip_data->ipd_key_scratch[nr]; +} + +static inline struct dynlock *path_dynlock(struct iam_path *path) +{ + return &LDISKFS_I(iam_path_obj(path))->i_htree_lock; +} + +static inline int iam_leaf_is_locked(const struct iam_leaf *leaf) +{ + int result; + + result = dynlock_is_locked(path_dynlock(leaf->il_path), + leaf->il_curidx); + if (!result) + dump_stack(); + return result; +} + +static inline int iam_frame_is_locked(struct iam_path *path, + const struct iam_frame *frame) +{ + int result; + + result = dynlock_is_locked(path_dynlock(path), frame->curidx); + if (!result) + dump_stack(); + return result; +} + +int dx_lookup_lock(struct iam_path *path, + struct dynlock_handle **dl, enum dynlock_type lt); + +void dx_insert_block(struct iam_path *path, struct iam_frame *frame, + u32 hash, u32 block); +int dx_index_is_compat(struct iam_path *path); + +int ldiskfs_htree_next_block(struct inode *dir, __u32 hash, + struct iam_path *path, __u32 *start_hash); + +struct buffer_head *ldiskfs_append(handle_t *handle, struct inode *inode, + u32 *block, int *err); +int split_index_node(handle_t *handle, struct iam_path *path, + struct dynlock_handle **lh); +struct ldiskfs_dir_entry_2 *split_entry(struct inode *dir, + struct ldiskfs_dir_entry_2 *de, + unsigned long ino, mode_t mode, + const char *name, int namelen); +struct ldiskfs_dir_entry_2 *find_insertion_point(struct inode *dir, + struct buffer_head *bh, + const char *name, int namelen); +struct ldiskfs_dir_entry_2 *move_entries(struct inode *dir, + struct ldiskfs_dx_hash_info *hinfo, + struct buffer_head **bh1, + struct buffer_head **bh2, + __u32 *delim_hash); + +extern struct iam_descr iam_htree_compat_param; + +struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, + enum dynlock_type lt); +void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh); + +/* + * external + */ +void iam_container_write_lock(struct iam_container *c); +void iam_container_write_unlock(struct iam_container *c); + +void iam_container_read_lock(struct iam_container *c); +void iam_container_read_unlock(struct iam_container *c); + +int iam_index_next(struct iam_container *c, struct iam_path *p); +int iam_read_leaf(struct iam_path *p); + +int iam_node_read(struct iam_container *c, iam_ptr_t ptr, + handle_t *handle, struct buffer_head **bh); +int iam_lvar_create(struct inode *obj, int keysize, int ptrsize, + int recsize, handle_t *handle); +int iam_lfix_create(struct inode *obj, int keysize, int ptrsize, + int recsize, handle_t *handle); + +#ifndef swap +#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +#define BH_DXLock 25 +#define DX_DEBUG (1) +#if DX_DEBUG +static struct iam_lock_stats { + unsigned dls_bh_lock; + unsigned dls_bh_busy; + unsigned dls_bh_again; + unsigned dls_bh_full_again; +} iam_lock_stats = { 0, }; +#define DX_DEVAL(x) x +#else +#define DX_DEVAL(x) +#endif + +static inline void iam_lock_bh(struct buffer_head volatile *bh) +{ + DX_DEVAL(iam_lock_stats.dls_bh_lock++); +#ifdef CONFIG_SMP + while (test_and_set_bit(BH_DXLock, &bh->b_state)) { + DX_DEVAL(iam_lock_stats.dls_bh_busy++); + while (test_bit(BH_DXLock, &bh->b_state)) + cpu_relax(); + } +#endif +} + +static inline void iam_unlock_bh(struct buffer_head *bh) +{ +#ifdef CONFIG_SMP + smp_mb__before_clear_bit(); + clear_bit(BH_DXLock, &bh->b_state); +#endif +} + + +void iam_insert_key(struct iam_path *path, struct iam_frame *frame, + const struct iam_ikey *key, iam_ptr_t ptr); + +void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, + const struct iam_ikey *key, iam_ptr_t ptr); + + +int iam_leaf_at_end(const struct iam_leaf *l); +void iam_leaf_next(struct iam_leaf *folio); +int iam_leaf_can_add(const struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r); + +struct iam_path *iam_leaf_path(const struct iam_leaf *leaf); +struct iam_container *iam_leaf_container(const struct iam_leaf *leaf); +struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf); +struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf); + + +int iam_node_read(struct iam_container *c, iam_ptr_t ptr, + handle_t *h, struct buffer_head **bh); + +/* + * Container format. + */ +struct iam_format { + /* + * Method called to recognize container format. Should return true iff + * container @c conforms to this format. This method may do IO to read + * container pages. + * + * If container is recognized, this method sets operation vectors + * ->id_ops and ->id_leaf_ops in container description (c->ic_descr), + * and fills other description fields. + */ + int (*if_guess)(struct iam_container *c); + /* + * Linkage into global list of container formats. + */ + struct list_head if_linkage; +}; + +void iam_format_register(struct iam_format *fmt); +int iam_root_limit(int rootgap, int blocksize, int size); + +void iam_lfix_format_init(void); +void iam_lvar_format_init(void); +void iam_htree_format_init(void); + +struct iam_private_info; + +void ldiskfs_iam_release(struct file *filp, struct inode *inode); + +int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg); + +/* dir.c +#if LDISKFS_INVARIANT_ON +extern int ldiskfs_check_dir_entry(const char *, struct inode *, + struct ldiskfs_dir_entry_2 *, + struct buffer_head *, unsigned long); +#else +static inline int ldiskfs_check_dir_entry(const char * function, + struct inode * dir, + struct ldiskfs_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) +{ + return 1; +} +#endif +*/ + +/* __KERNEL__ */ +#endif + +/* + * User level API. Copy exists in lustre/lustre/tests/iam_ut.c + */ + +struct iam_uapi_info { + __u16 iui_keysize; + __u16 iui_recsize; + __u16 iui_ptrsize; + __u16 iui_height; + char iui_fmt_name[DX_FMT_NAME_LEN]; +}; + +struct iam_uapi_op { + void *iul_key; + void *iul_rec; +}; + +struct iam_uapi_it { + struct iam_uapi_op iui_op; + __u16 iui_state; +}; + +enum iam_ioctl_cmd { + IAM_IOC_INIT = _IOW('i', 1, struct iam_uapi_info), + IAM_IOC_GETINFO = _IOR('i', 2, struct iam_uapi_info), + IAM_IOC_INSERT = _IOR('i', 3, struct iam_uapi_op), + IAM_IOC_LOOKUP = _IOWR('i', 4, struct iam_uapi_op), + IAM_IOC_DELETE = _IOR('i', 5, struct iam_uapi_op), + IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it), + IAM_IOC_IT_NEXT = _IOW('i', 7, struct iam_uapi_it), + IAM_IOC_IT_STOP = _IOR('i', 8, struct iam_uapi_it), + + IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long) +}; + +/* __LINUX_LUSTRE_IAM_H__ */ +#endif diff --git a/lustre/osd/osd_iam_lfix.c b/lustre/osd/osd_iam_lfix.c new file mode 100644 index 0000000..22a60e2 --- /dev/null +++ b/lustre/osd/osd_iam_lfix.c @@ -0,0 +1,852 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * iam_lfix.c + * implementation of iam format for fixed size records. + * + * Author: Wang Di + * Author: Nikita Danilov + */ + +#include +#include +/* ldiskfs_error() */ +#include +#include "osd_internal.h" + +/* + * Leaf operations. + */ + +enum { + IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in + * lustre/utils/create_iam.c */ +}; + +/* This is duplicated in lustre/utils/create_iam.c */ +struct iam_leaf_head { + __le16 ill_magic; + __le16 ill_count; +}; + +static inline int iam_lfix_entry_size(const struct iam_leaf *l) +{ + return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size; +} + +static inline struct iam_lentry * +iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift) +{ + return (void *)entry + shift * iam_lfix_entry_size(l); +} + +static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry) +{ + return (struct iam_key *)entry; +} + +static inline int lfix_keycmp(const struct iam_container *c, + const struct iam_key *k1, + const struct iam_key *k2) +{ + return memcmp(k1, k2, c->ic_descr->id_key_size); +} + +static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l) +{ + return (struct iam_leaf_head *)l->il_bh->b_data; +} + +static struct iam_lentry *iam_entries(const struct buffer_head *bh) +{ + return (void *)bh->b_data + sizeof(struct iam_leaf_head); +} + +static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l) +{ + return iam_entries(l->il_bh); +} + +static int leaf_count_limit(const struct iam_leaf *leaf) +{ + int free_space; + + free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize; + free_space -= sizeof(struct iam_leaf_head); + return free_space / iam_lfix_entry_size(leaf); +} + +static int lentry_count_get(const struct iam_leaf *leaf) +{ + return le16_to_cpu(iam_get_head(leaf)->ill_count); +} + +static void lentry_count_set(struct iam_leaf *leaf, unsigned count) +{ + assert_corr(0 <= count && count <= leaf_count_limit(leaf)); + iam_get_head(leaf)->ill_count = cpu_to_le16(count); +} + +static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l); + +#if LDISKFS_CORRECTNESS_ON || LDISKFS_INVARIANT_ON +static int iam_leaf_at_rec(const struct iam_leaf *folio) +{ + return + iam_get_lentries(folio) <= folio->il_at && + folio->il_at < iam_lfix_get_end(folio); +} +#endif + +static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l, + struct iam_ikey *key) +{ + void *ie = l->il_at; + assert_corr(iam_leaf_at_rec(l)); + return (struct iam_ikey*)ie; +} + +static struct iam_key *iam_lfix_key(const struct iam_leaf *l) +{ + void *ie = l->il_at; + assert_corr(iam_leaf_at_rec(l)); + return (struct iam_key*)ie; +} + +static int iam_lfix_key_size(const struct iam_leaf *l) +{ + return iam_leaf_descr(l)->id_key_size; +} + +static void iam_lfix_start(struct iam_leaf *l) +{ + l->il_at = iam_get_lentries(l); +} + +static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l, + const struct iam_lentry *e1, + const struct iam_lentry *e2) +{ + ptrdiff_t diff; + int esize; + + esize = iam_lfix_entry_size(l); + diff = (void *)e1 - (void *)e2; + assert_corr(diff / esize * esize == diff); + return diff / esize; +} + +static int iam_lfix_init(struct iam_leaf *l) +{ + int result; + struct iam_leaf_head *ill; + int count; + + assert_corr(l->il_bh != NULL); + + ill = iam_get_head(l); + count = le16_to_cpu(ill->ill_count); + if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) && + 0 <= count && count <= leaf_count_limit(l)) { + l->il_at = l->il_entries = iam_get_lentries(l); + result = 0; + } else { + struct inode *obj; + + obj = iam_leaf_container(l)->ic_object; + CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or " + "wrong count: %i (%i)", + (unsigned long long)l->il_bh->b_blocknr, obj->i_ino, + ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC), + count, leaf_count_limit(l)); + result = -EIO; + } + return result; +} + +static void iam_lfix_fini(struct iam_leaf *l) +{ + l->il_entries = l->il_at = NULL; +} + +static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l) +{ + int count = lentry_count_get(l); + struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count); + + return ile; +} + +struct iam_rec *iam_lfix_rec(const struct iam_leaf *l) +{ + void *e = l->il_at; + assert_corr(iam_leaf_at_rec(l)); + return e + iam_leaf_descr(l)->id_key_size; +} + +static void iam_lfix_next(struct iam_leaf *l) +{ + assert_corr(iam_leaf_at_rec(l)); + l->il_at = iam_lfix_shift(l, l->il_at, 1); +} + +/* + * Bug chasing. + */ +int lfix_dump = 0; +EXPORT_SYMBOL(lfix_dump); + +static char hdigit(char ch) +{ + static char d[] = "0123456789abcdef"; + return d[ch & 0xf]; +} + +static char *hex(char ch, char *area) +{ + area[0] = hdigit(ch >> 4); + area[1] = hdigit(ch); + area[2] = 0; + return area; +} + +static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry) +{ + int i; + char *area; + char h[3]; + + area = (char *)entry; + printk(KERN_EMERG "["); + for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area) + printk("%s", hex(*area, h)); + printk("]-("); + for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area) + printk("%s", hex(*area, h)); + printk(")\n"); +} + +static void lfix_print(struct iam_leaf *leaf) +{ + struct iam_lentry *entry; + int count; + int i; + + entry = leaf->il_entries; + count = lentry_count_get(leaf); + printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count); + for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1)) + l_print(leaf, entry); +} + +static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k) +{ + struct iam_lentry *p, *q, *m, *t; + struct iam_container *c; + int count; + int result; + + count = lentry_count_get(l); + if (count == 0) + return IAM_LOOKUP_EMPTY; + + result = IAM_LOOKUP_OK; + c = iam_leaf_container(l); + + p = l->il_entries; + q = iam_lfix_shift(l, p, count - 1); + if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) { + /* + * @k is less than the least key in the leaf + */ + l->il_at = p; + result = IAM_LOOKUP_BEFORE; + } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) { + l->il_at = q; + } else { + /* + * EWD1293 + */ + while (iam_lfix_shift(l, p, 1) != q) { + m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2); + assert_corr(p < m && m < q); + if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0) + p = m; + else + q = m; + } + assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 && + lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0); + /* + * skip over records with duplicate keys. + */ + while (p > l->il_entries) { + t = iam_lfix_shift(l, p, -1); + if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0) + p = t; + else + break; + } + l->il_at = p; + } + assert_corr(iam_leaf_at_rec(l)); + + if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0) + result = IAM_LOOKUP_EXACT; + + if (lfix_dump) + lfix_print(l); + + return result; +} + +static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik) +{ + assert(0); + return IAM_LOOKUP_OK; +} + +static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k) +{ + assert_corr(iam_leaf_at_rec(l)); + memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size); +} + +static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k) +{ + return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k); +} + +static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k) +{ + return !lfix_keycmp(iam_leaf_container(l), + iam_leaf_key_at(l->il_at), k); +} + +static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r) +{ + assert_corr(iam_leaf_at_rec(l)); + memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size); +} + +static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r) +{ + assert_corr(iam_leaf_at_rec(l)); + memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size); +} + +static void iam_lfix_rec_add(struct iam_leaf *leaf, + const struct iam_key *k, const struct iam_rec *r) +{ + struct iam_lentry *end; + struct iam_lentry *cur; + struct iam_lentry *start; + ptrdiff_t diff; + int count; + + assert_corr(iam_leaf_can_add(leaf, k, r)); + + count = lentry_count_get(leaf); + /* + * This branch handles two exceptional cases: + * + * - leaf positioned beyond last record, and + * + * - empty leaf. + */ + if (!iam_leaf_at_end(leaf)) { + end = iam_lfix_get_end(leaf); + cur = leaf->il_at; + if (lfix_keycmp(iam_leaf_container(leaf), + k, iam_leaf_key_at(cur)) >= 0) + iam_lfix_next(leaf); + else + /* + * Another exceptional case: insertion with the key + * less than least key in the leaf. + */ + assert_corr(cur == leaf->il_entries); + + start = leaf->il_at; + diff = (void *)end - (void *)start; + assert_corr(diff >= 0); + memmove(iam_lfix_shift(leaf, start, 1), start, diff); + } + lentry_count_set(leaf, count + 1); + iam_lfix_key_set(leaf, k); + iam_lfix_rec_set(leaf, r); + assert_corr(iam_leaf_at_rec(leaf)); +} + +static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift) +{ + struct iam_lentry *next, *end; + int count; + ptrdiff_t diff; + + assert_corr(iam_leaf_at_rec(leaf)); + + count = lentry_count_get(leaf); + end = iam_lfix_get_end(leaf); + next = iam_lfix_shift(leaf, leaf->il_at, 1); + diff = (void *)end - (void *)next; + memmove(leaf->il_at, next, diff); + + lentry_count_set(leaf, count - 1); +} + +static int iam_lfix_can_add(const struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r) +{ + return lentry_count_get(l) < leaf_count_limit(l); +} + +static int iam_lfix_at_end(const struct iam_leaf *folio) +{ + return folio->il_at == iam_lfix_get_end(folio); +} + +static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh) +{ + struct iam_leaf_head *hdr; + + hdr = (struct iam_leaf_head*)bh->b_data; + hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC); + hdr->ill_count = cpu_to_le16(0); +} + +static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh, + iam_ptr_t new_blknr) +{ + struct iam_path *path; + struct iam_leaf_head *hdr; + const struct iam_ikey *pivot; + struct buffer_head *new_leaf; + + unsigned count; + unsigned split; + + void *start; + void *finis; + + new_leaf = *bh; + path = iam_leaf_path(l); + + hdr = (void *)new_leaf->b_data; + + count = lentry_count_get(l); + split = count / 2; + + start = iam_lfix_shift(l, iam_get_lentries(l), split); + finis = iam_lfix_shift(l, iam_get_lentries(l), count); + + pivot = (const struct iam_ikey *)iam_leaf_key_at(start); + + memmove(iam_entries(new_leaf), start, finis - start); + hdr->ill_count = count - split; + lentry_count_set(l, split); + if ((void *)l->il_at >= start) { + /* + * insertion point moves into new leaf. + */ + int shift; + int result; + + shift = iam_lfix_diff(l, l->il_at, start); + *bh = l->il_bh; + l->il_bh = new_leaf; + l->il_curidx = new_blknr; + result = iam_lfix_init(l); + /* + * init cannot fail, as node was just initialized. + */ + assert_corr(result == 0); + l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift); + } + /* + * Insert pointer to the new node (together with the least key in + * the node) into index node. + */ + iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr); +} + +static struct iam_leaf_operations iam_lfix_leaf_ops = { + .init = iam_lfix_init, + .init_new = iam_lfix_init_new, + .fini = iam_lfix_fini, + .start = iam_lfix_start, + .next = iam_lfix_next, + .key = iam_lfix_key, + .ikey = iam_lfix_ikey, + .rec = iam_lfix_rec, + .key_set = iam_lfix_key_set, + .key_cmp = iam_lfix_key_cmp, + .key_eq = iam_lfix_key_eq, + .key_size = iam_lfix_key_size, + .rec_set = iam_lfix_rec_set, + .rec_get = iam_lfix_rec_get, + .lookup = iam_lfix_lookup, + .ilookup = iam_lfix_ilookup, + .at_end = iam_lfix_at_end, + .rec_add = iam_lfix_rec_add, + .rec_del = iam_lfix_rec_del, + .can_add = iam_lfix_can_add, + .split = iam_lfix_split +}; + +/* + * Index operations. + */ + +enum { + /* This is duplicated in lustre/utils/create_iam.c */ + /* + * Then shalt thou see the dew-BEDABBLED wretch + * Turn, and return, indenting with the way; + * Each envious brier his weary legs doth scratch, + * Each shadow makes him stop, each murmur stay: + * For misery is trodden on by many, + * And being low never relieved by any. + */ + IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull +}; + +/* This is duplicated in lustre/utils/create_iam.c */ +struct iam_lfix_root { + __le64 ilr_magic; + __le16 ilr_keysize; + __le16 ilr_recsize; + __le16 ilr_ptrsize; + u8 ilr_indirect_levels; + u8 ilr_padding; +}; + +static __u32 iam_lfix_root_ptr(struct iam_container *c) +{ + return 0; +} + +static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh, + int root) +{ + return 0; +} + +static struct iam_entry *iam_lfix_root_inc(struct iam_container *c, + struct iam_path *path, + struct iam_frame *frame) +{ + struct iam_lfix_root *root; + struct iam_entry *entries; + + entries = frame->entries; + + dx_set_count(entries, 2); + assert_corr(dx_get_limit(entries) == dx_root_limit(path)); + + root = (void *)frame->bh->b_data; + assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC); + root->ilr_indirect_levels ++; + frame->at = entries = iam_entry_shift(path, entries, 1); + memset(iam_ikey_at(path, entries), 0, + iam_path_descr(path)->id_ikey_size); + return entries; +} + +static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame) +{ + unsigned count; + unsigned limit; + unsigned limit_correct; + struct iam_entry *entries; + + entries = dx_node_get_entries(path, frame); + + if (frame == path->ip_frames) { + struct iam_lfix_root *root; + + root = (void *)frame->bh->b_data; + if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) { + return -EIO; + } + limit_correct = dx_root_limit(path); + } else + limit_correct = dx_node_limit(path); + count = dx_get_count(entries); + limit = dx_get_limit(entries); + if (count > limit) { + return -EIO; + } + if (limit != limit_correct) { + return -EIO; + } + return 0; +} + +static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame) +{ + struct iam_entry *entries; + void *data; + entries = dx_node_get_entries(path, frame); + + data = frame->bh->b_data; + + if (frame == path->ip_frames) { + struct iam_lfix_root *root; + + root = data; + path->ip_indirect = root->ilr_indirect_levels; + if (path->ip_ikey_target == NULL) + path->ip_ikey_target = + (struct iam_ikey *)path->ip_key_target; + } + frame->entries = frame->at = entries; + return 0; +} + +static int iam_lfix_ikeycmp(const struct iam_container *c, + const struct iam_ikey *k1, + const struct iam_ikey *k2) +{ + return memcmp(k1, k2, c->ic_descr->id_ikey_size); +} + +static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c, + void *area) +{ + return iam_ipd_alloc(area, c->ic_descr->id_ikey_size); +} + +static struct iam_operations iam_lfix_ops = { + .id_root_ptr = iam_lfix_root_ptr, + .id_node_read = iam_node_read, + .id_node_init = iam_lfix_node_init, + .id_node_check = iam_lfix_node_check, + .id_node_load = iam_lfix_node_load, + .id_ikeycmp = iam_lfix_ikeycmp, + .id_root_inc = iam_lfix_root_inc, + .id_ipd_alloc = iam_lfix_ipd_alloc, + .id_ipd_free = iam_ipd_free, + .id_name = "lfix" +}; + +static int iam_lfix_guess(struct iam_container *c) +{ + int result; + struct buffer_head *bh; + const struct iam_lfix_root *root; + + assert_corr(c->ic_object != NULL); + + result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh); + if (result == 0) { + root = (void *)bh->b_data; + if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) { + struct iam_descr *descr; + + descr = c->ic_descr; + descr->id_key_size = le16_to_cpu(root->ilr_keysize); + descr->id_ikey_size = le16_to_cpu(root->ilr_keysize); + descr->id_rec_size = le16_to_cpu(root->ilr_recsize); + descr->id_ptr_size = le16_to_cpu(root->ilr_ptrsize); + descr->id_root_gap = sizeof(struct iam_lfix_root); + descr->id_node_gap = 0; + descr->id_ops = &iam_lfix_ops; + descr->id_leaf_ops = &iam_lfix_leaf_ops; + } else + result = -EBADF; + brelse(bh); + } + return result; +} + +static struct iam_format iam_lfix_format = { + .if_guess = iam_lfix_guess +}; + +void iam_lfix_format_init(void) +{ + iam_format_register(&iam_lfix_format); +} + +/* + * Debugging aid. + */ + +#define KEYSIZE (8) +#define RECSIZE (8) +#define PTRSIZE (4) + +#define LFIX_ROOT_RECNO \ + ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE)) + +#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE)) + +#define LFIX_LEAF_RECNO \ + ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE)) + +struct lfix_root { + struct iam_lfix_root lr_root; + struct { + char key[KEYSIZE]; + char ptr[PTRSIZE]; + } lr_entry[LFIX_ROOT_RECNO]; +}; + +struct lfix_index { + struct dx_countlimit li_cl; + char li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)]; + struct { + char key[KEYSIZE]; + char ptr[PTRSIZE]; + } li_entry[LFIX_INDEX_RECNO - 1]; +}; + +struct lfix_leaf { + struct iam_leaf_head ll_head; + struct { + char key[KEYSIZE]; + char rec[RECSIZE]; + } ll_entry[LFIX_LEAF_RECNO]; +}; + +#define STORE_UNALIGNED(val, dst) \ +({ \ + typeof(val) __val = (val); \ + CLASSERT(sizeof(val) == sizeof(*(dst))); \ + memcpy(dst, &__val, sizeof(*(dst))); \ +}) + +#include +#include +#include + +static void lfix_root(void *buf, + int blocksize, int keysize, int ptrsize, int recsize) +{ + struct iam_lfix_root *root; + struct dx_countlimit *limit; + void *entry; + + root = buf; + *root = (typeof(*root)) { + .ilr_magic = cpu_to_le64(IAM_LFIX_ROOT_MAGIC), + .ilr_keysize = cpu_to_le16(keysize), + .ilr_recsize = cpu_to_le16(recsize), + .ilr_ptrsize = cpu_to_le16(ptrsize), + .ilr_indirect_levels = 0 + }; + + limit = (void *)(root + 1); + *limit = (typeof(*limit)){ + /* + * limit itself + one pointer to the leaf. + */ + .count = cpu_to_le16(2), + .limit = iam_root_limit(sizeof(struct iam_lfix_root), + blocksize, keysize + ptrsize) + }; + + entry = root + 1; + /* + * Skip over @limit. + */ + entry += keysize + ptrsize; + + /* + * Entry format is followed by . In the minimal tree + * consisting of a root and single node, is a minimal possible + * key. + * + * XXX: this key is hard-coded to be a sequence of 0's. + */ + + entry += keysize; + /* now @entry points to */ + if (ptrsize == 4) + STORE_UNALIGNED(cpu_to_le32(1), (u_int32_t *)entry); + else + STORE_UNALIGNED(cpu_to_le64(1), (u_int64_t *)entry); +} + +static void lfix_leaf(void *buf, + int blocksize, int keysize, int ptrsize, int recsize) +{ + struct iam_leaf_head *head; + + /* form leaf */ + head = buf; + *head = (struct iam_leaf_head) { + .ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC), + /* + * Leaf contains an entry with the smallest possible key + * (created by zeroing). + */ + .ill_count = cpu_to_le16(1), + }; +} + +int iam_lfix_create(struct inode *obj, + int keysize, int ptrsize, int recsize, handle_t *handle) +{ + struct buffer_head *root_node; + struct buffer_head *leaf_node; + struct super_block *sb; + + u32 blknr; + int result; + unsigned long bsize; + + assert_corr(obj->i_size == 0); + + sb = obj->i_sb; + bsize = sb->s_blocksize; + root_node = ldiskfs_append(handle, obj, &blknr, &result); + leaf_node = ldiskfs_append(handle, obj, &blknr, &result); + if (root_node != NULL && leaf_node != NULL) { + lfix_root(root_node->b_data, bsize, keysize, ptrsize, recsize); + lfix_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize); + ldiskfs_mark_inode_dirty(handle, obj); + result = ldiskfs_journal_dirty_metadata(handle, root_node); + if (result == 0) + result = ldiskfs_journal_dirty_metadata(handle, leaf_node); + if (result != 0) + ldiskfs_std_error(sb, result); + } + brelse(leaf_node); + brelse(root_node); + return result; +} +EXPORT_SYMBOL(iam_lfix_create); diff --git a/lustre/osd/osd_iam_lvar.c b/lustre/osd/osd_iam_lvar.c new file mode 100644 index 0000000..49a77d7 --- /dev/null +++ b/lustre/osd/osd_iam_lvar.c @@ -0,0 +1,1072 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * iam_lvar.c + * + * implementation of iam format for fixed size records, variable sized keys. + * + * Author: Nikita Danilov + */ + +#include +#include +/* ldiskfs_error() */ +#include +#include "osd_internal.h" + +/* + * Leaf operations. + */ + +enum { + IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in + * lustre/utils/create_iam.c */ +}; + +/* This is duplicated in lustre/utils/create_iam.c */ +struct lvar_leaf_header { + __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */ + __le16 vlh_used; /* used bytes, including header */ +}; + +/* + * Format of leaf entry: + * + * __le16 keysize + * u8 key[keysize] + * u8 record[rec_size] + * + * Entries are ordered in key order. + */ + +/* This is duplicated in lustre/utils/create_iam.c */ +typedef __u32 lvar_hash_t; + +/* This is duplicated in lustre/utils/create_iam.c */ +struct lvar_leaf_entry { + __le32 vle_hash; + __le16 vle_keysize; + u8 vle_key[0]; +}; + +#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1))) + + +static inline int blocksize(const struct iam_leaf *leaf) +{ + return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize; +} + +static inline const char *kchar(const struct iam_key *key) +{ + return (void *)key; +} + +static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent) +{ + return (struct iam_lentry *)ent; +} + +static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent) +{ + return (struct lvar_leaf_entry *)lent; +} + + +static inline int e_keysize(const struct lvar_leaf_entry *ent) +{ + return le16_to_cpu(ent->vle_keysize); +} + +/* This is duplicated in lustre/utils/create_iam.c */ +enum { + LVAR_PAD = 4, + LVAR_ROUND = LVAR_PAD - 1 +}; + +static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize) +{ + CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1))); + + return (offsetof(struct lvar_leaf_entry, vle_key) + + namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND; +} + +static inline int rec_size(const struct iam_rec *rec) +{ + return *(const char *)rec; +} + +static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent) +{ + return ((void *)ent) + + offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent); +} + +static inline int e_size(const struct iam_leaf *leaf, + const struct lvar_leaf_entry *ent) +{ + return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent))); +} + +static inline char *e_char(const struct lvar_leaf_entry *ent) +{ + return (char *)&ent->vle_key; +} + +static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent) +{ + return (struct iam_key *)e_char(ent); +} + +static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent) +{ + return le32_to_cpu(ent->vle_hash); +} + +static void e_print(const struct lvar_leaf_entry *ent) +{ + printk(" %p %8.8x \"%*.*s\"\n", ent, e_hash(ent), + e_keysize(ent), e_keysize(ent), e_char(ent)); +} +#if 0 +static int e_check(const struct iam_leaf *leaf, + const struct lvar_leaf_entry *ent) +{ + const void *point = ent; + const void *start = leaf->il_bh->b_data; + return + start + sizeof(struct lvar_leaf_header) <= point && + point + e_size(leaf, ent) < start + blocksize(leaf); +} +#endif + +static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf, + const struct lvar_leaf_entry *ent) +{ + return ((void *)ent) + e_size(leaf, ent); +} + +#define LVAR_HASH_SANDWICH (0) +#define LVAR_HASH_TEA (1) +#define LVAR_HASH_R5 (0) +#define LVAR_HASH_PREFIX (0) + +static __u32 hash_build0(const char *name, int namelen) +{ + __u32 result; + + if (namelen == 0) + return 0; + if (strncmp(name, ".", 1) == 0 && namelen == 1) + return 1; + if (strncmp(name, "..", 2) == 0 && namelen == 2) + return 2; + + if (LVAR_HASH_PREFIX) { + result = 0; + strncpy((void *)&result, + name, min(namelen, (int)sizeof result)); + } else { + struct ldiskfs_dx_hash_info hinfo; + + if (LVAR_HASH_TEA) + hinfo.hash_version = LDISKFS_DX_HASH_TEA; + else + hinfo.hash_version = LDISKFS_DX_HASH_R5; + hinfo.seed = 0; + ldiskfsfs_dirhash(name, namelen, &hinfo); + result = hinfo.hash; + if (LVAR_HASH_SANDWICH) { + __u32 result2; + + hinfo.hash_version = LDISKFS_DX_HASH_TEA; + hinfo.seed = 0; + ldiskfsfs_dirhash(name, namelen, &hinfo); + result2 = hinfo.hash; + result = (0xfc000000 & result2) | (0x03ffffff & result); + } + } + return result; +} + +enum { + HASH_GRAY_AREA = 1024, + HASH_MAX_SIZE = 0x7fffffffUL +}; + +static __u32 hash_build(const char *name, int namelen) +{ + __u32 hash; + + hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE; + if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA) + hash &= HASH_GRAY_AREA - 1; + return hash; +} + +static inline lvar_hash_t get_hash(const struct iam_container *bag, + const char *name, int namelen) +{ + return hash_build(name, namelen); +} + +static inline int e_eq(const struct lvar_leaf_entry *ent, + const char *name, int namelen) +{ + return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen); +} + +static inline int e_cmp(const struct iam_leaf *leaf, + const struct lvar_leaf_entry *ent, lvar_hash_t hash) +{ + lvar_hash_t ehash; + + ehash = e_hash(ent); + return ehash == hash ? 0 : (ehash < hash ? -1 : +1); +} + +static struct lvar_leaf_header *n_head(const struct iam_leaf *l) +{ + return (struct lvar_leaf_header *)l->il_bh->b_data; +} + +static int h_used(const struct lvar_leaf_header *hdr) +{ + return le16_to_cpu(hdr->vlh_used); +} + +static void h_used_adj(const struct iam_leaf *leaf, + struct lvar_leaf_header *hdr, int adj) +{ + int used; + + used = h_used(hdr) + adj; + assert_corr(sizeof *hdr <= used && used <= blocksize(leaf)); + hdr->vlh_used = cpu_to_le16(used); +} + +static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf) +{ + return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header); +} + +static struct lvar_leaf_entry *n_end(const struct iam_leaf *l) +{ + return (void *)l->il_bh->b_data + h_used(n_head(l)); +} + +static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l) +{ + return lentry_lvar(l->il_at); +} + +void n_print(const struct iam_leaf *l) +{ + struct lvar_leaf_entry *scan; + + printk(KERN_EMERG "used: %d\n", h_used(n_head(l))); + for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan)) + e_print(scan); +} + +#if LDISKFS_CORRECTNESS_ON +static int n_at_rec(const struct iam_leaf *folio) +{ + return + n_start(folio) <= lentry_lvar(folio->il_at) && + lentry_lvar(folio->il_at) < n_end(folio); +} + +#if LDISKFS_INVARIANT_ON +static int n_invariant(const struct iam_leaf *leaf) +{ + struct iam_path *path; + struct lvar_leaf_entry *scan; + struct lvar_leaf_entry *end; + lvar_hash_t hash; + lvar_hash_t nexthash; + lvar_hash_t starthash; + + end = n_end(leaf); + hash = 0; + path = leaf->il_path; + + if (h_used(n_head(leaf)) > blocksize(leaf)) + return 0; + + /* + * Delimiting key in the parent index node. Clear least bit to account + * for hash collision marker. + */ + starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1; + for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { + nexthash = e_hash(scan); + if (nexthash != get_hash(iam_leaf_container(leaf), + e_char(scan), e_keysize(scan))) { + BREAKPOINT(); + return 0; + } + if (0 && nexthash < starthash) { + /* + * Unfortunately this useful invariant cannot be + * reliably checked as parent node is nor necessarily + * locked. + */ + n_print(leaf); + printk("%#x < %#x\n", nexthash, starthash); + dump_stack(); + return 0; + } + if (nexthash < hash) { + BREAKPOINT(); + return 0; + } + hash = nexthash; + } + if (scan != end) { + BREAKPOINT(); + return 0; + } + return 1; +} +/* LDISKFS_INVARIANT_ON */ +#endif + +/* LDISKFS_CORRECTNESS_ON */ +#endif + +static struct iam_ikey *lvar_ikey(const struct iam_leaf *l, + struct iam_ikey *key) +{ + lvar_hash_t *hash; + + assert_corr(n_at_rec(l)); + + hash = (void *)key; + *hash = e_hash(n_cur(l)); + return key; +} + +static struct iam_key *lvar_key(const struct iam_leaf *l) +{ + return e_key(n_cur(l)); +} + +static int lvar_key_size(const struct iam_leaf *l) +{ + return e_keysize(n_cur(l)); +} + +static void lvar_start(struct iam_leaf *l) +{ + l->il_at = lvar_lentry(n_start(l)); +} + +static int lvar_init(struct iam_leaf *l) +{ + int result; + int used; + struct lvar_leaf_header *head; + + assert_corr(l->il_bh != NULL); + + head = n_head(l); + used = h_used(head); + if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) && + used <= blocksize(l)) { + l->il_at = l->il_entries = lvar_lentry(n_start(l)); + result = 0; + } else { + struct inode *obj; + + obj = iam_leaf_container(l)->ic_object; + CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or " + "wrong used: %i", + (unsigned long long)l->il_bh->b_blocknr, obj->i_ino, + head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC), + used); + result = -EIO; + } + return result; +} + +static void lvar_fini(struct iam_leaf *l) +{ + l->il_entries = l->il_at = NULL; +} + +struct iam_rec *lvar_rec(const struct iam_leaf *l) +{ + assert_corr(n_at_rec(l)); + return e_rec(n_cur(l)); +} + +static void lvar_next(struct iam_leaf *l) +{ + assert_corr(n_at_rec(l)); + assert_corr(iam_leaf_is_locked(l)); + l->il_at = lvar_lentry(e_next(l, n_cur(l))); +} + +static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k) +{ + struct lvar_leaf_entry *found; + struct lvar_leaf_entry *scan; + struct lvar_leaf_entry *end; + int result; + const char *name; + int namelen; + int found_equal; + lvar_hash_t hash; + int last; + + assert_inv(n_invariant(leaf)); + end = n_end(leaf); + + name = kchar(k); + namelen = strlen(name); + hash = get_hash(iam_leaf_container(leaf), name, namelen); + found = NULL; + found_equal = 0; + last = 1; + + for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { + lvar_hash_t scan_hash; + + scan_hash = e_hash(scan); + if (scan_hash < hash) + found = scan; + else if (scan_hash == hash) { + if (e_eq(scan, name, namelen)) { + /* + * perfect match + */ + leaf->il_at = lvar_lentry(scan); + return IAM_LOOKUP_EXACT; + } else if (!found_equal) { + found = scan; + found_equal = 1; + } + } else { + last = 0; + break; + } + } + if (found == NULL) { + /* + * @k is less than all hashes in the leaf. + */ + lvar_start(leaf); + result = IAM_LOOKUP_BEFORE; + } else { + leaf->il_at = lvar_lentry(found); + result = IAM_LOOKUP_OK; + assert_corr(n_at_rec(leaf)); + } + if (last) + result |= IAM_LOOKUP_LAST; + assert_inv(n_invariant(leaf)); + + return result; +} + +static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik) +{ + struct lvar_leaf_entry *scan; + struct lvar_leaf_entry *end; + lvar_hash_t hash; + + assert_inv(n_invariant(leaf)); + end = n_end(leaf); + hash = *(const lvar_hash_t *)ik; + + lvar_start(leaf); + for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { + lvar_hash_t scan_hash; + + scan_hash = e_hash(scan); + if (scan_hash > hash) + return scan == n_start(leaf) ? + IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK; + leaf->il_at = lvar_lentry(scan); + if (scan_hash == hash) + return IAM_LOOKUP_EXACT; + } + assert_inv(n_invariant(leaf)); + /* + * @ik is greater than any key in the node. Return last record in the + * node. + */ + return IAM_LOOKUP_OK; +} + +static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k) +{ + memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l))); +} + +static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k) +{ + assert_corr(n_at_rec(l)); + assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l))); + assert_corr(iam_leaf_is_locked(l)); + __lvar_key_set(l, k); + assert_inv(n_invariant(l)); +} + +static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k) +{ + lvar_hash_t hash; + const char *name; + + name = kchar(k); + + hash = get_hash(iam_leaf_container(l), name, strlen(name)); + return e_cmp(l, n_cur(l), hash); +} + +static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k) +{ + const char *name; + + name = kchar(k); + return e_eq(n_cur(l), name, strlen(name)); +} + +static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) +{ + memcpy(e_rec(n_cur(l)), r, rec_size(r)); +} + +static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) +{ + assert_corr(n_at_rec(l)); + assert_corr(iam_leaf_is_locked(l)); + __lvar_rec_set(l, r); + assert_inv(n_invariant(l)); +} + +static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r) +{ + struct iam_rec *rec; + + rec = e_rec(n_cur(l)); + assert_corr(n_at_rec(l)); + assert_corr(iam_leaf_is_locked(l)); + memcpy(r, rec, rec_size(rec)); + assert_inv(n_invariant(l)); +} + +static int lvar_can_add(const struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r) +{ + assert_corr(iam_leaf_is_locked(l)); + return + h_used(n_head(l)) + + getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l); +} + +static int lvar_at_end(const struct iam_leaf *folio) +{ + assert_corr(iam_leaf_is_locked(folio)); + return n_cur(folio) == n_end(folio); +} + +static void lvar_rec_add(struct iam_leaf *leaf, + const struct iam_key *k, const struct iam_rec *r) +{ + const char *key; + int ksize; + int shift; + void *end; + void *start; + ptrdiff_t diff; + + assert_corr(lvar_can_add(leaf, k, r)); + assert_inv(n_invariant(leaf)); + assert_corr(iam_leaf_is_locked(leaf)); + + key = kchar(k); + ksize = strlen(key); + shift = getsize(leaf, ksize, rec_size(r)); + + if (!lvar_at_end(leaf)) { + assert_corr(n_cur(leaf) < n_end(leaf)); + end = n_end(leaf); + if (lvar_key_cmp(leaf, k) <= 0) + lvar_next(leaf); + else + /* + * Another exceptional case: insertion with the key + * less than least key in the leaf. + */ + assert_corr(leaf->il_at == leaf->il_entries); + + start = leaf->il_at; + diff = PDIFF(end, start); + assert_corr(diff >= 0); + memmove(start + shift, start, diff); + } + h_used_adj(leaf, n_head(leaf), shift); + n_cur(leaf)->vle_keysize = cpu_to_le16(ksize); + n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf), + key, ksize)); + __lvar_key_set(leaf, k); + __lvar_rec_set(leaf, r); + assert_corr(n_at_rec(leaf)); + assert_inv(n_invariant(leaf)); +} + +static void lvar_rec_del(struct iam_leaf *leaf, int shift) +{ + void *next; + void *end; + int nob; + + assert_corr(n_at_rec(leaf)); + assert_inv(n_invariant(leaf)); + assert_corr(iam_leaf_is_locked(leaf)); + + end = n_end(leaf); + next = e_next(leaf, n_cur(leaf)); + nob = e_size(leaf, n_cur(leaf)); + memmove(leaf->il_at, next, end - next); + h_used_adj(leaf, n_head(leaf), -nob); + assert_inv(n_invariant(leaf)); +} + +static void lvar_init_new(struct iam_container *c, struct buffer_head *bh) +{ + struct lvar_leaf_header *hdr; + + hdr = (struct lvar_leaf_header *)bh->b_data; + hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC); + hdr->vlh_used = sizeof *hdr; +} + +static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf, + struct lvar_leaf_entry **prev) +{ + void *scan; + void *start; + int threshold; + + *prev = NULL; + threshold = blocksize(leaf) / 2; + for (scan = start = n_start(leaf); scan - start <= threshold; + *prev = scan, scan = e_next(leaf, scan)) { + ; + } + return scan; +} + +static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh, + iam_ptr_t new_blknr) +{ + struct lvar_leaf_entry *first_to_move; + struct lvar_leaf_entry *last_to_stay; + struct iam_path *path; + struct lvar_leaf_header *hdr; + struct buffer_head *new_leaf; + + ptrdiff_t tomove; + lvar_hash_t hash; + + assert_inv(n_invariant(leaf)); + assert_corr(iam_leaf_is_locked(leaf)); + + new_leaf = *bh; + path = iam_leaf_path(leaf); + + hdr = (void *)new_leaf->b_data; + + first_to_move = find_pivot(leaf, &last_to_stay); + assert_corr(last_to_stay != NULL); + assert_corr(e_next(leaf, last_to_stay) == first_to_move); + + hash = e_hash(first_to_move); + if (hash == e_hash(last_to_stay)) + /* + * Duplicate hash. + */ + hash |= 1; + + tomove = PDIFF(n_end(leaf), first_to_move); + memmove(hdr + 1, first_to_move, tomove); + + h_used_adj(leaf, hdr, tomove); + h_used_adj(leaf, n_head(leaf), -tomove); + + assert_corr(n_end(leaf) == first_to_move); + + if (n_cur(leaf) >= first_to_move) { + /* + * insertion point moves into new leaf. + */ + ptrdiff_t shift; + int result; + + shift = PDIFF(leaf->il_at, first_to_move); + *bh = leaf->il_bh; + leaf->il_bh = new_leaf; + leaf->il_curidx = new_blknr; + + assert_corr(iam_leaf_is_locked(leaf)); + result = lvar_init(leaf); + /* + * init cannot fail, as node was just initialized. + */ + assert_corr(result == 0); + leaf->il_at = ((void *)leaf->il_at) + shift; + } + /* + * Insert pointer to the new node (together with the least key in + * the node) into index node. + */ + iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash, + new_blknr); + assert_corr(n_cur(leaf) < n_end(leaf)); + assert_inv(n_invariant(leaf)); +} + +static struct iam_leaf_operations lvar_leaf_ops = { + .init = lvar_init, + .init_new = lvar_init_new, + .fini = lvar_fini, + .start = lvar_start, + .next = lvar_next, + .key = lvar_key, + .ikey = lvar_ikey, + .rec = lvar_rec, + .key_set = lvar_key_set, + .key_cmp = lvar_key_cmp, + .key_eq = lvar_key_eq, + .key_size = lvar_key_size, + .rec_set = lvar_rec_set, + .rec_get = lvar_rec_get, + .lookup = lvar_lookup, + .ilookup = lvar_ilookup, + .at_end = lvar_at_end, + .rec_add = lvar_rec_add, + .rec_del = lvar_rec_del, + .can_add = lvar_can_add, + .split = lvar_split +}; + +/* + * Index operations. + */ + +enum { + /* This is duplicated in lustre/utils/create_iam.c */ + /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */ + IAM_LVAR_ROOT_MAGIC = 0xb01dface +}; + +/* This is duplicated in lustre/utils/create_iam.c */ +struct lvar_root { + __le32 vr_magic; + __le16 vr_recsize; + __le16 vr_ptrsize; + u8 vr_indirect_levels; + u8 vr_padding0; + __le16 vr_padding1; +}; + +static __u32 lvar_root_ptr(struct iam_container *c) +{ + return 0; +} + +static int lvar_node_init(struct iam_container *c, struct buffer_head *bh, + int root) +{ + return 0; +} + +static struct iam_entry *lvar_root_inc(struct iam_container *c, + struct iam_path *path, + struct iam_frame *frame) +{ + struct lvar_root *root; + struct iam_entry *entries; + + assert_corr(iam_frame_is_locked(path, frame)); + entries = frame->entries; + + dx_set_count(entries, 2); + assert_corr(dx_get_limit(entries) == dx_root_limit(path)); + + root = (void *)frame->bh->b_data; + assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC); + root->vr_indirect_levels ++; + frame->at = entries = iam_entry_shift(path, entries, 1); + memset(iam_ikey_at(path, entries), 0, + iam_path_descr(path)->id_ikey_size); + return entries; +} + +static int lvar_node_check(struct iam_path *path, struct iam_frame *frame) +{ + unsigned count; + unsigned limit; + unsigned limit_correct; + struct iam_entry *entries; + + entries = dx_node_get_entries(path, frame); + + if (frame == path->ip_frames) { + struct lvar_root *root; + + root = (void *)frame->bh->b_data; + if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC) + return -EIO; + limit_correct = dx_root_limit(path); + } else + limit_correct = dx_node_limit(path); + count = dx_get_count(entries); + limit = dx_get_limit(entries); + if (count > limit) + return -EIO; + if (limit != limit_correct) + return -EIO; + return 0; +} + +static int lvar_node_load(struct iam_path *path, struct iam_frame *frame) +{ + struct iam_entry *entries; + void *data; + entries = dx_node_get_entries(path, frame); + + data = frame->bh->b_data; + + if (frame == path->ip_frames) { + struct lvar_root *root; + const char *name; + + root = data; + name = kchar(path->ip_key_target); + path->ip_indirect = root->vr_indirect_levels; + if (path->ip_ikey_target == NULL) { + path->ip_ikey_target = iam_path_ikey(path, 4); + *(lvar_hash_t *)path->ip_ikey_target = + get_hash(path->ip_container, name, + strlen(name)); + } + } + frame->entries = frame->at = entries; + return 0; +} + +static int lvar_ikeycmp(const struct iam_container *c, + const struct iam_ikey *k1, const struct iam_ikey *k2) +{ + lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1); + lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2); + + return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); +} + +static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c, + void *area) +{ + return iam_ipd_alloc(area, c->ic_descr->id_ikey_size); +} + +static void lvar_root(void *buf, + int blocksize, int keysize, int ptrsize, int recsize) +{ + struct lvar_root *root; + struct dx_countlimit *limit; + void *entry; + int isize; + + isize = sizeof(lvar_hash_t) + ptrsize; + root = buf; + *root = (typeof(*root)) { + .vr_magic = cpu_to_le32(IAM_LVAR_ROOT_MAGIC), + .vr_recsize = cpu_to_le16(recsize), + .vr_ptrsize = cpu_to_le16(ptrsize), + .vr_indirect_levels = 0 + }; + + limit = (void *)(root + 1); + *limit = (typeof(*limit)){ + /* + * limit itself + one pointer to the leaf. + */ + .count = cpu_to_le16(2), + .limit = iam_root_limit(sizeof(struct lvar_root), blocksize, + sizeof (lvar_hash_t) + ptrsize) + }; + + entry = root + 1; + /* + * Skip over @limit. + */ + entry += isize; + + /* + * Entry format is followed by . In the minimal tree + * consisting of a root and single node, is a minimal possible + * key. + */ + *(lvar_hash_t *)entry = 0; + entry += sizeof(lvar_hash_t); + /* now @entry points to */ + if (ptrsize == 4) + *(u_int32_t *)entry = cpu_to_le32(1); + else + *(u_int64_t *)entry = cpu_to_le64(1); +} + +static int lvar_esize(int namelen, int recsize) +{ + return (offsetof(struct lvar_leaf_entry, vle_key) + + namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND; +} + +static void lvar_leaf(void *buf, + int blocksize, int keysize, int ptrsize, int recsize) +{ + struct lvar_leaf_header *head; + struct lvar_leaf_entry *entry; + + /* form leaf */ + head = buf; + *head = (typeof(*head)) { + .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC), + .vlh_used = cpu_to_le16(sizeof *head + lvar_esize(0, recsize)) + }; + entry = (void *)(head + 1); + *entry = (typeof(*entry)) { + .vle_hash = 0, + .vle_keysize = 0 + }; + memset(e_rec(entry), 0, recsize); + *(char *)e_rec(entry) = recsize; +} + +#include +#include +#include + +int iam_lvar_create(struct inode *obj, + int keysize, int ptrsize, int recsize, handle_t *handle) +{ + struct buffer_head *root_node; + struct buffer_head *leaf_node; + struct super_block *sb; + + u32 blknr; + int result; + unsigned long bsize; + + assert_corr(obj->i_size == 0); + + sb = obj->i_sb; + bsize = sb->s_blocksize; + root_node = ldiskfs_append(handle, obj, &blknr, &result); + leaf_node = ldiskfs_append(handle, obj, &blknr, &result); + if (root_node != NULL && leaf_node != NULL) { + lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize); + lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize); + ldiskfs_mark_inode_dirty(handle, obj); + result = ldiskfs_journal_dirty_metadata(handle, root_node); + if (result == 0) + result = ldiskfs_journal_dirty_metadata(handle, leaf_node); + if (result != 0) + ldiskfs_std_error(sb, result); + } + brelse(leaf_node); + brelse(root_node); + return result; +} +EXPORT_SYMBOL(iam_lvar_create); + +static struct iam_operations lvar_ops = { + .id_root_ptr = lvar_root_ptr, + .id_node_read = iam_node_read, + .id_node_init = lvar_node_init, + .id_node_check = lvar_node_check, + .id_node_load = lvar_node_load, + .id_ikeycmp = lvar_ikeycmp, + .id_root_inc = lvar_root_inc, + .id_ipd_alloc = lvar_ipd_alloc, + .id_ipd_free = iam_ipd_free, + .id_name = "lvar" +}; + +static int lvar_guess(struct iam_container *c) +{ + int result; + struct buffer_head *bh; + const struct lvar_root *root; + + assert_corr(c->ic_object != NULL); + + result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh); + if (result == 0) { + root = (void *)bh->b_data; + if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) { + struct iam_descr *descr; + + descr = c->ic_descr; + descr->id_key_size = LDISKFS_NAME_LEN; + descr->id_ikey_size = sizeof (lvar_hash_t); + descr->id_rec_size = le16_to_cpu(root->vr_recsize); + descr->id_ptr_size = le16_to_cpu(root->vr_ptrsize); + descr->id_root_gap = sizeof *root; + descr->id_node_gap = 0; + descr->id_ops = &lvar_ops; + descr->id_leaf_ops = &lvar_leaf_ops; + } else + result = -EBADF; + brelse(bh); + } + return result; +} + +static struct iam_format lvar_format = { + .if_guess = lvar_guess +}; + +void iam_lvar_format_init(void) +{ + iam_format_register(&lvar_format); +} + diff --git a/lustre/osd/osd_internal.h b/lustre/osd/osd_internal.h index 40f9ee4..14910aa 100644 --- a/lustre/osd/osd_internal.h +++ b/lustre/osd/osd_internal.h @@ -53,7 +53,6 @@ #include /* struct dentry */ #include -#include /* struct dirent64 */ #include @@ -65,6 +64,7 @@ #include #include "osd_oi.h" +#include "osd_iam.h" struct inode;