From dac045a34e49a1d5a536181c801954976ecd9dac Mon Sep 17 00:00:00 2001 From: tappro Date: Thu, 14 Jun 2007 10:07:23 +0000 Subject: [PATCH] - add cmd3 patches for ldiskfs --- .../patches/ext3-hash-selection.patch | 125 + .../patches/ext3-htree-comments.patch | 1643 +++++ .../patches/ext3-htree-path-ops.patch | 1125 ++++ .../kernel_patches/patches/ext3-htree-path.patch | 406 ++ .../patches/ext3-htree-r5-hash.patch | 88 + ldiskfs/kernel_patches/patches/ext3-iam-ops.patch | 1178 ++++ .../kernel_patches/patches/ext3-iam-separate.patch | 6758 ++++++++++++++++++++ ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch | 1408 ++++ .../patches/ext3-orphans-delay.patch | 42 + .../patches/ext3-pdirops-2.6.9.patch | 1247 ++++ .../kernel_patches/patches/ext3-tall-htree.patch | 431 ++ .../patches/ext3-wantedi-2.6-rhel4.patch | 23 +- .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 17 +- 13 files changed, 14469 insertions(+), 22 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/ext3-hash-selection.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-htree-comments.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-htree-path.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-iam-ops.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-iam-separate.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-tall-htree.patch diff --git a/ldiskfs/kernel_patches/patches/ext3-hash-selection.patch b/ldiskfs/kernel_patches/patches/ext3-hash-selection.patch new file mode 100644 index 0000000..40eb9fe --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-hash-selection.patch @@ -0,0 +1,125 @@ +Index: linux-2.6.9/fs/ext3/hash.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/hash.c 2006-04-23 22:39:01.000000000 +0800 ++++ linux-2.6.9/fs/ext3/hash.c 2006-04-23 22:39:16.000000000 +0800 +@@ -127,6 +127,11 @@ + return a; + } + ++static __u32 dx_same_hash(const signed char *msg, int len) ++{ ++ return 0xcafebabeUL; ++} ++ + static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) + { + __u32 pad, val; +@@ -220,6 +225,9 @@ + case DX_HASH_R5: + hash = dx_r5_hash(name, len); + break; ++ case DX_HASH_SAME: ++ hash = dx_same_hash(name, len); ++ break; + default: + hinfo->hash = 0; + return -1; +Index: linux-2.6.9/fs/ext3/super.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/super.c 2006-08-17 09:26:01.000000000 +0300 ++++ linux-2.6.9/fs/ext3/super.c 2006-08-17 09:31:22.000000000 +0300 +@@ -599,6 +599,7 @@ enum { + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, + Opt_mballoc, Opt_nomballoc, Opt_stripe, ++ Opt_hashfunc, + }; + + static match_table_t tokens = { +@@ -655,6 +656,7 @@ static match_table_t tokens = { + {Opt_stripe, "stripe=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, ++ {Opt_hashfunc,"hash=%s"}, + }; + + static unsigned long get_sb_block(void **data) +@@ -679,6 +681,7 @@ static unsigned long get_sb_block(void * + return sb_block; + } + ++int user_selected_hash_function = -1; + static int parse_options (char * options, struct super_block *sb, + unsigned long * inum, unsigned long *n_blocks_count, int is_remount) + { +@@ -980,6 +983,22 @@ clear_qf_name: + return 0; + sbi->s_stripe = option; + break; ++ case Opt_hashfunc: ++ if (strncmp (args[0].from,"legacy",6) == 0){ ++ user_selected_hash_function = 0; ++ } else if (strncmp (args[0].from,"half_md4",8) == 0){ ++ user_selected_hash_function = 1; ++ } else if (strncmp (args[0].from,"tea",3) == 0){ ++ user_selected_hash_function = 2; ++ } else if (strncmp (args[0].from,"r5",2) == 0){ ++ user_selected_hash_function = 3; ++ } else if (strncmp (args[0].from,"same",4) == 0){ ++ user_selected_hash_function = 4; ++ } else { ++ printk ("Hashfunc name wrong\n"); ++ return 0; ++ } ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:39:02.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:39:16.000000000 +0800 +@@ -365,10 +365,7 @@ + struct htree_cookie *hc = cookie; + + root = data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_R5 && +- root->info.hash_version != DX_HASH_LEGACY) { ++ if (root->info.hash_version > DX_HASH_MAX) { + ext3_warning(sb, __FUNCTION__, + "Unrecognised inode hash code %d", + root->info.hash_version); +@@ -1467,6 +1464,7 @@ + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ ++extern int user_selected_hash_function; + static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) + { +@@ -1522,7 +1520,9 @@ + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; +- root->info.hash_version = DX_HASH_R5; ++ if (user_selected_hash_function >= 0 && ++ user_selected_hash_function <= DX_HASH_MAX) ++ root->info.hash_version = user_selected_hash_function; + entries = (void *)root->entries; + dx_set_block (&path, entries, 1); + dx_set_count (entries, 1); +Index: linux-2.6.9/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.9.orig/include/linux/ext3_fs.h 2006-04-23 22:39:01.000000000 +0800 ++++ linux-2.6.9/include/linux/ext3_fs.h 2006-04-23 22:39:16.000000000 +0800 +@@ -665,6 +665,8 @@ + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 + #define DX_HASH_R5 3 ++#define DX_HASH_SAME 4 ++#define DX_HASH_MAX 4 + + /* hash info structure used by the directory hash */ + struct dx_hash_info diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-comments.patch b/ldiskfs/kernel_patches/patches/ext3-htree-comments.patch new file mode 100644 index 0000000..159add6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-htree-comments.patch @@ -0,0 +1,1643 @@ +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-17 18:32:12.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 21:40:41.000000000 +0800 +@@ -24,6 +24,78 @@ + * Theodore Ts'o, 2002 + */ + ++/* ++ * iam: big theory statement. ++ * ++ * iam (Index Access Module) is a module providing abstraction of persistent ++ * transactional container on top of generalized ext3 htree. ++ * ++ * iam supports: ++ * ++ * - key, pointer, and record size specifiable per container. ++ * ++ * - trees taller than 2 index levels. ++ * ++ * - read/write to existing ext3 htree directories as iam containers. ++ * ++ * iam container is a tree, consisting of leaf nodes containing keys and ++ * records stored in this container, and index nodes, containing keys and ++ * pointers to leaf or index nodes. ++ * ++ * iam does not work with keys directly, instead it calls user-supplied key ++ * comparison function (->dpo_keycmp()). ++ * ++ * Pointers are (currently) interpreted as logical offsets (measured in ++ * blocksful) within underlying flat file on top of which iam tree lives. ++ * ++ * On-disk format: ++ * ++ * iam mostly tries to reuse existing htree formats. ++ * ++ * Format of index node: ++ * ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * | | count | | | | | | ++ * | gap | / | entry | entry | .... | entry | free space | ++ * | | limit | | | | | | ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * ++ * gap this part of node is never accessed by iam code. It ++ * exists for binary compatibility with ext3 htree (that, ++ * in turn, stores fake struct ext2_dirent for ext2 ++ * compatibility), and to keep some unspecified per-node ++ * data. Gap can be different for root and non-root index ++ * nodes. Gap size can be specified for each container ++ * (gap of 0 is allowed). ++ * ++ * count/limit current number of entries in this node, and the maximal ++ * number of entries that can fit into node. count/limit ++ * has the same size as entry, and is itself counted in ++ * count. ++ * ++ * entry index entry: consists of a key immediately followed by ++ * a pointer to a child node. Size of a key and size of a ++ * pointer depends on container. Entry has neither ++ * alignment nor padding. ++ * ++ * free space portion of node new entries are added to ++ * ++ * Entries in index node are sorted by their key value. ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ */ ++ + #include + #include + #include +@@ -98,14 +170,6 @@ + __le16 count; + }; + +-struct dx_entry; /* incomplete type */ +-struct dx_key; /* incomplete type */ +- +-struct dx_entry_compat { +- __le32 hash; +- __le32 block; +-}; +- + /* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the +@@ -135,111 +199,513 @@ + struct {} entries[0]; + }; + +- +-struct dx_frame +-{ +- struct buffer_head *bh; +- struct dx_entry *entries; +- struct dx_entry *at; +-}; +- + struct dx_map_entry + { + u32 hash; + u32 offs; + }; + +-struct dx_path; +-struct dx_param { +- size_t dpo_key_size; +- size_t dpo_ptr_size; +- size_t dpo_node_gap; +- size_t dpo_root_gap; +- +- u32 (*dpo_root_ptr)(struct dx_path *path); +- int (*dpo_node_check)(struct dx_path *path, +- struct dx_frame *frame, void *cookie); +- int (*dpo_node_init)(struct dx_path *path, +- struct buffer_head *bh, int root); +- int (*dpo_keycmp)(struct dx_path *path, +- struct dx_key *k1, struct dx_key *k2); ++/* ++ * Entry within index tree node. Consists of a key immediately followed ++ * (without padding) by a pointer to the child node. ++ * ++ * Both key and pointer are of variable size, hence incomplete type. ++ */ ++struct iam_entry; ++ ++struct iam_entry_compat { ++ __le32 hash; ++ __le32 block; ++}; ++ ++/* ++ * Incomplete type used to refer to keys in iam container. ++ * ++ * As key size can be different from container to container, iam has to use ++ * incomplete type. Clients cast pointer to iam_key to real key type and back. ++ */ ++struct iam_key; ++ ++/* Incomplete type use to refer to the records stored in iam containers. */ ++struct iam_rec; ++ ++typedef __u64 iam_ptr_t; ++ ++/* ++ * Index node traversed during tree lookup. ++ */ ++struct iam_frame { ++ struct buffer_head *bh; /* buffer holding node data */ ++ struct iam_entry *entries; /* array of entries */ ++ struct iam_entry *at; /* target entry, found by binary search */ ++}; ++ ++/* leaf node reached by tree lookup */ ++struct iam_leaf { ++ struct buffer_head *bh; ++ struct iam_leaf_entry *entries; ++ struct iam_leaf_entry *at; ++}; ++ ++struct iam_path; ++struct iam_container; ++ ++/* ++ * Parameters, describing a flavor of iam container. ++ */ ++struct iam_descr { ++ /* ++ * Size of a key in this container, in bytes. ++ */ ++ size_t id_key_size; ++ /* ++ * Size of a pointer to the next level (stored in index nodes), in ++ * bytes. ++ */ ++ size_t id_ptr_size; ++ /* ++ * Size of a record (stored in leaf nodes), in bytes. ++ */ ++ size_t id_rec_size; ++ /* ++ * Size of unused (by iam) space at the beginning of every non-root ++ * node, in bytes. Used for compatibility with ext3. ++ */ ++ size_t id_node_gap; ++ /* ++ * Size of unused (by iam) space at the beginning of root node, in ++ * bytes. Used for compatibility with ext3. ++ */ ++ size_t id_root_gap; ++ ++ /* ++ * Returns pointer (in the same sense as pointer in index entry) to ++ * the root node. ++ */ ++ __u32 (*id_root_ptr)(struct iam_container *c); ++ ++ /* ++ * Check validity and consistency of index node. This is called when ++ * iam just loaded new node into frame. ++ */ ++ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); ++ /* ++ * Initialize new node (stored in @bh) that is going to be added into ++ * tree. ++ */ ++ int (*id_node_init)(struct iam_container *c, ++ struct buffer_head *bh, int root); ++ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ /* ++ * Key comparison function. Returns -1, 0, +1. ++ */ ++ int (*id_keycmp)(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2); ++ /* ++ * Create new container. ++ * ++ * Newly created container has a root node and a single leaf. Leaf ++ * contains single record with the smallest possible key. ++ */ ++ int (*id_create)(struct iam_container *c); ++ struct { ++ /* ++ * leaf operations. ++ */ ++ /* ++ * returns true iff leaf is positioned at the last entry. ++ */ ++ int (*at_end)(struct iam_container *c, struct iam_leaf *l); ++ /* position leaf at the first entry */ ++ void (*start)(struct iam_container *c, struct iam_leaf *l); ++ /* more leaf to the next entry. */ ++ void (*next)(struct iam_container *c, struct iam_leaf *l); ++ /* return key of current leaf record in @k */ ++ void (*key)(struct iam_container *c, struct iam_leaf *l, ++ struct iam_key *k); ++ /* return pointer to entry body */ ++ struct iam_rec *(*rec)(struct iam_container *c, ++ struct iam_leaf *l); ++ } id_leaf; ++}; ++ ++struct iam_container { ++ /* ++ * Underlying flat file. IO against this object is issued to ++ * read/write nodes. ++ */ ++ struct inode *ic_object; ++ /* ++ * container flavor. ++ */ ++ struct iam_descr *ic_descr; ++ /* ++ * pointer to flavor-specific per-container data. ++ */ ++ void *ic_descr_data; + }; + + /* + * Structure to keep track of a path drilled through htree. + */ +-struct dx_path { +- struct inode *dp_object; +- struct dx_param *dp_param; +- int dp_indirect; +- struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; +- struct dx_frame *dp_frame; +- struct dx_key *dp_key_target; +- struct dx_key *dp_key_scratch[DX_SCRATCH_KEYS]; +-}; +- +-struct dx_path_compat { +- struct dx_path dpc_path; +- __u32 dpc_scrach[DX_SCRATCH_KEYS]; +-}; +- +-static u32 htree_root_ptr(struct dx_path *p); +-static int htree_node_check(struct dx_path *path, +- struct dx_frame *frame, void *cookie); +-static int htree_node_init(struct dx_path *path, ++struct iam_path { ++ /* ++ * Parent container. ++ */ ++ struct iam_container *ip_container; ++ /* ++ * Number of index levels minus one. ++ */ ++ int ip_indirect; ++ /* ++ * Nodes that top-to-bottom traversal passed through. ++ */ ++ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; ++ /* ++ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one ++ * immediately above leaf). ++ */ ++ struct iam_frame *ip_frame; ++ /* ++ * Leaf node: a child of ->ip_frame. ++ */ ++ struct iam_leaf *ip_leaf; ++ /* ++ * Key searched for. ++ */ ++ struct iam_key *ip_key_target; ++ /* ++ * Scratch-pad area for temporary keys. ++ */ ++ struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; ++ /* ++ * pointer to flavor-specific per-container data. ++ */ ++ void *ip_descr_data; ++}; ++ ++/* ++ * Helper structure for legacy htrees. ++ */ ++struct iam_path_compat { ++ struct iam_path ipc_path; ++ struct iam_container ipc_container; ++ __u32 ipc_scrach[DX_SCRATCH_KEYS]; ++}; ++ ++static u32 htree_root_ptr(struct iam_container *c); ++static int htree_node_check(struct iam_path *path, struct iam_frame *frame); ++static int htree_node_init(struct iam_container *c, + struct buffer_head *bh, int root); +-static int htree_keycmp(struct dx_path *path, +- struct dx_key *k1, struct dx_key *k2); ++static int htree_keycmp(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2); ++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ ++/* ++ * Parameters describing iam compatibility mode in which existing ext3 htrees ++ * can be manipulated. ++ */ ++static struct iam_descr htree_compat_param = { ++ .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, ++ .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, ++ .id_node_gap = offsetof(struct dx_node, entries), ++ .id_root_gap = offsetof(struct dx_root, entries), ++ ++ .id_root_ptr = htree_root_ptr, ++ .id_node_check = htree_node_check, ++ .id_node_init = htree_node_init, ++ .id_node_read = htree_node_read, ++ .id_keycmp = htree_keycmp ++}; ++ ++ ++struct iam_key; ++struct iam_rec; ++struct iam_descr; ++struct iam_container; ++struct iam_path; + +-static struct dx_param htree_compat_param = { +- .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, +- .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, +- .dpo_node_gap = offsetof(struct dx_node, entries), +- .dpo_root_gap = offsetof(struct dx_root, entries), +- +- .dpo_root_ptr = htree_root_ptr, +- .dpo_node_check = htree_node_check, +- .dpo_node_init = htree_node_init, +- .dpo_keycmp = htree_keycmp ++/* ++ * Initialize container @c, acquires additional reference on @inode. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode); ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c); ++ ++/* ++ * Search container @c for record with key @k. If record is found, its data ++ * are moved into @r. ++ * ++ * ++ * ++ * Return values: +ve: found, 0: not-found, -ve: error ++ */ ++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); ++/* ++ * Insert new record @r with key @k into container @c (within context of ++ * transaction @h. ++ * ++ * Return values: 0: success, -ve: error, including -EEXIST when record with ++ * given key is already present. ++ * ++ * postcondition: ergo(result == 0 || result == -EEXIST, ++ * iam_lookup(c, k, r2) > 0 && ++ * !memcmp(r, r2, c->ic_descr->id_rec_size)); ++ */ ++int iam_insert(handle_t *h, struct iam_container *c, ++ struct iam_key *k, struct iam_rec *r); ++/* ++ * Replace existing record with key @k, or insert new one. New record data are ++ * in @r. ++ * ++ * Return values: 0: success, -ve: error. ++ * ++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && ++ * !memcmp(r, r2, c->ic_descr->id_rec_size)); ++ */ ++int iam_update(handle_t *h, struct iam_container *c, ++ struct iam_key *k, struct iam_rec *r); ++/* ++ * Delete existing record with key @k. ++ * ++ * Return values: 0: success, -ENOENT: not-found, -ve: other error. ++ * ++ * postcondition: ergo(result == 0 || result == -ENOENT, ++ * !iam_lookup(c, k, *)); ++ */ ++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); ++ ++/* ++ * iam cursor (iterator) api. ++ */ ++ ++/* ++ * Flags controlling iterator functionality. ++ */ ++enum iam_it_flags { ++ /* ++ * this iterator will move (iam_it_{prev,next}() will be called on it) ++ */ ++ IAM_IT_MOVE = (1 << 0), ++ /* ++ * tree can be updated through this iterator. ++ */ ++ IAM_IT_WRITE = (1 << 1) + }; + ++/* ++ * States of iterator state machine. ++ */ ++enum iam_it_state { ++ /* initial state */ ++ IAM_IT_DETACHED, ++ /* iterator is above particular record in the container */ ++ IAM_IT_ATTACHED ++}; ++ ++/* ++ * Iterator. ++ * ++ * Immediately after call to iam_it_init() iterator is in "detached" ++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but ++ * doesn't point to any particular record in this container. ++ * ++ * After successful call to iam_it_get() and until corresponding call to ++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). ++ * ++ * Attached iterator can move through records in a container (provided ++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it ++ * passes over them, and can modify container (provided IAM_IT_WRITE ++ * permission). ++ * ++ * Concurrency: iterators are supposed to be local to thread. Interfaces below ++ * do no internal serialization. ++ * ++ */ ++struct iam_iterator { ++ /* ++ * iterator flags, taken from enum iam_it_flags. ++ */ ++ __u32 ii_flags; ++ enum iam_it_state ii_state; ++ /* ++ * path to the record. Valid in IAM_IT_ATTACHED state. ++ */ ++ struct iam_path ii_path; ++}; ++ ++static inline struct iam_key *keycpy(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2) ++{ ++ return memcpy(k1, k2, c->ic_descr->id_key_size); ++} ++ ++static inline int keycmp(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2) ++{ ++ return c->ic_descr->id_keycmp(c, k1, k2); ++} ++ ++static struct iam_container *iam_it_container(struct iam_iterator *it) ++{ ++ return it->ii_path.ip_container; ++} ++ ++static inline int it_keycmp(struct iam_iterator *it, ++ struct iam_key *k1, struct iam_key *k2) ++{ ++ return keycmp(iam_it_container(it), k1, k2); ++} ++ ++/* ++ * Initialize iterator to IAM_IT_DETACHED state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags); ++/* ++ * Finalize iterator and release all resources. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_fini(struct iam_iterator *it); ++ ++/* ++ * Attach iterator. After successful completion, @it points to record with the ++ * largest key not larger than @k. Semantics of ->id_create() method guarantee ++ * that such record will always be found. ++ * ++ * Return value: 0: positioned on existing record, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ * postcondition: ergo(result == 0, ++ * (it_state(it) == IAM_IT_ATTACHED && ++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) ++ */ ++int iam_it_get(struct iam_iterator *it, struct iam_key *k); ++ ++/* ++ * Duplicates iterator. ++ * ++ * postcondition: it_state(dst) == it_state(src) && ++ * iam_it_container(dst) == iam_it_container(src) && ++ * dst->ii_flags = src->ii_flags && ++ * ergo(it_state(it) == IAM_IT_ATTACHED, ++ * iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) ++ */ ++void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src); ++ ++/* ++ * Detach iterator. Does nothing it detached state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_put(struct iam_iterator *it); ++ ++/* ++ * Move iterator one record right. ++ * ++ * Return value: 0: success, ++ * +1: end of container reached ++ * -ve: error ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE ++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) ++ */ ++int iam_it_next(struct iam_iterator *it); ++ ++/* ++ * Return pointer to the record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++const struct iam_rec *iam_it_rec_get(struct iam_iterator *it); ++ ++/* ++ * Replace contents of record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); ++ ++/* ++ * Place key under iterator in @k, return @k ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++const struct iam_key *iam_it_key_get(struct iam_iterator *it, ++ struct iam_key *k); ++ ++/* ++ * Insert new record with key @k and contents from @r, shifting records to the ++ * right. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE && ++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0 ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, ++ * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && ++ * !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, ++ struct iam_key *k, struct iam_rec *r); ++/* ++ * Delete record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); + + #ifdef CONFIG_EXT3_INDEX +-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry); +-static void dx_set_block(struct dx_path *p, +- struct dx_entry *entry, unsigned value); +-static inline struct dx_key *dx_get_key(struct dx_path *p, +- struct dx_entry *entry, +- struct dx_key *key); +-static void dx_set_key(struct dx_path *p, struct dx_entry *entry, +- struct dx_key *key); +-static unsigned dx_get_count(struct dx_entry *entries); +-static unsigned dx_get_limit(struct dx_entry *entries); +-static void dx_set_count(struct dx_entry *entries, unsigned value); +-static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct dx_path *p); +-static unsigned dx_node_limit(struct dx_path *p); ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry); ++static void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value); ++static inline struct iam_key *dx_get_key(struct iam_path *p, ++ struct iam_entry *entry, ++ struct iam_key *key); ++static void dx_set_key(struct iam_path *p, struct iam_entry *entry, ++ struct iam_key *key); ++static unsigned dx_get_count(struct iam_entry *entries); ++static unsigned dx_get_limit(struct iam_entry *entries); ++static void dx_set_count(struct iam_entry *entries, unsigned value); ++static void dx_set_limit(struct iam_entry *entries, unsigned value); ++static unsigned dx_root_limit(struct iam_path *p); ++static unsigned dx_node_limit(struct iam_path *p); + static int dx_probe(struct dentry *dentry, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_path *path); ++ struct iam_path *path); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_path *path, +- struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct iam_path *path, ++ struct iam_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_path *path, __u32 *start_hash); ++ struct iam_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +-static inline void dx_path_init(struct dx_path *path, struct inode *inode); +-static inline void dx_path_fini(struct dx_path *path); ++static inline void iam_path_init(struct iam_path *path, ++ struct iam_container *c); ++static inline void iam_path_fini(struct iam_path *path); + + + /* +@@ -247,153 +713,154 @@ + * Mask them off for now. + */ + +-static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off) ++static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off) + { + return (void *)((char *)entry + off); + } + +-static inline size_t dx_entry_size(struct dx_path *p) ++static inline struct iam_descr *path_descr(struct iam_path *p) + { +- return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size; ++ return p->ip_container->ic_descr; + } + +-static inline struct dx_entry *dx_entry_shift(struct dx_path *p, +- struct dx_entry *entry, int shift) ++static inline struct inode *path_obj(struct iam_path *p) ++{ ++ return p->ip_container->ic_object; ++} ++ ++static inline size_t iam_entry_size(struct iam_path *p) ++{ ++ return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size; ++} ++ ++static inline struct iam_entry *iam_entry_shift(struct iam_path *p, ++ struct iam_entry *entry, int shift) + { + void *e = entry; +- return e + shift * dx_entry_size(p); ++ return e + shift * iam_entry_size(p); + } + +-static inline ptrdiff_t dx_entry_diff(struct dx_path *p, +- struct dx_entry *e1, struct dx_entry *e2) ++static inline ptrdiff_t iam_entry_diff(struct iam_path *p, ++ struct iam_entry *e1, struct iam_entry *e2) + { + ptrdiff_t diff; + + diff = (void *)e1 - (void *)e2; +- assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff); +- return diff / dx_entry_size(p); ++ assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff); ++ return diff / iam_entry_size(p); + } + +-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry) ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) + { +- return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size)) ++ return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size)) + & 0x00ffffff; + } + +-static inline void dx_set_block(struct dx_path *p, +- struct dx_entry *entry, unsigned value) ++static inline void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value) + { +- *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value); ++ *(u32*)entry_off(entry, ++ path_descr(p)->id_key_size) = cpu_to_le32(value); + } + +-static inline struct dx_key *dx_get_key(struct dx_path *p, +- struct dx_entry *entry, +- struct dx_key *key) ++static inline struct iam_key *dx_get_key(struct iam_path *p, ++ struct iam_entry *entry, ++ struct iam_key *key) + { +- memcpy(key, entry, p->dp_param->dpo_key_size); ++ memcpy(key, entry, path_descr(p)->id_key_size); + return key; + } + +-static inline struct dx_key *dx_key_at(struct dx_path *p, +- struct dx_entry *entry) ++static inline struct iam_key *iam_key_at(struct iam_path *p, ++ struct iam_entry *entry) + { +- return (struct dx_key *)entry; ++ return (struct iam_key *)entry; + } + +-static inline void dx_set_key(struct dx_path *p, +- struct dx_entry *entry, struct dx_key *key) ++static inline void dx_set_key(struct iam_path *p, ++ struct iam_entry *entry, struct iam_key *key) + { +- memcpy(entry, key, p->dp_param->dpo_key_size); ++ memcpy(entry, key, path_descr(p)->id_key_size); + } + +-static inline unsigned dx_get_count (struct dx_entry *entries) ++static inline unsigned dx_get_count (struct iam_entry *entries) + { + return le16_to_cpu(((struct dx_countlimit *) entries)->count); + } + +-static inline unsigned dx_get_limit (struct dx_entry *entries) ++static inline unsigned dx_get_limit (struct iam_entry *entries) + { + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); + } + +-static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++static inline void dx_set_count (struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); + } + +-static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++static inline void dx_set_limit (struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct dx_path *p) ++static inline unsigned dx_root_limit(struct iam_path *p) + { +- struct dx_param *param = p->dp_param; +- unsigned entry_space = p->dp_object->i_sb->s_blocksize - +- param->dpo_root_gap; +- return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++ struct iam_descr *param = path_descr(p); ++ unsigned entry_space = path_obj(p)->i_sb->s_blocksize - ++ param->id_root_gap; ++ return entry_space / (param->id_key_size + param->id_ptr_size); + } + +-static inline unsigned dx_node_limit(struct dx_path *p) ++static inline unsigned dx_node_limit(struct iam_path *p) + { +- struct dx_param *param = p->dp_param; +- unsigned entry_space = p->dp_object->i_sb->s_blocksize - +- param->dpo_node_gap; +- return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++ struct iam_descr *param = path_descr(p); ++ unsigned entry_space = path_obj(p)->i_sb->s_blocksize - ++ param->id_node_gap; ++ return entry_space / (param->id_key_size + param->id_ptr_size); + } + +-static inline int dx_index_is_compat(struct dx_path *path) ++static inline int dx_index_is_compat(struct iam_path *path) + { +- return path->dp_param == &htree_compat_param; ++ return path_descr(path) == &htree_compat_param; + } + +-static struct dx_entry *dx_get_entries(struct dx_path *path, void *data, ++static struct iam_entry *dx_get_entries(struct iam_path *path, void *data, + int root) + { + return data + + (root ? +- path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap); ++ path_descr(path)->id_root_gap : path_descr(path)->id_node_gap); + } + +-static struct dx_entry *dx_node_get_entries(struct dx_path *path, +- struct dx_frame *frame) ++static struct iam_entry *dx_node_get_entries(struct iam_path *path, ++ struct iam_frame *frame) + { + return dx_get_entries(path, +- frame->bh->b_data, frame == path->dp_frames); +-} +- +-static inline struct dx_key *keycpy(struct dx_path *p, +- struct dx_key *k1, struct dx_key *k2) +-{ +- return memcpy(k1, k2, p->dp_param->dpo_key_size); +-} +- +-static inline int keycmp(struct dx_path *p, +- struct dx_key *k1, struct dx_key *k2) +-{ +- return p->dp_param->dpo_keycmp(p, k1, k2); ++ frame->bh->b_data, frame == path->ip_frames); + } + +-static int dx_node_check(struct dx_path *p, struct dx_frame *f) ++static int dx_node_check(struct iam_path *p, struct iam_frame *f) + { +- struct dx_entry *e; ++ struct iam_entry *e; ++ struct iam_container *c; + unsigned count; + unsigned i; + ++ c = p->ip_container; + e = dx_node_get_entries(p, f); + count = dx_get_count(e); +- e = dx_entry_shift(p, e, 1); +- for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) { +- keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]); +- dx_get_key(p, e, p->dp_key_scratch[1]); ++ e = iam_entry_shift(p, e, 1); ++ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { ++ keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]); ++ dx_get_key(p, e, p->ip_key_scratch[1]); + if (i > 0 && +- keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0) ++ keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0) + return 0; + } + return 1; + } + +-static u32 htree_root_ptr(struct dx_path *path) ++static u32 htree_root_ptr(struct iam_container *c) + { + return 0; + } +@@ -403,20 +870,19 @@ + struct dentry *dentry; + }; + +-static int htree_node_check(struct dx_path *path, struct dx_frame *frame, +- void *cookie) ++static int htree_node_check(struct iam_path *path, struct iam_frame *frame) + { + void *data; +- struct dx_entry *entries; ++ struct iam_entry *entries; + struct super_block *sb; + + data = frame->bh->b_data; + entries = dx_node_get_entries(path, frame); +- sb = path->dp_object->i_sb; +- if (frame == path->dp_frames) { ++ sb = path_obj(path)->i_sb; ++ if (frame == path->ip_frames) { + /* root node */ + struct dx_root *root; +- struct htree_cookie *hc = cookie; ++ struct htree_cookie *hc = path->ip_descr_data; + + root = data; + if (root->info.hash_version > DX_HASH_MAX) { +@@ -433,8 +899,8 @@ + return ERR_BAD_DX_DIR; + } + +- path->dp_indirect = root->info.indirect_levels; +- if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) { ++ path->ip_indirect = root->info.indirect_levels; ++ if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) { + ext3_warning(sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -450,17 +916,17 @@ + if (hc->dentry) + ext3fs_dirhash(hc->dentry->d_name.name, + hc->dentry->d_name.len, hc->hinfo); +- path->dp_key_target = (struct dx_key *)&hc->hinfo->hash; ++ path->ip_key_target = (struct iam_key *)&hc->hinfo->hash; + } else { + /* non-root index */ +- assert(entries == data + path->dp_param->dpo_node_gap); ++ assert(entries == data + path_descr(path)->id_node_gap); + assert(dx_get_limit(entries) == dx_node_limit(path)); + } + frame->entries = frame->at = entries; + return 0; + } + +-static int htree_node_init(struct dx_path *path, ++static int htree_node_init(struct iam_container *c, + struct buffer_head *bh, int root) + { + struct dx_node *node; +@@ -468,13 +934,24 @@ + assert(!root); + + node = (void *)bh->b_data; +- node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize); ++ node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); + node->fake.inode = 0; + return 0; + } + +-static int htree_keycmp(struct dx_path *path, +- struct dx_key *k1, struct dx_key *k2) ++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *handle, struct buffer_head **bh) ++{ ++ int result = 0; ++ ++ *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result); ++ if (*bh == NULL) ++ result = -EIO; ++ return result; ++} ++ ++static int htree_keycmp(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2) + { + __u32 p1 = le32_to_cpu(*(__u32 *)k1); + __u32 p2 = le32_to_cpu(*(__u32 *)k2); +@@ -486,7 +963,7 @@ + * Debug + */ + #ifdef DX_DEBUG +-static void dx_show_index (char * label, struct dx_entry *entries) ++static void dx_show_index (char * label, struct iam_entry *entries) + { + int i, n = dx_get_count (entries); + printk("%s index ", label); +@@ -535,7 +1012,7 @@ + } + + struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, +- struct dx_entry *entries, int levels) ++ struct iam_entry *entries, int levels) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; +@@ -565,32 +1042,33 @@ + } + #endif /* DX_DEBUG */ + +-static int dx_lookup(struct dx_path *path, void *cookie) ++static int dx_lookup(struct iam_path *path) + { + u32 ptr; +- int err; ++ int err = 0; + int i; + +- struct dx_param *param; +- struct dx_frame *frame; +- +- param = path->dp_param; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct iam_container *c; + +- for (frame = path->dp_frames, i = 0, +- ptr = param->dpo_root_ptr(path); i <= path->dp_indirect; ++ param = path_descr(path); ++ c = path->ip_container; ++ ++ for (frame = path->ip_frames, i = 0, ++ ptr = param->id_root_ptr(path->ip_container); ++ i <= path->ip_indirect; + ptr = dx_get_block(path, frame->at), ++frame, ++i) { +- struct dx_entry *entries; +- struct dx_entry *p; +- struct dx_entry *q; +- struct dx_entry *m; ++ struct iam_entry *entries; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; + unsigned count; + +- frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err); +- if (frame->bh == NULL) { +- err = -EIO; ++ err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh); ++ if (err != 0) + break; +- } +- err = param->dpo_node_check(path, frame, cookie); ++ err = param->id_node_check(path, frame); + if (err != 0) + break; + +@@ -599,37 +1077,37 @@ + entries = frame->entries; + count = dx_get_count(entries); + assert(count && count <= dx_get_limit(entries)); +- p = dx_entry_shift(path, entries, 1); +- q = dx_entry_shift(path, entries, count - 1); ++ p = iam_entry_shift(path, entries, 1); ++ q = iam_entry_shift(path, entries, count - 1); + while (p <= q) { +- m = dx_entry_shift(path, +- p, dx_entry_diff(path, q, p) / 2); ++ m = iam_entry_shift(path, ++ p, iam_entry_diff(path, q, p) / 2); + dxtrace(printk(".")); +- if (keycmp(path, dx_key_at(path, m), +- path->dp_key_target) > 0) +- q = dx_entry_shift(path, m, -1); ++ if (keycmp(c, iam_key_at(path, m), ++ path->ip_key_target) > 0) ++ q = iam_entry_shift(path, m, -1); + else +- p = dx_entry_shift(path, m, +1); ++ p = iam_entry_shift(path, m, +1); + } + +- frame->at = dx_entry_shift(path, p, -1); ++ frame->at = iam_entry_shift(path, p, -1); + if (1) { // linear search cross check + unsigned n = count - 1; +- struct dx_entry *at; ++ struct iam_entry *at; + + at = entries; + while (n--) { + dxtrace(printk(",")); +- at = dx_entry_shift(path, at, +1); +- if (keycmp(path, dx_key_at(path, at), +- path->dp_key_target) > 0) { +- if (at != dx_entry_shift(path, frame->at, 1)) { ++ at = iam_entry_shift(path, at, +1); ++ if (keycmp(c, iam_key_at(path, at), ++ path->ip_key_target) > 0) { ++ if (at != iam_entry_shift(path, frame->at, 1)) { + BREAKPOINT; + printk(KERN_EMERG "%i\n", +- keycmp(path, dx_key_at(path, at), +- path->dp_key_target)); ++ keycmp(c, iam_key_at(path, at), ++ path->ip_key_target)); + } +- at = dx_entry_shift(path, at, -1); ++ at = iam_entry_shift(path, at, -1); + break; + } + } +@@ -637,8 +1115,8 @@ + } + } + if (err != 0) +- dx_path_fini(path); +- path->dp_frame = --frame; ++ iam_path_fini(path); ++ path->ip_frame = --frame; + return err; + } + +@@ -652,7 +1130,7 @@ + * back to userspace. + */ + static int dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_path *path) ++ struct dx_hash_info *hinfo, struct iam_path *path) + { + int err; + struct htree_cookie hc = { +@@ -661,39 +1139,78 @@ + }; + + assert(dx_index_is_compat(path)); +- err = dx_lookup(path, &hc); +- assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL); ++ path->ip_descr_data = &hc; ++ err = dx_lookup(path); ++ assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); + return err; + } + +-static inline void dx_path_init(struct dx_path *path, struct inode *inode) ++/* ++ * Initialize container @c, acquires additional reference on @inode. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode) ++{ ++ memset(c, 0, sizeof *c); ++ c->ic_descr = descr; ++ c->ic_object = igrab(inode); ++ if (c->ic_object != NULL) ++ return 0; ++ else ++ return -ENOENT; ++} ++ ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c) ++{ ++ if (c->ic_object != NULL) { ++ iput(c->ic_object); ++ c->ic_object = NULL; ++ } ++} ++ ++static inline void iam_path_init(struct iam_path *path, struct iam_container *c) + { + memset(path, 0, sizeof *path); +- path->dp_object = inode; +- path->dp_frame = path->dp_frames; ++ path->ip_container = c; ++ path->ip_frame = path->ip_frames; + } + +-static inline void dx_path_fini(struct dx_path *path) ++static inline void iam_path_fini(struct iam_path *path) + { + int i; + +- for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) { +- if (path->dp_frames[i].bh != NULL) { +- brelse(path->dp_frames[i].bh); +- path->dp_frames[i].bh = NULL; ++ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { ++ if (path->ip_frames[i].bh != NULL) { ++ brelse(path->ip_frames[i].bh); ++ path->ip_frames[i].bh = NULL; + } + } + } + +-static void dx_path_compat_init(struct dx_path_compat *path, +- struct inode *inode) ++static void iam_path_compat_init(struct iam_path_compat *path, ++ struct inode *inode) + { + int i; +- dx_path_init(&path->dpc_path, inode); +- path->dpc_path.dp_param = &htree_compat_param; +- for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i) +- path->dpc_path.dp_key_scratch[i] = +- (struct dx_key *)&path->dpc_scrach[i]; ++ ++ iam_container_init(&path->ipc_container, &htree_compat_param, inode); ++ /* ++ * XXX hack allowing finalization of iam_path_compat with ++ * iam_path_fini(). ++ */ ++ iput(inode); ++ iam_path_init(&path->ipc_path, &path->ipc_container); ++ for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) ++ path->ipc_path.ip_key_scratch[i] = ++ (struct iam_key *)&path->ipc_scrach[i]; ++} ++ ++static void iam_path_compat_fini(struct iam_path_compat *path) ++{ ++ iam_path_fini(&path->ipc_path); ++ iam_container_fini(&path->ipc_container); + } + + /* +@@ -714,16 +1231,16 @@ + * hash of the next page. + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_path *path, __u32 *start_hash) ++ struct iam_path *path, __u32 *start_hash) + { +- struct dx_frame *p; ++ struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + assert(dx_index_is_compat(path)); + +- p = path->dp_frame; ++ p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and +@@ -732,11 +1249,11 @@ + * nodes need to be read. + */ + while (1) { +- p->at = dx_entry_shift(path, p->at, +1); +- if (p->at < dx_entry_shift(path, p->entries, ++ p->at = iam_entry_shift(path, p->at, +1); ++ if (p->at < iam_entry_shift(path, p->entries, + dx_get_count(p->entries))) + break; +- if (p == path->dp_frames) ++ if (p == path->ip_frames) + return 0; + num_frames++; + --p; +@@ -749,7 +1266,7 @@ + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- dx_get_key(path, p->at, (struct dx_key *)&bhash); ++ dx_get_key(path, p->at, (struct iam_key *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { +@@ -761,8 +1278,10 @@ + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, +- dx_get_block(path, p->at), 0, &err))) ++ err = path_descr(path)->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, p->at), ++ NULL, &bh); ++ if (err != 0) + return err; /* Failure */ + ++p; + brelse (p->bh); +@@ -837,8 +1356,8 @@ + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; + struct inode *dir; + int block, err; + int count = 0; +@@ -848,7 +1367,7 @@ + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; +- dx_path_compat_init(&cpath, dir); ++ iam_path_compat_init(&cpath, dir); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -865,7 +1384,7 @@ + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + de = ext3_next_entry(de); +@@ -875,7 +1394,7 @@ + } + + while (1) { +- block = dx_get_block(path, path->dp_frame->at); ++ block = dx_get_block(path, path->ip_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -900,12 +1419,12 @@ + (count && ((hashval & 1) == 0))) + break; + } +- dx_path_fini(path); ++ iam_path_fini(path); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_path_fini(path); ++ iam_path_fini(path); + return (err); + } + +@@ -964,18 +1483,18 @@ + } while(more); + } + +-static void dx_insert_block(struct dx_path *path, +- struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct iam_path *path, ++ struct iam_frame *frame, u32 hash, u32 block) + { +- struct dx_entry *entries = frame->entries; +- struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1); ++ struct iam_entry *entries = frame->entries; ++ struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1); + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); +- assert(old < dx_entry_shift(path, entries, count)); +- memmove(dx_entry_shift(path, new, 1), new, +- (char *)dx_entry_shift(path, entries, count) - (char *)new); +- dx_set_key(path, new, (struct dx_key *)&hash); ++ assert(old < iam_entry_shift(path, entries, count)); ++ memmove(iam_entry_shift(path, new, 1), new, ++ (char *)iam_entry_shift(path, entries, count) - (char *)new); ++ dx_set_key(path, new, (struct iam_key *)&hash); + dx_set_block(path, new, block); + dx_set_count(entries, count + 1); + } +@@ -1177,9 +1696,9 @@ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; +- struct dx_entry_compat dummy_dot = { ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_entry_compat dummy_dot = { + .block = 0 + }; + struct ext3_dir_entry_2 *de, *top; +@@ -1190,8 +1709,8 @@ + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + +- dx_path_compat_init(&cpath, dir); +- ++ iam_path_compat_init(&cpath, dir); ++ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +@@ -1199,13 +1718,15 @@ + if (*err != 0) + return NULL; + } else { +- path->dp_frame->bh = NULL; /* for dx_path_fini() */ +- path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ ++ path->ip_frame->bh = NULL; /* for iam_path_fini() */ ++ path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ + } + hash = hinfo.hash; + do { +- block = dx_get_block(path, path->dp_frame->at); +- if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ block = dx_get_block(path, path->ip_frame->at); ++ *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block, ++ NULL, &bh); ++ if (*err != 0) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - +@@ -1220,7 +1741,7 @@ + goto errout; + } + *res_dir = de; +- dx_path_fini(path); ++ iam_path_fini(path); + return bh; + } + brelse (bh); +@@ -1238,7 +1759,7 @@ + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_path_fini(path); ++ iam_path_fini(path); + return NULL; + } + #endif +@@ -1363,11 +1884,11 @@ + + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ +-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path, +- struct buffer_head **bh,struct dx_frame *frame, ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path, ++ struct buffer_head **bh,struct iam_frame *frame, + struct dx_hash_info *hinfo, int *error) + { +- struct inode *dir = path->dp_object; ++ struct inode *dir = path_obj(path); + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; +@@ -1553,9 +2074,9 @@ + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; +- struct dx_entry *entries; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; +@@ -1565,7 +2086,7 @@ + u32 block; + struct fake_dirent *fde; + +- dx_path_compat_init(&cpath, dir); ++ iam_path_compat_init(&cpath, dir); + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1612,12 +2133,12 @@ + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- path->dp_frame->entries = entries; +- path->dp_frame->at = entries; +- path->dp_frame->bh = bh; ++ path->ip_frame->entries = entries; ++ path->ip_frame->at = entries; ++ path->ip_frame->bh = bh; + bh = bh2; +- de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval); +- dx_path_fini(path); ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &retval); ++ iam_path_fini(path); + if (!de) + return retval; + +@@ -1698,12 +2219,12 @@ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; +- struct dx_param *param; +- struct dx_frame *frame, *safe; +- struct dx_entry *entries; /* old block contents */ +- struct dx_entry *entries2; /* new block contents */ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame, *safe; ++ struct iam_entry *entries; /* old block contents */ ++ struct iam_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; + struct buffer_head * bh; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; +@@ -1716,20 +2237,22 @@ + int i; + size_t isize; + +- dx_path_compat_init(&cpath, dir); +- param = path->dp_param; ++ iam_path_compat_init(&cpath, dir); ++ param = path_descr(path); + + err = dx_probe(dentry, NULL, &hinfo, path); + if (err != 0) + return err; +- frame = path->dp_frame; ++ frame = path->ip_frame; + entries = frame->entries; + + /* XXX nikita: global serialization! */ + isize = dir->i_size; + +- if (!(bh = ext3_bread(handle, dir, +- dx_get_block(path, frame->at), 0, &err))) ++ err = param->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, ++ frame->at), handle, &bh); ++ if (err != 0) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); +@@ -1761,7 +2284,7 @@ + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ +- for (nr_splet = 0; frame >= path->dp_frames && ++ for (nr_splet = 0; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +@@ -1778,7 +2301,7 @@ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); + if (!bh_new[i] || +- param->dpo_node_init(path, bh_new[i], 0) != 0) ++ param->id_node_init(path->ip_container, bh_new[i], 0) != 0) + goto cleanup; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); +@@ -1786,7 +2309,7 @@ + goto journal_error; + } + /* Add "safe" node to transaction too */ +- if (safe + 1 != path->dp_frames) { ++ if (safe + 1 != path->ip_frames) { + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1800,12 +2323,12 @@ + + entries = frame->entries; + count = dx_get_count(entries); +- idx = dx_entry_diff(path, frame->at, entries); ++ idx = iam_entry_diff(path, frame->at, entries); + + bh2 = bh_new[i]; + entries2 = dx_get_entries(path, bh2->b_data, 0); + +- if (frame == path->dp_frames) { ++ if (frame == path->ip_frames) { + /* splitting root node. Tricky point: + * + * In the "normal" B-tree we'd split root *and* add +@@ -1818,14 +2341,14 @@ + */ + struct dx_root *root; + u8 indirects; +- struct dx_frame *frames; ++ struct iam_frame *frames; + +- frames = path->dp_frames; ++ frames = path->ip_frames; + root = (struct dx_root *) frames->bh->b_data; + indirects = root->info.indirect_levels; + dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, +- count * dx_entry_size(path)); ++ count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ +@@ -1835,9 +2358,9 @@ + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, +- (sizeof path->dp_frames) - 2 * sizeof frames[0]); ++ (sizeof path->ip_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ +- frames[1].at = dx_entry_shift(path, entries2, idx); ++ frames[1].at = iam_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; + assert(dx_node_check(path, frame)); +@@ -1853,22 +2376,22 @@ + unsigned hash2; + + dx_get_key(path, +- dx_entry_shift(path, entries, count1), +- (struct dx_key *)&hash2); ++ iam_entry_shift(path, entries, count1), ++ (struct iam_key *)&hash2); + + dxtrace(printk("Split index %i/%i\n", count1, count2)); + + memcpy ((char *) entries2, +- (char *) dx_entry_shift(path, entries, count1), +- count2 * dx_entry_size(path)); ++ (char *) iam_entry_shift(path, entries, count1), ++ count2 * iam_entry_size(path)); + dx_set_count (entries, count1); + dx_set_count (entries2, count2); + dx_set_limit (entries2, dx_node_limit(path)); + + /* Which index block gets the new entry? */ + if (idx >= count1) { +- frame->at = dx_entry_shift(path, entries2, +- idx - count1); ++ frame->at = iam_entry_shift(path, entries2, ++ idx - count1); + frame->entries = entries = entries2; + swap(frame->bh, bh2); + bh_new[i] = bh2; +@@ -1903,7 +2426,7 @@ + } + if (err) + inode->i_size = isize; +- dx_path_fini(path); ++ iam_path_fini(path); + return err; + } + #endif diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch b/ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch new file mode 100644 index 0000000..ec66561 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-htree-path-ops.patch @@ -0,0 +1,1125 @@ +Index: iam-src/fs/ext3/namei.c +=================================================================== +--- iam-src.orig/fs/ext3/namei.c 2006-02-15 18:31:48.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-15 21:25:34.000000000 +0300 +@@ -51,7 +51,10 @@ + /* + * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. + */ +-#define DX_MAX_TREE_HEIGHT (5) ++enum { ++ DX_MAX_TREE_HEIGHT = 5, ++ DX_SCRATCH_KEYS = 2 ++}; + + static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, +@@ -83,22 +86,22 @@ static struct buffer_head *ext3_append(h + #define dxtrace(command) + #endif + +-struct fake_dirent +-{ ++struct fake_dirent { + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; + }; + +-struct dx_countlimit +-{ ++struct dx_countlimit { + __le16 limit; + __le16 count; + }; + +-struct dx_entry +-{ ++struct dx_entry; /* incomplete type */ ++struct dx_key; /* incomplete type */ ++ ++struct dx_entry_compat { + __le32 hash; + __le32 block; + }; +@@ -109,8 +112,7 @@ struct dx_entry + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +-struct dx_root +-{ ++struct dx_root { + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; +@@ -124,13 +126,13 @@ struct dx_root + u8 unused_flags; + } + info; +- struct dx_entry entries[0]; ++ struct {} entries[0]; + }; + + struct dx_node + { + struct fake_dirent fake; +- struct dx_entry entries[0]; ++ struct {} entries[0]; + }; + + +@@ -147,38 +149,88 @@ struct dx_map_entry + u32 offs; + }; + ++struct dx_path; ++struct dx_param { ++ size_t dpo_key_size; ++ size_t dpo_ptr_size; ++ size_t dpo_node_gap; ++ size_t dpo_root_gap; ++ ++ u32 (*dpo_root_ptr)(struct dx_path *path); ++ int (*dpo_node_check)(struct dx_path *path, ++ struct dx_frame *frame, void *cookie); ++ int (*dpo_node_init)(struct dx_path *path, ++ struct buffer_head *bh, int root); ++ int (*dpo_keycmp)(struct dx_path *path, ++ struct dx_key *k1, struct dx_key *k2); ++}; ++ + /* + * Structure to keep track of a path drilled through htree. + */ + struct dx_path { +- struct inode *dp_object; +- struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; +- struct dx_frame *dp_frame; ++ struct inode *dp_object; ++ struct dx_param *dp_param; ++ int dp_indirect; ++ struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; ++ struct dx_frame *dp_frame; ++ struct dx_key *dp_key_target; ++ struct dx_key *dp_key_scratch[DX_SCRATCH_KEYS]; ++}; ++ ++struct dx_path_compat { ++ struct dx_path dpc_path; ++ __u32 dpc_scrach[DX_SCRATCH_KEYS]; + }; + ++static u32 htree_root_ptr(struct dx_path *p); ++static int htree_node_check(struct dx_path *path, ++ struct dx_frame *frame, void *cookie); ++static int htree_node_init(struct dx_path *path, ++ struct buffer_head *bh, int root); ++static int htree_keycmp(struct dx_path *path, ++ struct dx_key *k1, struct dx_key *k2); ++ ++static struct dx_param htree_compat_param = { ++ .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, ++ .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, ++ .dpo_node_gap = offsetof(struct dx_node, entries), ++ .dpo_root_gap = offsetof(struct dx_root, entries), ++ ++ .dpo_root_ptr = htree_root_ptr, ++ .dpo_node_check = htree_node_check, ++ .dpo_node_init = htree_node_init, ++ .dpo_keycmp = htree_keycmp ++}; ++ ++ + #ifdef CONFIG_EXT3_INDEX +-static inline unsigned dx_get_block (struct dx_entry *entry); +-static void dx_set_block (struct dx_entry *entry, unsigned value); +-static inline unsigned dx_get_hash (struct dx_entry *entry); +-static void dx_set_hash (struct dx_entry *entry, unsigned value); +-static unsigned dx_get_count (struct dx_entry *entries); +-static unsigned dx_get_limit (struct dx_entry *entries); +-static void dx_set_count (struct dx_entry *entries, unsigned value); +-static void dx_set_limit (struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +-static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, +- struct inode *dir, +- struct dx_hash_info *hinfo, +- struct dx_path *path, +- int *err); ++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry); ++static void dx_set_block(struct dx_path *p, ++ struct dx_entry *entry, unsigned value); ++static inline struct dx_key *dx_get_key(struct dx_path *p, ++ struct dx_entry *entry, ++ struct dx_key *key); ++static void dx_set_key(struct dx_path *p, struct dx_entry *entry, ++ struct dx_key *key); ++static unsigned dx_get_count(struct dx_entry *entries); ++static unsigned dx_get_limit(struct dx_entry *entries); ++static void dx_set_count(struct dx_entry *entries, unsigned value); ++static void dx_set_limit(struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit(struct dx_path *p); ++static unsigned dx_node_limit(struct dx_path *p); ++static int dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_path *path); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct dx_path *path, ++ struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +@@ -186,29 +238,72 @@ static struct buffer_head * ext3_dx_find + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + ++static inline void dx_path_init(struct dx_path *path, struct inode *inode); ++static inline void dx_path_fini(struct dx_path *path); ++ ++ + /* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +-static inline unsigned dx_get_block (struct dx_entry *entry) ++static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return (void *)((char *)entry + off); + } + +-static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++static inline size_t dx_entry_size(struct dx_path *p) + { +- entry->block = cpu_to_le32(value); ++ return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size; + } + +-static inline unsigned dx_get_hash (struct dx_entry *entry) ++static inline struct dx_entry *dx_entry_shift(struct dx_path *p, ++ struct dx_entry *entry, int shift) + { +- return le32_to_cpu(entry->hash); ++ void *e = entry; ++ return e + shift * dx_entry_size(p); ++} ++ ++static inline ptrdiff_t dx_entry_diff(struct dx_path *p, ++ struct dx_entry *e1, struct dx_entry *e2) ++{ ++ ptrdiff_t diff; ++ ++ diff = (void *)e1 - (void *)e2; ++ assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff); ++ return diff / dx_entry_size(p); + } + +-static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry) + { +- entry->hash = cpu_to_le32(value); ++ return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size)) ++ & 0x00ffffff; ++} ++ ++static inline void dx_set_block(struct dx_path *p, ++ struct dx_entry *entry, unsigned value) ++{ ++ *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value); ++} ++ ++static inline struct dx_key *dx_get_key(struct dx_path *p, ++ struct dx_entry *entry, ++ struct dx_key *key) ++{ ++ memcpy(key, entry, p->dp_param->dpo_key_size); ++ return key; ++} ++ ++static inline struct dx_key *dx_key_at(struct dx_path *p, ++ struct dx_entry *entry) ++{ ++ return (struct dx_key *)entry; ++} ++ ++static inline void dx_set_key(struct dx_path *p, ++ struct dx_entry *entry, struct dx_key *key) ++{ ++ memcpy(entry, key, p->dp_param->dpo_key_size); + } + + static inline unsigned dx_get_count (struct dx_entry *entries) +@@ -231,17 +326,163 @@ static inline void dx_set_limit (struct + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct dx_path *p) ++{ ++ struct dx_param *param = p->dp_param; ++ unsigned entry_space = p->dp_object->i_sb->s_blocksize - ++ param->dpo_root_gap; ++ return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++} ++ ++static inline unsigned dx_node_limit(struct dx_path *p) ++{ ++ struct dx_param *param = p->dp_param; ++ unsigned entry_space = p->dp_object->i_sb->s_blocksize - ++ param->dpo_node_gap; ++ return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++} ++ ++static inline int dx_index_is_compat(struct dx_path *path) ++{ ++ return path->dp_param == &htree_compat_param; ++} ++ ++static struct dx_entry *dx_get_entries(struct dx_path *path, void *data, ++ int root) ++{ ++ return data + ++ (root ? ++ path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap); ++} ++ ++static struct dx_entry *dx_node_get_entries(struct dx_path *path, ++ struct dx_frame *frame) ++{ ++ return dx_get_entries(path, ++ frame->bh->b_data, frame == path->dp_frames); ++} ++ ++static inline struct dx_key *keycpy(struct dx_path *p, ++ struct dx_key *k1, struct dx_key *k2) ++{ ++ return memcpy(k1, k2, p->dp_param->dpo_key_size); ++} ++ ++static inline int keycmp(struct dx_path *p, ++ struct dx_key *k1, struct dx_key *k2) ++{ ++ return p->dp_param->dpo_keycmp(p, k1, k2); ++} ++ ++static int dx_node_check(struct dx_path *p, struct dx_frame *f) ++{ ++ struct dx_entry *e; ++ unsigned count; ++ unsigned i; ++ ++ e = dx_node_get_entries(p, f); ++ count = dx_get_count(e); ++ e = dx_entry_shift(p, e, 1); ++ for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) { ++ keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]); ++ dx_get_key(p, e, p->dp_key_scratch[1]); ++ if (i > 0 && ++ keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0) ++ return 0; ++ } ++ return 1; ++} ++ ++static u32 htree_root_ptr(struct dx_path *path) ++{ ++ return 0; ++} ++ ++struct htree_cookie { ++ struct dx_hash_info *hinfo; ++ struct dentry *dentry; ++}; ++ ++static int htree_node_check(struct dx_path *path, struct dx_frame *frame, ++ void *cookie) ++{ ++ void *data; ++ struct dx_entry *entries; ++ struct super_block *sb; ++ ++ data = frame->bh->b_data; ++ entries = dx_node_get_entries(path, frame); ++ sb = path->dp_object->i_sb; ++ if (frame == path->dp_frames) { ++ /* root node */ ++ struct dx_root *root; ++ struct htree_cookie *hc = cookie; ++ ++ root = data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_R5 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ return ERR_BAD_DX_DIR; ++ } ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ return ERR_BAD_DX_DIR; ++ } ++ ++ path->dp_indirect = root->info.indirect_levels; ++ if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) { ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ return ERR_BAD_DX_DIR; ++ } ++ ++ assert((char *)entries == (((char *)&root->info) + ++ root->info.info_length)); ++ assert(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ hc->hinfo->hash_version = root->info.hash_version; ++ hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed; ++ if (hc->dentry) ++ ext3fs_dirhash(hc->dentry->d_name.name, ++ hc->dentry->d_name.len, hc->hinfo); ++ path->dp_key_target = (struct dx_key *)&hc->hinfo->hash; ++ } else { ++ /* non-root index */ ++ assert(entries == data + path->dp_param->dpo_node_gap); ++ assert(dx_get_limit(entries) == dx_node_limit(path)); ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int htree_node_init(struct dx_path *path, ++ struct buffer_head *bh, int root) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - +- EXT3_DIR_REC_LEN(2) - infosize; +- return 0? 20: entry_space / sizeof(struct dx_entry); ++ struct dx_node *node; ++ ++ assert(!root); ++ ++ node = (void *)bh->b_data; ++ node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize); ++ node->fake.inode = 0; ++ return 0; + } + +-static inline unsigned dx_node_limit (struct inode *dir) ++static int htree_keycmp(struct dx_path *path, ++ struct dx_key *k1, struct dx_key *k2) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); +- return 0? 22: entry_space / sizeof(struct dx_entry); ++ __u32 p1 = le32_to_cpu(*(__u32 *)k1); ++ __u32 p2 = le32_to_cpu(*(__u32 *)k2); ++ ++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); + } + + /* +@@ -327,123 +568,105 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + +-/* +- * Probe for a directory leaf block to search. +- * +- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +- * error in the directory index, and the caller should fall back to +- * searching the directory normally. The callers of dx_probe **MUST** +- * check for this error code, and make sure it never gets reflected +- * back to userspace. +- */ +-static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_path *path, int *err) +-{ +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; +- struct buffer_head *bh; +- struct dx_frame *frame = path->dp_frames; +- u32 hash; ++static int dx_lookup(struct dx_path *path, void *cookie) ++{ ++ u32 ptr; ++ int err; ++ int i; + +- frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; +- if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) +- goto fail; +- root = (struct dx_root *) bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_R5 && +- root->info.hash_version != DX_HASH_LEGACY) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unrecognised inode hash code %d", root->info.hash_version); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; +- } +- hinfo->hash_version = root->info.hash_version; +- hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); +- hash = hinfo->hash; +- +- if (root->info.unused_flags & 1) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; +- } ++ struct dx_param *param; ++ struct dx_frame *frame; + +- if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; +- } ++ param = path->dp_param; + +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); +- while (1) +- { ++ for (frame = path->dp_frames, i = 0, ++ ptr = param->dpo_root_ptr(path); i <= path->dp_indirect; ++ ptr = dx_get_block(path, frame->at), ++frame, ++i) { ++ struct dx_entry *entries; ++ struct dx_entry *p; ++ struct dx_entry *q; ++ struct dx_entry *m; ++ unsigned count; ++ ++ frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err); ++ if (frame->bh == NULL) { ++ err = -EIO; ++ break; ++ } ++ err = param->dpo_node_check(path, frame, cookie); ++ if (err != 0) ++ break; ++ ++ assert(dx_node_check(path, frame)); ++ ++ entries = frame->entries; + count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; ++ assert(count && count <= dx_get_limit(entries)); ++ p = dx_entry_shift(path, entries, 1); ++ q = dx_entry_shift(path, entries, count - 1); ++ while (p <= q) { ++ m = dx_entry_shift(path, ++ p, dx_entry_diff(path, q, p) / 2); + dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; ++ if (keycmp(path, dx_key_at(path, m), ++ path->dp_key_target) > 0) ++ q = dx_entry_shift(path, m, -1); + else +- p = m + 1; ++ p = dx_entry_shift(path, m, +1); + } + +- if (0) // linear search cross check +- { ++ frame->at = dx_entry_shift(path, p, -1); ++ if (1) { // linear search cross check + unsigned n = count - 1; ++ struct dx_entry *at; ++ + at = entries; +- while (n--) +- { ++ while (n--) { + dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; ++ at = dx_entry_shift(path, at, +1); ++ if (keycmp(path, dx_key_at(path, at), ++ path->dp_key_target) > 0) { ++ if (at != dx_entry_shift(path, frame->at, 1)) { ++ BREAKPOINT; ++ printk(KERN_EMERG "%i\n", ++ keycmp(path, dx_key_at(path, at), ++ path->dp_key_target)); ++ } ++ at = dx_entry_shift(path, at, -1); + break; + } + } +- assert (at == p - 1); ++ assert(at == frame->at); + } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); +- frame->bh = bh; +- frame->entries = entries; +- frame->at = at; +- if (!indirect--) +- return path->dp_frame = frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) +- goto fail2; +- at = entries = ((struct dx_node *) bh->b_data)->entries; +- assert (dx_get_limit(entries) == dx_node_limit (dir)); +- frame++; +- } +-fail2: +- while (frame >= path->dp_frames) { +- brelse(frame->bh); +- frame--; + } +-fail: +- return NULL; ++ if (err != 0) ++ dx_path_fini(path); ++ path->dp_frame = --frame; ++ return err; ++} ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static int dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_path *path) ++{ ++ int err; ++ struct htree_cookie hc = { ++ .dentry = dentry, ++ .hinfo = hinfo ++ }; ++ ++ assert(dx_index_is_compat(path)); ++ err = dx_lookup(path, &hc); ++ assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL); ++ return err; + } + + static inline void dx_path_init(struct dx_path *path, struct inode *inode) +@@ -458,11 +681,24 @@ static inline void dx_path_fini(struct d + int i; + + for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) { +- if (path->dp_frames[i].bh != NULL) ++ if (path->dp_frames[i].bh != NULL) { + brelse(path->dp_frames[i].bh); ++ path->dp_frames[i].bh = NULL; ++ } + } + } + ++static void dx_path_compat_init(struct dx_path_compat *path, ++ struct inode *inode) ++{ ++ int i; ++ dx_path_init(&path->dpc_path, inode); ++ path->dpc_path.dp_param = &htree_compat_param; ++ for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i) ++ path->dpc_path.dp_key_scratch[i] = ++ (struct dx_key *)&path->dpc_scrach[i]; ++} ++ + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -488,6 +724,8 @@ static int ext3_htree_next_block(struct + int err, num_frames = 0; + __u32 bhash; + ++ assert(dx_index_is_compat(path)); ++ + p = path->dp_frame; + /* + * Find the next leaf page by incrementing the frame pointer. +@@ -497,7 +735,9 @@ static int ext3_htree_next_block(struct + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ p->at = dx_entry_shift(path, p->at, +1); ++ if (p->at < dx_entry_shift(path, p->entries, ++ dx_get_count(p->entries))) + break; + if (p == path->dp_frames) + return 0; +@@ -512,7 +752,7 @@ static int ext3_htree_next_block(struct + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- bhash = dx_get_hash(p->at); ++ dx_get_key(path, p->at, (struct dx_key *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { +@@ -524,12 +764,14 @@ static int ext3_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err))) ++ if (!(bh = ext3_bread(NULL, dir, ++ dx_get_block(path, p->at), 0, &err))) + return err; /* Failure */ + ++p; + brelse (p->bh); + p->bh = bh; +- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->at = p->entries = dx_node_get_entries(path, p); ++ assert(dx_node_check(path, p)); + } + return 1; + } +@@ -598,7 +840,8 @@ int ext3_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_path path; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; + struct inode *dir; + int block, err; + int count = 0; +@@ -608,7 +851,7 @@ int ext3_htree_fill_tree(struct file *di + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; +- dx_path_init(&path, dir); ++ dx_path_compat_init(&cpath, dir); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -619,12 +862,13 @@ int ext3_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err)) ++ err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, path); ++ if (err != 0) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + de = ext3_next_entry(de); +@@ -634,7 +878,7 @@ int ext3_htree_fill_tree(struct file *di + } + + while (1) { +- block = dx_get_block(path.dp_frame->at); ++ block = dx_get_block(path, path->dp_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -643,7 +887,8 @@ int ext3_htree_fill_tree(struct file *di + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval); ++ ret = ext3_htree_next_block(dir, ++ HASH_NB_ALWAYS, path, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -658,12 +903,12 @@ int ext3_htree_fill_tree(struct file *di + (count && ((hashval & 1) == 0))) + break; + } +- dx_path_fini(&path); ++ dx_path_fini(path); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_path_fini(&path); ++ dx_path_fini(path); + return (err); + } + +@@ -722,17 +967,19 @@ static void dx_sort_map (struct dx_map_e + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct dx_path *path, ++ struct dx_frame *frame, u32 hash, u32 block) + { + struct dx_entry *entries = frame->entries; +- struct dx_entry *old = frame->at, *new = old + 1; ++ struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1); + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); +- assert(old < entries + count); +- memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); +- dx_set_hash(new, hash); +- dx_set_block(new, block); ++ assert(old < dx_entry_shift(path, entries, count)); ++ memmove(dx_entry_shift(path, new, 1), new, ++ (char *)dx_entry_shift(path, entries, count) - (char *)new); ++ dx_set_key(path, new, (struct dx_key *)&hash); ++ dx_set_block(path, new, block); + dx_set_count(entries, count + 1); + } + #endif +@@ -933,8 +1180,11 @@ static struct buffer_head * ext3_dx_find + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_path path; +- struct dx_entry dummy_dot; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; ++ struct dx_entry_compat dummy_dot = { ++ .block = 0 ++ }; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -943,20 +1193,21 @@ static struct buffer_head * ext3_dx_find + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + +- dx_path_init(&path, dir); ++ dx_path_compat_init(&cpath, dir); ++ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(dx_probe(dentry, NULL, &hinfo, &path, err))) ++ *err = dx_probe(dentry, NULL, &hinfo, path); ++ if (*err != 0) + return NULL; + } else { +- path.dp_frame->bh = NULL; /* for dx_path_fini() */ +- path.dp_frame->at = &dummy_dot; /* hack for zero entry*/ +- dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */ ++ path->dp_frame->bh = NULL; /* for dx_path_fini() */ ++ path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ + } + hash = hinfo.hash; + do { +- block = dx_get_block(path.dp_frame->at); ++ block = dx_get_block(path, path->dp_frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -972,12 +1223,12 @@ static struct buffer_head * ext3_dx_find + goto errout; + } + *res_dir = de; +- dx_path_fini(&path); ++ dx_path_fini(path); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ +- retval = ext3_htree_next_block(dir, hash, &path, NULL); ++ retval = ext3_htree_next_block(dir, hash, path, NULL); + if (retval < 0) { + ext3_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", +@@ -990,7 +1241,7 @@ static struct buffer_head * ext3_dx_find + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_path_fini(&path); ++ dx_path_fini(path); + return NULL; + } + #endif +@@ -1115,10 +1366,11 @@ static struct ext3_dir_entry_2* dx_pack_ + + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ +-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) + { ++ struct inode *dir = path->dp_object; + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; +@@ -1180,7 +1432,7 @@ static struct ext3_dir_entry_2 *do_split + swap(*bh, bh2); + de = de2; + } +- dx_insert_block (frame, hash2 + continued, newblock); ++ dx_insert_block(path, frame, hash2 + continued, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1303,7 +1555,8 @@ static int make_indexed_dir(handle_t *ha + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_path path; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1314,7 +1567,7 @@ static int make_indexed_dir(handle_t *ha + u32 block; + struct fake_dirent *fde; + +- dx_path_init(&path, dir); ++ dx_path_compat_init(&cpath, dir); + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1350,21 +1603,21 @@ static int make_indexed_dir(handle_t *ha + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + root->info.hash_version = DX_HASH_R5; +- entries = root->entries; +- dx_set_block (entries, 1); ++ entries = (void *)root->entries; ++ dx_set_block (path, entries, 1); + dx_set_count (entries, 1); +- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit (entries, dx_root_limit(path)); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- path.dp_frame->entries = entries; +- path.dp_frame->at = entries; +- path.dp_frame->bh = bh; ++ path->dp_frame->entries = entries; ++ path->dp_frame->at = entries; ++ path->dp_frame->bh = bh; + bh = bh2; +- de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval); +- dx_path_fini(&path); ++ de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval); ++ dx_path_fini(path); + if (!de) + return retval; + +@@ -1445,9 +1698,10 @@ static int ext3_add_entry (handle_t *han + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_path path; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; ++ struct dx_param *param; + struct dx_frame *frame, *safe; +- struct dx_node *node2; + struct dx_entry *entries; /* old block contents */ + struct dx_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; +@@ -1462,16 +1716,20 @@ static int ext3_dx_add_entry(handle_t *h + int i; + size_t isize; + +- dx_path_init(&path, dir); +- if (!dx_probe(dentry, NULL, &hinfo, &path, &err)) ++ dx_path_compat_init(&cpath, dir); ++ param = path->dp_param; ++ ++ err = dx_probe(dentry, NULL, &hinfo, path); ++ if (err != 0) + return err; +- frame = path.dp_frame; ++ frame = path->dp_frame; + entries = frame->entries; + + /* XXX nikita: global serialization! */ + isize = dir->i_size; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ if (!(bh = ext3_bread(handle, dir, ++ dx_get_block(path, frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); +@@ -1503,7 +1761,7 @@ static int ext3_dx_add_entry(handle_t *h + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ +- for (nr_splet = 0; frame >= path.dp_frames && ++ for (nr_splet = 0; frame >= path->dp_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +@@ -1519,19 +1777,16 @@ static int ext3_dx_add_entry(handle_t *h + * transaction... */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); +- if (!bh_new[i]) ++ if (!bh_new[i] || ++ param->dpo_node_init(path, bh_new[i], 0) != 0) + goto cleanup; +- node2 = (struct dx_node *)(bh_new[i]->b_data); +- entries2 = node2->entries; +- node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +- node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + } + /* Add "safe" node to transaction too */ +- if (safe + 1 != path.dp_frames) { ++ if (safe + 1 != path->dp_frames) { + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1545,13 +1800,12 @@ static int ext3_dx_add_entry(handle_t *h + + entries = frame->entries; + count = dx_get_count(entries); +- idx = frame->at - entries; ++ idx = dx_entry_diff(path, frame->at, entries); + + bh2 = bh_new[i]; +- node2 = (struct dx_node *)(bh2->b_data); +- entries2 = node2->entries; ++ entries2 = dx_get_entries(path, bh2->b_data, 0); + +- if (frame == path.dp_frames) { ++ if (frame == path->dp_frames) { + /* splitting root node. Tricky point: + * + * In the "normal" B-tree we'd split root *and* add +@@ -1566,27 +1820,29 @@ static int ext3_dx_add_entry(handle_t *h + u8 indirects; + struct dx_frame *frames; + +- frames = path.dp_frames; ++ frames = path->dp_frames; + root = (struct dx_root *) frames->bh->b_data; + indirects = root->info.indirect_levels; + dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, +- count * sizeof(struct dx_entry)); +- dx_set_limit(entries2, dx_node_limit(dir)); ++ count * dx_entry_size(path)); ++ dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ + dx_set_count(entries, 1); +- dx_set_block(entries + 0, newblock[i]); ++ dx_set_block(path, entries, newblock[i]); + root->info.indirect_levels = indirects + 1; + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, +- (sizeof path.dp_frames) - 2 * sizeof frames[0]); ++ (sizeof path->dp_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ +- frames[1].at = entries2 + idx; ++ frames[1].at = dx_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; ++ assert(dx_node_check(path, frame)); + ++ frame; ++ assert(dx_node_check(path, frame)); + bh_new[i] = NULL; /* buffer head is "consumed" */ + err = ext3_journal_get_write_access(handle, bh2); + if (err) +@@ -1594,23 +1850,32 @@ static int ext3_dx_add_entry(handle_t *h + } else { + /* splitting non-root index node. */ + unsigned count1 = count/2, count2 = count - count1; +- unsigned hash2 = dx_get_hash(entries + count1); ++ unsigned hash2; ++ ++ dx_get_key(path, ++ dx_entry_shift(path, entries, count1), ++ (struct dx_key *)&hash2); ++ + dxtrace(printk("Split index %i/%i\n", count1, count2)); + +- memcpy ((char *) entries2, (char *) (entries + count1), +- count2 * sizeof(struct dx_entry)); ++ memcpy ((char *) entries2, ++ (char *) dx_entry_shift(path, entries, count1), ++ count2 * dx_entry_size(path)); + dx_set_count (entries, count1); + dx_set_count (entries2, count2); +- dx_set_limit (entries2, dx_node_limit(dir)); ++ dx_set_limit (entries2, dx_node_limit(path)); + + /* Which index block gets the new entry? */ + if (idx >= count1) { +- frame->at = entries2 + idx - count1; ++ frame->at = dx_entry_shift(path, entries2, ++ idx - count1); + frame->entries = entries = entries2; + swap(frame->bh, bh2); + bh_new[i] = bh2; + } +- dx_insert_block (frame - 1, hash2, newblock[i]); ++ dx_insert_block(path, frame - 1, hash2, newblock[i]); ++ assert(dx_node_check(path, frame)); ++ assert(dx_node_check(path, frame - 1)); + dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1619,9 +1884,10 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + } + } +- de = do_split(handle, dir, &bh, --frame, &hinfo, &err); ++ de = do_split(handle, path, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; ++ assert(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; + +@@ -1637,7 +1903,7 @@ cleanup2: + } + if (err) + inode->i_size = isize; +- dx_path_fini(&path); ++ dx_path_fini(path); + return err; + } + #endif diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-path.patch b/ldiskfs/kernel_patches/patches/ext3-htree-path.patch new file mode 100644 index 0000000..893d1d1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-htree-path.patch @@ -0,0 +1,406 @@ +Index: iam-src/fs/ext3/namei.c +=================================================================== +--- iam-src.orig/fs/ext3/namei.c 2006-02-09 20:44:02.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-10 18:23:32.000000000 +0300 +@@ -147,6 +147,15 @@ struct dx_map_entry + u32 offs; + }; + ++/* ++ * Structure to keep track of a path drilled through htree. ++ */ ++struct dx_path { ++ struct inode *dp_object; ++ struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; ++ struct dx_frame *dp_frame; ++}; ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -161,9 +170,8 @@ static unsigned dx_node_limit (struct in + static struct dx_frame *dx_probe(struct dentry *dentry, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame, ++ struct dx_path *path, + int *err); +-static void dx_release (struct dx_frame *frames); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); +@@ -172,9 +180,7 @@ static struct ext3_dir_entry_2 *dx_move_ + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); + static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_frame *frame, +- struct dx_frame *frames, +- __u32 *start_hash); ++ struct dx_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +@@ -332,13 +338,13 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++ struct dx_hash_info *hinfo, struct dx_path *path, int *err) + { + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; +- struct dx_frame *frame = frame_in; ++ struct dx_frame *frame = path->dp_frames; + u32 hash; + + frame->bh = NULL; +@@ -352,8 +358,7 @@ dx_probe(struct dentry *dentry, struct i + root->info.hash_version != DX_HASH_R5 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); ++ "Unrecognised inode hash code %d", root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -424,7 +429,8 @@ dx_probe(struct dentry *dentry, struct i + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; ++ if (!indirect--) ++ return path->dp_frame = frame; + if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; +@@ -432,7 +438,7 @@ dx_probe(struct dentry *dentry, struct i + frame++; + } + fail2: +- while (frame >= frame_in) { ++ while (frame >= path->dp_frames) { + brelse(frame->bh); + frame--; + } +@@ -440,16 +446,20 @@ fail: + return NULL; + } + +-static void dx_release (struct dx_frame *frames) ++static inline void dx_path_init(struct dx_path *path, struct inode *inode) + { +- int height; ++ memset(path, 0, sizeof *path); ++ path->dp_object = inode; ++ path->dp_frame = path->dp_frames; ++} + +- if (frames[0].bh == NULL) +- return; +- height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels; +- for (; height >= 0; height--) { +- assert(frames[height].bh != NULL); +- brelse(frames[height].bh); ++static inline void dx_path_fini(struct dx_path *path) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) { ++ if (path->dp_frames[i].bh != NULL) ++ brelse(path->dp_frames[i].bh); + } + } + +@@ -471,16 +481,14 @@ static void dx_release (struct dx_frame + * hash of the next page. + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_frame *frame, +- struct dx_frame *frames, +- __u32 *start_hash) ++ struct dx_path *path, __u32 *start_hash) + { + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + +- p = frame; ++ p = path->dp_frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and +@@ -491,10 +499,10 @@ static int ext3_htree_next_block(struct + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; +- if (p == frames) ++ if (p == path->dp_frames) + return 0; + num_frames++; +- p--; ++ --p; + } + + /* +@@ -516,10 +524,9 @@ static int ext3_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, &err))) ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err))) + return err; /* Failure */ +- p++; ++ ++p; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; +@@ -591,7 +598,7 @@ int ext3_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; ++ struct dx_path path; + struct inode *dir; + int block, err; + int count = 0; +@@ -601,6 +608,7 @@ int ext3_htree_fill_tree(struct file *di + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; ++ dx_path_init(&path, dir); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -611,13 +619,12 @@ int ext3_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); +- if (!frame) ++ if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err)) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + de = ext3_next_entry(de); +@@ -627,7 +634,7 @@ int ext3_htree_fill_tree(struct file *di + } + + while (1) { +- block = dx_get_block(frame->at); ++ block = dx_get_block(path.dp_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -636,8 +643,7 @@ int ext3_htree_fill_tree(struct file *di + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -652,12 +658,12 @@ int ext3_htree_fill_tree(struct file *di + (count && ((hashval & 1) == 0))) + break; + } +- dx_release(frames); ++ dx_path_fini(&path); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_release(frames); ++ dx_path_fini(&path); + return (err); + } + +@@ -927,7 +933,8 @@ static struct buffer_head * ext3_dx_find + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; ++ struct dx_path path; ++ struct dx_entry dummy_dot; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -936,20 +943,20 @@ static struct buffer_head * ext3_dx_find + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + ++ dx_path_init(&path, dir); + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) ++ if (!(dx_probe(dentry, NULL, &hinfo, &path, err))) + return NULL; + } else { +- frame = frames; +- frame->bh = NULL; /* for dx_release() */ +- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ +- dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ path.dp_frame->bh = NULL; /* for dx_path_fini() */ ++ path.dp_frame->at = &dummy_dot; /* hack for zero entry*/ ++ dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */ + } + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = dx_get_block(path.dp_frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -965,13 +972,12 @@ static struct buffer_head * ext3_dx_find + goto errout; + } + *res_dir = de; +- dx_release (frames); ++ dx_path_fini(&path); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ +- retval = ext3_htree_next_block(dir, hash, frame, +- frames, NULL); ++ retval = ext3_htree_next_block(dir, hash, &path, NULL); + if (retval < 0) { + ext3_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", +@@ -984,7 +990,7 @@ static struct buffer_head * ext3_dx_find + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_release (frames); ++ dx_path_fini(&path); + return NULL; + } + #endif +@@ -1297,7 +1303,7 @@ static int make_indexed_dir(handle_t *ha + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; ++ struct dx_path path; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1308,6 +1314,7 @@ static int make_indexed_dir(handle_t *ha + u32 block; + struct fake_dirent *fde; + ++ dx_path_init(&path, dir); + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1352,14 +1359,13 @@ static int make_indexed_dir(handle_t *ha + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- frame = frames; +- frame->entries = entries; +- frame->at = entries; +- frame->bh = bh; ++ path.dp_frame->entries = entries; ++ path.dp_frame->at = entries; ++ path.dp_frame->bh = bh; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); +- dx_release (frames); +- if (!(de)) ++ de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval); ++ dx_path_fini(&path); ++ if (!de) + return retval; + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +@@ -1439,7 +1445,8 @@ static int ext3_add_entry (handle_t *han + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe; ++ struct dx_path path; ++ struct dx_frame *frame, *safe; + struct dx_node *node2; + struct dx_entry *entries; /* old block contents */ + struct dx_entry *entries2; /* new block contents */ +@@ -1455,9 +1462,10 @@ static int ext3_dx_add_entry(handle_t *h + int i; + size_t isize; + +- frame = dx_probe(dentry, NULL, &hinfo, frames, &err); +- if (!frame) ++ dx_path_init(&path, dir); ++ if (!dx_probe(dentry, NULL, &hinfo, &path, &err)) + return err; ++ frame = path.dp_frame; + entries = frame->entries; + + /* XXX nikita: global serialization! */ +@@ -1495,7 +1503,7 @@ static int ext3_dx_add_entry(handle_t *h + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ +- for (nr_splet = 0; frame >= frames && ++ for (nr_splet = 0; frame >= path.dp_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +@@ -1523,7 +1531,7 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + } + /* Add "safe" node to transaction too */ +- if (safe + 1 != frames) { ++ if (safe + 1 != path.dp_frames) { + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1543,7 +1551,7 @@ static int ext3_dx_add_entry(handle_t *h + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + +- if (frame == frames) { ++ if (frame == path.dp_frames) { + /* splitting root node. Tricky point: + * + * In the "normal" B-tree we'd split root *and* add +@@ -1556,7 +1564,9 @@ static int ext3_dx_add_entry(handle_t *h + */ + struct dx_root *root; + u8 indirects; ++ struct dx_frame *frames; + ++ frames = path.dp_frames; + root = (struct dx_root *) frames->bh->b_data; + indirects = root->info.indirect_levels; + dxtrace(printk("Creating new root %d\n", indirects)); +@@ -1571,7 +1581,7 @@ static int ext3_dx_add_entry(handle_t *h + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, +- (sizeof frames) - 2 * sizeof frames[0]); ++ (sizeof path.dp_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ + frames[1].at = entries2 + idx; + frames[1].entries = entries = entries2; +@@ -1627,7 +1637,7 @@ cleanup2: + } + if (err) + inode->i_size = isize; +- dx_release(frames); ++ dx_path_fini(&path); + return err; + } + #endif diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch b/ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch new file mode 100644 index 0000000..48897e7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-htree-r5-hash.patch @@ -0,0 +1,88 @@ +Index: iam-src/fs/ext3/hash.c +=================================================================== +--- iam-src.orig/fs/ext3/hash.c 2006-02-11 01:08:59.000000000 +0300 ++++ iam-src/fs/ext3/hash.c 2006-02-11 20:46:22.000000000 +0300 +@@ -4,7 +4,7 @@ + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. +- * ++ * + * This file may be redistributed under the terms of the GNU Public + * License. + */ +@@ -115,6 +115,18 @@ static __u32 dx_hack_hash (const char *n + return (hash0 << 1); + } + ++static __u32 dx_r5_hash(const signed char *msg, int len) ++{ ++ __u32 a = 0; ++ while (len--) { ++ a += *msg << 4; ++ a += *msg >> 4; ++ a *= 11; ++ msg++; ++ } ++ return a; ++} ++ + static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) + { + __u32 pad, val; +@@ -146,11 +158,11 @@ static void str2hashbuf(const char *msg, + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. +- * ++ * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. +- * ++ * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. +@@ -205,6 +217,9 @@ int ext3fs_dirhash(const char *name, int + hash = buf[0]; + minor_hash = buf[1]; + break; ++ case DX_HASH_R5: ++ hash = dx_r5_hash(name, len); ++ break; + default: + hinfo->hash = 0; + return -1; +Index: iam-src/fs/ext3/namei.c +=================================================================== +--- iam-src.orig/fs/ext3/namei.c 2006-02-11 01:09:12.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-11 20:45:58.000000000 +0300 +@@ -370,6 +370,7 @@ dx_probe(struct dentry *dentry, struct i + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_R5 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unrecognised inode hash code %d", root->info.hash_version); +@@ -1363,6 +1364,7 @@ static int make_indexed_dir(handle_t *ha + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; ++ root->info.hash_version = DX_HASH_R5; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); +Index: iam-src/include/linux/ext3_fs.h +=================================================================== +--- iam-src.orig/include/linux/ext3_fs.h 2006-02-11 01:08:59.000000000 +0300 ++++ iam-src/include/linux/ext3_fs.h 2006-02-11 20:45:58.000000000 +0300 +@@ -665,6 +665,7 @@ struct ext3_dir_entry_2 { + #define DX_HASH_LEGACY 0 + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 ++#define DX_HASH_R5 3 + + /* hash info structure used by the directory hash */ + struct dx_hash_info diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-ops.patch b/ldiskfs/kernel_patches/patches/ext3-iam-ops.patch new file mode 100644 index 0000000..e59f0c8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-ops.patch @@ -0,0 +1,1178 @@ +Index: iam/fs/ext3/namei.c +=================================================================== +--- iam.orig/fs/ext3/namei.c ++++ iam/fs/ext3/namei.c +@@ -82,13 +82,16 @@ + * + * Entries in index node are sorted by their key value. + * ++ * Format of leaf node: + * +- * +- * +- * +- * +- * +- * ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * | | count | | | | | | ++ * | gap | / | leaf | leaf | .... | leaf | free space | ++ * | | limit | | | | | | ++ * +-----+-------+-------+-------+------+-------+------------+ ++ ++ * leaf For leaf entry: consists of a rec immediately followd by ++ * a key. size of a key and size of a rec depends on container. + * + * + * +@@ -96,6 +99,7 @@ + * + */ + ++#include + #include + #include + #include +@@ -111,7 +115,7 @@ + #include "xattr.h" + #include "iopen.h" + #include "acl.h" +- ++#include + /* + * define how far ahead to read directories while searching them. + */ +@@ -120,13 +124,6 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +-/* +- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. +- */ +-enum { +- DX_MAX_TREE_HEIGHT = 5, +- DX_SCRATCH_KEYS = 2 +-}; + + static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, +@@ -205,194 +202,6 @@ struct dx_map_entry + u32 offs; + }; + +-/* +- * Entry within index tree node. Consists of a key immediately followed +- * (without padding) by a pointer to the child node. +- * +- * Both key and pointer are of variable size, hence incomplete type. +- */ +-struct iam_entry; +- +-struct iam_entry_compat { +- __le32 hash; +- __le32 block; +-}; +- +-/* +- * Incomplete type used to refer to keys in iam container. +- * +- * As key size can be different from container to container, iam has to use +- * incomplete type. Clients cast pointer to iam_key to real key type and back. +- */ +-struct iam_key; +- +-/* Incomplete type use to refer to the records stored in iam containers. */ +-struct iam_rec; +- +-typedef __u64 iam_ptr_t; +- +-/* +- * Index node traversed during tree lookup. +- */ +-struct iam_frame { +- struct buffer_head *bh; /* buffer holding node data */ +- struct iam_entry *entries; /* array of entries */ +- struct iam_entry *at; /* target entry, found by binary search */ +-}; +- +-/* leaf node reached by tree lookup */ +-struct iam_leaf { +- struct buffer_head *bh; +- struct iam_leaf_entry *entries; +- struct iam_leaf_entry *at; +-}; +- +-struct iam_path; +-struct iam_container; +- +-/* +- * Parameters, describing a flavor of iam container. +- */ +-struct iam_descr { +- /* +- * Size of a key in this container, in bytes. +- */ +- size_t id_key_size; +- /* +- * Size of a pointer to the next level (stored in index nodes), in +- * bytes. +- */ +- size_t id_ptr_size; +- /* +- * Size of a record (stored in leaf nodes), in bytes. +- */ +- size_t id_rec_size; +- /* +- * Size of unused (by iam) space at the beginning of every non-root +- * node, in bytes. Used for compatibility with ext3. +- */ +- size_t id_node_gap; +- /* +- * Size of unused (by iam) space at the beginning of root node, in +- * bytes. Used for compatibility with ext3. +- */ +- size_t id_root_gap; +- +- /* +- * Returns pointer (in the same sense as pointer in index entry) to +- * the root node. +- */ +- __u32 (*id_root_ptr)(struct iam_container *c); +- +- /* +- * Check validity and consistency of index node. This is called when +- * iam just loaded new node into frame. +- */ +- int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); +- /* +- * Initialize new node (stored in @bh) that is going to be added into +- * tree. +- */ +- int (*id_node_init)(struct iam_container *c, +- struct buffer_head *bh, int root); +- int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, +- handle_t *h, struct buffer_head **bh); +- /* +- * Key comparison function. Returns -1, 0, +1. +- */ +- int (*id_keycmp)(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); +- /* +- * Create new container. +- * +- * Newly created container has a root node and a single leaf. Leaf +- * contains single record with the smallest possible key. +- */ +- int (*id_create)(struct iam_container *c); +- struct { +- /* +- * leaf operations. +- */ +- /* +- * returns true iff leaf is positioned at the last entry. +- */ +- int (*at_end)(struct iam_container *c, struct iam_leaf *l); +- /* position leaf at the first entry */ +- void (*start)(struct iam_container *c, struct iam_leaf *l); +- /* more leaf to the next entry. */ +- void (*next)(struct iam_container *c, struct iam_leaf *l); +- /* return key of current leaf record in @k */ +- void (*key)(struct iam_container *c, struct iam_leaf *l, +- struct iam_key *k); +- /* return pointer to entry body */ +- struct iam_rec *(*rec)(struct iam_container *c, +- struct iam_leaf *l); +- } id_leaf; +-}; +- +-struct iam_container { +- /* +- * Underlying flat file. IO against this object is issued to +- * read/write nodes. +- */ +- struct inode *ic_object; +- /* +- * container flavor. +- */ +- struct iam_descr *ic_descr; +- /* +- * pointer to flavor-specific per-container data. +- */ +- void *ic_descr_data; +-}; +- +-/* +- * Structure to keep track of a path drilled through htree. +- */ +-struct iam_path { +- /* +- * Parent container. +- */ +- struct iam_container *ip_container; +- /* +- * Number of index levels minus one. +- */ +- int ip_indirect; +- /* +- * Nodes that top-to-bottom traversal passed through. +- */ +- struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; +- /* +- * Last filled frame in ->ip_frames. Refers to the 'twig' node (one +- * immediately above leaf). +- */ +- struct iam_frame *ip_frame; +- /* +- * Leaf node: a child of ->ip_frame. +- */ +- struct iam_leaf *ip_leaf; +- /* +- * Key searched for. +- */ +- struct iam_key *ip_key_target; +- /* +- * Scratch-pad area for temporary keys. +- */ +- struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; +- /* +- * pointer to flavor-specific per-container data. +- */ +- void *ip_descr_data; +-}; +- +-/* +- * Helper structure for legacy htrees. +- */ +-struct iam_path_compat { +- struct iam_path ipc_path; +- struct iam_container ipc_container; +- __u32 ipc_scrach[DX_SCRATCH_KEYS]; +-}; + + static u32 htree_root_ptr(struct iam_container *c); + static int htree_node_check(struct iam_path *path, struct iam_frame *frame); +@@ -427,58 +236,7 @@ struct iam_descr; + struct iam_container; + struct iam_path; + +-/* +- * Initialize container @c, acquires additional reference on @inode. +- */ +-int iam_container_init(struct iam_container *c, +- struct iam_descr *descr, struct inode *inode); +-/* +- * Finalize container @c, release all resources. +- */ +-void iam_container_fini(struct iam_container *c); + +-/* +- * Search container @c for record with key @k. If record is found, its data +- * are moved into @r. +- * +- * +- * +- * Return values: +ve: found, 0: not-found, -ve: error +- */ +-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); +-/* +- * Insert new record @r with key @k into container @c (within context of +- * transaction @h. +- * +- * Return values: 0: success, -ve: error, including -EEXIST when record with +- * given key is already present. +- * +- * postcondition: ergo(result == 0 || result == -EEXIST, +- * iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_insert(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Replace existing record with key @k, or insert new one. New record data are +- * in @r. +- * +- * Return values: 0: success, -ve: error. +- * +- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_update(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Delete existing record with key @k. +- * +- * Return values: 0: success, -ENOENT: not-found, -ve: other error. +- * +- * postcondition: ergo(result == 0 || result == -ENOENT, +- * !iam_lookup(c, k, *)); +- */ +-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); + + /* + * iam cursor (iterator) api. +@@ -508,6 +266,11 @@ enum iam_it_state { + IAM_IT_ATTACHED + }; + ++struct htree_cookie { ++ struct dx_hash_info *hinfo; ++ struct dentry *dentry; ++}; ++ + /* + * Iterator. + * +@@ -704,7 +467,7 @@ static int ext3_dx_add_entry(handle_t *h + struct inode *inode); + + static inline void iam_path_init(struct iam_path *path, +- struct iam_container *c); ++ struct iam_container *c, struct htree_cookie *hc); + static inline void iam_path_fini(struct iam_path *path); + + +@@ -865,11 +628,6 @@ static u32 htree_root_ptr(struct iam_con + return 0; + } + +-struct htree_cookie { +- struct dx_hash_info *hinfo; +- struct dentry *dentry; +-}; +- + static int htree_node_check(struct iam_path *path, struct iam_frame *frame) + { + void *data; +@@ -1171,11 +929,13 @@ void iam_container_fini(struct iam_conta + } + } + +-static inline void iam_path_init(struct iam_path *path, struct iam_container *c) ++static inline void iam_path_init(struct iam_path *path, struct iam_container *c, ++ struct htree_cookie *hc) + { + memset(path, 0, sizeof *path); + path->ip_container = c; + path->ip_frame = path->ip_frames; ++ path->ip_descr_data = hc; + } + + static inline void iam_path_fini(struct iam_path *path) +@@ -1201,7 +961,7 @@ static void iam_path_compat_init(struct + * iam_path_fini(). + */ + iput(inode); +- iam_path_init(&path->ipc_path, &path->ipc_container); ++ iam_path_init(&path->ipc_path, &path->ipc_container, NULL); + for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) + path->ipc_path.ip_key_scratch[i] = + (struct iam_key *)&path->ipc_scrach[i]; +@@ -1213,6 +973,425 @@ static void iam_path_compat_fini(struct + iam_container_fini(&path->ipc_container); + } + ++static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf) ++{ ++ int block, err; ++ struct buffer_head *bh; ++ ++ block = dx_get_block(path, path->ip_frame->at); ++ err = path_descr(path)->id_node_read(path->ip_container, block, ++ NULL, &bh); ++ if (err) ++ return err; ++ ++ leaf->bh = bh; ++ leaf->entries = (struct iam_leaf_entry *)bh->b_data; ++ return 0; ++} ++ ++static void iam_leaf_fini(struct iam_leaf *leaf) ++{ ++ if (leaf->bh) ++ brelse(leaf->bh); ++} ++ ++/* ++ * Search container @c for record with key @k. If record is found, its data ++ * are moved into @r. ++ * ++ * ++ * ++ * Return values: +ve: found, 0: not-found, -ve: error ++ */ ++ ++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ do { ++ struct iam_leaf leaf; ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ for (path_descr(path)->id_leaf.start(c, &leaf); ++ !path_descr(path)->id_leaf.at_end(c, &leaf); ++ path_descr(path)->id_leaf.next(c, &leaf)) { ++ struct iam_key *key; ++ ++ key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL); ++ path_descr(path)->id_leaf.key(c, &leaf, key); ++ if (keycmp(c, k, key) == 0) { ++ memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf), ++ path_descr(path)->id_rec_size); ++ iam_path_fini(path); ++ iam_leaf_fini(&leaf); ++ return 0; ++ } ++ } ++ ++ iam_leaf_fini(&leaf); ++ /* Check to see if we should continue to search */ ++ err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL); ++ if (err < 0) ++ goto errout; ++ } while (err == 1); ++errout: ++ iam_path_fini(path); ++ return(err); ++} ++EXPORT_SYMBOL(iam_lookup); ++ ++static inline size_t iam_leaf_entry_size(struct iam_path *p) ++{ ++ return path_descr(p)->id_rec_size + path_descr(p)->id_key_size; ++} ++ ++static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p, ++ struct iam_leaf_entry *e1, struct iam_leaf_entry *e2) ++{ ++ ptrdiff_t diff; ++ ++ diff = (void *)e1 - (void *)e2; ++ assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff); ++ return diff / iam_leaf_entry_size(p); ++} ++ ++static inline struct iam_leaf_entry* ++iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift) ++{ ++ void *e = entry; ++ return e + shift * iam_leaf_entry_size(p); ++} ++ ++static inline struct iam_key * ++dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key) ++{ ++ memcpy(key, e, path_descr(p)->id_key_size); ++ return key; ++} ++ ++static inline struct iam_key * ++iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry) ++{ ++ void *e = entry; ++ return e + path_descr(p)->id_rec_size; ++} ++static inline struct iam_leaf_entry * ++iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry) ++{ ++ return entry; ++} ++ ++static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, ++ struct iam_key *k) ++{ ++ struct iam_leaf_entry *p, *q, *m; ++ struct iam_leaf_entry *entries = leaf->entries; ++ int count = dx_get_count((struct iam_entry *)entries); ++ ++ p = iam_leaf_entry_shift(path, entries, 1); ++ q = iam_leaf_entry_shift(path, entries, count - 1); ++ while (p <= q) { ++ m = iam_leaf_entry_shift(path, ++ p, iam_leaf_entry_diff(path, q, p) / 2); ++ dxtrace(printk(".")); ++ if (keycmp(path->ip_container, iam_leaf_key_at(path, m), ++ path->ip_key_target) > 0) ++ q = iam_leaf_entry_shift(path, m, -1); ++ else ++ p = iam_leaf_entry_shift(path, m, +1); ++ } ++ leaf->at = q; ++ return 0; ++} ++ ++/*XXX what kind of lock should this entry be locked: WangDi */ ++static int iam_leaf_insert(handle_t *handle, struct iam_path *path, ++ struct iam_key *k, struct iam_rec *r) ++{ ++ struct iam_leaf leaf; ++ struct iam_leaf_entry *p, *q; ++ int err, count; ++ ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ path_descr(path)->id_leaf.start(path->ip_container, &leaf); ++ count = dx_get_count((struct iam_entry *)leaf.entries); ++ if (dx_get_count((struct iam_entry *)leaf.entries) >= ++ dx_get_limit((struct iam_entry *)leaf.entries)){ ++ err = -ENOSPC; ++ goto errout; ++ } ++ ++ err = iam_leaf_lookup(path, &leaf, k); ++ if (err) ++ goto errout; ++ ++ /*insert the k/r to leaf entries*/ ++ p = iam_leaf_entry_shift(path, leaf.at, 1); ++ q = iam_leaf_entry_shift(path, leaf.entries, count - 1); ++ while (q < p) { ++ memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path)); ++ q = iam_leaf_entry_shift(path, q, -1); ++ } ++ memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size); ++ memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size); ++ ++ dx_set_count((struct iam_entry*)leaf.entries, count + 1); ++ err = ext3_journal_dirty_metadata(handle, leaf.bh); ++ if (err) ++ ext3_std_error(path->ip_container->ic_object->i_sb, err); ++errout: ++ iam_leaf_fini(&leaf); ++ return err; ++} ++ ++static int split_leaf_node(handle_t *handle, struct iam_path *path) ++{ ++ struct inode *dir = path_obj(path); ++ unsigned continued = 0; ++ struct buffer_head *bh2; ++ u32 newblock, hash_split; ++ char *data2; ++ struct iam_leaf leaf; ++ unsigned split; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) { ++ err = -ENOSPC; ++ goto errout; ++ } ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ BUFFER_TRACE(leaf.bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, leaf.bh); ++ if (err) { ++ journal_error: ++ iam_leaf_fini(&leaf); ++ brelse(bh2); ++ ext3_std_error(dir->i_sb, err); ++ err = -EIO; ++ goto errout; ++ } ++ data2 = bh2->b_data; ++ split = dx_get_count((struct iam_entry*)leaf.entries)/2; ++ hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)); ++ if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)), ++ iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0) ++ continued = 1; ++ ++ memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1), ++ iam_leaf_entry_shift(path, leaf.entries, split), ++ split * iam_leaf_entry_size(path)); ++ ++ /* Which block gets the new entry? */ ++ dx_insert_block(path, path->ip_frame, hash_split + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, leaf.bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ iam_leaf_fini(&leaf); ++errout: ++ return err; ++} ++ ++static int split_index_node(handle_t *handle, struct iam_path *path); ++/* ++ * Insert new record @r with key @k into container @c (within context of ++ * transaction @h. ++ * ++ * Return values: 0: success, -ve: error, including -EEXIST when record with ++ * given key is already present. ++ * ++ * postcondition: ergo(result == 0 || result == -EEXIST, ++ * iam_lookup(c, k, r2) > 0 && ++ * !memcmp(r, r2, c->ic_descr->id_rec_size)); ++ */ ++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, ++ struct iam_rec *r) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_insert(handle, path, k, r); ++ ++ if (err != -ENOSPC) ++ goto errout; ++ ++ err = split_index_node(handle, path); ++ if (err) ++ goto errout; ++ ++ err = split_leaf_node(handle, path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_insert(handle, path, k, r); ++errout: ++ iam_path_fini(path); ++ return(err); ++} ++ ++EXPORT_SYMBOL(iam_insert); ++static int iam_leaf_delete(handle_t *handle, struct iam_path *path, ++ struct iam_key *k) ++{ ++ struct iam_leaf leaf; ++ struct iam_leaf_entry *p, *q; ++ int err, count; ++ ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_lookup(path, &leaf, k); ++ if (err) ++ goto errout; ++ ++ count = dx_get_count((struct iam_entry*)leaf.entries); ++ /*delete the k to leaf entries*/ ++ p = iam_leaf_entry_shift(path, leaf.at, 1); ++ q = iam_leaf_entry_shift(path, leaf.entries, count - 1); ++ while (p < q) { ++ memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path)); ++ p = iam_leaf_entry_shift(path, p, 1); ++ } ++ dx_set_count((struct iam_entry*)leaf.entries, count - 1); ++ ++ err = ext3_journal_dirty_metadata(handle, leaf.bh); ++ if (err) ++ ext3_std_error(path_obj(path)->i_sb, err); ++errout: ++ iam_leaf_fini(&leaf); ++ return err; ++} ++ ++/* ++ * Delete existing record with key @k. ++ * ++ * Return values: 0: success, -ENOENT: not-found, -ve: other error. ++ * ++ * postcondition: ergo(result == 0 || result == -ENOENT, ++ * !iam_lookup(c, k, *)); ++ */ ++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_delete(h, path, k); ++errout: ++ iam_path_fini(path); ++ return err; ++} ++ ++EXPORT_SYMBOL(iam_delete); ++ ++static int iam_leaf_update(handle_t *handle, struct iam_path *path, ++ struct iam_key *k, struct iam_rec *r) ++{ ++ struct iam_leaf leaf; ++ int err; ++ ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_lookup(path, &leaf, k); ++ if (err) ++ goto errout; ++ ++ memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size); ++ memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size); ++ ++ err = ext3_journal_dirty_metadata(handle, leaf.bh); ++ if (err) ++ ext3_std_error(path_obj(path)->i_sb, err); ++errout: ++ iam_leaf_fini(&leaf); ++ return err; ++} ++/* ++ * Replace existing record with key @k, or insert new one. New record data are ++ * in @r. ++ * ++ * Return values: 0: success, -ve: error. ++ * ++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && ++ * !memcmp(r, r2, c->ic_descr->id_rec_size)); ++ */ ++int iam_update(handle_t *h, struct iam_container *c, ++ struct iam_key *k, struct iam_rec *r) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_update(h, path, k, r); ++errout: ++ iam_path_fini(path); ++ return err; ++} ++ ++EXPORT_SYMBOL(iam_update); ++ + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -2245,59 +2424,21 @@ static int ext3_add_entry (handle_t *han + } + + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ +-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++static int split_index_node(handle_t *handle, struct iam_path *path) + { +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct iam_descr *param; +- struct iam_frame *frame, *safe; ++ + struct iam_entry *entries; /* old block contents */ + struct iam_entry *entries2; /* new block contents */ +- struct dx_hash_info hinfo; +- struct buffer_head * bh; ++ struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; +- struct inode *dir = dentry->d_parent->d_inode; +- struct super_block * sb = dir->i_sb; +- struct ext3_dir_entry_2 *de; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; +- int err; ++ struct inode *dir = path_obj(path); + int nr_splet; +- int i; +- size_t isize; +- +- iam_path_compat_init(&cpath, dir); +- param = path_descr(path); ++ int i, err; + +- err = dx_probe(dentry, NULL, &hinfo, path); +- if (err != 0) +- return err; + frame = path->ip_frame; + entries = frame->entries; + +- /* XXX nikita: global serialization! */ +- isize = dir->i_size; +- +- err = param->id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, +- frame->at), handle, &bh); +- if (err != 0) +- goto cleanup; +- +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- +- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (err != -ENOSPC) { +- bh = NULL; +- goto cleanup; +- } +- + /* + * Tall-tree handling: we might have to split multiple index blocks + * all the way up to tree root. Tricky point here is error handling: +@@ -2320,7 +2461,7 @@ static int ext3_dx_add_entry(handle_t *h + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +- ext3_warning(sb, __FUNCTION__, ++ ext3_warning(dir->i_sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; +@@ -2333,7 +2474,7 @@ static int ext3_dx_add_entry(handle_t *h + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); + if (!bh_new[i] || +- param->id_node_init(path->ip_container, bh_new[i], 0) != 0) ++ path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0) + goto cleanup; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); +@@ -2439,9 +2580,71 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + } + } ++ goto cleanup; ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++ ++cleanup: ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ return err; ++} ++ ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh = NULL; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct ext3_dir_entry_2 *de; ++ int err; ++ size_t isize; ++ ++ iam_path_compat_init(&cpath, dir); ++ param = path_descr(path); ++ ++ err = dx_probe(dentry, NULL, &hinfo, path); ++ if (err != 0) ++ return err; ++ frame = path->ip_frame; ++ ++ /* XXX nikita: global serialization! */ ++ isize = dir->i_size; ++ ++ err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), ++ handle, &bh); ++ if (err != 0) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); ++ if (err != -ENOSPC) { ++ bh = NULL; ++ goto cleanup; ++ } ++ ++ err = split_index_node(handle, path); ++ if (err) ++ goto cleanup; ++ ++ /*copy split inode too*/ + de = do_split(handle, path, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; ++ + assert(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; +@@ -2452,10 +2655,6 @@ cleanup: + if (bh) + brelse(bh); + cleanup2: +- for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { +- if (bh_new[i] != NULL) +- brelse(bh_new[i]); +- } + if (err) + inode->i_size = isize; + iam_path_fini(path); +Index: iam/include/linux/lustre_iam.h +=================================================================== +--- iam.orig/include/linux/lustre_iam.h ++++ iam/include/linux/lustre_iam.h +@@ -0,0 +1,212 @@ ++/* ++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. ++ */ ++enum { ++ DX_MAX_TREE_HEIGHT = 5, ++ DX_SCRATCH_KEYS = 2 ++}; ++ ++/* ++ * Entry within index tree node. Consists of a key immediately followed ++ * (without padding) by a pointer to the child node. ++ * ++ * Both key and pointer are of variable size, hence incomplete type. ++ */ ++struct iam_entry; ++ ++struct iam_entry_compat { ++ __le32 hash; ++ __le32 block; ++}; ++ ++/* ++ * Incomplete type used to refer to keys in iam container. ++ * ++ * As key size can be different from container to container, iam has to use ++ * incomplete type. Clients cast pointer to iam_key to real key type and back. ++ */ ++struct iam_key; ++ ++/* Incomplete type use to refer to the records stored in iam containers. */ ++struct iam_rec; ++ ++typedef __u64 iam_ptr_t; ++ ++/* ++ * Index node traversed during tree lookup. ++ */ ++struct iam_frame { ++ struct buffer_head *bh; /* buffer holding node data */ ++ struct iam_entry *entries; /* array of entries */ ++ struct iam_entry *at; /* target entry, found by binary search */ ++}; ++ ++/* leaf node reached by tree lookup */ ++#define iam_leaf_entry iam_rec ++struct iam_leaf { ++ struct buffer_head *bh; ++ struct iam_leaf_entry *entries; ++ struct iam_leaf_entry *at; ++}; ++ ++struct iam_path; ++struct iam_container; ++ ++/* ++ * Parameters, describing a flavor of iam container. ++ */ ++struct iam_descr { ++ /* ++ * Size of a key in this container, in bytes. ++ */ ++ size_t id_key_size; ++ /* ++ * Size of a pointer to the next level (stored in index nodes), in ++ * bytes. ++ */ ++ size_t id_ptr_size; ++ /* ++ * Size of a record (stored in leaf nodes), in bytes. ++ */ ++ size_t id_rec_size; ++ /* ++ * Size of unused (by iam) space at the beginning of every non-root ++ * node, in bytes. Used for compatibility with ext3. ++ */ ++ size_t id_node_gap; ++ /* ++ * Size of unused (by iam) space at the beginning of root node, in ++ * bytes. Used for compatibility with ext3. ++ */ ++ size_t id_root_gap; ++ ++ /* ++ * Returns pointer (in the same sense as pointer in index entry) to ++ * the root node. ++ */ ++ __u32 (*id_root_ptr)(struct iam_container *c); ++ ++ /* ++ * Check validity and consistency of index node. This is called when ++ * iam just loaded new node into frame. ++ */ ++ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); ++ /* ++ * Initialize new node (stored in @bh) that is going to be added into ++ * tree. ++ */ ++ int (*id_node_init)(struct iam_container *c, ++ struct buffer_head *bh, int root); ++ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ /* ++ * Key comparison function. Returns -1, 0, +1. ++ */ ++ int (*id_keycmp)(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2); ++ /* ++ * Create new container. ++ * ++ * Newly created container has a root node and a single leaf. Leaf ++ * contains single record with the smallest possible key. ++ */ ++ int (*id_create)(struct iam_container *c); ++ struct { ++ /* ++ * leaf operations. ++ */ ++ /* ++ * returns true iff leaf is positioned at the last entry. ++ */ ++ int (*at_end)(struct iam_container *c, struct iam_leaf *l); ++ /* position leaf at the first entry */ ++ void (*start)(struct iam_container *c, struct iam_leaf *l); ++ /* more leaf to the next entry. */ ++ void (*next)(struct iam_container *c, struct iam_leaf *l); ++ /* return key of current leaf record in @k */ ++ void (*key)(struct iam_container *c, struct iam_leaf *l, ++ struct iam_key *k); ++ /* return pointer to entry body */ ++ struct iam_rec *(*rec)(struct iam_container *c, ++ struct iam_leaf *l); ++ } id_leaf; ++}; ++ ++struct iam_container { ++ /* ++ * Underlying flat file. IO against this object is issued to ++ * read/write nodes. ++ */ ++ struct inode *ic_object; ++ /* ++ * container flavor. ++ */ ++ struct iam_descr *ic_descr; ++ /* ++ * pointer to flavor-specific per-container data. ++ */ ++ void *ic_descr_data; ++}; ++ ++/* ++ * Structure to keep track of a path drilled through htree. ++ */ ++struct iam_path { ++ /* ++ * Parent container. ++ */ ++ struct iam_container *ip_container; ++ /* ++ * Number of index levels minus one. ++ */ ++ int ip_indirect; ++ /* ++ * Nodes that top-to-bottom traversal passed through. ++ */ ++ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; ++ /* ++ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one ++ * immediately above leaf). ++ */ ++ struct iam_frame *ip_frame; ++ /* ++ * Leaf node: a child of ->ip_frame. ++ */ ++ struct iam_leaf *ip_leaf; ++ /* ++ * Key searched for. ++ */ ++ struct iam_key *ip_key_target; ++ /* ++ * Scratch-pad area for temporary keys. ++ */ ++ struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; ++ /* ++ * pointer to flavor-specific per-container data. ++ */ ++ void *ip_descr_data; ++}; ++ ++/* ++ * Helper structure for legacy htrees. ++ */ ++struct iam_path_compat { ++ struct iam_path ipc_path; ++ struct iam_container ipc_container; ++ __u32 ipc_scrach[DX_SCRATCH_KEYS]; ++}; ++ ++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); ++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); ++int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r); ++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r); ++/* ++ * Initialize container @c, acquires additional reference on @inode. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode); ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c); ++ diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-separate.patch b/ldiskfs/kernel_patches/patches/ext3-iam-separate.patch new file mode 100644 index 0000000..717ecce --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-separate.patch @@ -0,0 +1,6758 @@ +Index: iam/include/linux/ext3_fs.h +=================================================================== +--- iam.orig/include/linux/ext3_fs.h 2007-05-23 11:18:17.000000000 +0800 ++++ iam/include/linux/ext3_fs.h 2007-05-23 11:18:20.000000000 +0800 +@@ -758,9 +758,7 @@ + extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv); + + /* dir.c */ +-extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, +- struct buffer_head *, unsigned long); ++ + extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +Index: iam/include/linux/lustre_iam.h +=================================================================== +--- iam.orig/include/linux/lustre_iam.h 2007-05-23 11:18:18.000000000 +0800 ++++ iam/include/linux/lustre_iam.h 2007-05-23 11:18:20.000000000 +0800 +@@ -1,9 +1,68 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * lustre_iam.c ++ * Top-level entry points into osd module ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Wang Di ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#ifndef __LINUX_LUSTRE_IAM_H__ ++#define __LINUX_LUSTRE_IAM_H__ ++ ++/* handle_t, journal_start(), journal_stop() */ ++#include ++ + /* +- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. ++ * linux/include/linux/lustre_iam.h + */ ++ + enum { ++ /* ++ * Maximal number of non-leaf levels in htree. In the stock ext3 this ++ * is 2. ++ */ + DX_MAX_TREE_HEIGHT = 5, +- DX_SCRATCH_KEYS = 2 ++ /* ++ * Scratch keys used by generic code for temporaries. ++ * ++ * Allocation: ++ * ++ * [0] reserved for assertions and as a staging area for ++ * record keys immediately used for key comparisons. ++ * ++ * [1] reserved for record key, stored during iteration over ++ * node records (see dx_node_check()). ++ * ++ * [2] reserved for leaf node operations. ++ * ++ * [3] reserved for index operations. ++ */ ++ DX_SCRATCH_KEYS = 4, ++ /* ++ * Maximal format name length. ++ */ ++ DX_FMT_NAME_LEN = 16 + }; + + /* +@@ -30,6 +89,11 @@ + /* Incomplete type use to refer to the records stored in iam containers. */ + struct iam_rec; + ++struct iam_cookie { ++ struct iam_key *ic_key; ++ struct iam_rec *ic_rec; ++}; ++ + typedef __u64 iam_ptr_t; + + /* +@@ -41,45 +105,25 @@ + struct iam_entry *at; /* target entry, found by binary search */ + }; + +-/* leaf node reached by tree lookup */ +-#define iam_leaf_entry iam_rec +-struct iam_leaf { +- struct buffer_head *bh; +- struct iam_leaf_entry *entries; +- struct iam_leaf_entry *at; +-}; ++/* ++ * Opaque entry in the leaf node. ++ */ ++struct iam_lentry; + + struct iam_path; + struct iam_container; + +-/* +- * Parameters, describing a flavor of iam container. +- */ +-struct iam_descr { +- /* +- * Size of a key in this container, in bytes. +- */ +- size_t id_key_size; +- /* +- * Size of a pointer to the next level (stored in index nodes), in +- * bytes. +- */ +- size_t id_ptr_size; +- /* +- * Size of a record (stored in leaf nodes), in bytes. +- */ +- size_t id_rec_size; +- /* +- * Size of unused (by iam) space at the beginning of every non-root +- * node, in bytes. Used for compatibility with ext3. +- */ +- size_t id_node_gap; +- /* +- * Size of unused (by iam) space at the beginning of root node, in +- * bytes. Used for compatibility with ext3. +- */ +- size_t id_root_gap; + ++/* leaf node reached by tree lookup */ ++struct iam_leaf { ++ struct iam_path *il_path; ++ struct buffer_head *il_bh; ++ struct iam_lentry *il_entries; ++ struct iam_lentry *il_at; ++ void *il_descr_data; ++}; ++ ++struct iam_operations { + /* + * Returns pointer (in the same sense as pointer in index entry) to + * the root node. +@@ -102,8 +146,8 @@ + /* + * Key comparison function. Returns -1, 0, +1. + */ +- int (*id_keycmp)(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); ++ int (*id_keycmp)(const struct iam_container *c, ++ const struct iam_key *k1, const struct iam_key *k2); + /* + * Create new container. + * +@@ -111,25 +155,113 @@ + * contains single record with the smallest possible key. + */ + int (*id_create)(struct iam_container *c); +- struct { ++ /* ++ * Format name. ++ */ ++ char id_name[DX_FMT_NAME_LEN]; ++}; ++ ++struct iam_leaf_operations { + /* + * leaf operations. + */ ++ ++ /* ++ * initialize just loaded leaf node. ++ */ ++ int (*init)(struct iam_leaf *p); ++ /* ++ * Format new node. ++ */ ++ void (*init_new)(struct iam_container *c, struct buffer_head *bh); ++ /* ++ * Release resources. ++ */ ++ void (*fini)(struct iam_leaf *l); + /* + * returns true iff leaf is positioned at the last entry. + */ +- int (*at_end)(struct iam_container *c, struct iam_leaf *l); ++ int (*at_end)(const struct iam_leaf *l); + /* position leaf at the first entry */ +- void (*start)(struct iam_container *c, struct iam_leaf *l); ++ void (*start)(struct iam_leaf *l); + /* more leaf to the next entry. */ +- void (*next)(struct iam_container *c, struct iam_leaf *l); +- /* return key of current leaf record in @k */ +- void (*key)(struct iam_container *c, struct iam_leaf *l, +- struct iam_key *k); +- /* return pointer to entry body */ +- struct iam_rec *(*rec)(struct iam_container *c, +- struct iam_leaf *l); +- } id_leaf; ++ void (*next)(struct iam_leaf *l); ++ /* return key of current leaf record. This method may return ++ * either pointer to the key stored in node, or copy key into ++ * @k buffer supplied by caller and return pointer to this ++ * buffer. The latter approach is used when keys in nodes are ++ * not stored in plain form (e.g., htree doesn't store keys at ++ * all). ++ * ++ * Caller should assume that returned pointer is only valid ++ * while leaf node is pinned and locked.*/ ++ struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k); ++ /* return pointer to entry body. Pointer is valid while ++ corresponding leaf node is locked and pinned. */ ++ struct iam_rec *(*rec)(const struct iam_leaf *l); ++ ++ void (*key_set)(struct iam_leaf *l, const struct iam_key *k); ++ void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r); ++ ++ /* ++ * Search leaf @l for a record with key @k or for a place ++ * where such record is to be inserted. ++ * ++ * Scratch keys from @path can be used. ++ */ ++ int (*lookup)(struct iam_leaf *l, const struct iam_key *k); ++ ++ int (*can_add)(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r); ++ /* ++ * add rec for a leaf ++ */ ++ void (*rec_add)(struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r); ++ /* ++ * remove rec for a leaf ++ */ ++ void (*rec_del)(struct iam_leaf *l); ++ /* ++ * split leaf node, moving some entries into @bh (the latter currently ++ * is assumed to be empty). ++ */ ++ void (*split)(struct iam_leaf *l, struct buffer_head *bh); ++}; ++ ++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf); ++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf); ++ ++/* ++ * Parameters, describing a flavor of iam container. ++ */ ++struct iam_descr { ++ /* ++ * Size of a key in this container, in bytes. ++ */ ++ size_t id_key_size; ++ /* ++ * Size of a pointer to the next level (stored in index nodes), in ++ * bytes. ++ */ ++ size_t id_ptr_size; ++ /* ++ * Size of a record (stored in leaf nodes), in bytes. ++ */ ++ size_t id_rec_size; ++ /* ++ * Size of unused (by iam) space at the beginning of every non-root ++ * node, in bytes. Used for compatibility with ext3. ++ */ ++ size_t id_node_gap; ++ /* ++ * Size of unused (by iam) space at the beginning of root node, in ++ * bytes. Used for compatibility with ext3. ++ */ ++ size_t id_root_gap; ++ ++ struct iam_operations *id_ops; ++ struct iam_leaf_operations *id_leaf_ops; + }; + + struct iam_container { +@@ -142,10 +274,17 @@ + * container flavor. + */ + struct iam_descr *ic_descr; ++}; ++ ++/* ++ * description-specific part of iam_path. This is usually embedded into larger ++ * structure. ++ */ ++struct iam_path_descr { + /* +- * pointer to flavor-specific per-container data. ++ * Scratch-pad area for temporary keys. + */ +- void *ic_descr_data; ++ struct iam_key *ipd_key_scratch[DX_SCRATCH_KEYS]; + }; + + /* +@@ -172,36 +311,240 @@ + /* + * Leaf node: a child of ->ip_frame. + */ +- struct iam_leaf *ip_leaf; ++ struct iam_leaf ip_leaf; + /* + * Key searched for. + */ +- struct iam_key *ip_key_target; +- /* +- * Scratch-pad area for temporary keys. +- */ +- struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; ++ const struct iam_key *ip_key_target; + /* +- * pointer to flavor-specific per-container data. ++ * Description-specific data. + */ +- void *ip_descr_data; ++ struct iam_path_descr *ip_data; + }; + ++struct dx_hash_info; ++ + /* + * Helper structure for legacy htrees. + */ + struct iam_path_compat { + struct iam_path ipc_path; + struct iam_container ipc_container; +- __u32 ipc_scrach[DX_SCRATCH_KEYS]; ++ __u32 ipc_scratch[DX_SCRATCH_KEYS]; ++ struct dx_hash_info *ipc_hinfo; ++ struct dentry *ipc_dentry; ++ struct iam_path_descr ipc_descr; ++}; ++ ++/* ++ * iam cursor (iterator) api. ++ */ ++ ++/* ++ * States of iterator state machine. ++ */ ++enum iam_it_state { ++ /* initial state */ ++ IAM_IT_DETACHED, ++ /* iterator is above particular record in the container */ ++ IAM_IT_ATTACHED + }; + +-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); +-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); +-int iam_update(handle_t *h, struct iam_container *c, struct iam_key *k, struct iam_rec *r); +-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, struct iam_rec *r); + /* +- * Initialize container @c, acquires additional reference on @inode. ++ * Flags controlling iterator functionality. ++ */ ++enum iam_it_flags { ++ /* ++ * this iterator will move (iam_it_{prev,next}() will be called on it) ++ */ ++ IAM_IT_MOVE = (1 << 0), ++ /* ++ * tree can be updated through this iterator. ++ */ ++ IAM_IT_WRITE = (1 << 1) ++}; ++ ++/* ++ * Iterator. ++ * ++ * Immediately after call to iam_it_init() iterator is in "detached" ++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but ++ * doesn't point to any particular record in this container. ++ * ++ * After successful call to iam_it_get() and until corresponding call to ++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). ++ * ++ * Attached iterator can move through records in a container (provided ++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it ++ * passes over them, and can modify container (provided IAM_IT_WRITE ++ * permission). ++ * ++ * Concurrency: iterators are supposed to be local to thread. Interfaces below ++ * do no internal serialization. ++ * ++ */ ++struct iam_iterator { ++ /* ++ * iterator flags, taken from enum iam_it_flags. ++ */ ++ __u32 ii_flags; ++ enum iam_it_state ii_state; ++ /* ++ * path to the record. Valid in IAM_IT_ATTACHED state. ++ */ ++ struct iam_path ii_path; ++}; ++ ++void iam_path_init(struct iam_path *path, struct iam_container *c, ++ struct iam_path_descr *pd); ++void iam_path_fini(struct iam_path *path); ++ ++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode); ++void iam_path_compat_fini(struct iam_path_compat *path); ++ ++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize); ++void iam_ipd_free(struct iam_path_descr *ipd); ++ ++/* ++ * Initialize iterator to IAM_IT_DETACHED state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, ++ struct iam_path_descr *pd); ++/* ++ * Finalize iterator and release all resources. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_fini(struct iam_iterator *it); ++ ++/* ++ * Attach iterator. After successful completion, @it points to record with the ++ * largest key not larger than @k. Semantics of ->id_create() method guarantee ++ * that such record will always be found. ++ * ++ * Return value: 0: positioned on existing record, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ * postcondition: ergo(result == 0, ++ * (it_state(it) == IAM_IT_ATTACHED && ++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) ++ */ ++int iam_it_get(struct iam_iterator *it, const struct iam_key *k); ++ ++/* ++ * Duplicates iterator. ++ * ++ * postcondition: it_state(dst) == it_state(src) && ++ * iam_it_container(dst) == iam_it_container(src) && ++ * dst->ii_flags = src->ii_flags && ++ * ergo(it_state(it) == IAM_IT_ATTACHED, ++ * iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) ++ */ ++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src); ++ ++/* ++ * Detach iterator. Does nothing it detached state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_put(struct iam_iterator *it); ++ ++/* ++ * Move iterator one record right. ++ * ++ * Return value: 0: success, ++ * +1: end of container reached ++ * -ve: error ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE ++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) ++ */ ++int iam_it_next(struct iam_iterator *it); ++ ++/* ++ * Return pointer to the record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it); ++ ++/* ++ * Replace contents of record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); ++ ++/* ++ * Place key under iterator in @k, return @k ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++struct iam_key *iam_it_key_get(const struct iam_iterator *it, ++ struct iam_key *k); ++ ++/* ++ * Insert new record with key @k and contents from @r, shifting records to the ++ * right. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE && ++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0 ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, ++ * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && ++ * !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, ++ const struct iam_key *k, const struct iam_rec *r); ++/* ++ * Delete record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); ++ ++typedef __u64 iam_pos_t; ++ ++/* ++ * Convert iterator to cookie. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++iam_pos_t iam_it_store(const struct iam_iterator *it); ++ ++/* ++ * Restore iterator from cookie. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE && ++ * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED && ++ * iam_it_store(it) == pos) ++ */ ++int iam_it_load(struct iam_iterator *it, iam_pos_t pos); ++ ++int iam_lookup(struct iam_container *c, const struct iam_key *k, ++ struct iam_rec *r, struct iam_path_descr *pd); ++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ struct iam_path_descr *pd); ++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ struct iam_rec *r, struct iam_path_descr *pd); ++int iam_insert(handle_t *handle, struct iam_container *c, ++ const struct iam_key *k, ++ struct iam_rec *r, struct iam_path_descr *pd); ++/* ++ * Initialize container @c. + */ + int iam_container_init(struct iam_container *c, + struct iam_descr *descr, struct inode *inode); +@@ -210,3 +553,170 @@ + */ + void iam_container_fini(struct iam_container *c); + ++/* ++ * Determine container format. ++ */ ++int iam_container_setup(struct iam_container *c); ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++static inline struct iam_descr *iam_container_descr(struct iam_container *c) ++{ ++ return c->ic_descr; ++} ++ ++static inline struct iam_descr *iam_path_descr(const struct iam_path *p) ++{ ++ return p->ip_container->ic_descr; ++} ++ ++static inline struct inode *iam_path_obj(struct iam_path *p) ++{ ++ return p->ip_container->ic_object; ++} ++ ++static inline void iam_keycpy(const struct iam_container *c, ++ struct iam_key *k1, const struct iam_key *k2) ++{ ++ memcpy(k1, k2, c->ic_descr->id_key_size); ++} ++ ++static inline int iam_keycmp(const struct iam_container *c, ++ const struct iam_key *k1, const struct iam_key *k2) ++{ ++ return c->ic_descr->id_ops->id_keycmp(c, k1, k2); ++} ++ ++static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst, ++ const struct iam_rec *rec_src) ++{ ++ memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size); ++} ++ ++static inline void *iam_entry_off(struct iam_entry *entry, size_t off) ++{ ++ return (void *)((char *)entry + off); ++} ++ ++/*XXX These stuff put here, just because they are used by iam.c and namei.c*/ ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) ++{ ++ return le32_to_cpu(*(u32*)iam_entry_off(entry, ++ iam_path_descr(p)->id_key_size)) ++ & 0x00ffffff; ++} ++ ++static inline void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value) ++{ ++ *(u32*)iam_entry_off(entry, ++ iam_path_descr(p)->id_key_size) = ++ cpu_to_le32(value); ++} ++ ++static inline void dx_set_key(struct iam_path *p, struct iam_entry *entry, ++ const struct iam_key *key) ++{ ++ iam_keycpy(p->ip_container, iam_entry_off(entry, 0), key); ++} ++ ++struct dx_countlimit { ++ __le16 limit; ++ __le16 count; ++}; ++ ++static inline unsigned dx_get_count(struct iam_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count); ++} ++ ++static inline unsigned dx_get_limit(struct iam_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit); ++} ++ ++static inline void dx_set_count(struct iam_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_node_limit(struct iam_path *p) ++{ ++ struct iam_descr *param = iam_path_descr(p); ++ unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize - ++ param->id_node_gap; ++ return entry_space / (param->id_key_size + param->id_ptr_size); ++} ++ ++static inline struct iam_entry *dx_get_entries(struct iam_path *path, ++ void *data, int root) ++{ ++ struct iam_descr *param = iam_path_descr(path); ++ return data + (root ? param->id_root_gap : param->id_node_gap); ++} ++ ++ ++static inline struct iam_entry *dx_node_get_entries(struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ return dx_get_entries(path, ++ frame->bh->b_data, frame == path->ip_frames); ++} ++ ++static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr) ++{ ++ assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch)); ++ return path->ip_data->ipd_key_scratch[nr]; ++} ++ ++int dx_lookup(struct iam_path *path); ++void dx_insert_block(struct iam_path *path, struct iam_frame *frame, ++ u32 hash, u32 block); ++ ++int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash); ++ ++struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, ++ u32 *block, int *err); ++int split_index_node(handle_t *handle, struct iam_path *path); ++ ++/* ++ * external ++ */ ++void iam_container_write_lock(struct iam_container *c); ++void iam_container_write_unlock(struct iam_container *c); ++ ++void iam_container_read_lock(struct iam_container *c); ++void iam_container_read_unlock(struct iam_container *c); ++ ++int iam_index_next(struct iam_container *c, struct iam_path *p); ++int iam_read_leaf(struct iam_path *p); ++ ++int iam_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *handle, struct buffer_head **bh); ++ ++void iam_insert_key(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_key *key, iam_ptr_t ptr); ++ ++int iam_leaf_at_end(const struct iam_leaf *l); ++void iam_leaf_next(struct iam_leaf *folio); ++ ++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf); ++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf); ++struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf); ++struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf); ++ ++ ++struct iam_format { ++ int (*if_guess)(struct iam_container *c); ++ struct list_head if_linkage; ++}; ++ ++void iam_format_register(struct iam_format *fmt); ++ ++void iam_lfix_format_init(void); ++ ++/* __LINUX_LUSTRE_IAM_H__ */ ++#endif +Index: iam/fs/ext3/iam.c +=================================================================== +--- iam.orig/fs/ext3/iam.c 2007-05-23 09:56:30.476305206 +0800 ++++ iam/fs/ext3/iam.c 2007-05-23 11:18:20.000000000 +0800 +@@ -0,0 +1,1436 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam.c ++ * Top-level entry points into iam module ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Wang Di ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++/* ++ * iam: big theory statement. ++ * ++ * iam (Index Access Module) is a module providing abstraction of persistent ++ * transactional container on top of generalized ext3 htree. ++ * ++ * iam supports: ++ * ++ * - key, pointer, and record size specifiable per container. ++ * ++ * - trees taller than 2 index levels. ++ * ++ * - read/write to existing ext3 htree directories as iam containers. ++ * ++ * iam container is a tree, consisting of leaf nodes containing keys and ++ * records stored in this container, and index nodes, containing keys and ++ * pointers to leaf or index nodes. ++ * ++ * iam does not work with keys directly, instead it calls user-supplied key ++ * comparison function (->dpo_keycmp()). ++ * ++ * Pointers are (currently) interpreted as logical offsets (measured in ++ * blocksful) within underlying flat file on top of which iam tree lives. ++ * ++ * On-disk format: ++ * ++ * iam mostly tries to reuse existing htree formats. ++ * ++ * Format of index node: ++ * ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * | | count | | | | | | ++ * | gap | / | entry | entry | .... | entry | free space | ++ * | | limit | | | | | | ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * ++ * gap this part of node is never accessed by iam code. It ++ * exists for binary compatibility with ext3 htree (that, ++ * in turn, stores fake struct ext2_dirent for ext2 ++ * compatibility), and to keep some unspecified per-node ++ * data. Gap can be different for root and non-root index ++ * nodes. Gap size can be specified for each container ++ * (gap of 0 is allowed). ++ * ++ * count/limit current number of entries in this node, and the maximal ++ * number of entries that can fit into node. count/limit ++ * has the same size as entry, and is itself counted in ++ * count. ++ * ++ * entry index entry: consists of a key immediately followed by ++ * a pointer to a child node. Size of a key and size of a ++ * pointer depends on container. Entry has neither ++ * alignment nor padding. ++ * ++ * free space portion of node new entries are added to ++ * ++ * Entries in index node are sorted by their key value. ++ * ++ * Format of a leaf node is not specified. Generic iam code accesses leaf ++ * nodes through ->id_leaf methods in struct iam_descr. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "xattr.h" ++#include "iopen.h" ++#include "acl.h" ++ ++/* ++ * List of all registered formats. ++ * ++ * No locking. Callers synchronize. ++ */ ++static LIST_HEAD(iam_formats); ++ ++void iam_format_register(struct iam_format *fmt) ++{ ++ list_add(&fmt->if_linkage, &iam_formats); ++} ++EXPORT_SYMBOL(iam_format_register); ++ ++/* ++ * Determine format of given container. This is done by scanning list of ++ * registered formats and calling ->if_guess() method of each in turn. ++ */ ++static int iam_format_guess(struct iam_container *c) ++{ ++ int result; ++ struct iam_format *fmt; ++ ++ /* ++ * XXX temporary initialization hook. ++ */ ++ { ++ static int initialized = 0; ++ ++ if (!initialized) { ++ /* ++ * Keep that order: htree should be registered first, ++ * so that iam_htree_guess() runs last. ++ */ ++ iam_htree_format_init(); ++ iam_lvar_format_init(); ++ iam_lfix_format_init(); ++ initialized = 1; ++ } ++ } ++ ++ result = -ENOENT; ++ list_for_each_entry(fmt, &iam_formats, if_linkage) { ++ result = fmt->if_guess(c); ++ if (result == 0) ++ break; ++ } ++ return result; ++} ++ ++/* ++ * Initialize container @c. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode) ++{ ++ memset(c, 0, sizeof *c); ++ c->ic_descr = descr; ++ c->ic_object = inode; ++ init_rwsem(&c->ic_sem); ++ return 0; ++} ++EXPORT_SYMBOL(iam_container_init); ++ ++/* ++ * Determine container format. ++ */ ++int iam_container_setup(struct iam_container *c) ++{ ++ return iam_format_guess(c); ++} ++EXPORT_SYMBOL(iam_container_setup); ++ ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c) ++{ ++} ++EXPORT_SYMBOL(iam_container_fini); ++ ++void iam_path_init(struct iam_path *path, struct iam_container *c, ++ struct iam_path_descr *pd) ++{ ++ memset(path, 0, sizeof *path); ++ path->ip_container = c; ++ path->ip_frame = path->ip_frames; ++ path->ip_data = pd; ++ path->ip_leaf.il_path = path; ++} ++ ++static void iam_leaf_fini(struct iam_leaf *leaf); ++ ++void iam_path_release(struct iam_path *path) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { ++ if (path->ip_frames[i].bh != NULL) { ++ brelse(path->ip_frames[i].bh); ++ path->ip_frames[i].bh = NULL; ++ } ++ } ++} ++ ++void iam_path_fini(struct iam_path *path) ++{ ++ iam_leaf_fini(&path->ip_leaf); ++ iam_path_release(path); ++} ++ ++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode) ++{ ++ int i; ++ ++ path->ipc_hinfo = &path->ipc_hinfo_area; ++ for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i) ++ path->ipc_descr.ipd_key_scratch[i] = ++ (struct iam_ikey *)&path->ipc_scratch[i]; ++ ++ iam_container_init(&path->ipc_container, ++ &iam_htree_compat_param, inode); ++ iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr); ++} ++ ++void iam_path_compat_fini(struct iam_path_compat *path) ++{ ++ iam_path_fini(&path->ipc_path); ++ iam_container_fini(&path->ipc_container); ++} ++ ++/* ++ * Helper function initializing iam_path_descr and its key scratch area. ++ */ ++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize) ++{ ++ struct iam_path_descr *ipd; ++ void *karea; ++ int i; ++ ++ ipd = area; ++ karea = ipd + 1; ++ for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize) ++ ipd->ipd_key_scratch[i] = karea; ++ return ipd; ++} ++EXPORT_SYMBOL(iam_ipd_alloc); ++ ++void iam_ipd_free(struct iam_path_descr *ipd) ++{ ++} ++EXPORT_SYMBOL(iam_ipd_free); ++ ++int iam_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh) ++{ ++ int result = 0; ++ ++ *bh = ext3_bread(h, c->ic_object, (int)ptr, 0, &result); ++ if (*bh == NULL) ++ result = -EIO; ++ return result; ++} ++ ++/* ++ * Return pointer to current leaf record. Pointer is valid while corresponding ++ * leaf node is locked and pinned. ++ */ ++static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->rec(leaf); ++} ++ ++/* ++ * Return pointer to the current leaf key. This function returns pointer to ++ * the key stored in node. ++ * ++ * Caller should assume that returned pointer is only valid while leaf node is ++ * pinned and locked. ++ */ ++static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->key(leaf); ++} ++ ++static int iam_leaf_key_size(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->key_size(leaf); ++} ++ ++static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf, ++ struct iam_ikey *key) ++{ ++ return iam_leaf_ops(leaf)->ikey(leaf, key); ++} ++ ++static int iam_leaf_keycmp(const struct iam_leaf *leaf, ++ const struct iam_key *key) ++{ ++ return iam_leaf_ops(leaf)->key_cmp(leaf, key); ++} ++ ++static int iam_leaf_keyeq(const struct iam_leaf *leaf, ++ const struct iam_key *key) ++{ ++ return iam_leaf_ops(leaf)->key_eq(leaf, key); ++} ++ ++#if EXT3_INVARIANT_ON ++static int iam_leaf_check(struct iam_leaf *leaf); ++extern int dx_node_check(struct iam_path *p, struct iam_frame *f); ++ ++static int iam_path_check(struct iam_path *p) ++{ ++ int i; ++ int result; ++ struct iam_frame *f; ++ struct iam_descr *param; ++ ++ result = 1; ++ param = iam_path_descr(p); ++ for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) { ++ f = &p->ip_frames[i]; ++ if (f->bh != NULL) { ++ result = dx_node_check(p, f); ++ if (result) ++ result = !param->id_ops->id_node_check(p, f); ++ } ++ } ++ if (result && p->ip_leaf.il_bh != NULL) ++ result = iam_leaf_check(&p->ip_leaf); ++ if (result == 0) { ++ ext3_std_error(iam_path_obj(p)->i_sb, result); ++ } ++ return result; ++} ++#endif ++ ++static int iam_leaf_load(struct iam_path *path) ++{ ++ iam_ptr_t block; ++ int err; ++ struct iam_container *c; ++ struct buffer_head *bh; ++ struct iam_leaf *leaf; ++ struct iam_descr *descr; ++ ++ c = path->ip_container; ++ leaf = &path->ip_leaf; ++ descr = iam_path_descr(path); ++ block = path->ip_frame->leaf; ++ if (block == 0) { ++ /* XXX bug 11027 */ ++ printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n", ++ (long unsigned)path->ip_frame->leaf, ++ dx_get_count(dx_node_get_entries(path, path->ip_frame)), ++ path->ip_frames[0].bh, path->ip_frames[1].bh, ++ path->ip_frames[2].bh); ++ } ++ err = descr->id_ops->id_node_read(c, block, NULL, &bh); ++ if (err == 0) { ++ leaf->il_bh = bh; ++ leaf->il_curidx = block; ++ err = iam_leaf_ops(leaf)->init(leaf); ++ assert_inv(ergo(err == 0, iam_leaf_check(leaf))); ++ } ++ return err; ++} ++ ++static void iam_leaf_unlock(struct iam_leaf *leaf) ++{ ++ if (leaf->il_lock != NULL) { ++ dx_unlock_htree(iam_leaf_container(leaf)->ic_object, ++ leaf->il_lock); ++ do_corr(schedule()); ++ leaf->il_lock = NULL; ++ } ++} ++ ++static void iam_leaf_fini(struct iam_leaf *leaf) ++{ ++ if (leaf->il_path != NULL) { ++ iam_leaf_unlock(leaf); ++ assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf))); ++ iam_leaf_ops(leaf)->fini(leaf); ++ if (leaf->il_bh) { ++ brelse(leaf->il_bh); ++ leaf->il_bh = NULL; ++ leaf->il_curidx = 0; ++ } ++ } ++} ++ ++static void iam_leaf_start(struct iam_leaf *folio) ++{ ++ iam_leaf_ops(folio)->start(folio); ++} ++ ++void iam_leaf_next(struct iam_leaf *folio) ++{ ++ iam_leaf_ops(folio)->next(folio); ++} ++ ++static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key, ++ const struct iam_rec *rec) ++{ ++ iam_leaf_ops(leaf)->rec_add(leaf, key, rec); ++} ++ ++static void iam_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ iam_leaf_ops(leaf)->rec_del(leaf, shift); ++} ++ ++int iam_leaf_at_end(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->at_end(leaf); ++} ++ ++void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr) ++{ ++ iam_leaf_ops(l)->split(l, bh, nr); ++} ++ ++int iam_leaf_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ return iam_leaf_ops(l)->can_add(l, k, r); ++} ++ ++#if EXT3_INVARIANT_ON ++static int iam_leaf_check(struct iam_leaf *leaf) ++{ ++ return 1; ++#if 0 ++ struct iam_lentry *orig; ++ struct iam_path *path; ++ struct iam_container *bag; ++ struct iam_ikey *k0; ++ struct iam_ikey *k1; ++ int result; ++ int first; ++ ++ orig = leaf->il_at; ++ path = iam_leaf_path(leaf); ++ bag = iam_leaf_container(leaf); ++ ++ result = iam_leaf_ops(leaf)->init(leaf); ++ if (result != 0) ++ return result; ++ ++ first = 1; ++ iam_leaf_start(leaf); ++ k0 = iam_path_ikey(path, 0); ++ k1 = iam_path_ikey(path, 1); ++ while (!iam_leaf_at_end(leaf)) { ++ iam_ikeycpy(bag, k0, k1); ++ iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1)); ++ if (!first && iam_ikeycmp(bag, k0, k1) > 0) { ++ return 0; ++ } ++ first = 0; ++ iam_leaf_next(leaf); ++ } ++ leaf->il_at = orig; ++ return 1; ++#endif ++} ++#endif ++ ++static int iam_txn_dirty(handle_t *handle, ++ struct iam_path *path, struct buffer_head *bh) ++{ ++ int result; ++ ++ result = ext3_journal_dirty_metadata(handle, bh); ++ if (result != 0) ++ ext3_std_error(iam_path_obj(path)->i_sb, result); ++ return result; ++} ++ ++static int iam_txn_add(handle_t *handle, ++ struct iam_path *path, struct buffer_head *bh) ++{ ++ int result; ++ ++ result = ext3_journal_get_write_access(handle, bh); ++ if (result != 0) ++ ext3_std_error(iam_path_obj(path)->i_sb, result); ++ return result; ++} ++ ++/***********************************************************************/ ++/* iterator interface */ ++/***********************************************************************/ ++ ++static enum iam_it_state it_state(const struct iam_iterator *it) ++{ ++ return it->ii_state; ++} ++ ++/* ++ * Helper function returning scratch key. ++ */ ++static struct iam_container *iam_it_container(const struct iam_iterator *it) ++{ ++ return it->ii_path.ip_container; ++} ++ ++static inline int it_keycmp(const struct iam_iterator *it, ++ const struct iam_key *k) ++{ ++ return iam_leaf_keycmp(&it->ii_path.ip_leaf, k); ++} ++ ++static inline int it_keyeq(const struct iam_iterator *it, ++ const struct iam_key *k) ++{ ++ return iam_leaf_keyeq(&it->ii_path.ip_leaf, k); ++} ++ ++static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik) ++{ ++ return iam_ikeycmp(it->ii_path.ip_container, ++ iam_leaf_ikey(&it->ii_path.ip_leaf, ++ iam_path_ikey(&it->ii_path, 0)), ik); ++} ++ ++static inline int it_at_rec(const struct iam_iterator *it) ++{ ++ return !iam_leaf_at_end(&it->ii_path.ip_leaf); ++} ++ ++static inline int it_before(const struct iam_iterator *it) ++{ ++ return it_state(it) == IAM_IT_SKEWED && it_at_rec(it); ++} ++ ++/* ++ * Helper wrapper around iam_it_get(): returns 0 (success) only when record ++ * with exactly the same key as asked is found. ++ */ ++static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k) ++{ ++ int result; ++ ++ result = iam_it_get(it, k); ++ if (result > 0) ++ result = 0; ++ else if (result == 0) ++ /* ++ * Return -ENOENT if cursor is located above record with a key ++ * different from one specified, or in the empty leaf. ++ * ++ * XXX returning -ENOENT only works if iam_it_get() never ++ * returns -ENOENT as a legitimate error. ++ */ ++ result = -ENOENT; ++ return result; ++} ++ ++void iam_container_write_lock(struct iam_container *ic) ++{ ++ down_write(&ic->ic_sem); ++} ++ ++void iam_container_write_unlock(struct iam_container *ic) ++{ ++ up_write(&ic->ic_sem); ++} ++ ++void iam_container_read_lock(struct iam_container *ic) ++{ ++ down_read(&ic->ic_sem); ++} ++ ++void iam_container_read_unlock(struct iam_container *ic) ++{ ++ up_read(&ic->ic_sem); ++} ++ ++/* ++ * Initialize iterator to IAM_IT_DETACHED state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, ++ struct iam_path_descr *pd) ++{ ++ memset(it, 0, sizeof *it); ++ it->ii_flags = flags; ++ it->ii_state = IAM_IT_DETACHED; ++ iam_path_init(&it->ii_path, c, pd); ++ return 0; ++} ++EXPORT_SYMBOL(iam_it_init); ++ ++/* ++ * Finalize iterator and release all resources. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_fini(struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ iam_path_fini(&it->ii_path); ++} ++EXPORT_SYMBOL(iam_it_fini); ++ ++/* ++ * Performs tree top-to-bottom traversal starting from root, and loads leaf ++ * node. ++ */ ++static int iam_path_lookup(struct iam_path *path, int index) ++{ ++ struct iam_container *c; ++ struct iam_descr *descr; ++ struct iam_leaf *leaf; ++ int result; ++ ++ c = path->ip_container; ++ leaf = &path->ip_leaf; ++ descr = iam_path_descr(path); ++ result = dx_lookup_lock(path, &leaf->il_lock, DLT_WRITE); ++ assert_inv(iam_path_check(path)); ++ do_corr(schedule()); ++ if (result == 0) { ++ result = iam_leaf_load(path); ++ assert_inv(ergo(result == 0, iam_leaf_check(leaf))); ++ if (result == 0) { ++ do_corr(schedule()); ++ if (index) ++ result = iam_leaf_ops(leaf)-> ++ ilookup(leaf, path->ip_ikey_target); ++ else ++ result = iam_leaf_ops(leaf)-> ++ lookup(leaf, path->ip_key_target); ++ do_corr(schedule()); ++ } ++ if (result < 0) ++ iam_leaf_unlock(leaf); ++ } ++ return result; ++} ++ ++/* ++ * Common part of iam_it_{i,}get(). ++ */ ++static int __iam_it_get(struct iam_iterator *it, int index) ++{ ++ int result; ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ ++ result = iam_path_lookup(&it->ii_path, index); ++ if (result >= 0) { ++ int collision; ++ ++ collision = result & IAM_LOOKUP_LAST; ++ switch (result & ~IAM_LOOKUP_LAST) { ++ case IAM_LOOKUP_EXACT: ++ result = +1; ++ it->ii_state = IAM_IT_ATTACHED; ++ break; ++ case IAM_LOOKUP_OK: ++ result = 0; ++ it->ii_state = IAM_IT_ATTACHED; ++ break; ++ case IAM_LOOKUP_BEFORE: ++ case IAM_LOOKUP_EMPTY: ++ result = 0; ++ it->ii_state = IAM_IT_SKEWED; ++ break; ++ default: ++ assert(0); ++ } ++ result |= collision; ++ } ++ /* ++ * See iam_it_get_exact() for explanation. ++ */ ++ assert_corr(result != -ENOENT); ++ return result; ++} ++ ++/* ++ * Correct hash, but not the same key was found, iterate through hash ++ * collision chain, looking for correct record. ++ */ ++static int iam_it_collision(struct iam_iterator *it) ++{ ++ int result; ++ ++ assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target))); ++ ++ while ((result = iam_it_next(it)) == 0) { ++ do_corr(schedule()); ++ if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0) ++ return -ENOENT; ++ if (it_keyeq(it, it->ii_path.ip_key_target)) ++ return 0; ++ } ++ return result; ++} ++ ++/* ++ * Attach iterator. After successful completion, @it points to record with ++ * least key not larger than @k. ++ * ++ * Return value: 0: positioned on existing record, ++ * +ve: exact position found, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED, ++ * it_keycmp(it, k) <= 0) ++ */ ++int iam_it_get(struct iam_iterator *it, const struct iam_key *k) ++{ ++ int result; ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ ++ it->ii_path.ip_ikey_target = NULL; ++ it->ii_path.ip_key_target = k; ++ ++ result = __iam_it_get(it, 0); ++ ++ if (result == IAM_LOOKUP_LAST) { ++ result = iam_it_collision(it); ++ if (result != 0) { ++ iam_it_put(it); ++ iam_it_fini(it); ++ result = __iam_it_get(it, 0); ++ } else ++ result = +1; ++ } ++ if (result > 0) ++ result &= ~IAM_LOOKUP_LAST; ++ ++ assert_corr(ergo(result > 0, it_keycmp(it, k) == 0)); ++ assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED, ++ it_keycmp(it, k) <= 0)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_get); ++ ++/* ++ * Attach iterator by index key. ++ */ ++static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k) ++{ ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ ++ it->ii_path.ip_ikey_target = k; ++ return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST; ++} ++ ++/* ++ * Attach iterator, and assure it points to the record (not skewed). ++ * ++ * Return value: 0: positioned on existing record, ++ * +ve: exact position found, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED && ++ * !(it->ii_flags&IAM_IT_WRITE) ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) ++ */ ++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k) ++{ ++ int result; ++ assert_corr(it_state(it) == IAM_IT_DETACHED && ++ !(it->ii_flags&IAM_IT_WRITE)); ++ result = iam_it_get(it, k); ++ if (result == 0) { ++ if (it_state(it) != IAM_IT_ATTACHED) { ++ assert_corr(it_state(it) == IAM_IT_SKEWED); ++ result = iam_it_next(it); ++ } ++ } ++ assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_get_at); ++ ++/* ++ * Duplicates iterator. ++ * ++ * postcondition: it_state(dst) == it_state(src) && ++ * iam_it_container(dst) == iam_it_container(src) && ++ * dst->ii_flags = src->ii_flags && ++ * ergo(it_state(src) == IAM_IT_ATTACHED, ++ * iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ * iam_it_key_get(dst) == iam_it_key_get(src)) ++ */ ++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src) ++{ ++ dst->ii_flags = src->ii_flags; ++ dst->ii_state = src->ii_state; ++ /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */ ++ /* ++ * XXX: duplicate lock. ++ */ ++ assert_corr(it_state(dst) == it_state(src)); ++ assert_corr(iam_it_container(dst) == iam_it_container(src)); ++ assert_corr(dst->ii_flags = src->ii_flags); ++ assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED, ++ iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ iam_it_key_get(dst) == iam_it_key_get(src))); ++ ++} ++ ++/* ++ * Detach iterator. Does nothing it detached state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_put(struct iam_iterator *it) ++{ ++ if (it->ii_state != IAM_IT_DETACHED) { ++ it->ii_state = IAM_IT_DETACHED; ++ iam_leaf_fini(&it->ii_path.ip_leaf); ++ } ++} ++EXPORT_SYMBOL(iam_it_put); ++ ++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it, ++ struct iam_ikey *ikey); ++/* ++ * Move iterator one record right. ++ * ++ * Return value: 0: success, ++ * +1: end of container reached ++ * -ve: error ++ * ++ * precondition: (it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) && ++ * ergo(result > 0, it_state(it) == IAM_IT_DETACHED) ++ */ ++int iam_it_next(struct iam_iterator *it) ++{ ++ int result; ++ struct iam_path *path; ++ struct iam_leaf *leaf; ++ struct inode *obj; ++ do_corr(struct iam_ikey *ik_orig); ++ ++ /* assert_corr(it->ii_flags&IAM_IT_MOVE); */ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ ++ path = &it->ii_path; ++ leaf = &path->ip_leaf; ++ obj = iam_path_obj(path); ++ ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ result = 0; ++ do_corr(ik_orig = it_at_rec(it) ? ++ iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL); ++ if (it_before(it)) { ++ assert_corr(!iam_leaf_at_end(leaf)); ++ it->ii_state = IAM_IT_ATTACHED; ++ } else { ++ if (!iam_leaf_at_end(leaf)) ++ /* advance within leaf node */ ++ iam_leaf_next(leaf); ++ /* ++ * multiple iterations may be necessary due to empty leaves. ++ */ ++ while (result == 0 && iam_leaf_at_end(leaf)) { ++ do_corr(schedule()); ++ /* advance index portion of the path */ ++ result = iam_index_next(iam_it_container(it), path); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ if (result == 1) { ++ struct dynlock_handle *lh; ++ lh = dx_lock_htree(obj, path->ip_frame->leaf, ++ DLT_WRITE); ++ if (lh != NULL) { ++ iam_leaf_fini(leaf); ++ leaf->il_lock = lh; ++ result = iam_leaf_load(path); ++ if (result == 0) ++ iam_leaf_start(leaf); ++ } else ++ result = -ENOMEM; ++ } else if (result == 0) ++ /* end of container reached */ ++ result = +1; ++ if (result != 0) ++ iam_it_put(it); ++ } ++ if (result == 0) ++ it->ii_state = IAM_IT_ATTACHED; ++ } ++ assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)); ++ assert_corr(ergo(result > 0, it_state(it) == IAM_IT_DETACHED)); ++ assert_corr(ergo(result == 0 && ik_orig != NULL, ++ it_ikeycmp(it, ik_orig) >= 0)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_next); ++ ++/* ++ * Return pointer to the record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it_at_rec(it) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_rec(&it->ii_path.ip_leaf); ++} ++EXPORT_SYMBOL(iam_it_rec_get); ++ ++static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r) ++{ ++ struct iam_leaf *folio; ++ ++ folio = &it->ii_path.ip_leaf; ++ iam_leaf_ops(folio)->rec_set(folio, r); ++} ++ ++/* ++ * Replace contents of record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_set(handle_t *h, ++ struct iam_iterator *it, const struct iam_rec *r) ++{ ++ int result; ++ struct iam_path *path; ++ struct buffer_head *bh; ++ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED && ++ it->ii_flags&IAM_IT_WRITE); ++ assert_corr(it_at_rec(it)); ++ ++ path = &it->ii_path; ++ bh = path->ip_leaf.il_bh; ++ result = iam_txn_add(h, path, bh); ++ if (result == 0) { ++ iam_it_reccpy(it, r); ++ result = iam_txn_dirty(h, path, bh); ++ } ++ return result; ++} ++EXPORT_SYMBOL(iam_it_rec_set); ++ ++/* ++ * Return pointer to the index key under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED ++ */ ++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it, ++ struct iam_ikey *ikey) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey); ++} ++ ++/* ++ * Return pointer to the key under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED ++ */ ++struct iam_key *iam_it_key_get(const struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_key(&it->ii_path.ip_leaf); ++} ++EXPORT_SYMBOL(iam_it_key_get); ++ ++/* ++ * Return size of key under iterator (in bytes) ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED ++ */ ++int iam_it_key_size(const struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_key_size(&it->ii_path.ip_leaf); ++} ++EXPORT_SYMBOL(iam_it_key_size); ++ ++/* ++ * Insertion of new record. Interaction with jbd during non-trivial case (when ++ * split happens) is as following: ++ * ++ * - new leaf node is involved into transaction by ext3_append(); ++ * ++ * - old leaf node is involved into transaction by iam_add_rec(); ++ * ++ * - leaf where insertion point ends in, is marked dirty by iam_add_rec(); ++ * ++ * - leaf without insertion point is marked dirty (as @new_leaf) by ++ * iam_new_leaf(); ++ * ++ * - split index nodes are involved into transaction and marked dirty by ++ * split_index_node(). ++ * ++ * - "safe" index node, which is no split, but where new pointer is inserted ++ * is involved into transaction and marked dirty by split_index_node(). ++ * ++ * - index node where pointer to new leaf is inserted is involved into ++ * transaction by split_index_node() and marked dirty by iam_add_rec(). ++ * ++ * - inode is marked dirty by iam_add_rec(). ++ * ++ */ ++ ++static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf) ++{ ++ int err; ++ iam_ptr_t blknr; ++ struct buffer_head *new_leaf; ++ struct buffer_head *old_leaf; ++ struct iam_container *c; ++ struct inode *obj; ++ struct iam_path *path; ++ ++ assert_inv(iam_leaf_check(leaf)); ++ ++ c = iam_leaf_container(leaf); ++ path = leaf->il_path; ++ ++ obj = c->ic_object; ++ new_leaf = ext3_append(handle, obj, (__u32 *)&blknr, &err); ++ do_corr(schedule()); ++ if (new_leaf != NULL) { ++ struct dynlock_handle *lh; ++ ++ lh = dx_lock_htree(obj, blknr, DLT_WRITE); ++ do_corr(schedule()); ++ if (lh != NULL) { ++ iam_leaf_ops(leaf)->init_new(c, new_leaf); ++ do_corr(schedule()); ++ old_leaf = leaf->il_bh; ++ iam_leaf_split(leaf, &new_leaf, blknr); ++ if (old_leaf != leaf->il_bh) { ++ /* ++ * Switched to the new leaf. ++ */ ++ iam_leaf_unlock(leaf); ++ leaf->il_lock = lh; ++ path->ip_frame->leaf = blknr; ++ } else ++ dx_unlock_htree(obj, lh); ++ do_corr(schedule()); ++ err = iam_txn_dirty(handle, path, new_leaf); ++ brelse(new_leaf); ++ if (err == 0) ++ err = ext3_mark_inode_dirty(handle, obj); ++ do_corr(schedule()); ++ } else ++ err = -ENOMEM; ++ } ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf)); ++ assert_inv(iam_path_check(iam_leaf_path(leaf))); ++ return err; ++} ++ ++static int iam_add_rec(handle_t *handle, struct iam_iterator *it, ++ struct iam_path *path, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ int err; ++ struct iam_leaf *leaf; ++ ++ leaf = &path->ip_leaf; ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_path_check(path)); ++ err = iam_txn_add(handle, path, leaf->il_bh); ++ if (err == 0) { ++ do_corr(schedule()); ++ if (!iam_leaf_can_add(leaf, k, r)) { ++ struct dynlock_handle *lh = NULL; ++ ++ do { ++ assert_corr(lh == NULL); ++ do_corr(schedule()); ++ err = split_index_node(handle, path, &lh); ++ if (err == -EAGAIN) { ++ assert_corr(lh == NULL); ++ ++ iam_path_fini(path); ++ it->ii_state = IAM_IT_DETACHED; ++ ++ do_corr(schedule()); ++ err = iam_it_get_exact(it, k); ++ if (err == -ENOENT) ++ err = +1; /* repeat split */ ++ else if (err == 0) ++ err = -EEXIST; ++ } ++ } while (err > 0); ++ assert_inv(iam_path_check(path)); ++ if (err == 0) { ++ assert_corr(lh != NULL); ++ do_corr(schedule()); ++ err = iam_new_leaf(handle, leaf); ++ if (err == 0) ++ err = iam_txn_dirty(handle, path, ++ path->ip_frame->bh); ++ } ++ dx_unlock_htree(iam_path_obj(path), lh); ++ do_corr(schedule()); ++ } ++ if (err == 0) { ++ iam_leaf_rec_add(leaf, k, r); ++ err = iam_txn_dirty(handle, path, leaf->il_bh); ++ } ++ } ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_leaf_check(&path->ip_leaf)); ++ assert_inv(iam_path_check(path)); ++ return err; ++} ++ ++/* ++ * Insert new record with key @k and contents from @r, shifting records to the ++ * right. On success, iterator is positioned on the newly inserted record. ++ * ++ * precondition: it->ii_flags&IAM_IT_WRITE && ++ * (it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED) && ++ * ergo(it_state(it) == IAM_IT_ATTACHED, ++ * it_keycmp(it, k) <= 0) && ++ * ergo(it_before(it), it_keycmp(it, k) > 0)); ++ * postcondition: ergo(result == 0, ++ * it_state(it) == IAM_IT_ATTACHED && ++ * it_keycmp(it, k) == 0 && ++ * !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ int result; ++ struct iam_path *path; ++ ++ path = &it->ii_path; ++ ++ assert_corr(it->ii_flags&IAM_IT_WRITE); ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED, ++ it_keycmp(it, k) <= 0)); ++ assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0)); ++ result = iam_add_rec(h, it, path, k, r); ++ if (result == 0) ++ it->ii_state = IAM_IT_ATTACHED; ++ assert_corr(ergo(result == 0, ++ it_state(it) == IAM_IT_ATTACHED && ++ it_keycmp(it, k) == 0)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_rec_insert); ++ ++/* ++ * Delete record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE && ++ * it_at_rec(it) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it) ++{ ++ int result; ++ struct iam_leaf *leaf; ++ struct iam_path *path; ++ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED && ++ it->ii_flags&IAM_IT_WRITE); ++ assert_corr(it_at_rec(it)); ++ ++ path = &it->ii_path; ++ leaf = &path->ip_leaf; ++ ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_path_check(path)); ++ ++ result = iam_txn_add(h, path, leaf->il_bh); ++ /* ++ * no compaction for now. ++ */ ++ if (result == 0) { ++ iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE); ++ result = iam_txn_dirty(h, path, leaf->il_bh); ++ if (result == 0 && iam_leaf_at_end(leaf) && ++ it->ii_flags&IAM_IT_MOVE) { ++ result = iam_it_next(it); ++ if (result > 0) ++ result = 0; ++ } ++ } ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_path_check(path)); ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_DETACHED); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_rec_delete); ++ ++/* ++ * Convert iterator to cookie. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++iam_pos_t iam_it_store(const struct iam_iterator *it) ++{ ++ iam_pos_t result; ++ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED); ++ assert_corr(it_at_rec(it)); ++ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= ++ sizeof result); ++ ++ result = 0; ++ return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result); ++} ++EXPORT_SYMBOL(iam_it_store); ++ ++/* ++ * Restore iterator from cookie. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE && ++ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED && ++ * iam_it_store(it) == pos) ++ */ ++int iam_it_load(struct iam_iterator *it, iam_pos_t pos) ++{ ++ assert_corr(it_state(it) == IAM_IT_DETACHED && ++ it->ii_flags&IAM_IT_MOVE); ++ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos); ++ return iam_it_iget(it, (struct iam_ikey *)&pos); ++} ++EXPORT_SYMBOL(iam_it_load); ++ ++/***********************************************************************/ ++/* invariants */ ++/***********************************************************************/ ++ ++static inline int ptr_inside(void *base, size_t size, void *ptr) ++{ ++ return (base <= ptr) && (ptr < base + size); ++} ++ ++int iam_frame_invariant(struct iam_frame *f) ++{ ++ return ++ (f->bh != NULL && ++ f->bh->b_data != NULL && ++ ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) && ++ ptr_inside(f->bh->b_data, f->bh->b_size, f->at) && ++ f->entries <= f->at); ++} ++int iam_leaf_invariant(struct iam_leaf *l) ++{ ++ return ++ l->il_bh != NULL && ++ l->il_bh->b_data != NULL && ++ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) && ++ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) && ++ l->il_entries <= l->il_at; ++} ++ ++int iam_path_invariant(struct iam_path *p) ++{ ++ int i; ++ ++ if (p->ip_container == NULL || ++ p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 || ++ p->ip_frame != p->ip_frames + p->ip_indirect || ++ !iam_leaf_invariant(&p->ip_leaf)) ++ return 0; ++ for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) { ++ if (i <= p->ip_indirect) { ++ if (!iam_frame_invariant(&p->ip_frames[i])) ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++int iam_it_invariant(struct iam_iterator *it) ++{ ++ return ++ (it->ii_state == IAM_IT_DETACHED || ++ it->ii_state == IAM_IT_ATTACHED || ++ it->ii_state == IAM_IT_SKEWED) && ++ !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) && ++ ergo(it->ii_state == IAM_IT_ATTACHED || ++ it->ii_state == IAM_IT_SKEWED, ++ iam_path_invariant(&it->ii_path) && ++ equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED)); ++} ++ ++/* ++ * Search container @c for record with key @k. If record is found, its data ++ * are moved into @r. ++ * ++ * Return values: 0: found, -ENOENT: not-found, -ve: error ++ */ ++int iam_lookup(struct iam_container *c, const struct iam_key *k, ++ struct iam_rec *r, struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, 0, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == 0) ++ /* ++ * record with required key found, copy it into user buffer ++ */ ++ iam_reccpy(&it.ii_path.ip_leaf, r); ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_lookup); ++ ++/* ++ * Insert new record @r with key @k into container @c (within context of ++ * transaction @h). ++ * ++ * Return values: 0: success, -ve: error, including -EEXIST when record with ++ * given key is already present. ++ * ++ * postcondition: ergo(result == 0 || result == -EEXIST, ++ * iam_lookup(c, k, r2) > 0; ++ */ ++int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ const struct iam_rec *r, struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, IAM_IT_WRITE, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == -ENOENT) ++ result = iam_it_rec_insert(h, &it, k, r); ++ else if (result == 0) ++ result = -EEXIST; ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_insert); ++ ++/* ++ * Update record with the key @k in container @c (within context of ++ * transaction @h), new record is given by @r. ++ * ++ * Return values: 0: success, -ve: error, including -ENOENT if no record with ++ * the given key found. ++ */ ++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ const struct iam_rec *r, struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, IAM_IT_WRITE, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == 0) ++ iam_it_rec_set(h, &it, r); ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_update); ++ ++/* ++ * Delete existing record with key @k. ++ * ++ * Return values: 0: success, -ENOENT: not-found, -ve: other error. ++ * ++ * postcondition: ergo(result == 0 || result == -ENOENT, ++ * !iam_lookup(c, k, *)); ++ */ ++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, IAM_IT_WRITE, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == 0) ++ iam_it_rec_delete(h, &it); ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_delete); ++ +Index: iam/fs/ext3/namei.c +=================================================================== +--- iam.orig/fs/ext3/namei.c 2007-05-23 11:18:18.000000000 +0800 ++++ iam/fs/ext3/namei.c 2007-05-23 11:18:20.000000000 +0800 +@@ -24,81 +24,6 @@ + * Theodore Ts'o, 2002 + */ + +-/* +- * iam: big theory statement. +- * +- * iam (Index Access Module) is a module providing abstraction of persistent +- * transactional container on top of generalized ext3 htree. +- * +- * iam supports: +- * +- * - key, pointer, and record size specifiable per container. +- * +- * - trees taller than 2 index levels. +- * +- * - read/write to existing ext3 htree directories as iam containers. +- * +- * iam container is a tree, consisting of leaf nodes containing keys and +- * records stored in this container, and index nodes, containing keys and +- * pointers to leaf or index nodes. +- * +- * iam does not work with keys directly, instead it calls user-supplied key +- * comparison function (->dpo_keycmp()). +- * +- * Pointers are (currently) interpreted as logical offsets (measured in +- * blocksful) within underlying flat file on top of which iam tree lives. +- * +- * On-disk format: +- * +- * iam mostly tries to reuse existing htree formats. +- * +- * Format of index node: +- * +- * +-----+-------+-------+-------+------+-------+------------+ +- * | | count | | | | | | +- * | gap | / | entry | entry | .... | entry | free space | +- * | | limit | | | | | | +- * +-----+-------+-------+-------+------+-------+------------+ +- * +- * gap this part of node is never accessed by iam code. It +- * exists for binary compatibility with ext3 htree (that, +- * in turn, stores fake struct ext2_dirent for ext2 +- * compatibility), and to keep some unspecified per-node +- * data. Gap can be different for root and non-root index +- * nodes. Gap size can be specified for each container +- * (gap of 0 is allowed). +- * +- * count/limit current number of entries in this node, and the maximal +- * number of entries that can fit into node. count/limit +- * has the same size as entry, and is itself counted in +- * count. +- * +- * entry index entry: consists of a key immediately followed by +- * a pointer to a child node. Size of a key and size of a +- * pointer depends on container. Entry has neither +- * alignment nor padding. +- * +- * free space portion of node new entries are added to +- * +- * Entries in index node are sorted by their key value. +- * +- * Format of leaf node: +- * +- * +-----+-------+-------+-------+------+-------+------------+ +- * | | count | | | | | | +- * | gap | / | leaf | leaf | .... | leaf | free space | +- * | | limit | | | | | | +- * +-----+-------+-------+-------+------+-------+------------+ +- +- * leaf For leaf entry: consists of a rec immediately followd by +- * a key. size of a key and size of a rec depends on container. +- * +- * +- * +- * +- * +- */ +- + #include + #include + #include +@@ -112,10 +37,10 @@ + #include + #include + #include ++#include + #include "xattr.h" + #include "iopen.h" + #include "acl.h" +-#include + /* + * define how far ahead to read directories while searching them. + */ +@@ -125,7 +50,7 @@ + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + + +-static struct buffer_head *ext3_append(handle_t *handle, ++struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) + { +@@ -136,14 +61,15 @@ + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { + inode->i_size += inode->i_sb->s_blocksize; + EXT3_I(inode)->i_disksize = inode->i_size; +- ext3_journal_get_write_access(handle,bh); ++ *err = ext3_journal_get_write_access(handle, bh); ++ if (*err != 0) { ++ brelse(bh); ++ bh = NULL; ++ } + } + return bh; + } + +-#ifndef assert +-#define assert(test) J_ASSERT(test) +-#endif + + #ifndef swap + #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +@@ -155,293 +81,10 @@ + #define dxtrace(command) + #endif + +-struct fake_dirent { +- __le32 inode; +- __le16 rec_len; +- u8 name_len; +- u8 file_type; +-}; +- +-struct dx_countlimit { +- __le16 limit; +- __le16 count; +-}; +- +-/* +- * dx_root_info is laid out so that if it should somehow get overlaid by a +- * dirent the two low bits of the hash version will be zero. Therefore, the +- * hash version mod 4 should never be 0. Sincerely, the paranoia department. +- */ +- +-struct dx_root { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct {} entries[0]; +-}; +- +-struct dx_node +-{ +- struct fake_dirent fake; +- struct {} entries[0]; +-}; +- +-struct dx_map_entry +-{ +- u32 hash; +- u32 offs; +-}; +- +- +-static u32 htree_root_ptr(struct iam_container *c); +-static int htree_node_check(struct iam_path *path, struct iam_frame *frame); +-static int htree_node_init(struct iam_container *c, +- struct buffer_head *bh, int root); +-static int htree_keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); +-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, +- handle_t *h, struct buffer_head **bh); +- +-/* +- * Parameters describing iam compatibility mode in which existing ext3 htrees +- * can be manipulated. +- */ +-static struct iam_descr htree_compat_param = { +- .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, +- .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, +- .id_node_gap = offsetof(struct dx_node, entries), +- .id_root_gap = offsetof(struct dx_root, entries), +- +- .id_root_ptr = htree_root_ptr, +- .id_node_check = htree_node_check, +- .id_node_init = htree_node_init, +- .id_node_read = htree_node_read, +- .id_keycmp = htree_keycmp +-}; +- +- +-struct iam_key; +-struct iam_rec; +-struct iam_descr; +-struct iam_container; +-struct iam_path; +- +- +- +-/* +- * iam cursor (iterator) api. +- */ +- +-/* +- * Flags controlling iterator functionality. +- */ +-enum iam_it_flags { +- /* +- * this iterator will move (iam_it_{prev,next}() will be called on it) +- */ +- IAM_IT_MOVE = (1 << 0), +- /* +- * tree can be updated through this iterator. +- */ +- IAM_IT_WRITE = (1 << 1) +-}; +- +-/* +- * States of iterator state machine. +- */ +-enum iam_it_state { +- /* initial state */ +- IAM_IT_DETACHED, +- /* iterator is above particular record in the container */ +- IAM_IT_ATTACHED +-}; +- +-struct htree_cookie { +- struct dx_hash_info *hinfo; +- struct dentry *dentry; +-}; +- +-/* +- * Iterator. +- * +- * Immediately after call to iam_it_init() iterator is in "detached" +- * (IAM_IT_DETACHED) state: it is associated with given parent container, but +- * doesn't point to any particular record in this container. +- * +- * After successful call to iam_it_get() and until corresponding call to +- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). +- * +- * Attached iterator can move through records in a container (provided +- * IAM_IT_MOVE permission) in a key order, can get record and key values as it +- * passes over them, and can modify container (provided IAM_IT_WRITE +- * permission). +- * +- * Concurrency: iterators are supposed to be local to thread. Interfaces below +- * do no internal serialization. +- * +- */ +-struct iam_iterator { +- /* +- * iterator flags, taken from enum iam_it_flags. +- */ +- __u32 ii_flags; +- enum iam_it_state ii_state; +- /* +- * path to the record. Valid in IAM_IT_ATTACHED state. +- */ +- struct iam_path ii_path; +-}; +- +-static inline struct iam_key *keycpy(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return memcpy(k1, k2, c->ic_descr->id_key_size); +-} +- +-static inline int keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return c->ic_descr->id_keycmp(c, k1, k2); +-} +- +-static struct iam_container *iam_it_container(struct iam_iterator *it) +-{ +- return it->ii_path.ip_container; +-} +- +-static inline int it_keycmp(struct iam_iterator *it, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return keycmp(iam_it_container(it), k1, k2); +-} +- +-/* +- * Initialize iterator to IAM_IT_DETACHED state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ +-int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags); +-/* +- * Finalize iterator and release all resources. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- */ +-void iam_it_fini(struct iam_iterator *it); +- +-/* +- * Attach iterator. After successful completion, @it points to record with the +- * largest key not larger than @k. Semantics of ->id_create() method guarantee +- * that such record will always be found. +- * +- * Return value: 0: positioned on existing record, +- * -ve: error. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- * postcondition: ergo(result == 0, +- * (it_state(it) == IAM_IT_ATTACHED && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) +- */ +-int iam_it_get(struct iam_iterator *it, struct iam_key *k); +- +-/* +- * Duplicates iterator. +- * +- * postcondition: it_state(dst) == it_state(src) && +- * iam_it_container(dst) == iam_it_container(src) && +- * dst->ii_flags = src->ii_flags && +- * ergo(it_state(it) == IAM_IT_ATTACHED, +- * iam_it_rec_get(dst) == iam_it_rec_get(src) && +- * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) +- */ +-void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src); +- +-/* +- * Detach iterator. Does nothing it detached state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ +-void iam_it_put(struct iam_iterator *it); +- +-/* +- * Move iterator one record right. +- * +- * Return value: 0: success, +- * +1: end of container reached +- * -ve: error +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE +- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) +- */ +-int iam_it_next(struct iam_iterator *it); +- +-/* +- * Return pointer to the record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-const struct iam_rec *iam_it_rec_get(struct iam_iterator *it); +- +-/* +- * Replace contents of record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); +- +-/* +- * Place key under iterator in @k, return @k +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-const struct iam_key *iam_it_key_get(struct iam_iterator *it, +- struct iam_key *k); +- +-/* +- * Insert new record with key @k and contents from @r, shifting records to the +- * right. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && +- * it->ii_flags&IAM_IT_WRITE && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0 +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, +- * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && +- * !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Delete record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); +- + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry); + static void dx_set_block(struct iam_path *p, + struct iam_entry *entry, unsigned value); +-static inline struct iam_key *dx_get_key(struct iam_path *p, +- struct iam_entry *entry, +- struct iam_key *key); +-static void dx_set_key(struct iam_path *p, struct iam_entry *entry, +- struct iam_key *key); +-static unsigned dx_get_count(struct iam_entry *entries); + static unsigned dx_get_limit(struct iam_entry *entries); + static void dx_set_count(struct iam_entry *entries, unsigned value); + static void dx_set_limit(struct iam_entry *entries, unsigned value); +@@ -457,264 +100,62 @@ + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct iam_path *path, +- struct iam_frame *frame, u32 hash, u32 block); +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct iam_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +-static inline void iam_path_init(struct iam_path *path, +- struct iam_container *c, struct htree_cookie *hc); +-static inline void iam_path_fini(struct iam_path *path); +- +- +-/* +- * Future: use high four bits of block for coalesce-on-delete flags +- * Mask them off for now. +- */ +- +-static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off) +-{ +- return (void *)((char *)entry + off); +-} +- +-static inline struct iam_descr *path_descr(struct iam_path *p) +-{ +- return p->ip_container->ic_descr; +-} +- +-static inline struct inode *path_obj(struct iam_path *p) +-{ +- return p->ip_container->ic_object; +-} +- +-static inline size_t iam_entry_size(struct iam_path *p) +-{ +- return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size; +-} +- +-static inline struct iam_entry *iam_entry_shift(struct iam_path *p, +- struct iam_entry *entry, int shift) +-{ +- void *e = entry; +- return e + shift * iam_entry_size(p); +-} +- +-static inline ptrdiff_t iam_entry_diff(struct iam_path *p, +- struct iam_entry *e1, struct iam_entry *e2) +-{ +- ptrdiff_t diff; +- +- diff = (void *)e1 - (void *)e2; +- assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff); +- return diff / iam_entry_size(p); +-} +- +-static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) +-{ +- return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size)) +- & 0x00ffffff; +-} +- +-static inline void dx_set_block(struct iam_path *p, +- struct iam_entry *entry, unsigned value) +-{ +- *(u32*)entry_off(entry, +- path_descr(p)->id_key_size) = cpu_to_le32(value); +-} +- +-static inline struct iam_key *dx_get_key(struct iam_path *p, +- struct iam_entry *entry, +- struct iam_key *key) +-{ +- memcpy(key, entry, path_descr(p)->id_key_size); +- return key; +-} +- +-static inline struct iam_key *iam_key_at(struct iam_path *p, +- struct iam_entry *entry) +-{ +- return (struct iam_key *)entry; +-} +- +-static inline void dx_set_key(struct iam_path *p, +- struct iam_entry *entry, struct iam_key *key) +-{ +- memcpy(entry, key, path_descr(p)->id_key_size); +-} +- +-static inline unsigned dx_get_count (struct iam_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->count); +-} +- +-static inline unsigned dx_get_limit (struct iam_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +-} +- +-static inline void dx_set_count (struct iam_entry *entries, unsigned value) +-{ +- ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +-} +- +-static inline void dx_set_limit (struct iam_entry *entries, unsigned value) ++static inline void dx_set_limit(struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct iam_path *p) ++int dx_index_is_compat(struct iam_path *path) + { +- struct iam_descr *param = path_descr(p); +- unsigned entry_space = path_obj(p)->i_sb->s_blocksize - +- param->id_root_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); ++ return iam_path_descr(path) == &iam_htree_compat_param; + } + +-static inline unsigned dx_node_limit(struct iam_path *p) +-{ +- struct iam_descr *param = path_descr(p); +- unsigned entry_space = path_obj(p)->i_sb->s_blocksize - +- param->id_node_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); +-} +- +-static inline int dx_index_is_compat(struct iam_path *path) +-{ +- return path_descr(path) == &htree_compat_param; +-} +- +-static struct iam_entry *dx_get_entries(struct iam_path *path, void *data, +- int root) +-{ +- return data + +- (root ? +- path_descr(path)->id_root_gap : path_descr(path)->id_node_gap); +-} +- +-static struct iam_entry *dx_node_get_entries(struct iam_path *path, +- struct iam_frame *frame) +-{ +- return dx_get_entries(path, +- frame->bh->b_data, frame == path->ip_frames); +-} + +-static int dx_node_check(struct iam_path *p, struct iam_frame *f) ++int dx_node_check(struct iam_path *p, struct iam_frame *f) + { + struct iam_entry *e; + struct iam_container *c; + unsigned count; + unsigned i; ++ iam_ptr_t blk; ++ iam_ptr_t root; ++ struct inode *inode; + + c = p->ip_container; + e = dx_node_get_entries(p, f); + count = dx_get_count(e); + e = iam_entry_shift(p, e, 1); ++ root = iam_path_descr(p)->id_ops->id_root_ptr(c); ++ ++ inode = iam_path_obj(p); + for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { +- keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]); +- dx_get_key(p, e, p->ip_key_scratch[1]); ++ iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1)); ++ iam_get_ikey(p, e, iam_path_ikey(p, 1)); + if (i > 0 && +- keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0) ++ iam_ikeycmp(c, iam_path_ikey(p, 0), ++ iam_path_ikey(p, 1)) > 0) { ++ BREAKPOINT(); + return 0; + } +- return 1; +-} +- +-static u32 htree_root_ptr(struct iam_container *c) +-{ ++ blk = dx_get_block(p, e); ++ if (inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) { ++ BREAKPOINT(); + return 0; +-} +- +-static int htree_node_check(struct iam_path *path, struct iam_frame *frame) +-{ +- void *data; +- struct iam_entry *entries; +- struct super_block *sb; +- +- data = frame->bh->b_data; +- entries = dx_node_get_entries(path, frame); +- sb = path_obj(path)->i_sb; +- if (frame == path->ip_frames) { +- /* root node */ +- struct dx_root *root; +- struct htree_cookie *hc = path->ip_descr_data; +- +- root = data; +- if (root->info.hash_version > DX_HASH_MAX) { +- ext3_warning(sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); +- return ERR_BAD_DX_DIR; + } +- +- if (root->info.unused_flags & 1) { +- ext3_warning(sb, __FUNCTION__, +- "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); +- return ERR_BAD_DX_DIR; +- } +- +- path->ip_indirect = root->info.indirect_levels; +- if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) { +- ext3_warning(sb, __FUNCTION__, +- "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); +- return ERR_BAD_DX_DIR; ++ /* ++ * By definition of a tree, no node points to the root. ++ */ ++ if (blk == root) { ++ BREAKPOINT(); ++ return 0; + } +- +- assert((char *)entries == (((char *)&root->info) + +- root->info.info_length)); +- assert(dx_get_limit(entries) == dx_root_limit(path)); +- +- hc->hinfo->hash_version = root->info.hash_version; +- hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed; +- if (hc->dentry) +- ext3fs_dirhash(hc->dentry->d_name.name, +- hc->dentry->d_name.len, hc->hinfo); +- path->ip_key_target = (struct iam_key *)&hc->hinfo->hash; +- } else { +- /* non-root index */ +- assert(entries == data + path_descr(path)->id_node_gap); +- assert(dx_get_limit(entries) == dx_node_limit(path)); + } +- frame->entries = frame->at = entries; +- return 0; +-} +- +-static int htree_node_init(struct iam_container *c, +- struct buffer_head *bh, int root) +-{ +- struct dx_node *node; +- +- assert(!root); +- +- node = (void *)bh->b_data; +- node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); +- node->fake.inode = 0; +- return 0; +-} +- +-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, +- handle_t *handle, struct buffer_head **bh) +-{ +- int result = 0; +- +- *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result); +- if (*bh == NULL) +- result = -EIO; +- return result; +-} +- +-static int htree_keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- __u32 p1 = le32_to_cpu(*(__u32 *)k1); +- __u32 p2 = le32_to_cpu(*(__u32 *)k2); +- +- return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++ return 1; + } + + /* +@@ -797,601 +238,124 @@ + printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", + names, space/bcount,(space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +-} +-#endif /* DX_DEBUG */ +- +-static int dx_lookup(struct iam_path *path) +-{ +- u32 ptr; +- int err = 0; +- int i; +- +- struct iam_descr *param; +- struct iam_frame *frame; +- struct iam_container *c; +- +- param = path_descr(path); +- c = path->ip_container; +- +- for (frame = path->ip_frames, i = 0, +- ptr = param->id_root_ptr(path->ip_container); +- i <= path->ip_indirect; +- ptr = dx_get_block(path, frame->at), ++frame, ++i) { +- struct iam_entry *entries; +- struct iam_entry *p; +- struct iam_entry *q; +- struct iam_entry *m; +- unsigned count; +- +- err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh); +- if (err != 0) +- break; +- err = param->id_node_check(path, frame); +- if (err != 0) +- break; +- +- assert(dx_node_check(path, frame)); +- +- entries = frame->entries; +- count = dx_get_count(entries); +- assert(count && count <= dx_get_limit(entries)); +- p = iam_entry_shift(path, entries, 1); +- q = iam_entry_shift(path, entries, count - 1); +- while (p <= q) { +- m = iam_entry_shift(path, +- p, iam_entry_diff(path, q, p) / 2); +- dxtrace(printk(".")); +- if (keycmp(c, iam_key_at(path, m), +- path->ip_key_target) > 0) +- q = iam_entry_shift(path, m, -1); +- else +- p = iam_entry_shift(path, m, +1); +- } +- +- frame->at = iam_entry_shift(path, p, -1); +- if (1) { // linear search cross check +- unsigned n = count - 1; +- struct iam_entry *at; +- +- at = entries; +- while (n--) { +- dxtrace(printk(",")); +- at = iam_entry_shift(path, at, +1); +- if (keycmp(c, iam_key_at(path, at), +- path->ip_key_target) > 0) { +- if (at != iam_entry_shift(path, frame->at, 1)) { +- BREAKPOINT; +- printk(KERN_EMERG "%i\n", +- keycmp(c, iam_key_at(path, at), +- path->ip_key_target)); +- } +- at = iam_entry_shift(path, at, -1); +- break; +- } +- } +- assert(at == frame->at); +- } +- } +- if (err != 0) +- iam_path_fini(path); +- path->ip_frame = --frame; +- return err; +-} +- +-/* +- * Probe for a directory leaf block to search. +- * +- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +- * error in the directory index, and the caller should fall back to +- * searching the directory normally. The callers of dx_probe **MUST** +- * check for this error code, and make sure it never gets reflected +- * back to userspace. +- */ +-static int dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct iam_path *path) +-{ +- int err; +- struct htree_cookie hc = { +- .dentry = dentry, +- .hinfo = hinfo +- }; +- +- assert(dx_index_is_compat(path)); +- path->ip_descr_data = &hc; +- err = dx_lookup(path); +- assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); +- return err; +-} +- +-/* +- * Initialize container @c, acquires additional reference on @inode. +- */ +-int iam_container_init(struct iam_container *c, +- struct iam_descr *descr, struct inode *inode) +-{ +- memset(c, 0, sizeof *c); +- c->ic_descr = descr; +- c->ic_object = igrab(inode); +- if (c->ic_object != NULL) +- return 0; +- else +- return -ENOENT; +-} +- +-/* +- * Finalize container @c, release all resources. +- */ +-void iam_container_fini(struct iam_container *c) +-{ +- if (c->ic_object != NULL) { +- iput(c->ic_object); +- c->ic_object = NULL; +- } +-} +- +-static inline void iam_path_init(struct iam_path *path, struct iam_container *c, +- struct htree_cookie *hc) +-{ +- memset(path, 0, sizeof *path); +- path->ip_container = c; +- path->ip_frame = path->ip_frames; +- path->ip_descr_data = hc; +-} +- +-static inline void iam_path_fini(struct iam_path *path) +-{ +- int i; +- +- for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { +- if (path->ip_frames[i].bh != NULL) { +- brelse(path->ip_frames[i].bh); +- path->ip_frames[i].bh = NULL; +- } +- } +-} +- +-static void iam_path_compat_init(struct iam_path_compat *path, +- struct inode *inode) +-{ +- int i; +- +- iam_container_init(&path->ipc_container, &htree_compat_param, inode); +- /* +- * XXX hack allowing finalization of iam_path_compat with +- * iam_path_fini(). +- */ +- iput(inode); +- iam_path_init(&path->ipc_path, &path->ipc_container, NULL); +- for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) +- path->ipc_path.ip_key_scratch[i] = +- (struct iam_key *)&path->ipc_scrach[i]; +-} +- +-static void iam_path_compat_fini(struct iam_path_compat *path) +-{ +- iam_path_fini(&path->ipc_path); +- iam_container_fini(&path->ipc_container); +-} +- +-static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf) +-{ +- int block, err; +- struct buffer_head *bh; +- +- block = dx_get_block(path, path->ip_frame->at); +- err = path_descr(path)->id_node_read(path->ip_container, block, +- NULL, &bh); +- if (err) +- return err; +- +- leaf->bh = bh; +- leaf->entries = (struct iam_leaf_entry *)bh->b_data; +- return 0; +-} +- +-static void iam_leaf_fini(struct iam_leaf *leaf) +-{ +- if (leaf->bh) +- brelse(leaf->bh); +-} +- +-/* +- * Search container @c for record with key @k. If record is found, its data +- * are moved into @r. +- * +- * +- * +- * Return values: +ve: found, 0: not-found, -ve: error +- */ +- +-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r) +-{ +- struct dx_hash_info hinfo; +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct htree_cookie hc = { +- .hinfo = &hinfo +- }; +- int err, i; +- +- iam_path_init(path, c, &hc); +- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) +- path->ip_key_scratch[i] = +- (struct iam_key *)&cpath.ipc_scrach[i]; +- err = dx_lookup(path); +- do { +- struct iam_leaf leaf; +- err = iam_leaf_init(path, &leaf); +- if (err) +- goto errout; +- +- for (path_descr(path)->id_leaf.start(c, &leaf); +- !path_descr(path)->id_leaf.at_end(c, &leaf); +- path_descr(path)->id_leaf.next(c, &leaf)) { +- struct iam_key *key; +- +- key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL); +- path_descr(path)->id_leaf.key(c, &leaf, key); +- if (keycmp(c, k, key) == 0) { +- memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf), +- path_descr(path)->id_rec_size); +- iam_path_fini(path); +- iam_leaf_fini(&leaf); +- return 0; +- } +- } +- +- iam_leaf_fini(&leaf); +- /* Check to see if we should continue to search */ +- err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL); +- if (err < 0) +- goto errout; +- } while (err == 1); +-errout: +- iam_path_fini(path); +- return(err); +-} +-EXPORT_SYMBOL(iam_lookup); +- +-static inline size_t iam_leaf_entry_size(struct iam_path *p) +-{ +- return path_descr(p)->id_rec_size + path_descr(p)->id_key_size; +-} +- +-static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p, +- struct iam_leaf_entry *e1, struct iam_leaf_entry *e2) +-{ +- ptrdiff_t diff; +- +- diff = (void *)e1 - (void *)e2; +- assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff); +- return diff / iam_leaf_entry_size(p); +-} +- +-static inline struct iam_leaf_entry* +-iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift) +-{ +- void *e = entry; +- return e + shift * iam_leaf_entry_size(p); +-} +- +-static inline struct iam_key * +-dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key) +-{ +- memcpy(key, e, path_descr(p)->id_key_size); +- return key; +-} +- +-static inline struct iam_key * +-iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry) +-{ +- void *e = entry; +- return e + path_descr(p)->id_rec_size; +-} +-static inline struct iam_leaf_entry * +-iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry) +-{ +- return entry; +-} +- +-static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, +- struct iam_key *k) +-{ +- struct iam_leaf_entry *p, *q, *m; +- struct iam_leaf_entry *entries = leaf->entries; +- int count = dx_get_count((struct iam_entry *)entries); +- +- p = iam_leaf_entry_shift(path, entries, 1); +- q = iam_leaf_entry_shift(path, entries, count - 1); +- while (p <= q) { +- m = iam_leaf_entry_shift(path, +- p, iam_leaf_entry_diff(path, q, p) / 2); +- dxtrace(printk(".")); +- if (keycmp(path->ip_container, iam_leaf_key_at(path, m), +- path->ip_key_target) > 0) +- q = iam_leaf_entry_shift(path, m, -1); +- else +- p = iam_leaf_entry_shift(path, m, +1); +- } +- leaf->at = q; +- return 0; +-} +- +-/*XXX what kind of lock should this entry be locked: WangDi */ +-static int iam_leaf_insert(handle_t *handle, struct iam_path *path, +- struct iam_key *k, struct iam_rec *r) +-{ +- struct iam_leaf leaf; +- struct iam_leaf_entry *p, *q; +- int err, count; +- +- err = iam_leaf_init(path, &leaf); +- if (err) +- goto errout; +- path_descr(path)->id_leaf.start(path->ip_container, &leaf); +- count = dx_get_count((struct iam_entry *)leaf.entries); +- if (dx_get_count((struct iam_entry *)leaf.entries) >= +- dx_get_limit((struct iam_entry *)leaf.entries)){ +- err = -ENOSPC; +- goto errout; +- } +- +- err = iam_leaf_lookup(path, &leaf, k); +- if (err) +- goto errout; +- +- /*insert the k/r to leaf entries*/ +- p = iam_leaf_entry_shift(path, leaf.at, 1); +- q = iam_leaf_entry_shift(path, leaf.entries, count - 1); +- while (q < p) { +- memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path)); +- q = iam_leaf_entry_shift(path, q, -1); +- } +- memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size); +- memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size); +- +- dx_set_count((struct iam_entry*)leaf.entries, count + 1); +- err = ext3_journal_dirty_metadata(handle, leaf.bh); +- if (err) +- ext3_std_error(path->ip_container->ic_object->i_sb, err); +-errout: +- iam_leaf_fini(&leaf); +- return err; +-} +- +-static int split_leaf_node(handle_t *handle, struct iam_path *path) +-{ +- struct inode *dir = path_obj(path); +- unsigned continued = 0; +- struct buffer_head *bh2; +- u32 newblock, hash_split; +- char *data2; +- struct iam_leaf leaf; +- unsigned split; +- int err; +- +- bh2 = ext3_append (handle, dir, &newblock, &err); +- if (!(bh2)) { +- err = -ENOSPC; +- goto errout; +- } +- err = iam_leaf_init(path, &leaf); +- if (err) +- goto errout; +- +- BUFFER_TRACE(leaf.bh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, leaf.bh); +- if (err) { +- journal_error: +- iam_leaf_fini(&leaf); +- brelse(bh2); +- ext3_std_error(dir->i_sb, err); +- err = -EIO; +- goto errout; +- } +- data2 = bh2->b_data; +- split = dx_get_count((struct iam_entry*)leaf.entries)/2; +- hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)); +- if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)), +- iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0) +- continued = 1; +- +- memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1), +- iam_leaf_entry_shift(path, leaf.entries, split), +- split * iam_leaf_entry_size(path)); +- +- /* Which block gets the new entry? */ +- dx_insert_block(path, path->ip_frame, hash_split + continued, newblock); +- err = ext3_journal_dirty_metadata (handle, bh2); +- if (err) +- goto journal_error; +- err = ext3_journal_dirty_metadata (handle, leaf.bh); +- if (err) +- goto journal_error; +- brelse (bh2); +- iam_leaf_fini(&leaf); +-errout: +- return err; +-} +- +-static int split_index_node(handle_t *handle, struct iam_path *path); +-/* +- * Insert new record @r with key @k into container @c (within context of +- * transaction @h. +- * +- * Return values: 0: success, -ve: error, including -EEXIST when record with +- * given key is already present. +- * +- * postcondition: ergo(result == 0 || result == -EEXIST, +- * iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, +- struct iam_rec *r) ++} ++#endif /* DX_DEBUG */ ++ ++int dx_lookup(struct iam_path *path) + { +- struct dx_hash_info hinfo; +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct htree_cookie hc = { +- .hinfo = &hinfo +- }; +- int err, i; ++ u32 ptr; ++ int err = 0; ++ int i; ++ int delta; + +- iam_path_init(path, c, &hc); +- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) +- path->ip_key_scratch[i] = +- (struct iam_key *)&cpath.ipc_scrach[i]; +- err = dx_lookup(path); +- if (err) +- goto errout; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct iam_container *c; + +- err = iam_leaf_insert(handle, path, k, r); ++ param = iam_path_descr(path); ++ c = path->ip_container; + +- if (err != -ENOSPC) +- goto errout; ++ delta = dx_index_is_compat(path) ? 1 : 2; + +- err = split_index_node(handle, path); +- if (err) +- goto errout; ++ for (frame = path->ip_frames, i = 0, ++ ptr = param->id_ops->id_root_ptr(c); ++ i <= path->ip_indirect; ++ ptr = dx_get_block(path, frame->at), ++frame, ++i) { ++ struct iam_entry *entries; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; ++ unsigned count; + +- err = split_leaf_node(handle, path); +- if (err) +- goto errout; +- +- err = iam_leaf_insert(handle, path, k, r); +-errout: +- iam_path_fini(path); +- return(err); +-} ++ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL, ++ &frame->bh); ++ if (err != 0) ++ break; + +-EXPORT_SYMBOL(iam_insert); +-static int iam_leaf_delete(handle_t *handle, struct iam_path *path, +- struct iam_key *k) +-{ +- struct iam_leaf leaf; +- struct iam_leaf_entry *p, *q; +- int err, count; ++ if (EXT3_INVARIANT_ON) { ++ err = param->id_ops->id_node_check(path, frame); ++ if (err != 0) ++ break; ++ } + +- err = iam_leaf_init(path, &leaf); +- if (err) +- goto errout; ++ err = param->id_ops->id_node_load(path, frame); ++ if (err != 0) ++ break; ++ ++ assert_inv(dx_node_check(path, frame)); + +- err = iam_leaf_lookup(path, &leaf, k); +- if (err) +- goto errout; ++ entries = frame->entries; ++ count = dx_get_count(entries); ++ assert_corr(count && count <= dx_get_limit(entries)); ++ p = iam_entry_shift(path, entries, delta); ++ q = iam_entry_shift(path, entries, count - 1); ++ while (p <= q) { ++ m = iam_entry_shift(path, ++ p, iam_entry_diff(path, q, p) / 2); ++ dxtrace(printk(".")); ++ if (iam_ikeycmp(c, iam_ikey_at(path, m), ++ path->ip_ikey_target) > 0) ++ q = iam_entry_shift(path, m, -1); ++ else ++ p = iam_entry_shift(path, m, +1); ++ } + +- count = dx_get_count((struct iam_entry*)leaf.entries); +- /*delete the k to leaf entries*/ +- p = iam_leaf_entry_shift(path, leaf.at, 1); +- q = iam_leaf_entry_shift(path, leaf.entries, count - 1); +- while (p < q) { +- memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path)); +- p = iam_leaf_entry_shift(path, p, 1); +- } +- dx_set_count((struct iam_entry*)leaf.entries, count - 1); ++ frame->at = iam_entry_shift(path, p, -1); ++ if (EXT3_INVARIANT_ON) { // linear search cross check ++ unsigned n = count - 1; ++ struct iam_entry *at; + +- err = ext3_journal_dirty_metadata(handle, leaf.bh); +- if (err) +- ext3_std_error(path_obj(path)->i_sb, err); +-errout: +- iam_leaf_fini(&leaf); ++ at = entries; ++ while (n--) { ++ dxtrace(printk(",")); ++ at = iam_entry_shift(path, at, +1); ++ if (iam_ikeycmp(c, iam_ikey_at(path, at), ++ path->ip_ikey_target) > 0) { ++ if (at != iam_entry_shift(path, frame->at, 1)) { ++ BREAKPOINT(); ++ printk(KERN_EMERG "%i\n", ++ iam_ikeycmp(c, iam_ikey_at(path, at), ++ path->ip_ikey_target)); ++ } ++ at = iam_entry_shift(path, at, -1); ++ break; ++ } ++ } ++ assert_corr(at == frame->at); ++ } ++ } ++ if (err != 0) ++ iam_path_fini(path); ++ path->ip_frame = --frame; + return err; + } + + /* +- * Delete existing record with key @k. +- * +- * Return values: 0: success, -ENOENT: not-found, -ve: other error. ++ * Probe for a directory leaf block to search. + * +- * postcondition: ergo(result == 0 || result == -ENOENT, +- * !iam_lookup(c, k, *)); ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. + */ +-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k) +-{ +- struct dx_hash_info hinfo; +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct htree_cookie hc = { +- .hinfo = &hinfo +- }; +- int err, i; +- +- iam_path_init(path, c, &hc); +- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) +- path->ip_key_scratch[i] = +- (struct iam_key *)&cpath.ipc_scrach[i]; +- err = dx_lookup(path); +- if (err) +- goto errout; +- +- err = iam_leaf_delete(h, path, k); +-errout: +- iam_path_fini(path); +- return err; +-} +- +-EXPORT_SYMBOL(iam_delete); +- +-static int iam_leaf_update(handle_t *handle, struct iam_path *path, +- struct iam_key *k, struct iam_rec *r) ++static int dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct iam_path *path) + { +- struct iam_leaf leaf; + int err; +- +- err = iam_leaf_init(path, &leaf); +- if (err) +- goto errout; ++ struct iam_path_compat *ipc; + +- err = iam_leaf_lookup(path, &leaf, k); +- if (err) +- goto errout; +- +- memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size); +- memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size); ++ assert_corr(path->ip_data != NULL); ++ ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr); ++ ipc->ipc_dentry = dentry; ++ ipc->ipc_hinfo = hinfo; + +- err = ext3_journal_dirty_metadata(handle, leaf.bh); +- if (err) +- ext3_std_error(path_obj(path)->i_sb, err); +-errout: +- iam_leaf_fini(&leaf); +- return err; +-} +-/* +- * Replace existing record with key @k, or insert new one. New record data are +- * in @r. +- * +- * Return values: 0: success, -ve: error. +- * +- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_update(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r) +-{ +- struct dx_hash_info hinfo; +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct htree_cookie hc = { +- .hinfo = &hinfo +- }; +- int err, i; +- +- iam_path_init(path, c, &hc); +- for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) +- path->ip_key_scratch[i] = +- (struct iam_key *)&cpath.ipc_scrach[i]; ++ assert_corr(dx_index_is_compat(path)); + err = dx_lookup(path); +- if (err) +- goto errout; +- +- err = iam_leaf_update(h, path, k, r); +-errout: +- iam_path_fini(path); ++ assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); + return err; + } + +-EXPORT_SYMBOL(iam_update); +- + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -1409,16 +373,15 @@ + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct iam_path *path, __u32 *start_hash) ++static int ext3_htree_advance(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash, ++ int compat) + { + struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + +- assert(dx_index_is_compat(path)); +- + p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. +@@ -1438,6 +401,10 @@ + --p; + } + ++ if (compat) { ++ /* ++ * Htree hash magic. ++ */ + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir +@@ -1445,19 +412,21 @@ + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- dx_get_key(path, p->at, (struct iam_key *)&bhash); ++ iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } ++ } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { +- err = path_descr(path)->id_node_read(path->ip_container, ++ err = iam_path_descr(path)->id_ops-> ++ id_node_read(path->ip_container, + (iam_ptr_t)dx_get_block(path, p->at), + NULL, &bh); + if (err != 0) +@@ -1465,12 +434,23 @@ + ++p; + brelse (p->bh); + p->bh = bh; +- p->at = p->entries = dx_node_get_entries(path, p); +- assert(dx_node_check(path, p)); ++ p->entries = dx_node_get_entries(path, p); ++ p->at = iam_entry_shift(path, p->entries, !compat); ++ assert_inv(dx_node_check(path, p)); + } + return 1; + } + ++int iam_index_next(struct iam_container *c, struct iam_path *path) ++{ ++ return ext3_htree_advance(c->ic_object, 0, path, NULL, 0); ++} ++ ++int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash) ++{ ++ return ext3_htree_advance(dir, hash, path, start_hash, 1); ++} + + /* + * p is at least 6 bytes before the end of page +@@ -1662,21 +642,30 @@ + } while(more); + } + +-static void dx_insert_block(struct iam_path *path, +- struct iam_frame *frame, u32 hash, u32 block) ++void iam_insert_key(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) + { + struct iam_entry *entries = frame->entries; +- struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1); ++ struct iam_entry *new = iam_entry_shift(path, frame->at, +1); + int count = dx_get_count(entries); + +- assert(count < dx_get_limit(entries)); +- assert(old < iam_entry_shift(path, entries, count)); ++ assert_corr(count < dx_get_limit(entries)); ++ assert_corr(frame->at < iam_entry_shift(path, entries, count)); ++ + memmove(iam_entry_shift(path, new, 1), new, + (char *)iam_entry_shift(path, entries, count) - (char *)new); +- dx_set_key(path, new, (struct iam_key *)&hash); +- dx_set_block(path, new, block); ++ dx_set_ikey(path, new, key); ++ dx_set_block(path, new, ptr); + dx_set_count(entries, count + 1); + } ++ ++void dx_insert_block(struct iam_path *path, struct iam_frame *frame, ++ u32 hash, u32 block) ++{ ++ assert_corr(dx_index_is_compat(path)); ++ iam_insert_key(path, frame, (struct iam_ikey *)&hash, block); ++} ++ + #endif + + +@@ -1903,7 +892,8 @@ + hash = hinfo.hash; + do { + block = dx_get_block(path, path->ip_frame->at); +- *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block, ++ *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)block, + NULL, &bh); + if (*err != 0) + goto errout; +@@ -2093,22 +1083,69 @@ + return prev; + } + ++struct ext3_dir_entry_2 *move_entries(struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct buffer_head **bh1, ++ struct buffer_head **bh2, ++ __u32 *delim_hash) ++{ ++ char *data1; ++ char *data2; ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count; ++ unsigned continued; ++ unsigned split; ++ u32 hash2; ++ ++ struct dx_map_entry *map; ++ struct ext3_dir_entry_2 *de1; ++ struct ext3_dir_entry_2 *de2; ++ ++ data1 = (*bh1)->b_data; ++ data2 = (*bh2)->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map(map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count - split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de1 = dx_pack_dirents(data1, blocksize); ++ de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) { ++ swap(*bh1, *bh2); ++ de1 = de2; ++ } ++ *delim_hash = hash2 + continued; ++ return de1; ++} ++ + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path, + struct buffer_head **bh,struct iam_frame *frame, + struct dx_hash_info *hinfo, int *error) + { +- struct inode *dir = path_obj(path); +- unsigned blocksize = dir->i_sb->s_blocksize; +- unsigned count, continued; ++ struct inode *dir = iam_path_obj(path); + struct buffer_head *bh2; + u32 newblock; + u32 hash2; +- struct dx_map_entry *map; +- char *data1 = (*bh)->b_data, *data2; +- unsigned split; +- struct ext3_dir_entry_2 *de = NULL, *de2; ++ struct ext3_dir_entry_2 *de = NULL; + int err; + + bh2 = ext3_append (handle, dir, &newblock, error); +@@ -2133,35 +1170,9 @@ + if (err) + goto journal_error; + +- data2 = bh2->b_data; +- +- /* create map in the end of data2 block */ +- map = (struct dx_map_entry *) (data2 + blocksize); +- count = dx_make_map ((struct ext3_dir_entry_2 *) data1, +- blocksize, hinfo, map); +- map -= count; +- split = count/2; // need to adjust to actual middle +- dx_sort_map (map, count); +- hash2 = map[split].hash; +- continued = hash2 == map[split - 1].hash; +- dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- +- /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split); +- de = dx_pack_dirents(data1,blocksize); +- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); +- de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ de = move_entries(dir, hinfo, bh, &bh2, &hash2); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; +- } +- dx_insert_block(path, frame, hash2 + continued, newblock); ++ dx_insert_block(path, frame, hash2, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -2175,6 +1186,63 @@ + } + #endif + ++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir, ++ struct buffer_head *bh, ++ const char *name, int namelen) ++{ ++ struct ext3_dir_entry_2 *de; ++ char *top; ++ unsigned long offset; ++ int nlen; ++ int rlen; ++ int reclen; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ offset = 0; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", ++ dir, de, bh, offset)) ++ return ERR_PTR(-EIO); ++ if (ext3_match(namelen, name, de)) ++ return ERR_PTR(-EEXIST); ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ return de; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ return ERR_PTR(-ENOSPC); ++} ++ ++struct ext3_dir_entry_2 *split_entry(struct inode *dir, ++ struct ext3_dir_entry_2 *de, ++ unsigned long ino, mode_t mode, ++ const char *name, int namelen) ++{ ++ int nlen; ++ int rlen; ++ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1; ++ ++ de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ de->inode = cpu_to_le32(ino); ++ if (ino != 0) ++ ext3_set_de_type(dir->i_sb, de, mode); ++ de->name_len = namelen; ++ memcpy(de->name, name, namelen); ++ return de; ++} + + /* + * Add a new entry into a directory (leaf) block. If de is non-NULL, +@@ -2194,34 +1262,16 @@ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; +- unsigned long offset = 0; +- unsigned short reclen; +- int nlen, rlen, err; +- char *top; ++ int err; + +- reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { +- de = (struct ext3_dir_entry_2 *)bh->b_data; +- top = bh->b_data + dir->i_sb->s_blocksize - reclen; +- while ((char *) de <= top) { +- if (!ext3_check_dir_entry("ext3_add_entry", dir, de, +- bh, offset)) { +- brelse (bh); +- return -EIO; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; +- } +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if ((de->inode? rlen - nlen: rlen) >= reclen) +- break; +- de = (struct ext3_dir_entry_2 *)((char *)de + rlen); +- offset += rlen; ++ de = find_insertion_point(dir, bh, name, namelen); ++ if (IS_ERR(de)) { ++ err = PTR_ERR(de); ++ if (err != -ENOSPC) ++ brelse(bh); ++ return err; + } +- if ((char *) de > top) +- return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -2232,22 +1282,9 @@ + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); +- de1->rec_len = cpu_to_le16(rlen - nlen); +- de->rec_len = cpu_to_le16(nlen); +- de = de1; +- } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); ++ ++ split_entry(dir, de, inode ? inode->i_ino : 0, ++ inode ? inode->i_mode : 0, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend +@@ -2423,8 +1460,40 @@ + return add_dirent_to_buf(handle, dentry, inode, de, bh); + } + ++static int shift_entries(struct iam_path *path, ++ struct iam_frame *frame, unsigned count, ++ struct iam_entry *entries, struct iam_entry *entries2, ++ u32 newblock) ++{ ++ unsigned count1; ++ unsigned count2; ++ int delta; ++ ++ struct iam_frame *parent = frame - 1; ++ struct iam_ikey *pivot = iam_path_ikey(path, 3); ++ ++ delta = dx_index_is_compat(path) ? 0 : +1; ++ ++ count1 = count/2 + delta; ++ count2 = count - count1; ++ iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot); ++ ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy((char *) iam_entry_shift(path, entries2, delta), ++ (char *) iam_entry_shift(path, entries, count1), ++ count2 * iam_entry_size(path)); ++ ++ dx_set_count(entries, count1); ++ dx_set_count(entries2, count2 + delta); ++ dx_set_limit(entries2, dx_node_limit(path)); ++ ++ iam_insert_key(path, parent, pivot, newblock); ++ return count1; ++} ++ + #ifdef CONFIG_EXT3_INDEX +-static int split_index_node(handle_t *handle, struct iam_path *path) ++int split_index_node(handle_t *handle, struct iam_path *path) + { + + struct iam_entry *entries; /* old block contents */ +@@ -2432,10 +1501,17 @@ + struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; +- struct inode *dir = path_obj(path); ++ struct inode *dir = iam_path_obj(path); ++ struct iam_descr *descr; + int nr_splet; + int i, err; + ++ descr = iam_path_descr(path); ++ /* ++ * Algorithm below depends on this. ++ */ ++ assert_corr(dx_root_limit(path) < dx_node_limit(path)); ++ + frame = path->ip_frame; + entries = frame->entries; + +@@ -2474,7 +1550,8 @@ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); + if (!bh_new[i] || +- path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0) ++ descr->id_ops->id_node_init(path->ip_container, ++ bh_new[i], 0) != 0) + goto cleanup; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); +@@ -2493,6 +1570,7 @@ + unsigned count; + int idx; + struct buffer_head *bh2; ++ struct buffer_head *bh; + + entries = frame->entries; + count = dx_get_count(entries); +@@ -2501,6 +1579,7 @@ + bh2 = bh_new[i]; + entries2 = dx_get_entries(path, bh2->b_data, 0); + ++ bh = frame->bh; + if (frame == path->ip_frames) { + /* splitting root node. Tricky point: + * +@@ -2512,22 +1591,20 @@ + * capacity of the root node is smaller than that of + * non-root one. + */ +- struct dx_root *root; +- u8 indirects; + struct iam_frame *frames; ++ struct iam_entry *next; ++ ++ assert_corr(i == 0); + + frames = path->ip_frames; +- root = (struct dx_root *) frames->bh->b_data; +- indirects = root->info.indirect_levels; +- dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, + count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(path, entries, newblock[i]); +- root->info.indirect_levels = indirects + 1; ++ next = descr->id_ops->id_root_inc(path->ip_container, ++ path, frame); ++ dx_set_block(path, next, newblock[0]); + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, +@@ -2536,49 +1613,61 @@ + frames[1].at = iam_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; +- assert(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, frame)); ++ ++ path->ip_frame; + ++ frame; +- assert(dx_node_check(path, frame)); +- bh_new[i] = NULL; /* buffer head is "consumed" */ ++ assert_inv(dx_node_check(path, frame)); ++ bh_new[0] = NULL; /* buffer head is "consumed" */ + err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; + } else { + /* splitting non-root index node. */ +- unsigned count1 = count/2, count2 = count - count1; +- unsigned hash2; +- +- dx_get_key(path, +- iam_entry_shift(path, entries, count1), +- (struct iam_key *)&hash2); +- +- dxtrace(printk("Split index %i/%i\n", count1, count2)); +- +- memcpy ((char *) entries2, +- (char *) iam_entry_shift(path, entries, count1), +- count2 * iam_entry_size(path)); +- dx_set_count (entries, count1); +- dx_set_count (entries2, count2); +- dx_set_limit (entries2, dx_node_limit(path)); ++ struct iam_frame *parent = frame - 1; + ++ count = shift_entries(path, frame, count, ++ entries, entries2, newblock[i]); + /* Which index block gets the new entry? */ +- if (idx >= count1) { ++ if (idx >= count) { ++ int d = dx_index_is_compat(path) ? 0 : +1; ++ + frame->at = iam_entry_shift(path, entries2, +- idx - count1); ++ idx - count + d); + frame->entries = entries = entries2; + swap(frame->bh, bh2); + bh_new[i] = bh2; ++ parent->at = iam_entry_shift(path, ++ parent->at, +1); + } +- dx_insert_block(path, frame - 1, hash2, newblock[i]); +- assert(dx_node_check(path, frame)); +- assert(dx_node_check(path, frame - 1)); ++ assert_inv(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, parent)); + dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; ++ err = ext3_journal_dirty_metadata(handle, parent->bh); ++ if (err) ++ goto journal_error; ++ } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto journal_error; ++ /* ++ * This function was called to make insertion of new leaf ++ * possible. Check that it fulfilled its obligations. ++ */ ++ assert_corr(dx_get_count(path->ip_frame->entries) < ++ dx_get_limit(path->ip_frame->entries)); + } ++ if (nr_splet > 0) { ++ /* ++ * Log ->i_size modification. ++ */ ++ err = ext3_mark_inode_dirty(handle, dir); ++ if (err) ++ goto journal_error; + } + goto cleanup; + journal_error: +@@ -2610,7 +1699,7 @@ + size_t isize; + + iam_path_compat_init(&cpath, dir); +- param = path_descr(path); ++ param = iam_path_descr(path); + + err = dx_probe(dentry, NULL, &hinfo, path); + if (err != 0) +@@ -2620,7 +1709,8 @@ + /* XXX nikita: global serialization! */ + isize = dir->i_size; + +- err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), ++ err = param->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, frame->at), + handle, &bh); + if (err != 0) + goto cleanup; +@@ -2641,11 +1731,11 @@ + goto cleanup; + + /*copy split inode too*/ +- de = do_split(handle, path, &bh, --frame, &hinfo, &err); ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err); + if (!de) + goto cleanup; + +- assert(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; + +@@ -2752,6 +1842,26 @@ + return ext3_new_inode(handle, dir, mode, inum); + } + ++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext3_new_inode(handle, dir, mode, 0); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext3_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ ext3_set_aops(inode); ++ } ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext3_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +Index: iam/fs/ext3/Makefile +=================================================================== +--- iam.orig/fs/ext3/Makefile 2007-05-23 11:18:11.000000000 +0800 ++++ iam/fs/ext3/Makefile 2007-05-23 11:18:20.000000000 +0800 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o mballoc.o ++ extents.o mballoc.o iam.o iam_lfix.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: iam/fs/ext3/iam_lvar.c +=================================================================== +--- iam.orig/fs/ext3/iam_lvar.c 2007-05-23 09:56:30.476305206 +0800 ++++ iam/fs/ext3/iam_lvar.c 2007-05-23 11:19:15.000000000 +0800 +@@ -0,0 +1,1080 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_lvar.c ++ * implementation of iam format for fixed size records, variable sized keys. ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error() */ ++#include ++ ++#include ++ ++#include ++#include ++ ++/* ++ * Leaf operations. ++ */ ++ ++enum { ++ IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in ++ * lustre/utils/create_iam.c */ ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct lvar_leaf_header { ++ __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */ ++ __le16 vlh_used; /* used bytes, including header */ ++}; ++ ++/* ++ * Format of leaf entry: ++ * ++ * __le16 keysize ++ * u8 key[keysize] ++ * u8 record[rec_size] ++ * ++ * Entries are ordered in key order. ++ */ ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++typedef __u32 lvar_hash_t; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct lvar_leaf_entry { ++ __le32 vle_hash; ++ __le16 vle_keysize; ++ u8 vle_key[0]; ++}; ++ ++#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1))) ++ ++ ++static inline int blocksize(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize; ++} ++ ++static inline const char *kchar(const struct iam_key *key) ++{ ++ return (void *)key; ++} ++ ++static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent) ++{ ++ return (struct iam_lentry *)ent; ++} ++ ++static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent) ++{ ++ return (struct lvar_leaf_entry *)lent; ++} ++ ++ ++static inline int e_keysize(const struct lvar_leaf_entry *ent) ++{ ++ return le16_to_cpu(ent->vle_keysize); ++} ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++enum { ++ LVAR_PAD = 4, ++ LVAR_ROUND = LVAR_PAD - 1 ++}; ++ ++static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize) ++{ ++ CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1))); ++ ++ return (offsetof(struct lvar_leaf_entry, vle_key) + ++ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND; ++} ++ ++static inline int rec_size(const struct iam_rec *rec) ++{ ++ return *(const char *)rec; ++} ++ ++static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent) ++{ ++ return ((void *)ent) + ++ offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent); ++} ++ ++static inline int e_size(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent) ++{ ++ return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent))); ++} ++ ++static inline char *e_char(const struct lvar_leaf_entry *ent) ++{ ++ return (char *)&ent->vle_key; ++} ++ ++static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent) ++{ ++ return (struct iam_key *)e_char(ent); ++} ++ ++static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent) ++{ ++ return le32_to_cpu(ent->vle_hash); ++} ++ ++static void e_print(const struct lvar_leaf_entry *ent) ++{ ++ printk(" %p %8.8x \"%*.*s\"\n", ent, e_hash(ent), ++ e_keysize(ent), e_keysize(ent), e_char(ent)); ++} ++#if 0 ++static int e_check(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent) ++{ ++ const void *point = ent; ++ const void *start = leaf->il_bh->b_data; ++ return ++ start + sizeof(struct lvar_leaf_header) <= point && ++ point + e_size(leaf, ent) < start + blocksize(leaf); ++} ++#endif ++ ++static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent) ++{ ++ return ((void *)ent) + e_size(leaf, ent); ++} ++ ++#define LVAR_HASH_SANDWICH (0) ++#define LVAR_HASH_TEA (1) ++#define LVAR_HASH_R5 (0) ++#define LVAR_HASH_PREFIX (0) ++ ++static __u32 hash_build0(const char *name, int namelen) ++{ ++ __u32 result; ++ ++ if (namelen == 0) ++ return 0; ++ if (strncmp(name, ".", 1) == 0 && namelen == 1) ++ return 1; ++ if (strncmp(name, "..", 2) == 0 && namelen == 2) ++ return 2; ++ ++ if (LVAR_HASH_PREFIX) { ++ result = 0; ++ strncpy((void *)&result, ++ name, min(namelen, (int)sizeof result)); ++ } else { ++ struct dx_hash_info hinfo; ++ ++ if (LVAR_HASH_TEA) ++ hinfo.hash_version = DX_HASH_TEA; ++ else ++ hinfo.hash_version = DX_HASH_R5; ++ hinfo.seed = 0; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ result = hinfo.hash; ++ if (LVAR_HASH_SANDWICH) { ++ __u32 result2; ++ ++ hinfo.hash_version = DX_HASH_TEA; ++ hinfo.seed = 0; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ result2 = hinfo.hash; ++ result = (0xfc000000 & result2) | (0x03ffffff & result); ++ } ++ } ++ return result; ++} ++ ++enum { ++ HASH_GRAY_AREA = 1024, ++ MAX_HASH_SIZE = 0x7fffffffUL ++}; ++ ++static __u32 hash_build(const char *name, int namelen) ++{ ++ __u32 hash; ++ ++ hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE; ++ if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA) ++ hash &= HASH_GRAY_AREA - 1; ++ return hash; ++} ++ ++static inline lvar_hash_t get_hash(const struct iam_container *bag, ++ const char *name, int namelen) ++{ ++ return hash_build(name, namelen); ++} ++ ++static inline int e_eq(const struct lvar_leaf_entry *ent, ++ const char *name, int namelen) ++{ ++ return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen); ++} ++ ++static inline int e_cmp(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent, lvar_hash_t hash) ++{ ++ lvar_hash_t ehash; ++ ++ ehash = e_hash(ent); ++ return ehash == hash ? 0 : (ehash < hash ? -1 : +1); ++} ++ ++static struct lvar_leaf_header *n_head(const struct iam_leaf *l) ++{ ++ return (struct lvar_leaf_header *)l->il_bh->b_data; ++} ++ ++static int h_used(const struct lvar_leaf_header *hdr) ++{ ++ return le16_to_cpu(hdr->vlh_used); ++} ++ ++static void h_used_adj(const struct iam_leaf *leaf, ++ struct lvar_leaf_header *hdr, int adj) ++{ ++ int used; ++ ++ used = h_used(hdr) + adj; ++ assert_corr(sizeof *hdr <= used && used <= blocksize(leaf)); ++ hdr->vlh_used = cpu_to_le16(used); ++} ++ ++static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf) ++{ ++ return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header); ++} ++ ++static struct lvar_leaf_entry *n_end(const struct iam_leaf *l) ++{ ++ return (void *)l->il_bh->b_data + h_used(n_head(l)); ++} ++ ++static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l) ++{ ++ return lentry_lvar(l->il_at); ++} ++ ++void n_print(const struct iam_leaf *l) ++{ ++ struct lvar_leaf_entry *scan; ++ ++ printk(KERN_EMERG "used: %d\n", h_used(n_head(l))); ++ for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan)) ++ e_print(scan); ++} ++ ++#if EXT3_CORRECTNESS_ON ++static int n_at_rec(const struct iam_leaf *folio) ++{ ++ return ++ n_start(folio) <= lentry_lvar(folio->il_at) && ++ lentry_lvar(folio->il_at) < n_end(folio); ++} ++ ++#if EXT3_INVARIANT_ON ++static int n_invariant(const struct iam_leaf *leaf) ++{ ++ struct iam_path *path; ++ struct lvar_leaf_entry *scan; ++ struct lvar_leaf_entry *end; ++ lvar_hash_t hash; ++ lvar_hash_t nexthash; ++ lvar_hash_t starthash; ++ ++ end = n_end(leaf); ++ hash = 0; ++ path = leaf->il_path; ++ ++ if (h_used(n_head(leaf)) > blocksize(leaf)) ++ return 0; ++ ++ /* ++ * Delimiting key in the parent index node. Clear least bit to account ++ * for hash collision marker. ++ */ ++ starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1; ++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { ++ nexthash = e_hash(scan); ++ if (nexthash != get_hash(iam_leaf_container(leaf), ++ e_char(scan), e_keysize(scan))) { ++ BREAKPOINT(); ++ return 0; ++ } ++ if (0 && nexthash < starthash) { ++ /* ++ * Unfortunately this useful invariant cannot be ++ * reliably checked as parent node is nor necessarily ++ * locked. ++ */ ++ n_print(leaf); ++ printk("%#x < %#x\n", nexthash, starthash); ++ dump_stack(); ++ return 0; ++ } ++ if (nexthash < hash) { ++ BREAKPOINT(); ++ return 0; ++ } ++ hash = nexthash; ++ } ++ if (scan != end) { ++ BREAKPOINT(); ++ return 0; ++ } ++ return 1; ++} ++/* EXT3_INVARIANT_ON */ ++#endif ++ ++/* EXT3_CORRECTNESS_ON */ ++#endif ++ ++static struct iam_ikey *lvar_ikey(const struct iam_leaf *l, ++ struct iam_ikey *key) ++{ ++ lvar_hash_t *hash; ++ ++ assert_corr(n_at_rec(l)); ++ ++ hash = (void *)key; ++ *hash = e_hash(n_cur(l)); ++ return key; ++} ++ ++static struct iam_key *lvar_key(const struct iam_leaf *l) ++{ ++ return e_key(n_cur(l)); ++} ++ ++static int lvar_key_size(const struct iam_leaf *l) ++{ ++ return e_keysize(n_cur(l)); ++} ++ ++static void lvar_start(struct iam_leaf *l) ++{ ++ l->il_at = lvar_lentry(n_start(l)); ++} ++ ++static int lvar_init(struct iam_leaf *l) ++{ ++ int result; ++ int used; ++ struct lvar_leaf_header *head; ++ ++ assert_corr(l->il_bh != NULL); ++ ++ head = n_head(l); ++ used = h_used(head); ++ if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) && ++ used <= blocksize(l)) { ++ l->il_at = l->il_entries = lvar_lentry(n_start(l)); ++ result = 0; ++ } else { ++ struct inode *obj; ++ ++ obj = iam_leaf_container(l)->ic_object; ++ ext3_error(obj->i_sb, __FUNCTION__, ++ "Wrong magic in node %llu (#%lu): %#x != %#x or " ++ "wrong used: %i", ++ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino, ++ head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC), ++ used); ++ result = -EIO; ++ } ++ return result; ++} ++ ++static void lvar_fini(struct iam_leaf *l) ++{ ++ l->il_entries = l->il_at = NULL; ++} ++ ++struct iam_rec *lvar_rec(const struct iam_leaf *l) ++{ ++ assert_corr(n_at_rec(l)); ++ return e_rec(n_cur(l)); ++} ++ ++static void lvar_next(struct iam_leaf *l) ++{ ++ assert_corr(n_at_rec(l)); ++ assert_corr(iam_leaf_is_locked(l)); ++ l->il_at = lvar_lentry(e_next(l, n_cur(l))); ++} ++ ++static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k) ++{ ++ struct lvar_leaf_entry *found; ++ struct lvar_leaf_entry *scan; ++ struct lvar_leaf_entry *end; ++ int result; ++ const char *name; ++ int namelen; ++ int found_equal; ++ lvar_hash_t hash; ++ int last; ++ ++ assert_inv(n_invariant(leaf)); ++ end = n_end(leaf); ++ ++ name = kchar(k); ++ namelen = strlen(name); ++ hash = get_hash(iam_leaf_container(leaf), name, namelen); ++ found = NULL; ++ found_equal = 0; ++ last = 1; ++ ++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { ++ lvar_hash_t scan_hash; ++ ++ scan_hash = e_hash(scan); ++ if (scan_hash < hash) ++ found = scan; ++ else if (scan_hash == hash) { ++ if (e_eq(scan, name, namelen)) { ++ /* ++ * perfect match ++ */ ++ leaf->il_at = lvar_lentry(scan); ++ return IAM_LOOKUP_EXACT; ++ } else if (!found_equal) { ++ found = scan; ++ found_equal = 1; ++ } ++ } else { ++ last = 0; ++ break; ++ } ++ } ++ if (found == NULL) { ++ /* ++ * @k is less than all hashes in the leaf. ++ */ ++ lvar_start(leaf); ++ result = IAM_LOOKUP_BEFORE; ++ } else { ++ leaf->il_at = lvar_lentry(found); ++ result = IAM_LOOKUP_OK; ++ assert_corr(n_at_rec(leaf)); ++ } ++ if (last) ++ result |= IAM_LOOKUP_LAST; ++ assert_inv(n_invariant(leaf)); ++ ++ return result; ++} ++ ++static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik) ++{ ++ struct lvar_leaf_entry *scan; ++ struct lvar_leaf_entry *end; ++ lvar_hash_t hash; ++ ++ assert_inv(n_invariant(leaf)); ++ end = n_end(leaf); ++ hash = *(const lvar_hash_t *)ik; ++ ++ lvar_start(leaf); ++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { ++ lvar_hash_t scan_hash; ++ ++ scan_hash = e_hash(scan); ++ if (scan_hash > hash) ++ return scan == n_start(leaf) ? ++ IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK; ++ leaf->il_at = lvar_lentry(scan); ++ if (scan_hash == hash) ++ return IAM_LOOKUP_EXACT; ++ } ++ assert_inv(n_invariant(leaf)); ++ /* ++ * @ik is greater than any key in the node. Return last record in the ++ * node. ++ */ ++ return IAM_LOOKUP_OK; ++} ++ ++static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l))); ++} ++ ++static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ assert_corr(n_at_rec(l)); ++ assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l))); ++ assert_corr(iam_leaf_is_locked(l)); ++ __lvar_key_set(l, k); ++ assert_inv(n_invariant(l)); ++} ++ ++static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ lvar_hash_t hash; ++ const char *name; ++ ++ name = kchar(k); ++ ++ hash = get_hash(iam_leaf_container(l), name, strlen(name)); ++ return e_cmp(l, n_cur(l), hash); ++} ++ ++static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ const char *name; ++ ++ name = kchar(k); ++ return e_eq(n_cur(l), name, strlen(name)); ++} ++ ++static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ memcpy(e_rec(n_cur(l)), r, rec_size(r)); ++} ++ ++static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ assert_corr(n_at_rec(l)); ++ assert_corr(iam_leaf_is_locked(l)); ++ __lvar_rec_set(l, r); ++ assert_inv(n_invariant(l)); ++} ++ ++static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r) ++{ ++ struct iam_rec *rec; ++ ++ rec = e_rec(n_cur(l)); ++ assert_corr(n_at_rec(l)); ++ assert_corr(iam_leaf_is_locked(l)); ++ memcpy(r, rec, rec_size(rec)); ++ assert_inv(n_invariant(l)); ++} ++ ++static int lvar_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ assert_corr(iam_leaf_is_locked(l)); ++ return ++ h_used(n_head(l)) + ++ getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l); ++} ++ ++static int lvar_at_end(const struct iam_leaf *folio) ++{ ++ assert_corr(iam_leaf_is_locked(folio)); ++ return n_cur(folio) == n_end(folio); ++} ++ ++static void lvar_rec_add(struct iam_leaf *leaf, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ const char *key; ++ int ksize; ++ int shift; ++ void *end; ++ void *start; ++ ptrdiff_t diff; ++ ++ assert_corr(lvar_can_add(leaf, k, r)); ++ assert_inv(n_invariant(leaf)); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ key = kchar(k); ++ ksize = strlen(key); ++ shift = getsize(leaf, ksize, rec_size(r)); ++ ++ if (!lvar_at_end(leaf)) { ++ assert_corr(n_cur(leaf) < n_end(leaf)); ++ end = n_end(leaf); ++ if (lvar_key_cmp(leaf, k) <= 0) ++ lvar_next(leaf); ++ else ++ /* ++ * Another exceptional case: insertion with the key ++ * less than least key in the leaf. ++ */ ++ assert_corr(leaf->il_at == leaf->il_entries); ++ ++ start = leaf->il_at; ++ diff = PDIFF(end, start); ++ assert_corr(diff >= 0); ++ memmove(start + shift, start, diff); ++ } ++ h_used_adj(leaf, n_head(leaf), shift); ++ n_cur(leaf)->vle_keysize = cpu_to_le16(ksize); ++ n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf), ++ key, ksize)); ++ __lvar_key_set(leaf, k); ++ __lvar_rec_set(leaf, r); ++ assert_corr(n_at_rec(leaf)); ++ assert_inv(n_invariant(leaf)); ++} ++ ++static void lvar_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ void *next; ++ void *end; ++ int nob; ++ ++ assert_corr(n_at_rec(leaf)); ++ assert_inv(n_invariant(leaf)); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ end = n_end(leaf); ++ next = e_next(leaf, n_cur(leaf)); ++ nob = e_size(leaf, n_cur(leaf)); ++ memmove(leaf->il_at, next, end - next); ++ h_used_adj(leaf, n_head(leaf), -nob); ++ assert_inv(n_invariant(leaf)); ++} ++ ++static void lvar_init_new(struct iam_container *c, struct buffer_head *bh) ++{ ++ struct lvar_leaf_header *hdr; ++ ++ hdr = (struct lvar_leaf_header *)bh->b_data; ++ hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC); ++ hdr->vlh_used = sizeof *hdr; ++} ++ ++static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf, ++ struct lvar_leaf_entry **prev) ++{ ++ void *scan; ++ void *start; ++ int threshold; ++ ++ *prev = NULL; ++ threshold = blocksize(leaf) / 2; ++ for (scan = start = n_start(leaf); scan - start <= threshold; ++ *prev = scan, scan = e_next(leaf, scan)) { ++ ; ++ } ++ return scan; ++} ++ ++static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh, ++ iam_ptr_t new_blknr) ++{ ++ struct lvar_leaf_entry *first_to_move; ++ struct lvar_leaf_entry *last_to_stay; ++ struct iam_path *path; ++ struct lvar_leaf_header *hdr; ++ struct buffer_head *new_leaf; ++ ++ ptrdiff_t tomove; ++ lvar_hash_t hash; ++ ++ assert_inv(n_invariant(leaf)); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ new_leaf = *bh; ++ path = iam_leaf_path(leaf); ++ ++ hdr = (void *)new_leaf->b_data; ++ ++ first_to_move = find_pivot(leaf, &last_to_stay); ++ assert_corr(last_to_stay != NULL); ++ assert_corr(e_next(leaf, last_to_stay) == first_to_move); ++ ++ hash = e_hash(first_to_move); ++ if (hash == e_hash(last_to_stay)) ++ /* ++ * Duplicate hash. ++ */ ++ hash |= 1; ++ ++ tomove = PDIFF(n_end(leaf), first_to_move); ++ memmove(hdr + 1, first_to_move, tomove); ++ ++ h_used_adj(leaf, hdr, tomove); ++ h_used_adj(leaf, n_head(leaf), -tomove); ++ ++ assert_corr(n_end(leaf) == first_to_move); ++ ++ if (n_cur(leaf) >= first_to_move) { ++ /* ++ * insertion point moves into new leaf. ++ */ ++ ptrdiff_t shift; ++ int result; ++ ++ shift = PDIFF(leaf->il_at, first_to_move); ++ *bh = leaf->il_bh; ++ leaf->il_bh = new_leaf; ++ leaf->il_curidx = new_blknr; ++ ++ assert_corr(iam_leaf_is_locked(leaf)); ++ result = lvar_init(leaf); ++ /* ++ * init cannot fail, as node was just initialized. ++ */ ++ assert_corr(result == 0); ++ leaf->il_at = ((void *)leaf->il_at) + shift; ++ } ++ /* ++ * Insert pointer to the new node (together with the least key in ++ * the node) into index node. ++ */ ++ iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash, ++ new_blknr); ++ assert_corr(n_cur(leaf) < n_end(leaf)); ++ assert_inv(n_invariant(leaf)); ++} ++ ++static struct iam_leaf_operations lvar_leaf_ops = { ++ .init = lvar_init, ++ .init_new = lvar_init_new, ++ .fini = lvar_fini, ++ .start = lvar_start, ++ .next = lvar_next, ++ .key = lvar_key, ++ .ikey = lvar_ikey, ++ .rec = lvar_rec, ++ .key_set = lvar_key_set, ++ .key_cmp = lvar_key_cmp, ++ .key_eq = lvar_key_eq, ++ .key_size = lvar_key_size, ++ .rec_set = lvar_rec_set, ++ .rec_get = lvar_rec_get, ++ .lookup = lvar_lookup, ++ .ilookup = lvar_ilookup, ++ .at_end = lvar_at_end, ++ .rec_add = lvar_rec_add, ++ .rec_del = lvar_rec_del, ++ .can_add = lvar_can_add, ++ .split = lvar_split ++}; ++ ++/* ++ * Index operations. ++ */ ++ ++enum { ++ /* This is duplicated in lustre/utils/create_iam.c */ ++ /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */ ++ IAM_LVAR_ROOT_MAGIC = 0xb01dface ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct lvar_root { ++ __le32 vr_magic; ++ __le16 vr_recsize; ++ __le16 vr_ptrsize; ++ u8 vr_indirect_levels; ++ u8 vr_padding0; ++ __le16 vr_padding1; ++}; ++ ++static __u32 lvar_root_ptr(struct iam_container *c) ++{ ++ return 0; ++} ++ ++static int lvar_node_init(struct iam_container *c, struct buffer_head *bh, ++ int root) ++{ ++ return 0; ++} ++ ++static struct iam_entry *lvar_root_inc(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ struct lvar_root *root; ++ struct iam_entry *entries; ++ ++ assert_corr(iam_frame_is_locked(path, frame)); ++ entries = frame->entries; ++ ++ dx_set_count(entries, 2); ++ assert_corr(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ root = (void *)frame->bh->b_data; ++ assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC); ++ root->vr_indirect_levels ++; ++ frame->at = entries = iam_entry_shift(path, entries, 1); ++ memset(iam_ikey_at(path, entries), 0, ++ iam_path_descr(path)->id_ikey_size); ++ return entries; ++} ++ ++static int lvar_node_check(struct iam_path *path, struct iam_frame *frame) ++{ ++ unsigned count; ++ unsigned limit; ++ unsigned limit_correct; ++ struct iam_entry *entries; ++ ++ entries = dx_node_get_entries(path, frame); ++ ++ if (frame == path->ip_frames) { ++ struct lvar_root *root; ++ ++ root = (void *)frame->bh->b_data; ++ if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC) ++ return -EIO; ++ limit_correct = dx_root_limit(path); ++ } else ++ limit_correct = dx_node_limit(path); ++ count = dx_get_count(entries); ++ limit = dx_get_limit(entries); ++ if (count > limit) ++ return -EIO; ++ if (limit != limit_correct) ++ return -EIO; ++ return 0; ++} ++ ++static int lvar_node_load(struct iam_path *path, struct iam_frame *frame) ++{ ++ struct iam_entry *entries; ++ void *data; ++ entries = dx_node_get_entries(path, frame); ++ ++ data = frame->bh->b_data; ++ ++ if (frame == path->ip_frames) { ++ struct lvar_root *root; ++ const char *name; ++ ++ root = data; ++ name = kchar(path->ip_key_target); ++ path->ip_indirect = root->vr_indirect_levels; ++ if (path->ip_ikey_target == NULL) { ++ path->ip_ikey_target = iam_path_ikey(path, 4); ++ *(lvar_hash_t *)path->ip_ikey_target = ++ get_hash(path->ip_container, name, ++ strlen(name)); ++ } ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int lvar_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, const struct iam_ikey *k2) ++{ ++ lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1); ++ lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2); ++ ++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++} ++ ++static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c, ++ void *area) ++{ ++ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size); ++} ++ ++static int root_limit(int rootgap, int blocksize, int size) ++{ ++ int limit; ++ int nlimit; ++ ++ limit = (blocksize - rootgap) / size; ++ nlimit = blocksize / size; ++ if (limit == nlimit) ++ limit--; ++ return limit; ++} ++ ++static int lvar_root_limit(int blocksize, int size) ++{ ++ return root_limit(sizeof(struct lvar_root), blocksize, size); ++} ++ ++static void lvar_root(void *buf, ++ int blocksize, int keysize, int ptrsize, int recsize) ++{ ++ struct lvar_root *root; ++ struct dx_countlimit *limit; ++ void *entry; ++ int isize; ++ ++ isize = sizeof(lvar_hash_t) + ptrsize; ++ root = buf; ++ *root = (typeof(*root)) { ++ .vr_magic = cpu_to_le32(IAM_LVAR_ROOT_MAGIC), ++ .vr_recsize = cpu_to_le16(recsize), ++ .vr_ptrsize = cpu_to_le16(ptrsize), ++ .vr_indirect_levels = 0 ++ }; ++ ++ limit = (void *)(root + 1); ++ *limit = (typeof(*limit)){ ++ /* ++ * limit itself + one pointer to the leaf. ++ */ ++ .count = cpu_to_le16(2), ++ .limit = lvar_root_limit(blocksize, ++ sizeof (lvar_hash_t) + ptrsize) ++ }; ++ ++ entry = root + 1; ++ /* ++ * Skip over @limit. ++ */ ++ entry += isize; ++ ++ /* ++ * Entry format is followed by . In the minimal tree ++ * consisting of a root and single node, is a minimal possible ++ * key. ++ */ ++ *(lvar_hash_t *)entry = 0; ++ entry += sizeof(lvar_hash_t); ++ /* now @entry points to */ ++ if (ptrsize == 4) ++ *(u_int32_t *)entry = cpu_to_le32(1); ++ else ++ *(u_int64_t *)entry = cpu_to_le64(1); ++} ++ ++static int lvar_esize(int namelen, int recsize) ++{ ++ return (offsetof(struct lvar_leaf_entry, vle_key) + ++ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND; ++} ++ ++static void lvar_leaf(void *buf, ++ int blocksize, int keysize, int ptrsize, int recsize) ++{ ++ struct lvar_leaf_header *head; ++ struct lvar_leaf_entry *entry; ++ ++ /* form leaf */ ++ head = buf; ++ *head = (typeof(*head)) { ++ .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC), ++ .vlh_used = cpu_to_le16(sizeof *head + lvar_esize(0, recsize)) ++ }; ++ entry = (void *)(head + 1); ++ *entry = (typeof(*entry)) { ++ .vle_hash = 0, ++ .vle_keysize = 0 ++ }; ++ memset(e_rec(entry), 0, recsize); ++ *(char *)e_rec(entry) = recsize; ++} ++ ++#include ++#include ++#include ++ ++int iam_lvar_create(struct inode *obj, ++ int keysize, int ptrsize, int recsize, handle_t *handle) ++{ ++ struct buffer_head *root_node; ++ struct buffer_head *leaf_node; ++ struct super_block *sb; ++ ++ u32 blknr; ++ int result; ++ unsigned long bsize; ++ ++ assert_corr(obj->i_size == 0); ++ ++ sb = obj->i_sb; ++ bsize = sb->s_blocksize; ++ root_node = ext3_append(handle, obj, &blknr, &result); ++ leaf_node = ext3_append(handle, obj, &blknr, &result); ++ if (root_node != NULL && leaf_node != NULL) { ++ lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize); ++ lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize); ++ ext3_mark_inode_dirty(handle, obj); ++ result = ext3_journal_dirty_metadata(handle, root_node); ++ if (result == 0) ++ result = ext3_journal_dirty_metadata(handle, leaf_node); ++ if (result != 0) ++ ext3_std_error(sb, result); ++ } ++ brelse(leaf_node); ++ brelse(root_node); ++ return result; ++} ++EXPORT_SYMBOL(iam_lvar_create); ++ ++static struct iam_operations lvar_ops = { ++ .id_root_ptr = lvar_root_ptr, ++ .id_node_read = iam_node_read, ++ .id_node_init = lvar_node_init, ++ .id_node_check = lvar_node_check, ++ .id_node_load = lvar_node_load, ++ .id_ikeycmp = lvar_ikeycmp, ++ .id_root_inc = lvar_root_inc, ++ .id_ipd_alloc = lvar_ipd_alloc, ++ .id_ipd_free = iam_ipd_free, ++ .id_name = "lvar" ++}; ++ ++static int lvar_guess(struct iam_container *c) ++{ ++ int result; ++ struct buffer_head *bh; ++ const struct lvar_root *root; ++ ++ assert_corr(c->ic_object != NULL); ++ ++ result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh); ++ if (result == 0) { ++ root = (void *)bh->b_data; ++ if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) { ++ struct iam_descr *descr; ++ ++ descr = c->ic_descr; ++ descr->id_key_size = EXT3_NAME_LEN; ++ descr->id_ikey_size = sizeof (lvar_hash_t); ++ descr->id_rec_size = le16_to_cpu(root->vr_recsize); ++ descr->id_ptr_size = le16_to_cpu(root->vr_ptrsize); ++ descr->id_root_gap = sizeof *root; ++ descr->id_node_gap = 0; ++ descr->id_ops = &lvar_ops; ++ descr->id_leaf_ops = &lvar_leaf_ops; ++ } else ++ result = -EBADF; ++ brelse(bh); ++ } ++ return result; ++} ++ ++static struct iam_format lvar_format = { ++ .if_guess = lvar_guess ++}; ++ ++void iam_lvar_format_init(void) ++{ ++ iam_format_register(&lvar_format); ++} ++ +Index: iam/fs/ext3/iam_lfix.c +=================================================================== +--- iam.orig/fs/ext3/iam_lfix.c 2007-05-23 09:56:30.476305206 +0800 ++++ iam/fs/ext3/iam_lfix.c 2007-05-23 11:18:20.000000000 +0800 +@@ -0,0 +1,735 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_lfix.c ++ * implementation of iam format for fixed size records. ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Wang Di ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error() */ ++#include ++ ++#include ++ ++#include ++#include ++ ++/* ++ * Leaf operations. ++ */ ++ ++enum { ++ IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in ++ * lustre/utils/create_iam.c */ ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct iam_leaf_head { ++ __le16 ill_magic; ++ __le16 ill_count; ++}; ++ ++static inline int iam_lfix_entry_size(const struct iam_leaf *l) ++{ ++ return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size; ++} ++ ++static inline struct iam_lentry * ++iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift) ++{ ++ return (void *)entry + shift * iam_lfix_entry_size(l); ++} ++ ++static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry) ++{ ++ return (struct iam_key *)entry; ++} ++ ++static inline int lfix_keycmp(const struct iam_container *c, ++ const struct iam_key *k1, ++ const struct iam_key *k2) ++{ ++ return memcmp(k1, k2, c->ic_descr->id_key_size); ++} ++ ++static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l) ++{ ++ return (struct iam_leaf_head *)l->il_bh->b_data; ++} ++ ++static struct iam_lentry *iam_entries(const struct buffer_head *bh) ++{ ++ return (void *)bh->b_data + sizeof(struct iam_leaf_head); ++} ++ ++static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l) ++{ ++ return iam_entries(l->il_bh); ++} ++ ++static int leaf_count_limit(const struct iam_leaf *leaf) ++{ ++ int free_space; ++ ++ free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize; ++ free_space -= sizeof(struct iam_leaf_head); ++ return free_space / iam_lfix_entry_size(leaf); ++} ++ ++static int lentry_count_get(const struct iam_leaf *leaf) ++{ ++ return le16_to_cpu(iam_get_head(leaf)->ill_count); ++} ++ ++static void lentry_count_set(struct iam_leaf *leaf, unsigned count) ++{ ++ assert_corr(0 <= count && count <= leaf_count_limit(leaf)); ++ iam_get_head(leaf)->ill_count = cpu_to_le16(count); ++} ++ ++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l); ++ ++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON ++static int iam_leaf_at_rec(const struct iam_leaf *folio) ++{ ++ return ++ iam_get_lentries(folio) <= folio->il_at && ++ folio->il_at < iam_lfix_get_end(folio); ++} ++#endif ++ ++static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l, ++ struct iam_ikey *key) ++{ ++ void *ie = l->il_at; ++ assert_corr(iam_leaf_at_rec(l)); ++ return (struct iam_ikey*)ie; ++} ++ ++static struct iam_key *iam_lfix_key(const struct iam_leaf *l) ++{ ++ void *ie = l->il_at; ++ assert_corr(iam_leaf_at_rec(l)); ++ return (struct iam_key*)ie; ++} ++ ++static int iam_lfix_key_size(const struct iam_leaf *l) ++{ ++ return iam_leaf_descr(l)->id_key_size; ++} ++ ++static void iam_lfix_start(struct iam_leaf *l) ++{ ++ l->il_at = iam_get_lentries(l); ++} ++ ++static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l, ++ const struct iam_lentry *e1, ++ const struct iam_lentry *e2) ++{ ++ ptrdiff_t diff; ++ int esize; ++ ++ esize = iam_lfix_entry_size(l); ++ diff = (void *)e1 - (void *)e2; ++ assert_corr(diff / esize * esize == diff); ++ return diff / esize; ++} ++ ++static int iam_lfix_init(struct iam_leaf *l) ++{ ++ int result; ++ struct iam_leaf_head *ill; ++ int count; ++ ++ assert_corr(l->il_bh != NULL); ++ ++ ill = iam_get_head(l); ++ count = le16_to_cpu(ill->ill_count); ++ if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) && ++ 0 <= count && count <= leaf_count_limit(l)) { ++ l->il_at = l->il_entries = iam_get_lentries(l); ++ result = 0; ++ } else { ++ struct inode *obj; ++ ++ obj = iam_leaf_container(l)->ic_object; ++ ext3_error(obj->i_sb, __FUNCTION__, ++ "Wrong magic in node %llu (#%lu): %#x != %#x or " ++ "wrong count: %i (%i)", ++ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino, ++ ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC), ++ count, leaf_count_limit(l)); ++ result = -EIO; ++ } ++ return result; ++} ++ ++static void iam_lfix_fini(struct iam_leaf *l) ++{ ++ l->il_entries = l->il_at = NULL; ++} ++ ++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l) ++{ ++ int count = lentry_count_get(l); ++ struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count); ++ ++ return ile; ++} ++ ++struct iam_rec *iam_lfix_rec(const struct iam_leaf *l) ++{ ++ void *e = l->il_at; ++ assert_corr(iam_leaf_at_rec(l)); ++ return e + iam_leaf_descr(l)->id_key_size; ++} ++ ++static void iam_lfix_next(struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ l->il_at = iam_lfix_shift(l, l->il_at, 1); ++} ++ ++/* ++ * Bug chasing. ++ */ ++int lfix_dump = 0; ++EXPORT_SYMBOL(lfix_dump); ++ ++static char hdigit(char ch) ++{ ++ static char d[] = "0123456789abcdef"; ++ return d[ch & 0xf]; ++} ++ ++static char *hex(char ch, char *area) ++{ ++ area[0] = hdigit(ch >> 4); ++ area[1] = hdigit(ch); ++ area[2] = 0; ++ return area; ++} ++ ++static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry) ++{ ++ int i; ++ char *area; ++ char h[3]; ++ ++ area = (char *)entry; ++ printk(KERN_EMERG "["); ++ for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area) ++ printk("%s", hex(*area, h)); ++ printk("]-("); ++ for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area) ++ printk("%s", hex(*area, h)); ++ printk(")\n"); ++} ++ ++static void lfix_print(struct iam_leaf *leaf) ++{ ++ struct iam_lentry *entry; ++ int count; ++ int i; ++ ++ entry = leaf->il_entries; ++ count = lentry_count_get(leaf); ++ printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count); ++ for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1)) ++ l_print(leaf, entry); ++} ++ ++static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k) ++{ ++ struct iam_lentry *p, *q, *m, *t; ++ struct iam_container *c; ++ int count; ++ int result; ++ ++ count = lentry_count_get(l); ++ if (count == 0) ++ return IAM_LOOKUP_EMPTY; ++ ++ result = IAM_LOOKUP_OK; ++ c = iam_leaf_container(l); ++ ++ p = l->il_entries; ++ q = iam_lfix_shift(l, p, count - 1); ++ if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) { ++ /* ++ * @k is less than the least key in the leaf ++ */ ++ l->il_at = p; ++ result = IAM_LOOKUP_BEFORE; ++ } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) { ++ l->il_at = q; ++ } else { ++ /* ++ * EWD1293 ++ */ ++ while (iam_lfix_shift(l, p, 1) != q) { ++ m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2); ++ assert_corr(p < m && m < q); ++ if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0) ++ p = m; ++ else ++ q = m; ++ } ++ assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 && ++ lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0); ++ /* ++ * skip over records with duplicate keys. ++ */ ++ while (p > l->il_entries) { ++ t = iam_lfix_shift(l, p, -1); ++ if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0) ++ p = t; ++ else ++ break; ++ } ++ l->il_at = p; ++ } ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0) ++ result = IAM_LOOKUP_EXACT; ++ ++ if (lfix_dump) ++ lfix_print(l); ++ ++ return result; ++} ++ ++static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik) ++{ ++ assert(0); ++ return IAM_LOOKUP_OK; ++} ++ ++static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size); ++} ++ ++static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k); ++} ++ ++static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ return !lfix_keycmp(iam_leaf_container(l), ++ iam_leaf_key_at(l->il_at), k); ++} ++ ++static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size); ++} ++ ++static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size); ++} ++ ++static void iam_lfix_rec_add(struct iam_leaf *leaf, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ struct iam_lentry *end; ++ struct iam_lentry *cur; ++ struct iam_lentry *start; ++ ptrdiff_t diff; ++ int count; ++ ++ assert_corr(iam_leaf_can_add(leaf, k, r)); ++ ++ count = lentry_count_get(leaf); ++ /* ++ * This branch handles two exceptional cases: ++ * ++ * - leaf positioned beyond last record, and ++ * ++ * - empty leaf. ++ */ ++ if (!iam_leaf_at_end(leaf)) { ++ end = iam_lfix_get_end(leaf); ++ cur = leaf->il_at; ++ if (lfix_keycmp(iam_leaf_container(leaf), ++ k, iam_leaf_key_at(cur)) >= 0) ++ iam_lfix_next(leaf); ++ else ++ /* ++ * Another exceptional case: insertion with the key ++ * less than least key in the leaf. ++ */ ++ assert_corr(cur == leaf->il_entries); ++ ++ start = leaf->il_at; ++ diff = (void *)end - (void *)start; ++ assert_corr(diff >= 0); ++ memmove(iam_lfix_shift(leaf, start, 1), start, diff); ++ } ++ lentry_count_set(leaf, count + 1); ++ iam_lfix_key_set(leaf, k); ++ iam_lfix_rec_set(leaf, r); ++ assert_corr(iam_leaf_at_rec(leaf)); ++} ++ ++static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ struct iam_lentry *next, *end; ++ int count; ++ ptrdiff_t diff; ++ ++ assert_corr(iam_leaf_at_rec(leaf)); ++ ++ count = lentry_count_get(leaf); ++ end = iam_lfix_get_end(leaf); ++ next = iam_lfix_shift(leaf, leaf->il_at, 1); ++ diff = (void *)end - (void *)next; ++ memmove(leaf->il_at, next, diff); ++ ++ lentry_count_set(leaf, count - 1); ++} ++ ++static int iam_lfix_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ return lentry_count_get(l) < leaf_count_limit(l); ++} ++ ++static int iam_lfix_at_end(const struct iam_leaf *folio) ++{ ++ return folio->il_at == iam_lfix_get_end(folio); ++} ++ ++static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh) ++{ ++ struct iam_leaf_head *hdr; ++ ++ hdr = (struct iam_leaf_head*)bh->b_data; ++ hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC); ++ hdr->ill_count = cpu_to_le16(0); ++} ++ ++static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh, ++ iam_ptr_t new_blknr) ++{ ++ struct iam_path *path; ++ struct iam_leaf_head *hdr; ++ const struct iam_ikey *pivot; ++ struct buffer_head *new_leaf; ++ ++ unsigned count; ++ unsigned split; ++ ++ void *start; ++ void *finis; ++ ++ new_leaf = *bh; ++ path = iam_leaf_path(l); ++ ++ hdr = (void *)new_leaf->b_data; ++ ++ count = lentry_count_get(l); ++ split = count / 2; ++ ++ start = iam_lfix_shift(l, iam_get_lentries(l), split); ++ finis = iam_lfix_shift(l, iam_get_lentries(l), count); ++ ++ pivot = (const struct iam_ikey *)iam_leaf_key_at(start); ++ ++ memmove(iam_entries(new_leaf), start, finis - start); ++ hdr->ill_count = count - split; ++ lentry_count_set(l, split); ++ if ((void *)l->il_at >= start) { ++ /* ++ * insertion point moves into new leaf. ++ */ ++ int shift; ++ int result; ++ ++ shift = iam_lfix_diff(l, l->il_at, start); ++ *bh = l->il_bh; ++ l->il_bh = new_leaf; ++ l->il_curidx = new_blknr; ++ result = iam_lfix_init(l); ++ /* ++ * init cannot fail, as node was just initialized. ++ */ ++ assert_corr(result == 0); ++ l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift); ++ } ++ /* ++ * Insert pointer to the new node (together with the least key in ++ * the node) into index node. ++ */ ++ iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr); ++} ++ ++static struct iam_leaf_operations iam_lfix_leaf_ops = { ++ .init = iam_lfix_init, ++ .init_new = iam_lfix_init_new, ++ .fini = iam_lfix_fini, ++ .start = iam_lfix_start, ++ .next = iam_lfix_next, ++ .key = iam_lfix_key, ++ .ikey = iam_lfix_ikey, ++ .rec = iam_lfix_rec, ++ .key_set = iam_lfix_key_set, ++ .key_cmp = iam_lfix_key_cmp, ++ .key_eq = iam_lfix_key_eq, ++ .key_size = iam_lfix_key_size, ++ .rec_set = iam_lfix_rec_set, ++ .rec_get = iam_lfix_rec_get, ++ .lookup = iam_lfix_lookup, ++ .ilookup = iam_lfix_ilookup, ++ .at_end = iam_lfix_at_end, ++ .rec_add = iam_lfix_rec_add, ++ .rec_del = iam_lfix_rec_del, ++ .can_add = iam_lfix_can_add, ++ .split = iam_lfix_split ++}; ++ ++/* ++ * Index operations. ++ */ ++ ++enum { ++ /* This is duplicated in lustre/utils/create_iam.c */ ++ /* ++ * Then shalt thou see the dew-BEDABBLED wretch ++ * Turn, and return, indenting with the way; ++ * Each envious brier his weary legs doth scratch, ++ * Each shadow makes him stop, each murmur stay: ++ * For misery is trodden on by many, ++ * And being low never relieved by any. ++ */ ++ IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct iam_lfix_root { ++ __le64 ilr_magic; ++ __le16 ilr_keysize; ++ __le16 ilr_recsize; ++ __le16 ilr_ptrsize; ++ u8 ilr_indirect_levels; ++ u8 ilr_padding; ++}; ++ ++static __u32 iam_lfix_root_ptr(struct iam_container *c) ++{ ++ return 0; ++} ++ ++static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh, ++ int root) ++{ ++ return 0; ++} ++ ++static struct iam_entry *iam_lfix_root_inc(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ struct iam_lfix_root *root; ++ struct iam_entry *entries; ++ ++ entries = frame->entries; ++ ++ dx_set_count(entries, 2); ++ assert_corr(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ root = (void *)frame->bh->b_data; ++ assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC); ++ root->ilr_indirect_levels ++; ++ frame->at = entries = iam_entry_shift(path, entries, 1); ++ memset(iam_ikey_at(path, entries), 0, ++ iam_path_descr(path)->id_ikey_size); ++ return entries; ++} ++ ++static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame) ++{ ++ unsigned count; ++ unsigned limit; ++ unsigned limit_correct; ++ struct iam_entry *entries; ++ ++ entries = dx_node_get_entries(path, frame); ++ ++ if (frame == path->ip_frames) { ++ struct iam_lfix_root *root; ++ ++ root = (void *)frame->bh->b_data; ++ if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) { ++ return -EIO; ++ } ++ limit_correct = dx_root_limit(path); ++ } else ++ limit_correct = dx_node_limit(path); ++ count = dx_get_count(entries); ++ limit = dx_get_limit(entries); ++ if (count > limit) { ++ return -EIO; ++ } ++ if (limit != limit_correct) { ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame) ++{ ++ struct iam_entry *entries; ++ void *data; ++ entries = dx_node_get_entries(path, frame); ++ ++ data = frame->bh->b_data; ++ ++ if (frame == path->ip_frames) { ++ struct iam_lfix_root *root; ++ ++ root = data; ++ path->ip_indirect = root->ilr_indirect_levels; ++ if (path->ip_ikey_target == NULL) ++ path->ip_ikey_target = ++ (struct iam_ikey *)path->ip_key_target; ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int iam_lfix_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2) ++{ ++ return memcmp(k1, k2, c->ic_descr->id_ikey_size); ++} ++ ++static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c, ++ void *area) ++{ ++ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size); ++} ++ ++static struct iam_operations iam_lfix_ops = { ++ .id_root_ptr = iam_lfix_root_ptr, ++ .id_node_read = iam_node_read, ++ .id_node_init = iam_lfix_node_init, ++ .id_node_check = iam_lfix_node_check, ++ .id_node_load = iam_lfix_node_load, ++ .id_ikeycmp = iam_lfix_ikeycmp, ++ .id_root_inc = iam_lfix_root_inc, ++ .id_ipd_alloc = iam_lfix_ipd_alloc, ++ .id_ipd_free = iam_ipd_free, ++ .id_name = "lfix" ++}; ++ ++static int iam_lfix_guess(struct iam_container *c) ++{ ++ int result; ++ struct buffer_head *bh; ++ const struct iam_lfix_root *root; ++ ++ assert_corr(c->ic_object != NULL); ++ ++ result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh); ++ if (result == 0) { ++ root = (void *)bh->b_data; ++ if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) { ++ struct iam_descr *descr; ++ ++ descr = c->ic_descr; ++ descr->id_key_size = le16_to_cpu(root->ilr_keysize); ++ descr->id_ikey_size = le16_to_cpu(root->ilr_keysize); ++ descr->id_rec_size = le16_to_cpu(root->ilr_recsize); ++ descr->id_ptr_size = le16_to_cpu(root->ilr_ptrsize); ++ descr->id_root_gap = sizeof(struct iam_lfix_root); ++ descr->id_node_gap = 0; ++ descr->id_ops = &iam_lfix_ops; ++ descr->id_leaf_ops = &iam_lfix_leaf_ops; ++ } else ++ result = -EBADF; ++ brelse(bh); ++ } ++ return result; ++} ++ ++static struct iam_format iam_lfix_format = { ++ .if_guess = iam_lfix_guess ++}; ++ ++void iam_lfix_format_init(void) ++{ ++ iam_format_register(&iam_lfix_format); ++} ++ ++/* ++ * Debugging aid. ++ */ ++ ++#define KEYSIZE (8) ++#define RECSIZE (8) ++#define PTRSIZE (4) ++ ++#define LFIX_ROOT_RECNO \ ++ ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE)) ++ ++#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE)) ++ ++#define LFIX_LEAF_RECNO \ ++ ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE)) ++ ++struct lfix_root { ++ struct iam_lfix_root lr_root; ++ struct { ++ char key[KEYSIZE]; ++ char ptr[PTRSIZE]; ++ } lr_entry[LFIX_ROOT_RECNO]; ++}; ++ ++struct lfix_index { ++ struct dx_countlimit li_cl; ++ char li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)]; ++ struct { ++ char key[KEYSIZE]; ++ char ptr[PTRSIZE]; ++ } li_entry[LFIX_INDEX_RECNO - 1]; ++}; ++ ++struct lfix_leaf { ++ struct iam_leaf_head ll_head; ++ struct { ++ char key[KEYSIZE]; ++ char rec[RECSIZE]; ++ } ll_entry[LFIX_LEAF_RECNO]; ++}; +Index: iam/fs/ext3/iam_htree.c +=================================================================== +--- iam.orig/fs/ext3/iam_htree.c 2007-05-23 09:56:30.476305206 +0800 ++++ iam/fs/ext3/iam_htree.c 2007-05-23 11:18:20.000000000 +0800 +@@ -0,0 +1,687 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_htree.c ++ * implementation of iam format for ext3/htree. ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error(), EXT3_DIR_ROUND() */ ++#include ++ ++#include ++ ++#include ++#include ++ ++static inline struct ext3_dir_entry_2 *dent(struct iam_lentry *ent) ++{ ++ return (struct ext3_dir_entry_2 *)ent; ++} ++ ++static inline struct iam_path_compat *getipc(const struct iam_leaf *folio) ++{ ++ struct iam_path *path; ++ ++ path = iam_leaf_path(folio); ++ assert_corr(dx_index_is_compat(path)); ++ assert_corr(path->ip_data != NULL); ++ return container_of(path->ip_data, struct iam_path_compat, ipc_descr); ++} ++ ++static inline struct ext3_dir_entry_2 *getent(const struct iam_leaf *folio) ++{ ++ return dent(folio->il_at); ++} ++ ++static __u32 hashname(const struct iam_leaf *folio, ++ const char *name, int namelen) ++{ ++ int result; ++ struct dx_hash_info *hinfo; ++ ++ hinfo = getipc(folio)->ipc_hinfo; ++ assert_corr(hinfo != NULL); ++ result = ext3fs_dirhash(name, namelen, hinfo); ++ assert_corr(result == 0); ++ return hinfo->hash; ++} ++ ++static __u32 gethash(const struct iam_leaf *folio, ++ const struct ext3_dir_entry_2 *ent) ++{ ++ return hashname(folio, ent->name, ent->name_len); ++} ++ ++static inline size_t recsize(size_t namelen) ++{ ++ return EXT3_DIR_REC_LEN(namelen); ++} ++ ++static struct ext3_dir_entry_2 *getlast(const struct iam_leaf *folio, int namelen) ++{ ++ return ++ (void *)folio->il_bh->b_data + ++ iam_leaf_container(folio)->ic_object->i_sb->s_blocksize - ++ recsize(namelen); ++} ++ ++static struct ext3_dir_entry_2 *gettop(const struct iam_leaf *folio) ++{ ++ return getlast(folio, 0); ++} ++ ++static inline int ent_is_live(const struct ext3_dir_entry_2 *ent) ++{ ++ return ent->inode != 0; ++} ++ ++static struct ext3_dir_entry_2 *entnext(const struct ext3_dir_entry_2 *ent) ++{ ++ return (void *)ent + le16_to_cpu(ent->rec_len); ++} ++ ++static struct ext3_dir_entry_2 *skipdead(struct ext3_dir_entry_2 *ent) ++{ ++ if (!ent_is_live(ent)) ++ ent = entnext(ent); ++ /* ++ * There can be no more than one dead entry in a row. ++ */ ++ return ent; ++} ++ ++static struct ext3_dir_entry_2 *getstart(const struct iam_leaf *folio) ++{ ++ return (void *)folio->il_bh->b_data; ++} ++ ++static int getfreespace(const struct ext3_dir_entry_2 *ent) ++{ ++ int free; ++ ++ free = le16_to_cpu(ent->rec_len); ++ if (ent_is_live(ent)) ++ free -= recsize(ent->name_len); ++ assert_corr(free >= 0); ++ return free; ++} ++ ++static int entcmp(const struct iam_leaf *folio, ++ const struct ext3_dir_entry_2 *e0, const struct ext3_dir_entry_2 *e1) ++{ ++ __u32 hash0; ++ __u32 hash1; ++ ++ assert_corr(ent_is_live(e0)); ++ assert_corr(ent_is_live(e1)); ++ ++ hash0 = gethash(folio, e0); ++ hash1 = gethash(folio, e1); ++ if (hash0 < hash1) ++ return -1; ++ else if (hash0 > hash1) ++ return +1; ++ else if (e0 < e1) ++ return -1; ++ else if (e0 > e1) ++ return +1; ++ else ++ return 0; ++} ++ ++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON ++static int iam_leaf_at_rec(const struct iam_leaf *folio) ++{ ++ struct ext3_dir_entry_2 *ent; ++ ++ ent = getent(folio); ++ return getstart(folio) <= ent && ++ ent < gettop(folio) && ent_is_live(ent); ++} ++#endif ++ ++/* ++ * Leaf operations. ++ */ ++ ++static struct iam_ikey *iam_htree_ikey(const struct iam_leaf *l, ++ struct iam_ikey *key) ++{ ++ __u32 *hash; ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ hash = (void *)key; ++ *hash = gethash(l, getent(l)); ++ return key; ++} ++ ++static struct iam_key *iam_htree_key(const struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ return (struct iam_key *)&getent(l)->name; ++} ++ ++static int iam_htree_key_size(const struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ return getent(l)->name_len; ++} ++ ++static void iam_htree_start(struct iam_leaf *l) ++{ ++ l->il_at = (void *)skipdead(getstart(l)); ++} ++ ++static int iam_htree_init(struct iam_leaf *l) ++{ ++ assert_corr(l->il_bh != NULL); ++ ++ l->il_at = l->il_entries = (void *)getstart(l); ++ return 0; ++} ++ ++static void iam_htree_fini(struct iam_leaf *l) ++{ ++ l->il_entries = l->il_at = NULL; ++} ++ ++struct iam_rec *iam_htree_rec(const struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ return (void *)&getent(l)->inode; ++} ++ ++static void iam_htree_next(struct iam_leaf *l) ++{ ++ struct ext3_dir_entry_2 *scan; ++ struct ext3_dir_entry_2 *found; ++ ++ assert_corr(iam_leaf_at_rec(l)); ++ found = NULL; ++ for (scan = getstart(l); scan < gettop(l); scan = entnext(scan)) { ++ if (scan != getent(l) && ent_is_live(scan) && ++ entcmp(l, getent(l), scan) < 0 && ++ (found == NULL || entcmp(l, scan, found) < 0)) ++ found = scan; ++ } ++ assert_corr(ergo(found != NULL, ++ gethash(l, getent(l)) <= gethash(l, found))); ++ l->il_at = (void *)(found ? : gettop(l)); ++} ++ ++static int iam_htree_at_end(const struct iam_leaf *folio) ++{ ++ return getent(folio) >= gettop(folio); ++} ++ ++ ++static inline int match(int len, const char *const name, ++ struct ext3_dir_entry_2 *de) ++{ ++ if (len != de->name_len) ++ return 0; ++ if (!de->inode) ++ return 0; ++ return !memcmp(name, de->name, len); ++} ++ ++static int iam_htree_lookup(struct iam_leaf *l, const struct iam_key *k) ++{ ++ struct iam_container *c; ++ struct ext3_dir_entry_2 *scan; ++ struct ext3_dir_entry_2 *found; ++ __u32 hash; ++ int result; ++ int namelen; ++ int last = 1; ++ const char *name; ++ ++ c = iam_leaf_container(l); ++ name = (const char *)k; ++ namelen = strlen(name); ++ hash = hashname(l, name, namelen); ++ found = NULL; ++ result = IAM_LOOKUP_OK; ++ for (scan = getstart(l); scan < getlast(l, namelen); ++ scan = entnext(scan)) { ++ if (match(namelen, name, scan)) { ++ found = scan; ++ result = IAM_LOOKUP_EXACT; ++ break; ++ } else if (ent_is_live(scan)) { ++ if (gethash(l, scan) <= hash) ++ found = scan; ++ else ++ last = 0; ++ } ++ } ++ if (found == NULL) { ++ /* ++ * @k is less than all hashes in the leaf. ++ */ ++ iam_htree_start(l); ++ result = IAM_LOOKUP_BEFORE; ++ } else { ++ l->il_at = (void *)found; ++ assert_corr(iam_leaf_at_rec(l)); ++ } ++ if (last) ++ result |= IAM_LOOKUP_LAST; ++ return result; ++} ++ ++static int iam_htree_ilookup(struct iam_leaf *l, const struct iam_ikey *ik) ++{ ++ assert(0); ++ return IAM_LOOKUP_OK; ++} ++ ++static void iam_htree_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ assert(0); ++} ++ ++static int iam_htree_key_cmp(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ const char *name; ++ __u32 h0; ++ __u32 h1; ++ ++ name = (const char *)k; ++ ++ assert_corr(ent_is_live(getent(l))); ++ ++ h0 = gethash(l, getent(l)); ++ h1 = hashname(l, name, strlen(name)); ++ ++ return h0 < h1 ? -1 : (h0 == h1 ? 0 : +1); ++} ++ ++static int iam_htree_key_eq(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ const char *name; ++ ++ name = (const char *)k; ++ return match(strlen(name), name, getent(l)); ++} ++ ++static void iam_htree_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ __u32 *ino; ++ ++ ino = (void *)r; ++ getent(l)->inode = cpu_to_le32(*ino); ++} ++ ++static void iam_htree_rec_get(const struct iam_leaf *l, struct iam_rec *r) ++{ ++ __u32 *ino; ++ ++ ino = (void *)r; ++ *ino = le32_to_cpu(getent(l)->inode); ++} ++ ++static void iam_htree_rec_add(struct iam_leaf *leaf, const struct iam_key *k, ++ const struct iam_rec *r) ++{ ++ struct ext3_dir_entry_2 *scan; ++ struct inode *dir; ++ const char *name; ++ ++ __u32 *ino; ++ int namelen; ++ ++ assert_corr(iam_leaf_can_add(leaf, k, r)); ++ ++ dir = iam_leaf_container(leaf)->ic_object; ++ ino = (void *)r; ++ name = (const char *)k; ++ namelen = strlen(name); ++ ++ scan = find_insertion_point(dir, leaf->il_bh, name, namelen); ++ assert_corr(!IS_ERR(scan)); ++ scan = split_entry(dir, scan, *ino, EXT3_FT_UNKNOWN, name, namelen); ++ leaf->il_at = (void *)scan; ++} ++ ++static void iam_htree_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ struct ext3_dir_entry_2 *orig; ++ struct ext3_dir_entry_2 *scan; ++ struct ext3_dir_entry_2 *prev; ++ ++ assert_corr(iam_leaf_at_rec(leaf)); ++ ++ orig = getent(leaf); ++ ++ if (shift) ++ iam_htree_next(leaf); ++ ++ for (prev = NULL, scan = getstart(leaf); scan < orig; ++ prev = scan, scan = entnext(scan)) ++ ; ++ ++ assert_corr(scan == orig); ++ if (prev != NULL) { ++ prev->rec_len = cpu_to_le16(le16_to_cpu(prev->rec_len) + ++ le16_to_cpu(scan->rec_len)); ++ } else { ++ assert_corr(scan == getstart(leaf)); ++ scan->inode = 0; ++ } ++ iam_leaf_container(leaf)->ic_object->i_version ++; ++} ++ ++static int iam_htree_can_add(const struct iam_leaf *leaf, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ struct ext3_dir_entry_2 *scan; ++ int size; ++ ++ size = recsize(strlen((const char *)k)); ++ for (scan = getstart(leaf); ++ scan < gettop(leaf); scan = entnext(scan)) { ++ if (getfreespace(scan) >= size) ++ return 1; ++ } ++ return 0; ++} ++ ++static void iam_htree_init_new(struct iam_container *c, struct buffer_head *bh) ++{ ++ /* ++ * Do nothing, all work is done by iam_htree_split(). ++ */ ++} ++ ++static void iam_htree_split(struct iam_leaf *l, struct buffer_head **bh, ++ iam_ptr_t new_blknr) ++{ ++ __u32 delim_hash; ++ __u32 old_hash; ++ struct buffer_head *newbh = *bh; ++ struct iam_path *path; ++ ++ old_hash = gethash(l, getent(l)); ++ move_entries(iam_leaf_container(l)->ic_object, ++ getipc(l)->ipc_hinfo, &l->il_bh, bh, &delim_hash); ++ /* ++ * Insert pointer to the new node (together with the least key in ++ * the node) into index node. ++ */ ++ path = iam_leaf_path(l); ++ if (l->il_bh == newbh) { ++ /* ++ * insertion point moves into new leaf. ++ */ ++ assert_corr(delim_hash >= old_hash); ++ l->il_curidx = new_blknr; ++ iam_htree_lookup(l, (void *)&old_hash); ++ } ++ iam_insert_key_lock(path, ++ path->ip_frame, (void *)&delim_hash, new_blknr); ++} ++ ++static struct iam_leaf_operations iam_htree_leaf_ops = { ++ .init = iam_htree_init, ++ .init_new = iam_htree_init_new, ++ .fini = iam_htree_fini, ++ .start = iam_htree_start, ++ .next = iam_htree_next, ++ .key = iam_htree_key, ++ .ikey = iam_htree_ikey, ++ .rec = iam_htree_rec, ++ .key_set = iam_htree_key_set, ++ .key_cmp = iam_htree_key_cmp, ++ .key_eq = iam_htree_key_eq, ++ .key_size = iam_htree_key_size, ++ .rec_set = iam_htree_rec_set, ++ .rec_get = iam_htree_rec_get, ++ .lookup = iam_htree_lookup, ++ .ilookup = iam_htree_ilookup, ++ .at_end = iam_htree_at_end, ++ .rec_add = iam_htree_rec_add, ++ .rec_del = iam_htree_rec_del, ++ .can_add = iam_htree_can_add, ++ .split = iam_htree_split ++}; ++ ++/* ++ * Index operations. ++ */ ++ ++static __u32 iam_htree_root_ptr(struct iam_container *c) ++{ ++ return 0; ++} ++ ++static int iam_htree_node_check(struct iam_path *path, struct iam_frame *frame) ++{ ++ /* XXX no checks yet */ ++ return 0; ++} ++ ++static int is_htree(struct super_block *sb, ++ const struct dx_root *root, int silent) ++{ ++ if (root->info.hash_version > DX_HASH_MAX) { ++ if (!silent) ++ ext3_warning(sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ return -EIO; ++ } ++ ++ if (root->info.unused_flags & 1) { ++ if (!silent) ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ return -EIO; ++ } ++ ++ if (root->info.indirect_levels > DX_MAX_TREE_HEIGHT - 1) { ++ if (!silent) ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int iam_htree_node_load(struct iam_path *path, struct iam_frame *frame) ++{ ++ void *data; ++ struct iam_entry *entries; ++ struct super_block *sb; ++ ++ data = frame->bh->b_data; ++ entries = dx_node_get_entries(path, frame); ++ sb = iam_path_obj(path)->i_sb; ++ if (frame == path->ip_frames) { ++ /* root node */ ++ struct dx_root *root; ++ struct iam_path_compat *ipc; ++ int check; ++ const char *name; ++ int namelen; ++ ++ root = data; ++ assert_corr(path->ip_data != NULL); ++ ipc = container_of(path->ip_data, struct iam_path_compat, ++ ipc_descr); ++ ++ check = is_htree(sb, root, 0); ++ if (check != 0) ++ return check; ++ path->ip_indirect = root->info.indirect_levels; ++ ++ assert_corr((char *)entries == (((char *)&root->info) + ++ root->info.info_length)); ++ assert_corr(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ ipc->ipc_hinfo->hash_version = root->info.hash_version; ++ ipc->ipc_hinfo->seed = EXT3_SB(sb)->s_hash_seed; ++ name = NULL; ++ if (ipc->ipc_qstr) { ++ name = ipc->ipc_qstr->name; ++ namelen = ipc->ipc_qstr->len; ++ } else if (ipc->ipc_hinfo == &ipc->ipc_hinfo_area){ ++ name = (const char *)path->ip_key_target; ++ namelen = strlen(name); ++ } ++ if (name != NULL) ++ ext3fs_dirhash(name, namelen, ipc->ipc_hinfo); ++ if (path->ip_ikey_target == NULL) { ++ path->ip_ikey_target = iam_path_ikey(path, 4); ++ *(__u32 *)path->ip_ikey_target = ipc->ipc_hinfo->hash; ++ } ++ } else { ++ /* non-root index */ ++ assert_corr(entries == ++ data + iam_path_descr(path)->id_node_gap); ++ assert_corr(dx_get_limit(entries) == dx_node_limit(path)); ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int iam_htree_node_init(struct iam_container *c, ++ struct buffer_head *bh, int root) ++{ ++ struct dx_node *node; ++ ++ assert_corr(!root); ++ ++ node = (void *)bh->b_data; ++ node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); ++ node->fake.inode = 0; ++ return 0; ++} ++ ++static struct iam_entry *iam_htree_root_inc(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ struct dx_root *root; ++ struct iam_entry *entries; ++ ++ entries = frame->entries; ++ ++ dx_set_count(entries, 1); ++ root = (struct dx_root *) frame->bh->b_data; ++ root->info.indirect_levels++; ++ ++ return entries; ++} ++ ++static int iam_htree_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2) ++{ ++ __u32 p1 = le32_to_cpu(*(__u32 *)k1); ++ __u32 p2 = le32_to_cpu(*(__u32 *)k2); ++ ++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++} ++ ++static struct iam_path_descr *iam_htree_ipd_alloc(const struct iam_container *c, ++ void *area) ++{ ++ struct iam_path_compat *ipc; ++ ++ ipc = area; ++ memset(ipc, 0, sizeof *ipc); ++ iam_path_compat_init(ipc, c->ic_object); ++ return &ipc->ipc_descr; ++} ++ ++static void iam_htree_ipd_free(struct iam_path_descr *ipd) ++{ ++} ++ ++static struct iam_operations iam_htree_ops = { ++ .id_root_ptr = iam_htree_root_ptr, ++ .id_node_read = iam_node_read, ++ .id_node_init = iam_htree_node_init, ++ .id_node_check = iam_htree_node_check, ++ .id_node_load = iam_htree_node_load, ++ .id_ikeycmp = iam_htree_ikeycmp, ++ .id_root_inc = iam_htree_root_inc, ++ .id_ipd_alloc = iam_htree_ipd_alloc, ++ .id_ipd_free = iam_htree_ipd_free, ++ .id_name = "htree" ++}; ++ ++/* ++ * Parameters describing iam compatibility mode in which existing ext3 htrees ++ * can be manipulated. ++ */ ++struct iam_descr iam_htree_compat_param = { ++ .id_key_size = EXT3_NAME_LEN, ++ .id_rec_size = sizeof ((struct ext3_dir_entry_2 *)NULL)->inode, ++ .id_ikey_size = sizeof ((struct dx_map_entry *)NULL)->hash, ++ .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, ++ .id_node_gap = offsetof(struct dx_node, entries), ++ .id_root_gap = offsetof(struct dx_root, entries), ++ .id_ops = &iam_htree_ops, ++ .id_leaf_ops = &iam_htree_leaf_ops ++}; ++EXPORT_SYMBOL(iam_htree_compat_param); ++ ++static int iam_htree_guess(struct iam_container *c) ++{ ++ int result; ++ struct buffer_head *bh; ++ const struct dx_root *root; ++ ++ assert_corr(c->ic_object != NULL); ++ ++ result = iam_node_read(c, iam_htree_root_ptr(c), NULL, &bh); ++ if (result == 0) { ++ root = (void *)bh->b_data; ++ result = is_htree(c->ic_object->i_sb, root, 1); ++ if (result == 0) ++ c->ic_descr = &iam_htree_compat_param; ++ else ++ result = -EBADF; ++ brelse(bh); ++ } ++ return result; ++} ++ ++static struct iam_format iam_htree_format = { ++ .if_guess = iam_htree_guess ++}; ++ ++void iam_htree_format_init(void) ++{ ++ iam_format_register(&iam_htree_format); ++} diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch b/ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch new file mode 100644 index 0000000..fd03c92 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-uapi.patch @@ -0,0 +1,1408 @@ +Index: iam/fs/ext3/Makefile +=================================================================== +--- iam.orig/fs/ext3/Makefile ++++ iam/fs/ext3/Makefile +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o mballoc.o iam.o iam_lfix.o ++ extents.o mballoc.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: iam/fs/ext3/dir.c +=================================================================== +--- iam.orig/fs/ext3/dir.c ++++ iam/fs/ext3/dir.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +@@ -61,6 +62,7 @@ static unsigned char get_dtype(struct su + } + + ++#if EXT3_INVARIANT_ON + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -90,6 +92,7 @@ int ext3_check_dir_entry (const char * f + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; + } ++#endif + + static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) +@@ -305,12 +308,14 @@ static void free_rb_tree_fname(struct rb + root->rb_node = NULL; + } + ++extern struct iam_private_info *ext3_iam_alloc_info(int flags); ++extern void ext3_iam_release_info(struct iam_private_info *info); + + struct dir_private_info *create_dir_info(loff_t pos) + { + struct dir_private_info *p; + +- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ p = (void *)ext3_iam_alloc_info(GFP_KERNEL); + if (!p) + return NULL; + p->root.rb_node = NULL; +@@ -326,6 +331,7 @@ struct dir_private_info *create_dir_info + void ext3_htree_free_dir_info(struct dir_private_info *p) + { + free_rb_tree_fname(&p->root); ++ ext3_iam_release_info((void *)p); + kfree(p); + } + +Index: iam/fs/ext3/file.c +=================================================================== +--- iam.orig/fs/ext3/file.c ++++ iam/fs/ext3/file.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include "xattr.h" + #include "acl.h" + +@@ -31,14 +32,18 @@ + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +-static int ext3_release_file (struct inode * inode, struct file * filp) ++static int ext3_release_file(struct inode * inode, struct file * filp) + { + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + ext3_discard_reservation(inode); +- if (is_dx(inode) && filp->private_data) ++ if (is_dx(inode) && filp->private_data) { ++ if (S_ISDIR(inode->i_mode)) + ext3_htree_free_dir_info(filp->private_data); ++ else ++ ext3_iam_release(filp, inode); ++ } + + return 0; + } +Index: iam/fs/ext3/iam-uapi.c +=================================================================== +--- iam.orig/fs/ext3/iam-uapi.c ++++ iam/fs/ext3/iam-uapi.c +@@ -0,0 +1,368 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_uapi.c ++ * User-level interface to iam (ioctl based) ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error() */ ++#include ++#include ++ ++#include ++ ++#include ++#include ++ ++struct iam_private_info { ++ struct dir_private_info ipi_dir; /* has to be first */ ++ struct iam_container ipi_bag; ++ struct iam_descr ipi_descr; ++ struct iam_iterator ipi_it; ++ struct iam_path_descr *ipi_ipd; ++ char ipi_ipd_area[DX_IPD_MAX_SIZE]; ++}; ++ ++enum { ++ IAM_INSERT_CREDITS = 20 ++}; ++ ++static struct iam_private_info *get_ipi(struct file *filp) ++{ ++ return filp->private_data; ++} ++ ++static int iam_uapi_it(int cmd, struct inode *inode, ++ struct file *filp, struct iam_uapi_it *itop) ++{ ++ struct iam_private_info *ipi; ++ struct iam_iterator *it; ++ enum iam_it_state st; ++ int result = 0; ++ ++ ipi = get_ipi(filp); ++ it = &ipi->ipi_it; ++ st = it->ii_state; ++ switch (cmd) { ++ case IAM_IOC_IT_START: ++ result = iam_it_init(it, &ipi->ipi_bag, ++ IAM_IT_MOVE, ipi->ipi_ipd); ++ if (result == 0) ++ result = iam_it_get(it, itop->iui_op.iul_key); ++ break; ++ case IAM_IOC_IT_NEXT: ++ if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED) ++ result = iam_it_next(it); ++ else ++ result = -EBUSY; ++ break; ++ case IAM_IOC_IT_STOP: ++ iam_it_put(it); ++ iam_it_fini(it); ++ result = 0; ++ break; ++ } ++ st = it->ii_state; ++ if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED) ++ memcpy(itop->iui_op.iul_key, iam_it_key_get(it), ++ iam_it_key_size(it)); ++ if (st == IAM_IT_ATTACHED) ++ iam_reccpy(&it->ii_path.ip_leaf, itop->iui_op.iul_rec); ++ itop->iui_state = st; ++ return result; ++} ++ ++static int iam_uapi_op(int cmd, struct inode *inode, ++ struct file *filp, struct iam_uapi_op *op) ++{ ++ int result; ++ struct iam_private_info *ipi; ++ ++ ipi = get_ipi(filp); ++ if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_DELETE) { ++ handle_t *h; ++ ++ h = ext3_journal_start(inode, IAM_INSERT_CREDITS); ++ if (!IS_ERR(h)) { ++ if (cmd == IAM_IOC_INSERT) ++ result = iam_insert(h, &ipi->ipi_bag, ++ op->iul_key, ++ op->iul_rec, ipi->ipi_ipd); ++ else ++ result = iam_delete(h, &ipi->ipi_bag, ++ op->iul_key, ipi->ipi_ipd); ++ ext3_journal_stop(h); ++ } else { ++ result = PTR_ERR(h); ++ ext3_std_error(inode->i_sb, result); ++ } ++ } else ++ result = iam_lookup(&ipi->ipi_bag, op->iul_key, ++ op->iul_rec, ipi->ipi_ipd); ++ return result; ++} ++ ++struct iam_private_info *ext3_iam_alloc_info(int flags) ++{ ++ struct iam_private_info *info; ++ ++ info = kmalloc(sizeof *info, flags); ++ if (info != NULL) ++ memset(info, 0, sizeof *info); ++ return info; ++} ++ ++void ext3_iam_release_info(struct iam_private_info *info) ++{ ++ iam_it_put(&info->ipi_it); ++ iam_it_fini(&info->ipi_it); ++ if (info->ipi_ipd != NULL) ++ info->ipi_bag.ic_descr->id_ops->id_ipd_free(info->ipi_ipd); ++ iam_container_fini(&info->ipi_bag); ++} ++ ++void ext3_iam_release(struct file *filp, struct inode *inode) ++{ ++ struct iam_private_info *info; ++ ++ info = filp->private_data; ++ ext3_iam_release_info(info); ++ ++ kfree(info); ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ ++static int iam_uapi_init(struct inode *inode, ++ struct file *filp, struct iam_uapi_info *ua) ++{ ++ int result; ++ struct iam_private_info *info; ++ ++ info = ext3_iam_alloc_info(GFP_KERNEL); ++ if (info != NULL) { ++ struct iam_container *bag; ++ struct iam_descr *des; ++ ++ bag = &info->ipi_bag; ++ des = &info->ipi_descr; ++ result = iam_container_init(bag, des, inode); ++ if (result == 0) { ++ result = iam_container_setup(bag); ++ if (result == 0) { ++ /* ++ * Container setup might change ->ic_descr ++ */ ++ des = bag->ic_descr; ++ info->ipi_ipd = des->id_ops-> ++ id_ipd_alloc(bag, info->ipi_ipd_area); ++ if (info->ipi_ipd != NULL) { ++ filp->private_data = info; ++ EXT3_I(inode)->i_flags |= EXT3_INDEX_FL; ++ } else ++ result = -ENOMEM; ++ } ++ } ++ } else ++ result = -ENOMEM; ++ return result; ++} ++ ++ ++static int getua(struct iam_uapi_info *ua, unsigned long arg) ++{ ++ if (copy_from_user(ua, (struct iam_uapi_info __user *)arg, sizeof *ua)) ++ return -EFAULT; ++ else ++ return 0; ++} ++ ++static int putua(struct iam_uapi_info *ua, unsigned long arg) ++{ ++ if (copy_to_user((struct iam_uapi_info __user *)arg, ua, sizeof *ua)) ++ return -EFAULT; ++ else ++ return 0; ++} ++ ++enum outop_t { ++ KEY = 1 << 0, ++ REC = 1 << 1, ++ STATE = 1 << 2 ++}; ++ ++static int outop(struct iam_uapi_op *op, struct iam_uapi_op *uop, ++ struct iam_descr *des, enum outop_t opt) ++{ ++ int result; ++ ++ if (((opt & REC) && copy_to_user((void __user *)uop->iul_rec, ++ op->iul_rec, des->id_rec_size)) || ++ ((opt & KEY) && copy_to_user((void __user *)uop->iul_key, ++ op->iul_key, des->id_key_size))) ++ result = -EFAULT; ++ else ++ result = 0; ++ return result; ++} ++ ++static void putop(struct iam_uapi_op *op) ++{ ++ kfree(op->iul_key); ++ kfree(op->iul_rec); ++} ++ ++static int getop(struct iam_uapi_op *op, struct iam_uapi_op *uop, ++ struct iam_descr *des, unsigned long arg) ++{ ++ int result; ++ int ks; ++ int rs; ++ ++ ks = des->id_key_size; ++ rs = des->id_rec_size; ++ op->iul_key = kmalloc(ks, GFP_KERNEL); ++ op->iul_rec = kmalloc(rs, GFP_KERNEL); ++ if (!copy_from_user(uop, ++ (struct iam_uapi_op __user *)arg, sizeof *uop) && ++ op->iul_key != NULL && op->iul_rec != NULL && ++ !copy_from_user(op->iul_key, (void __user *)uop->iul_key, ks) && ++ !copy_from_user(op->iul_rec, (void __user *)uop->iul_rec, rs)) ++ result = 0; ++ else { ++ result = -EFAULT; ++ putop(op); ++ } ++ return result; ++} ++ ++static int outit(struct iam_uapi_it *it, struct iam_uapi_it *uit, ++ struct iam_descr *des, enum outop_t opt, unsigned long arg) ++{ ++ int result; ++ ++ result = outop(&it->iui_op, &uit->iui_op, des, opt); ++ if (result == 0 && (opt&STATE)) ++ result = put_user(it->iui_state, (int __user *) arg); ++ return result; ++} ++ ++static void putit(struct iam_uapi_it *it) ++{ ++ putop(&it->iui_op); ++} ++ ++static int getit(struct iam_uapi_it *it, struct iam_uapi_it *uit, ++ struct iam_descr *des, unsigned long arg) ++{ ++ return getop(&it->iui_op, &uit->iui_op, des, ++ (unsigned long)&((struct iam_uapi_it *)arg)->iui_op); ++} ++ ++int iam_uapi_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int result; ++ struct iam_uapi_info ua; ++ struct iam_uapi_op uop; ++ struct iam_uapi_op op; ++ struct iam_uapi_it uit; ++ struct iam_uapi_it it; ++ enum outop_t opt; ++ ++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) { ++ result = -EACCES; ++ } else if (cmd == IAM_IOC_POLYMORPH) { ++ /* ++ * If polymorphing into directory, increase hard-link count. ++ */ ++ if (S_ISDIR((umode_t)arg) && !S_ISDIR(inode->i_mode)) ++ inode->i_nlink++; ++ else if (!S_ISDIR((umode_t)arg) && S_ISDIR(inode->i_mode)) ++ inode->i_nlink--; ++ inode->i_mode = (umode_t)arg; ++ mark_inode_dirty(inode); ++ result = 0; ++ } else if (cmd == IAM_IOC_INIT) { ++ if (filp->private_data == NULL) { ++ result = getua(&ua, arg); ++ if (result == 0) ++ result = iam_uapi_init(inode, filp, &ua); ++ } else ++ result = -EBUSY; ++ } else if (is_dx(inode) && filp->private_data != NULL) { ++ struct iam_descr *des; ++ ++ switch (cmd) { ++ case IAM_IOC_IT_START: ++ case IAM_IOC_IT_NEXT: ++ opt = KEY|REC|STATE; ++ break; ++ case IAM_IOC_LOOKUP: ++ opt = REC; ++ break; ++ default: ++ opt = 0; ++ break; ++ } ++ ++ des = get_ipi(filp)->ipi_bag.ic_descr; ++ if (cmd == IAM_IOC_GETINFO) { ++ ua.iui_keysize = des->id_key_size; ++ ua.iui_recsize = des->id_rec_size; ++ ua.iui_ptrsize = des->id_ptr_size; ++ ua.iui_height = 0; /* not yet */ ++ memcpy(ua.iui_fmt_name, des->id_ops->id_name, ++ ARRAY_SIZE(ua.iui_fmt_name)); ++ result = putua(&ua, arg); ++ } else if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_LOOKUP || ++ cmd == IAM_IOC_DELETE) { ++ result = getop(&op, &uop, des, arg); ++ if (result == 0) { ++ int res2; ++ result = iam_uapi_op(cmd, inode, filp, &op); ++ ++ res2 = outop(&op, &uop, des, opt); ++ result = result ? : res2; ++ putop(&op); ++ } ++ } else if (cmd == IAM_IOC_IT_START || cmd == IAM_IOC_IT_NEXT || ++ cmd == IAM_IOC_IT_STOP) { ++ result = getit(&it, &uit, des, arg); ++ if (result == 0) { ++ int res2; ++ ++ result = iam_uapi_it(cmd, inode, filp, &it); ++ ++ res2 = outit(&it, &uit, des, opt, arg); ++ result = result ? : res2; ++ putit(&it); ++ } ++ } else ++ result = -EINVAL; ++ } else ++ result = -ENOENT; ++ return result; ++} +Index: iam/fs/ext3/ioctl.c +=================================================================== +--- iam.orig/fs/ext3/ioctl.c ++++ iam/fs/ext3/ioctl.c +@@ -250,6 +250,6 @@ flags_err: + + + default: +- return -ENOTTY; ++ return iam_uapi_ioctl(inode, filp, cmd, arg); + } + } +Index: iam/include/linux/lustre_iam.h +=================================================================== +--- iam.orig/include/linux/lustre_iam.h ++++ iam/include/linux/lustre_iam.h +@@ -30,9 +30,6 @@ + #ifndef __LINUX_LUSTRE_IAM_H__ + #define __LINUX_LUSTRE_IAM_H__ + +-/* handle_t, journal_start(), journal_stop() */ +-#include +- + /* + * linux/include/linux/lustre_iam.h + */ +@@ -57,14 +54,95 @@ enum { + * [2] reserved for leaf node operations. + * + * [3] reserved for index operations. ++ * ++ * [4] reserved for path->ip_ikey_target ++ * + */ +- DX_SCRATCH_KEYS = 4, ++ DX_SCRATCH_KEYS = 5, + /* + * Maximal format name length. + */ + DX_FMT_NAME_LEN = 16 + }; + ++#ifdef __KERNEL__ ++/* handle_t, journal_start(), journal_stop() */ ++#include ++ ++/* ++ * Debugging. ++ * ++ * Various debugging levels. ++ */ ++ ++#if 0 ++/* ++ * Following macros are defined in config.h and are tunable through ++ * appropriate configure switches (indicated below). ++ */ ++ ++/* ++ * Compile basic assertions in. You want this most of the time. ++ * ++ * --{enable,disable}-ldiskfs-assert (on by default). ++ */ ++#define EXT3_ASSERT (1) ++ ++/* ++ * Compile heavier correctness checks in. You want this during development ++ * cycle. ++ * ++ * --{enable,disable}-ldiskfs-correctness (off by default). ++ */ ++#define EXT3_CORRECTNESS (1) ++ ++/* ++ * Compile heavy invariant checking in. You want this early during development ++ * or when chasing a bug. ++ * ++ * --{enable,disable}-ldiskfs-invariant (off by default). ++ */ ++#define EXT3_INVARIANT (1) ++#endif ++ ++#if defined(EXT3_ASSERT) ++#define EXT3_ASSERT_ON (1) ++#else ++#define EXT3_ASSERT_ON (0) ++#endif ++ ++#if defined(EXT3_CORRECTNESS) ++#define EXT3_CORRECTNESS_ON (1) ++#else ++#define EXT3_CORRECTNESS_ON (0) ++#endif ++ ++#if defined(EXT3_INVARIANT) ++#define EXT3_INVARIANT_ON (1) ++#else ++#define EXT3_INVARIANT_ON (0) ++#endif ++ ++#ifndef assert ++#if EXT3_ASSERT_ON ++#define assert(test) J_ASSERT(test) ++#else ++#define assert(test) ((void)(test)) ++#endif ++#endif ++ ++#if EXT3_CORRECTNESS_ON ++#define assert_corr(test) J_ASSERT(test) ++#else ++#define assert_corr(test) do {;} while (0) ++#endif ++ ++#if EXT3_INVARIANT_ON ++#define assert_inv(test) J_ASSERT(test) ++#else ++#define assert_inv(test) do {;} while (0) ++#endif ++ + /* + * Entry within index tree node. Consists of a key immediately followed + * (without padding) by a pointer to the child node. +@@ -86,14 +164,21 @@ struct iam_entry_compat { + */ + struct iam_key; + +-/* Incomplete type use to refer to the records stored in iam containers. */ ++/* ++ * Incomplete type use to refer to the records stored in iam containers. ++ */ + struct iam_rec; + +-struct iam_cookie { +- struct iam_key *ic_key; +- struct iam_rec *ic_rec; +-}; ++/* ++ * Key in index node. Possibly compressed. Fixed size. ++ */ ++struct iam_ikey; + ++/* ++ * Scalar type into which certain iam_key's can be uniquely mapped. Used to ++ * support interfaces like readdir(), where iteration over index has to be ++ * re-startable. ++ */ + typedef __u64 iam_ptr_t; + + /* +@@ -123,6 +208,31 @@ struct iam_leaf { + void *il_descr_data; + }; + ++/* ++ * Return values of ->lookup() operation from struct iam_leaf_operations. ++ */ ++enum iam_lookup_t { ++ /* ++ * lookup found a record with the key requested ++ */ ++ IAM_LOOKUP_EXACT, ++ /* ++ * lookup positioned leaf on some record ++ */ ++ IAM_LOOKUP_OK, ++ /* ++ * leaf was empty ++ */ ++ IAM_LOOKUP_EMPTY, ++ /* ++ * lookup positioned leaf before first record ++ */ ++ IAM_LOOKUP_BEFORE ++}; ++ ++/* ++ * Format-specific container operations. These are called by generic iam code. ++ */ + struct iam_operations { + /* + * Returns pointer (in the same sense as pointer in index entry) to +@@ -131,11 +241,15 @@ struct iam_operations { + __u32 (*id_root_ptr)(struct iam_container *c); + + /* +- * Check validity and consistency of index node. This is called when +- * iam just loaded new node into frame. ++ * Check validity and consistency of index node. + */ + int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); + /* ++ * Copy some data from node header into frame. This is called when ++ * new node is loaded into frame. ++ */ ++ int (*id_node_load)(struct iam_path *path, struct iam_frame *frame); ++ /* + * Initialize new node (stored in @bh) that is going to be added into + * tree. + */ +@@ -144,23 +258,33 @@ struct iam_operations { + int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, + handle_t *h, struct buffer_head **bh); + /* +- * Key comparison function. Returns -1, 0, +1. ++ * Key comparison functions. Returns -1, 0, +1. + */ +- int (*id_keycmp)(const struct iam_container *c, +- const struct iam_key *k1, const struct iam_key *k2); ++ int (*id_ikeycmp)(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2); + /* +- * Create new container. +- * +- * Newly created container has a root node and a single leaf. Leaf +- * contains single record with the smallest possible key. ++ * Modify root node when tree height increases. + */ +- int (*id_create)(struct iam_container *c); ++ struct iam_entry *(*id_root_inc)(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame); ++ ++ struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c); ++ void (*id_ipd_free)(const struct iam_container *c, ++ struct iam_path_descr *ipd); + /* + * Format name. + */ + char id_name[DX_FMT_NAME_LEN]; + }; + ++/* ++ * Another format-specific operation vector, consisting of methods to access ++ * leaf nodes. This is separated from struct iam_operations, because it is ++ * assumed that there will be many formats with different format of leaf ++ * nodes, yes the same struct iam_operations. ++ */ + struct iam_leaf_operations { + /* + * leaf operations. +@@ -186,7 +310,8 @@ struct iam_leaf_operations { + void (*start)(struct iam_leaf *l); + /* more leaf to the next entry. */ + void (*next)(struct iam_leaf *l); +- /* return key of current leaf record. This method may return ++ /* ++ * return key of current leaf record. This method may return + * either pointer to the key stored in node, or copy key into + * @k buffer supplied by caller and return pointer to this + * buffer. The latter approach is used when keys in nodes are +@@ -194,8 +319,10 @@ struct iam_leaf_operations { + * all). + * + * Caller should assume that returned pointer is only valid +- * while leaf node is pinned and locked.*/ +- struct iam_key *(*key)(const struct iam_leaf *l, struct iam_key *k); ++ * while leaf node is pinned and locked. ++ */ ++ struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k); ++ struct iam_key *(*key)(const struct iam_leaf *l); + /* return pointer to entry body. Pointer is valid while + corresponding leaf node is locked and pinned. */ + struct iam_rec *(*rec)(const struct iam_leaf *l); +@@ -203,6 +330,9 @@ struct iam_leaf_operations { + void (*key_set)(struct iam_leaf *l, const struct iam_key *k); + void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r); + ++ int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k); ++ ++ int (*key_size)(const struct iam_leaf *l); + /* + * Search leaf @l for a record with key @k or for a place + * where such record is to be inserted. +@@ -210,6 +340,7 @@ struct iam_leaf_operations { + * Scratch keys from @path can be used. + */ + int (*lookup)(struct iam_leaf *l, const struct iam_key *k); ++ int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik); + + int (*can_add)(const struct iam_leaf *l, + const struct iam_key *k, const struct iam_rec *r); +@@ -221,17 +352,15 @@ struct iam_leaf_operations { + /* + * remove rec for a leaf + */ +- void (*rec_del)(struct iam_leaf *l); ++ void (*rec_del)(struct iam_leaf *l, int shift); + /* + * split leaf node, moving some entries into @bh (the latter currently + * is assumed to be empty). + */ +- void (*split)(struct iam_leaf *l, struct buffer_head *bh); ++ void (*split)(struct iam_leaf *l, struct buffer_head **bh, ++ iam_ptr_t newblknr); + }; + +-struct iam_path *iam_leaf_path(const struct iam_leaf *leaf); +-struct iam_container *iam_leaf_container(const struct iam_leaf *leaf); +- + /* + * Parameters, describing a flavor of iam container. + */ +@@ -241,6 +370,10 @@ struct iam_descr { + */ + size_t id_key_size; + /* ++ * Size of a key in index nodes, in bytes. ++ */ ++ size_t id_ikey_size; ++ /* + * Size of a pointer to the next level (stored in index nodes), in + * bytes. + */ +@@ -264,6 +397,9 @@ struct iam_descr { + struct iam_leaf_operations *id_leaf_ops; + }; + ++/* ++ * An instance of iam container. ++ */ + struct iam_container { + /* + * Underlying flat file. IO against this object is issued to +@@ -274,6 +410,10 @@ struct iam_container { + * container flavor. + */ + struct iam_descr *ic_descr; ++ /* ++ * read-write lock protecting index consistency. ++ */ ++ struct rw_semaphore ic_sem; + }; + + /* +@@ -284,7 +424,7 @@ struct iam_path_descr { + /* + * Scratch-pad area for temporary keys. + */ +- struct iam_key *ipd_key_scratch[DX_SCRATCH_KEYS]; ++ struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS]; + }; + + /* +@@ -316,6 +456,7 @@ struct iam_path { + * Key searched for. + */ + const struct iam_key *ip_key_target; ++ const struct iam_ikey *ip_ikey_target; + /* + * Description-specific data. + */ +@@ -334,6 +475,7 @@ struct iam_path_compat { + struct dx_hash_info *ipc_hinfo; + struct dentry *ipc_dentry; + struct iam_path_descr ipc_descr; ++ struct dx_hash_info ipc_hinfo_area; + }; + + /* +@@ -347,7 +489,9 @@ enum iam_it_state { + /* initial state */ + IAM_IT_DETACHED, + /* iterator is above particular record in the container */ +- IAM_IT_ATTACHED ++ IAM_IT_ATTACHED, ++ /* iterator is positioned before record */ ++ IAM_IT_SKEWED + }; + + /* +@@ -355,7 +499,7 @@ enum iam_it_state { + */ + enum iam_it_flags { + /* +- * this iterator will move (iam_it_{prev,next}() will be called on it) ++ * this iterator will move (iam_it_next() will be called on it) + */ + IAM_IT_MOVE = (1 << 0), + /* +@@ -372,15 +516,26 @@ enum iam_it_flags { + * doesn't point to any particular record in this container. + * + * After successful call to iam_it_get() and until corresponding call to +- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). ++ * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or ++ * IAM_IT_SKEWED. + * +- * Attached iterator can move through records in a container (provided ++ * Active iterator can move through records in a container (provided + * IAM_IT_MOVE permission) in a key order, can get record and key values as it + * passes over them, and can modify container (provided IAM_IT_WRITE + * permission). + * ++ * Iteration may reach the end of container, at which point iterator switches ++ * into IAM_IT_DETACHED state. ++ * + * Concurrency: iterators are supposed to be local to thread. Interfaces below +- * do no internal serialization. ++ * do no internal serialization of access to the iterator fields. ++ * ++ * When in non-detached state, iterator keeps some container nodes pinned in ++ * memory and locked (that locking may be implemented at the container ++ * granularity though). In particular, clients may assume that pointers to ++ * records and keys obtained through iterator interface as valid until ++ * iterator is detached (except that they may be invalidated by sub-sequent ++ * operations done through the same iterator). + * + */ + struct iam_iterator { +@@ -390,7 +545,8 @@ struct iam_iterator { + __u32 ii_flags; + enum iam_it_state ii_state; + /* +- * path to the record. Valid in IAM_IT_ATTACHED state. ++ * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED ++ * states. + */ + struct iam_path ii_path; + }; +@@ -405,133 +561,26 @@ void iam_path_compat_fini(struct iam_pat + struct iam_path_descr *iam_ipd_alloc(void *area, int keysize); + void iam_ipd_free(struct iam_path_descr *ipd); + +-/* +- * Initialize iterator to IAM_IT_DETACHED state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ + int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, + struct iam_path_descr *pd); +-/* +- * Finalize iterator and release all resources. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- */ + void iam_it_fini(struct iam_iterator *it); +- +-/* +- * Attach iterator. After successful completion, @it points to record with the +- * largest key not larger than @k. Semantics of ->id_create() method guarantee +- * that such record will always be found. +- * +- * Return value: 0: positioned on existing record, +- * -ve: error. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- * postcondition: ergo(result == 0, +- * (it_state(it) == IAM_IT_ATTACHED && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) +- */ + int iam_it_get(struct iam_iterator *it, const struct iam_key *k); +- +-/* +- * Duplicates iterator. +- * +- * postcondition: it_state(dst) == it_state(src) && +- * iam_it_container(dst) == iam_it_container(src) && +- * dst->ii_flags = src->ii_flags && +- * ergo(it_state(it) == IAM_IT_ATTACHED, +- * iam_it_rec_get(dst) == iam_it_rec_get(src) && +- * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) +- */ ++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k); + void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src); +- +-/* +- * Detach iterator. Does nothing it detached state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ + void iam_it_put(struct iam_iterator *it); +- +-/* +- * Move iterator one record right. +- * +- * Return value: 0: success, +- * +1: end of container reached +- * -ve: error +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE +- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) +- */ + int iam_it_next(struct iam_iterator *it); +- +-/* +- * Return pointer to the record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ + struct iam_rec *iam_it_rec_get(const struct iam_iterator *it); +- +-/* +- * Replace contents of record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); +- +-/* +- * Place key under iterator in @k, return @k +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-struct iam_key *iam_it_key_get(const struct iam_iterator *it, +- struct iam_key *k); +- +-/* +- * Insert new record with key @k and contents from @r, shifting records to the +- * right. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && +- * it->ii_flags&IAM_IT_WRITE && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0 +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, +- * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && +- * !memcmp(iam_it_rec_get(it), r, ...)) +- */ ++int iam_it_rec_set(handle_t *h, ++ struct iam_iterator *it, const struct iam_rec *r); ++struct iam_key *iam_it_key_get(const struct iam_iterator *it); ++int iam_it_key_size(const struct iam_iterator *it); + int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, + const struct iam_key *k, const struct iam_rec *r); +-/* +- * Delete record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ + int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); + + typedef __u64 iam_pos_t; + +-/* +- * Convert iterator to cookie. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && +- * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ + iam_pos_t iam_it_store(const struct iam_iterator *it); +- +-/* +- * Restore iterator from cookie. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE && +- * path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) +- * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED && +- * iam_it_store(it) == pos) +- */ + int iam_it_load(struct iam_iterator *it, iam_pos_t pos); + + int iam_lookup(struct iam_container *c, const struct iam_key *k, +@@ -539,10 +588,10 @@ int iam_lookup(struct iam_container *c, + int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, + struct iam_path_descr *pd); + int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, +- struct iam_rec *r, struct iam_path_descr *pd); ++ const struct iam_rec *r, struct iam_path_descr *pd); + int iam_insert(handle_t *handle, struct iam_container *c, + const struct iam_key *k, +- struct iam_rec *r, struct iam_path_descr *pd); ++ const struct iam_rec *r, struct iam_path_descr *pd); + /* + * Initialize container @c. + */ +@@ -558,10 +607,6 @@ void iam_container_fini(struct iam_conta + */ + int iam_container_setup(struct iam_container *c); + +-#ifndef assert +-#define assert(test) J_ASSERT(test) +-#endif +- + static inline struct iam_descr *iam_container_descr(struct iam_container *c) + { + return c->ic_descr; +@@ -577,16 +622,65 @@ static inline struct inode *iam_path_obj + return p->ip_container->ic_object; + } + +-static inline void iam_keycpy(const struct iam_container *c, +- struct iam_key *k1, const struct iam_key *k2) ++static inline void iam_ikeycpy(const struct iam_container *c, ++ struct iam_ikey *k1, const struct iam_ikey *k2) ++{ ++ memcpy(k1, k2, c->ic_descr->id_ikey_size); ++} ++ ++static inline size_t iam_entry_size(struct iam_path *p) ++{ ++ return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size; ++} ++ ++static inline struct iam_entry *iam_entry_shift(struct iam_path *p, ++ struct iam_entry *entry, ++ int shift) ++{ ++ void *e = entry; ++ return e + shift * iam_entry_size(p); ++} ++ ++static inline struct iam_ikey *iam_get_ikey(struct iam_path *p, ++ struct iam_entry *entry, ++ struct iam_ikey *key) ++{ ++ return memcpy(key, entry, iam_path_descr(p)->id_ikey_size); ++} ++ ++static inline struct iam_ikey *iam_ikey_at(struct iam_path *p, ++ struct iam_entry *entry) ++{ ++ return (struct iam_ikey *)entry; ++} ++ ++static inline ptrdiff_t iam_entry_diff(struct iam_path *p, ++ struct iam_entry *e1, ++ struct iam_entry *e2) ++{ ++ ptrdiff_t diff; ++ ++ diff = (void *)e1 - (void *)e2; ++ assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff); ++ return diff / iam_entry_size(p); ++} ++ ++/* ++ * Helper for the frequent case, where key was already placed into @k1 by ++ * callback. ++ */ ++static inline void iam_ikeycpy0(const struct iam_container *c, ++ struct iam_ikey *k1, const struct iam_ikey *k2) + { +- memcpy(k1, k2, c->ic_descr->id_key_size); ++ if (k1 != k2) ++ iam_ikeycpy(c, k1, k2); + } + +-static inline int iam_keycmp(const struct iam_container *c, +- const struct iam_key *k1, const struct iam_key *k2) ++static inline int iam_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2) + { +- return c->ic_descr->id_ops->id_keycmp(c, k1, k2); ++ return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2); + } + + static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst, +@@ -600,11 +694,38 @@ static inline void *iam_entry_off(struct + return (void *)((char *)entry + off); + } + ++/* ++ * Leaf helpers. ++ */ ++ ++static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf) ++{ ++ return leaf->il_path; ++} ++ ++static inline struct iam_container * ++iam_leaf_container(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_path(leaf)->ip_container; ++} ++ ++static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_container(leaf)->ic_descr; ++} ++ ++static inline struct iam_leaf_operations * ++iam_leaf_ops(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_descr(leaf)->id_leaf_ops; ++} ++ ++ + /*XXX These stuff put here, just because they are used by iam.c and namei.c*/ + static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) + { + return le32_to_cpu(*(u32*)iam_entry_off(entry, +- iam_path_descr(p)->id_key_size)) ++ iam_path_descr(p)->id_ikey_size)) + & 0x00ffffff; + } + +@@ -612,21 +733,64 @@ static inline void dx_set_block(struct i + struct iam_entry *entry, unsigned value) + { + *(u32*)iam_entry_off(entry, +- iam_path_descr(p)->id_key_size) = ++ iam_path_descr(p)->id_ikey_size) = + cpu_to_le32(value); + } + +-static inline void dx_set_key(struct iam_path *p, struct iam_entry *entry, +- const struct iam_key *key) ++static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry, ++ const struct iam_ikey *key) + { +- iam_keycpy(p->ip_container, iam_entry_off(entry, 0), key); ++ iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key); + } + ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++struct fake_dirent { ++ __le32 inode; ++ __le16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ + struct dx_countlimit { + __le16 limit; + __le16 count; + }; + ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root { ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ __le32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct {} entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct {} entries[0]; ++}; ++ ++ + static inline unsigned dx_get_count(struct iam_entry *entries) + { + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +@@ -647,9 +811,21 @@ static inline unsigned dx_node_limit(str + struct iam_descr *param = iam_path_descr(p); + unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize - + param->id_node_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); ++ return entry_space / (param->id_ikey_size + param->id_ptr_size); ++} ++ ++static inline unsigned dx_root_limit(struct iam_path *p) ++{ ++ struct iam_descr *param = iam_path_descr(p); ++ unsigned limit = iam_path_obj(p)->i_sb->s_blocksize - ++ param->id_root_gap; ++ limit /= (param->id_ikey_size + param->id_ptr_size); ++ if (limit == dx_node_limit(p)) ++ limit--; ++ return limit; + } + ++ + static inline struct iam_entry *dx_get_entries(struct iam_path *path, + void *data, int root) + { +@@ -665,7 +841,8 @@ static inline struct iam_entry *dx_node_ + frame->bh->b_data, frame == path->ip_frames); + } + +-static inline struct iam_key *iam_path_key(const struct iam_path *path, int nr) ++static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path, ++ int nr) + { + assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch)); + return path->ip_data->ipd_key_scratch[nr]; +@@ -674,6 +851,7 @@ static inline struct iam_key *iam_path_k + int dx_lookup(struct iam_path *path); + void dx_insert_block(struct iam_path *path, struct iam_frame *frame, + u32 hash, u32 block); ++int dx_index_is_compat(struct iam_path *path); + + int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct iam_path *path, __u32 *start_hash); +@@ -681,6 +859,20 @@ int ext3_htree_next_block(struct inode * + struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, + u32 *block, int *err); + int split_index_node(handle_t *handle, struct iam_path *path); ++struct ext3_dir_entry_2 *split_entry(struct inode *dir, ++ struct ext3_dir_entry_2 *de, ++ unsigned long ino, mode_t mode, ++ const char *name, int namelen); ++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir, ++ struct buffer_head *bh, ++ const char *name, int namelen); ++struct ext3_dir_entry_2 *move_entries(struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct buffer_head **bh1, ++ struct buffer_head **bh2, ++ __u32 *delim_hash); ++ ++extern struct iam_descr iam_htree_compat_param; + + /* + * external +@@ -698,10 +890,12 @@ int iam_node_read(struct iam_container * + handle_t *handle, struct buffer_head **bh); + + void iam_insert_key(struct iam_path *path, struct iam_frame *frame, +- const struct iam_key *key, iam_ptr_t ptr); ++ const struct iam_ikey *key, iam_ptr_t ptr); + + int iam_leaf_at_end(const struct iam_leaf *l); + void iam_leaf_next(struct iam_leaf *folio); ++int iam_leaf_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r); + + struct iam_path *iam_leaf_path(const struct iam_leaf *leaf); + struct iam_container *iam_leaf_container(const struct iam_leaf *leaf); +@@ -709,14 +903,95 @@ struct iam_descr *iam_leaf_descr(const s + struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf); + + ++int iam_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ ++/* ++ * Container format. ++ */ + struct iam_format { ++ /* ++ * Method called to recognize container format. Should return true iff ++ * container @c conforms to this format. This method may do IO to read ++ * container pages. ++ * ++ * If container is recognized, this method sets operation vectors ++ * ->id_ops and ->id_leaf_ops in container description (c->ic_descr), ++ * and fills other description fields. ++ */ + int (*if_guess)(struct iam_container *c); ++ /* ++ * Linkage into global list of container formats. ++ */ + struct list_head if_linkage; + }; + + void iam_format_register(struct iam_format *fmt); + + void iam_lfix_format_init(void); ++void iam_lvar_format_init(void); ++void iam_htree_format_init(void); ++ ++struct iam_private_info; ++ ++void ext3_iam_release(struct file *filp, struct inode *inode); ++ ++int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd, ++ unsigned long arg); ++ ++/* dir.c */ ++#if EXT3_INVARIANT_ON ++extern int ext3_check_dir_entry(const char *, struct inode *, ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++#else ++static inline int ext3_check_dir_entry(const char * function, ++ struct inode * dir, ++ struct ext3_dir_entry_2 * de, ++ struct buffer_head * bh, ++ unsigned long offset) ++{ ++ return 1; ++} ++#endif ++ ++/* __KERNEL__ */ ++#endif ++ ++/* ++ * User level API. Copy exists in lustre/lustre/tests/iam_ut.c ++ */ ++ ++struct iam_uapi_info { ++ __u16 iui_keysize; ++ __u16 iui_recsize; ++ __u16 iui_ptrsize; ++ __u16 iui_height; ++ char iui_fmt_name[DX_FMT_NAME_LEN]; ++}; ++ ++struct iam_uapi_op { ++ void *iul_key; ++ void *iul_rec; ++}; ++ ++struct iam_uapi_it { ++ struct iam_uapi_op iui_op; ++ __u16 iui_state; ++}; ++ ++enum iam_ioctl_cmd { ++ IAM_IOC_INIT = _IOW('i', 1, struct iam_uapi_info), ++ IAM_IOC_GETINFO = _IOR('i', 2, struct iam_uapi_info), ++ IAM_IOC_INSERT = _IOR('i', 3, struct iam_uapi_op), ++ IAM_IOC_LOOKUP = _IOWR('i', 4, struct iam_uapi_op), ++ IAM_IOC_DELETE = _IOR('i', 5, struct iam_uapi_op), ++ IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it), ++ IAM_IOC_IT_NEXT = _IOW('i', 7, struct iam_uapi_it), ++ IAM_IOC_IT_STOP = _IOR('i', 8, struct iam_uapi_it), ++ ++ IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long) ++}; + + /* __LINUX_LUSTRE_IAM_H__ */ + #endif diff --git a/ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch b/ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch new file mode 100644 index 0000000..d03d74c --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-orphans-delay.patch @@ -0,0 +1,42 @@ +Index: iam/fs/ext3/super.c +=================================================================== +--- iam.orig/fs/ext3/super.c ++++ iam/fs/ext3/super.c +@@ -147,6 +147,8 @@ static void ext3_handle_error(struct sup + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ++ dump_stack(); ++ + if (sb->s_flags & MS_RDONLY) + return; + +@@ -1168,7 +1170,7 @@ static int ext3_check_descriptors (struc + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +-static void ext3_orphan_cleanup (struct super_block * sb, ++void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) + { + unsigned int s_flags = sb->s_flags; +@@ -1256,7 +1258,9 @@ static void ext3_orphan_cleanup (struct + } + #endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ ++ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; + } ++EXPORT_SYMBOL(ext3_orphan_cleanup); + + #define log2(n) ffz(~(n)) + +@@ -1682,8 +1686,7 @@ static int ext3_fill_super (struct super + * superblock lock. + */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; +- ext3_orphan_cleanup(sb, es); +- EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; ++ + if (needs_recovery) + printk (KERN_INFO "EXT3-fs: recovery complete.\n"); + ext3_mark_recovery_complete(sb, es); diff --git a/ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch new file mode 100644 index 0000000..565ba60 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-pdirops-2.6.9.patch @@ -0,0 +1,1247 @@ +Index: iam/fs/ext3/namei.c +=================================================================== +--- iam.orig/fs/ext3/namei.c ++++ iam/fs/ext3/namei.c +@@ -55,18 +55,20 @@ struct buffer_head *ext3_append(handle_t + u32 *block, int *err) + { + struct buffer_head *bh; ++ struct ext3_inode_info *ei = EXT3_I(inode); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + +- if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ bh = ext3_bread(handle, inode, *block, 1, err); ++ if (bh != NULL) { + inode->i_size += inode->i_sb->s_blocksize; +- EXT3_I(inode)->i_disksize = inode->i_size; +- *err = ext3_journal_get_write_access(handle, bh); +- if (*err != 0) { +- brelse(bh); +- bh = NULL; +- } ++ ei->i_disksize = inode->i_size; + } ++ up(&ei->i_append_sem); ++ + return bh; + } + +@@ -90,7 +92,7 @@ static void dx_set_count(struct iam_entr + static void dx_set_limit(struct iam_entry *entries, unsigned value); + static unsigned dx_root_limit(struct iam_path *p); + static unsigned dx_node_limit(struct iam_path *p); +-static int dx_probe(struct dentry *dentry, ++static int dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct iam_path *path); +@@ -104,7 +106,6 @@ static struct buffer_head * ext3_dx_find + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +- + static inline void dx_set_limit(struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +@@ -138,23 +139,20 @@ int dx_node_check(struct iam_path *p, st + iam_get_ikey(p, e, iam_path_ikey(p, 1)); + if (i > 0 && + iam_ikeycmp(c, iam_path_ikey(p, 0), +- iam_path_ikey(p, 1)) > 0) { +- BREAKPOINT(); ++ iam_path_ikey(p, 1)) > 0) + return 0; +- } + blk = dx_get_block(p, e); +- if (inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) { +- BREAKPOINT(); ++ /* ++ * Disable this check as it is racy. ++ */ ++ if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) + return 0; +- } + /* + * By definition of a tree, no node points to the root. + */ +- if (blk == root) { +- BREAKPOINT(); ++ if (blk == root) + return 0; + } +- } + return 1; + } + +@@ -241,12 +239,241 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + +-int dx_lookup(struct iam_path *path) ++/* ++ * Per-node tree locking. ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ */ ++ ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++#define DX_DEBUG (1) ++ ++#if DX_DEBUG ++static struct dx_lock_stats { ++ unsigned dls_bh_lock; ++ unsigned dls_bh_busy; ++ unsigned dls_bh_again; ++ unsigned dls_bh_full_again; ++} dx_lock_stats = { 0, }; ++#define DX_DEVAL(x) x ++#else ++#define DX_DEVAL(x) ++#endif ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++ DX_DEVAL(dx_lock_stats.dls_bh_lock++); ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ DX_DEVAL(dx_lock_stats.dls_bh_busy++); ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++/* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, ++ enum dynlock_type lt) ++{ ++ return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS); ++} ++ ++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh) ++{ ++ if (lh != NULL) ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh); ++} ++ ++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh) ++{ ++ int i; ++ ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) { ++ if (*lh != NULL) { ++ dx_unlock_htree(dir, *lh); ++ *lh = NULL; ++ } ++ } ++} ++ ++/* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct iam_entry *dx_find_position(struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ int count; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; ++ ++ count = dx_get_count(frame->entries); ++ assert_corr(count && count <= dx_get_limit(frame->entries)); ++ p = iam_entry_shift(path, frame->entries, ++ dx_index_is_compat(path) ? 1 : 2); ++ q = iam_entry_shift(path, frame->entries, count - 1); ++ while (p <= q) { ++ m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2); ++ if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m), ++ path->ip_ikey_target) > 0) ++ q = iam_entry_shift(path, m, -1); ++ else ++ p = iam_entry_shift(path, m, +1); ++ } ++ return iam_entry_shift(path, p, -1); ++} ++ ++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame) ++{ ++ return dx_get_block(path, dx_find_position(path, frame)); ++} ++ ++/* ++ * Fast check for frame consistency. ++ */ ++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame) ++{ ++ struct iam_container *bag; ++ struct iam_entry *next; ++ struct iam_entry *last; ++ struct iam_entry *entries; ++ struct iam_entry *at; ++ ++ bag = path->ip_container; ++ at = frame->at; ++ entries = frame->entries; ++ last = iam_entry_shift(path, entries, dx_get_count(entries) - 1); ++ ++ if (unlikely(at > last)) ++ return -EAGAIN; ++ ++ if (unlikely(dx_get_block(path, at) != frame->leaf)) ++ return -EAGAIN; ++ ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at), ++ path->ip_ikey_target) > 0)) ++ return -EAGAIN; ++ ++ next = iam_entry_shift(path, at, +1); ++ if (next <= last) { ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next), ++ path->ip_ikey_target) <= 0)) ++ return -EAGAIN; ++ } ++ return 0; ++} ++ ++/* ++ * returns 0 if path was unchanged, -EAGAIN otherwise. ++ */ ++static int dx_check_path(struct iam_path *path, struct iam_frame *frame) ++{ ++ int equal; ++ ++ dx_lock_bh(frame->bh); ++ equal = dx_check_fast(path, frame) == 0 || ++ frame->leaf == dx_find_ptr(path, frame); ++ DX_DEVAL(dx_lock_stats.dls_bh_again += !equal); ++ dx_unlock_bh(frame->bh); ++ ++ return equal ? 0 : -EAGAIN; ++} ++ ++/* ++ * returns 0 if path was unchanged, -EAGAIN otherwise. ++ */ ++static int dx_check_full_path(struct iam_path *path, int search) ++{ ++ struct iam_frame *bottom; ++ struct iam_frame *scan; ++ int i; ++ int result; ++ ++ do_corr(schedule()); ++ ++ for (bottom = path->ip_frames, i = 0; ++ i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) { ++ ; /* find last filled in frame */ ++ } ++ ++ /* ++ * Lock frames, bottom to top. ++ */ ++ for (scan = bottom - 1; scan >= path->ip_frames; --scan) ++ dx_lock_bh(scan->bh); ++ /* ++ * Check them top to bottom. ++ */ ++ result = 0; ++ for (scan = path->ip_frames; scan < bottom; ++scan) { ++ struct iam_entry *pos; ++ ++ if (search) { ++ if (dx_check_fast(path, scan) == 0) ++ continue; ++ ++ pos = dx_find_position(path, scan); ++ if (scan->leaf != dx_get_block(path, pos)) { ++ result = -EAGAIN; ++ break; ++ } ++ scan->at = pos; ++ } else { ++ pos = iam_entry_shift(path, scan->entries, ++ dx_get_count(scan->entries) - 1); ++ if (scan->at > pos || ++ scan->leaf != dx_get_block(path, scan->at)) { ++ result = -EAGAIN; ++ break; ++ } ++ } ++ } ++ ++ /* ++ * Unlock top to bottom. ++ */ ++ for (scan = path->ip_frames; scan < bottom; ++scan) ++ dx_unlock_bh(scan->bh); ++ DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result); ++ do_corr(schedule()); ++ ++ return result; ++} ++ ++static int dx_lookup_try(struct iam_path *path) + { + u32 ptr; + int err = 0; + int i; +- int delta; + + struct iam_descr *param; + struct iam_frame *frame; +@@ -255,20 +482,19 @@ int dx_lookup(struct iam_path *path) + param = iam_path_descr(path); + c = path->ip_container; + +- delta = dx_index_is_compat(path) ? 1 : 2; +- +- for (frame = path->ip_frames, i = 0, + ptr = param->id_ops->id_root_ptr(c); +- i <= path->ip_indirect; +- ptr = dx_get_block(path, frame->at), ++frame, ++i) { +- struct iam_entry *entries; +- struct iam_entry *p; +- struct iam_entry *q; +- struct iam_entry *m; +- unsigned count; +- ++ for (frame = path->ip_frames, i = 0; i <= path->ip_indirect; ++ ++frame, ++i) { + err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL, + &frame->bh); ++ do_corr(schedule()); ++ ++ dx_lock_bh(frame->bh); ++ /* ++ * node must be initialized under bh lock because concurrent ++ * creation procedure may change it and dx_lookup_try() will ++ * see obsolete tree height. -bzzz ++ */ + if (err != 0) + break; + +@@ -283,53 +509,82 @@ int dx_lookup(struct iam_path *path) + break; + + assert_inv(dx_node_check(path, frame)); +- +- entries = frame->entries; +- count = dx_get_count(entries); +- assert_corr(count && count <= dx_get_limit(entries)); +- p = iam_entry_shift(path, entries, delta); +- q = iam_entry_shift(path, entries, count - 1); +- while (p <= q) { +- m = iam_entry_shift(path, +- p, iam_entry_diff(path, q, p) / 2); +- dxtrace(printk(".")); +- if (iam_ikeycmp(c, iam_ikey_at(path, m), +- path->ip_ikey_target) > 0) +- q = iam_entry_shift(path, m, -1); +- else +- p = iam_entry_shift(path, m, +1); ++ /* ++ * splitting may change root index block and move hash we're ++ * looking for into another index block so, we have to check ++ * this situation and repeat from begining if path got changed ++ * -bzzz ++ */ ++ if (i > 0) { ++ err = dx_check_path(path, frame - 1); ++ if (err != 0) ++ break; + } + +- frame->at = iam_entry_shift(path, p, -1); +- if (EXT3_INVARIANT_ON) { // linear search cross check +- unsigned n = count - 1; +- struct iam_entry *at; ++ frame->at = dx_find_position(path, frame); ++ frame->curidx = ptr; ++ frame->leaf = ptr = dx_get_block(path, frame->at); + +- at = entries; +- while (n--) { +- dxtrace(printk(",")); +- at = iam_entry_shift(path, at, +1); +- if (iam_ikeycmp(c, iam_ikey_at(path, at), +- path->ip_ikey_target) > 0) { +- if (at != iam_entry_shift(path, frame->at, 1)) { +- BREAKPOINT(); +- printk(KERN_EMERG "%i\n", +- iam_ikeycmp(c, iam_ikey_at(path, at), +- path->ip_ikey_target)); +- } +- at = iam_entry_shift(path, at, -1); +- break; +- } +- } +- assert_corr(at == frame->at); +- } ++ dx_unlock_bh(frame->bh); ++ do_corr(schedule()); + } + if (err != 0) +- iam_path_fini(path); ++ dx_unlock_bh(frame->bh); + path->ip_frame = --frame; + return err; + } + ++static int dx_lookup(struct iam_path *path) ++{ ++ int err; ++ int i; ++ ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i) ++ assert(path->ip_frames[i].bh == NULL); ++ ++ do { ++ err = dx_lookup_try(path); ++ do_corr(schedule()); ++ if (err != 0) ++ iam_path_fini(path); ++ } while (err == -EAGAIN); ++ ++ return err; ++} ++ ++/* ++ * Performs path lookup and returns with found leaf (if any) locked by htree ++ * lock. ++ */ ++int dx_lookup_lock(struct iam_path *path, ++ struct dynlock_handle **dl, enum dynlock_type lt) ++{ ++ int result; ++ struct inode *dir; ++ ++ dir = iam_path_obj(path); ++ while ((result = dx_lookup(path)) == 0) { ++ do_corr(schedule()); ++ *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt); ++ if (*dl == NULL) { ++ iam_path_fini(path); ++ result = -ENOMEM; ++ break; ++ } ++ do_corr(schedule()); ++ /* ++ * while locking leaf we just found may get split so we need ++ * to check this -bzzz ++ */ ++ if (dx_check_full_path(path, 1) == 0) ++ break; ++ dx_unlock_htree(dir, *dl); ++ *dl = NULL; ++ iam_path_fini(path); ++ } ++ return result; ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -339,7 +594,7 @@ int dx_lookup(struct iam_path *path) + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +-static int dx_probe(struct dentry *dentry, struct inode *dir, ++static int dx_probe(struct qstr *name, struct inode *dir, + struct dx_hash_info *hinfo, struct iam_path *path) + { + int err; +@@ -347,7 +602,7 @@ static int dx_probe(struct dentry *dentr + + assert_corr(path->ip_data != NULL); + ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr); +- ipc->ipc_dentry = dentry; ++ ipc->ipc_qstr = name; + ipc->ipc_hinfo = hinfo; + + assert_corr(dx_index_is_compat(path)); +@@ -356,6 +611,7 @@ static int dx_probe(struct dentry *dentr + return err; + } + ++ + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -391,10 +647,16 @@ static int ext3_htree_advance(struct ino + * nodes need to be read. + */ + while (1) { ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); + p->at = iam_entry_shift(path, p->at, +1); + if (p->at < iam_entry_shift(path, p->entries, +- dx_get_count(p->entries))) ++ dx_get_count(p->entries))) { ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); + break; ++ } ++ dx_unlock_bh(p->bh); + if (p == path->ip_frames) + return 0; + num_frames++; +@@ -425,25 +687,125 @@ static int ext3_htree_advance(struct ino + * block so no check is necessary + */ + while (num_frames--) { ++ iam_ptr_t idx; ++ ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); ++ idx = p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); + err = iam_path_descr(path)->id_ops-> +- id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, p->at), +- NULL, &bh); ++ id_node_read(path->ip_container, idx, NULL, &bh); + if (err != 0) + return err; /* Failure */ + ++p; +- brelse (p->bh); ++ brelse(p->bh); ++ assert_corr(p->bh != bh); + p->bh = bh; + p->entries = dx_node_get_entries(path, p); + p->at = iam_entry_shift(path, p->entries, !compat); ++ assert_corr(p->curidx != idx); ++ p->curidx = idx; ++ dx_lock_bh(p->bh); ++ assert_corr(p->leaf != dx_get_block(path, p->at)); ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); + assert_inv(dx_node_check(path, p)); + } + return 1; + } + ++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh) ++{ ++ struct iam_frame *f; ++ ++ for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) { ++ do_corr(schedule()); ++ *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ); ++ if (*lh == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static int iam_index_advance(struct iam_path *path) ++{ ++ return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0); ++} ++ ++/* ++ * Advance index part of @path to point to the next leaf. Returns 1 on ++ * success, 0, when end of container was reached. Leaf node is locked. ++ */ + int iam_index_next(struct iam_container *c, struct iam_path *path) + { +- return ext3_htree_advance(c->ic_object, 0, path, NULL, 0); ++ iam_ptr_t cursor; ++ struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, }; ++ int result; ++ struct inode *object; ++ ++ /* ++ * Locking for iam_index_next()... is to be described. ++ */ ++ ++ object = c->ic_object; ++ cursor = path->ip_frame->leaf; ++ ++ while (1) { ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result == 0 && cursor == path->ip_frame->leaf) { ++ result = iam_index_advance(path); ++ ++ assert_corr(result == 0 || ++ cursor != path->ip_frame->leaf); ++ break; ++ } ++ do { ++ dx_unlock_array(object, lh); ++ ++ iam_path_release(path); ++ do_corr(schedule()); ++ ++ result = dx_lookup(path); ++ if (result < 0) ++ break; ++ ++ while (path->ip_frame->leaf != cursor) { ++ do_corr(schedule()); ++ ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ ++ result = iam_index_advance(path); ++ if (result == 0) { ++ ext3_error(object->i_sb, __FUNCTION__, ++ "cannot find cursor: %u\n", ++ cursor); ++ result = -EIO; ++ } ++ if (result < 0) ++ break; ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ dx_unlock_array(object, lh); ++ } ++ } while (result == -EAGAIN); ++ if (result < 0) ++ break; ++ } ++ dx_unlock_array(object, lh); ++ return result; + } + + int ext3_htree_next_block(struct inode *dir, __u32 hash, +@@ -649,14 +1011,29 @@ void iam_insert_key(struct iam_path *pat + struct iam_entry *new = iam_entry_shift(path, frame->at, +1); + int count = dx_get_count(entries); + ++ /* ++ * Unfortunately we cannot assert this, as this function is sometimes ++ * called by VFS under i_sem and without pdirops lock. ++ */ ++ assert_corr(1 || iam_frame_is_locked(path, frame)); + assert_corr(count < dx_get_limit(entries)); + assert_corr(frame->at < iam_entry_shift(path, entries, count)); ++ assert_inv(dx_node_check(path, frame)); + + memmove(iam_entry_shift(path, new, 1), new, + (char *)iam_entry_shift(path, entries, count) - (char *)new); + dx_set_ikey(path, new, key); + dx_set_block(path, new, ptr); + dx_set_count(entries, count + 1); ++ assert_inv(dx_node_check(path, frame)); ++} ++ ++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) ++{ ++ dx_lock_bh(frame->bh); ++ iam_insert_key(path, frame, key, ptr); ++ dx_unlock_bh(frame->bh); + } + + void dx_insert_block(struct iam_path *path, struct iam_frame *frame, +@@ -882,7 +1259,7 @@ static struct buffer_head * ext3_dx_find + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- *err = dx_probe(dentry, NULL, &hinfo, path); ++ *err = dx_probe(&dentry->d_name, NULL, &hinfo, path); + if (*err != 0) + return NULL; + } else { +@@ -1114,7 +1491,7 @@ struct ext3_dir_entry_2 *move_entries(st + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count - split)); ++ frame->leaf, hash2, split, count - split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); +@@ -1484,16 +1861,38 @@ static int shift_entries(struct iam_path + (char *) iam_entry_shift(path, entries, count1), + count2 * iam_entry_size(path)); + +- dx_set_count(entries, count1); + dx_set_count(entries2, count2 + delta); + dx_set_limit(entries2, dx_node_limit(path)); + +- iam_insert_key(path, parent, pivot, newblock); ++ /* ++ * NOTE: very subtle piece of code competing dx_probe() may find 2nd ++ * level index in root index, then we insert new index here and set ++ * new count in that 2nd level index. so, dx_probe() may see 2nd level ++ * index w/o hash it looks for. the solution is to check root index ++ * after we locked just founded 2nd level index -bzzz ++ */ ++ iam_insert_key_lock(path, parent, pivot, newblock); ++ ++ /* ++ * now old and new 2nd level index blocks contain all pointers, so ++ * dx_probe() may find it in the both. it's OK -bzzz ++ */ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, count1); ++ dx_unlock_bh(frame->bh); ++ ++ /* ++ * now old 2nd level index block points to first half of leafs. it's ++ * importand that dx_probe() must check root index block for changes ++ * under dx_lock_bh(frame->bh) -bzzz ++ */ ++ + return count1; + } + + #ifdef CONFIG_EXT3_INDEX +-int split_index_node(handle_t *handle, struct iam_path *path) ++int split_index_node(handle_t *handle, struct iam_path *path, ++ struct dynlock_handle **lh) + { + + struct iam_entry *entries; /* old block contents */ +@@ -1501,6 +1900,8 @@ int split_index_node(handle_t *handle, s + struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; ++ struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,}; + struct inode *dir = iam_path_obj(path); + struct iam_descr *descr; + int nr_splet; +@@ -1523,12 +1924,14 @@ int split_index_node(handle_t *handle, s + * - first allocate all necessary blocks + * + * - insert pointers into them atomically. +- * +- * XXX nikita: this algorithm is *not* scalable, as it assumes that at +- * least nodes in the path are locked. + */ + +- /* Block full, should compress but for now just split */ ++ /* ++ * Locking: leaf is already locked. htree-locks are acquired on all ++ * index nodes that require split bottom-to-top, on the "safe" node, ++ * and on all new nodes ++ */ ++ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + +@@ -1536,6 +1939,7 @@ int split_index_node(handle_t *handle, s + for (nr_splet = 0; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { ++ do_corr(schedule()); + if (nr_splet == DX_MAX_TREE_HEIGHT) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Directory index full!\n"); +@@ -1545,14 +1949,53 @@ int split_index_node(handle_t *handle, s + } + + safe = frame; +- /* Go back down, allocating blocks, and adding blocks into ++ ++ /* ++ * Lock all nodes, bottom to top. ++ */ ++ for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) { ++ do_corr(schedule()); ++ lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE); ++ if (lock[i] == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ } ++ ++ /* ++ * Check for concurrent index modification. ++ */ ++ err = dx_check_full_path(path, 1); ++ if (err) ++ goto cleanup; ++ /* ++ * And check that the same number of nodes is to be split. ++ */ ++ for (i = 0, frame = path->ip_frame; frame >= path->ip_frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++i) { ++ ; ++ } ++ if (i != nr_splet) { ++ err = -EAGAIN; ++ goto cleanup; ++ } ++ ++ /* Go back down, allocating blocks, locking them, and adding into + * transaction... */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ do_corr(schedule()); + if (!bh_new[i] || + descr->id_ops->id_node_init(path->ip_container, + bh_new[i], 0) != 0) + goto cleanup; ++ new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE); ++ if (new_lock[i] == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ do_corr(schedule()); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) +@@ -1560,6 +2003,7 @@ int split_index_node(handle_t *handle, s + } + /* Add "safe" node to transaction too */ + if (safe + 1 != path->ip_frames) { ++ do_corr(schedule()); + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1596,16 +2040,21 @@ int split_index_node(handle_t *handle, s + + assert_corr(i == 0); + ++ do_corr(schedule()); ++ + frames = path->ip_frames; + memcpy((char *) entries2, (char *) entries, + count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ ++ dx_lock_bh(frame->bh); + next = descr->id_ops->id_root_inc(path->ip_container, + path, frame); + dx_set_block(path, next, newblock[0]); ++ dx_unlock_bh(frame->bh); + ++ do_corr(schedule()); + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, + (sizeof path->ip_frames) - 2 * sizeof frames[0]); +@@ -1621,10 +2070,12 @@ int split_index_node(handle_t *handle, s + err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); + } else { + /* splitting non-root index node. */ + struct iam_frame *parent = frame - 1; + ++ do_corr(schedule()); + count = shift_entries(path, frame, count, + entries, entries2, newblock[i]); + /* Which index block gets the new entry? */ +@@ -1634,7 +2085,11 @@ int split_index_node(handle_t *handle, s + frame->at = iam_entry_shift(path, entries2, + idx - count + d); + frame->entries = entries = entries2; ++ frame->curidx = newblock[i]; + swap(frame->bh, bh2); ++ assert_corr(lock[i + 1] != NULL); ++ assert_corr(new_lock[i] != NULL); ++ swap(lock[i + 1], new_lock[i]); + bh_new[i] = bh2; + parent->at = iam_entry_shift(path, + parent->at, +1); +@@ -1647,20 +2102,25 @@ int split_index_node(handle_t *handle, s + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); + err = ext3_journal_dirty_metadata(handle, parent->bh); + if (err) + goto journal_error; + } ++ do_corr(schedule()); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto journal_error; ++ } + /* + * This function was called to make insertion of new leaf + * possible. Check that it fulfilled its obligations. + */ + assert_corr(dx_get_count(path->ip_frame->entries) < + dx_get_limit(path->ip_frame->entries)); +- } ++ assert_corr(lock[nr_splet] != NULL); ++ *lh = lock[nr_splet]; ++ lock[nr_splet] = NULL; + if (nr_splet > 0) { + /* + * Log ->i_size modification. +@@ -1674,6 +2134,12 @@ journal_error: + ext3_std_error(dir->i_sb, err); + + cleanup: ++ dx_unlock_array(dir, lock); ++ dx_unlock_array(dir, new_lock); ++ ++ assert_corr(err || iam_frame_is_locked(path, path->ip_frame)); ++ ++ do_corr(schedule()); + for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { + if (bh_new[i] != NULL) + brelse(bh_new[i]); +@@ -1695,18 +2161,18 @@ static int ext3_dx_add_entry(handle_t *h + struct buffer_head * bh = NULL; + struct inode *dir = dentry->d_parent->d_inode; + struct ext3_dir_entry_2 *de; ++ struct dynlock_handle *dummy = NULL; + int err; + size_t isize; + + iam_path_compat_init(&cpath, dir); + param = iam_path_descr(path); + +- err = dx_probe(dentry, NULL, &hinfo, path); ++ err = dx_probe(&dentry->d_name, NULL, &hinfo, path); + if (err != 0) + return err; + frame = path->ip_frame; + +- /* XXX nikita: global serialization! */ + isize = dir->i_size; + + err = param->id_ops->id_node_read(path->ip_container, +@@ -1726,7 +2192,7 @@ static int ext3_dx_add_entry(handle_t *h + goto cleanup; + } + +- err = split_index_node(handle, path); ++ err = split_index_node(handle, path, &dummy); + if (err) + goto cleanup; + +@@ -1742,6 +2208,7 @@ static int ext3_dx_add_entry(handle_t *h + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: ++ dx_unlock_htree(dir, dummy); + if (bh) + brelse(bh); + cleanup2: +Index: iam/fs/ext3/super.c +=================================================================== +--- iam.orig/fs/ext3/super.c ++++ iam/fs/ext3/super.c +@@ -465,4 +465,8 @@ static struct inode *ext3_alloc_inode(st + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + ei->vfs_inode.i_version = 1; ++ ++ dynlock_init(&ei->i_htree_lock); ++ sema_init(&ei->i_rename_sem, 1); ++ sema_init(&ei->i_append_sem, 1); + + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); +Index: iam/include/linux/ext3_fs_i.h +=================================================================== +--- iam.orig/include/linux/ext3_fs_i.h ++++ iam/include/linux/ext3_fs_i.h +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct reserve_window { + __u32 _rsv_start; /* First byte reserved */ +@@ -127,6 +128,12 @@ struct ext3_inode_info { + * by other means, so we have truncate_sem. + */ + struct semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; ++ + struct inode vfs_inode; + + __u32 i_cached_extent[4]; +Index: iam/include/linux/lustre_iam.h +=================================================================== +--- iam.orig/include/linux/lustre_iam.h ++++ iam/include/linux/lustre_iam.h +@@ -39,6 +39,9 @@ enum { + * Maximal number of non-leaf levels in htree. In the stock ext3 this + * is 2. + */ ++ /* ++ * XXX reduced back to 2 to make per-node locking work. ++ */ + DX_MAX_TREE_HEIGHT = 5, + /* + * Scratch keys used by generic code for temporaries. +@@ -62,7 +65,7 @@ enum { + /* + * Maximal format name length. + */ +- DX_FMT_NAME_LEN = 16 ++ DX_FMT_NAME_LEN = 16, + }; + + #ifdef __KERNEL__ +@@ -133,8 +136,10 @@ enum { + + #if EXT3_CORRECTNESS_ON + #define assert_corr(test) J_ASSERT(test) ++#define do_corr(exp) exp + #else + #define assert_corr(test) do {;} while (0) ++#define do_corr(exp) do {;} while (0) + #endif + + #if EXT3_INVARIANT_ON +@@ -179,7 +184,7 @@ struct iam_ikey; + * support interfaces like readdir(), where iteration over index has to be + * re-startable. + */ +-typedef __u64 iam_ptr_t; ++typedef __u32 iam_ptr_t; + + /* + * Index node traversed during tree lookup. +@@ -188,6 +193,11 @@ struct iam_frame { + struct buffer_head *bh; /* buffer holding node data */ + struct iam_entry *entries; /* array of entries */ + struct iam_entry *at; /* target entry, found by binary search */ ++ iam_ptr_t leaf; /* (logical) offset of child node found by ++ * binary search. */ ++ iam_ptr_t curidx; /* (logical) offset of this node. Used to ++ * per-node locking to detect concurrent ++ * splits. */ + }; + + /* +@@ -205,6 +215,11 @@ struct iam_leaf { + struct buffer_head *il_bh; + struct iam_lentry *il_entries; + struct iam_lentry *il_at; ++ /* ++ * Lock on a leaf node. ++ */ ++ struct dynlock_handle *il_lock; ++ iam_ptr_t il_curidx; /* logical offset of leaf node. */ + void *il_descr_data; + }; + +@@ -215,19 +230,23 @@ enum iam_lookup_t { + /* + * lookup found a record with the key requested + */ +- IAM_LOOKUP_EXACT, ++ IAM_LOOKUP_EXACT = 0, + /* + * lookup positioned leaf on some record + */ +- IAM_LOOKUP_OK, ++ IAM_LOOKUP_OK = 1, + /* + * leaf was empty + */ +- IAM_LOOKUP_EMPTY, ++ IAM_LOOKUP_EMPTY = 2, + /* + * lookup positioned leaf before first record + */ +- IAM_LOOKUP_BEFORE ++ IAM_LOOKUP_BEFORE = 3, ++ /* ++ * Found hash may have a continuation in the next leaf. ++ */ ++ IAM_LOOKUP_LAST = 0x100 + }; + + /* +@@ -270,9 +289,9 @@ struct iam_operations { + struct iam_path *path, + struct iam_frame *frame); + +- struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c); +- void (*id_ipd_free)(const struct iam_container *c, +- struct iam_path_descr *ipd); ++ struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c, ++ void *area); ++ void (*id_ipd_free)(struct iam_path_descr *ipd); + /* + * Format name. + */ +@@ -329,8 +348,10 @@ struct iam_leaf_operations { + + void (*key_set)(struct iam_leaf *l, const struct iam_key *k); + void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r); ++ void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r); + + int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k); ++ int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k); + + int (*key_size)(const struct iam_leaf *l); + /* +@@ -473,11 +494,23 @@ struct iam_path_compat { + struct iam_container ipc_container; + __u32 ipc_scratch[DX_SCRATCH_KEYS]; + struct dx_hash_info *ipc_hinfo; +- struct dentry *ipc_dentry; ++ struct qstr *ipc_qstr; + struct iam_path_descr ipc_descr; + struct dx_hash_info ipc_hinfo_area; + }; + ++#define const_max(p, q) ((p > q) ? p : q) ++ ++enum { ++ DX_MAX_IKEY_SIZE = 32, /* be generous */ ++ /* ++ * Hack to avoid dynamic allocation and freeing of ipd. ++ */ ++ DX_IPD_MAX_SIZE = const_max(sizeof(struct iam_path_compat), ++ DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS + ++ sizeof(struct iam_path_descr)) ++}; ++ + /* + * iam cursor (iterator) api. + */ +@@ -554,6 +587,7 @@ struct iam_iterator { + void iam_path_init(struct iam_path *path, struct iam_container *c, + struct iam_path_descr *pd); + void iam_path_fini(struct iam_path *path); ++void iam_path_release(struct iam_path *path); + + void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode); + void iam_path_compat_fini(struct iam_path_compat *path); +@@ -683,12 +717,6 @@ static inline int iam_ikeycmp(const stru + return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2); + } + +-static inline void iam_reccpy(const struct iam_path *p, struct iam_rec *rec_dst, +- const struct iam_rec *rec_src) +-{ +- memcpy(rec_dst, rec_src, iam_path_descr(p)->id_rec_size); +-} +- + static inline void *iam_entry_off(struct iam_entry *entry, size_t off) + { + return (void *)((char *)entry + off); +@@ -720,6 +748,11 @@ iam_leaf_ops(const struct iam_leaf *leaf + return iam_leaf_descr(leaf)->id_leaf_ops; + } + ++static inline void iam_reccpy(const struct iam_leaf *leaf, ++ struct iam_rec *rec_dst) ++{ ++ iam_leaf_ops(leaf)->rec_get(leaf, rec_dst); ++} + + /*XXX These stuff put here, just because they are used by iam.c and namei.c*/ + static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) +@@ -848,7 +881,36 @@ static inline struct iam_ikey *iam_path_ + return path->ip_data->ipd_key_scratch[nr]; + } + +-int dx_lookup(struct iam_path *path); ++static inline struct dynlock *path_dynlock(struct iam_path *path) ++{ ++ return &EXT3_I(iam_path_obj(path))->i_htree_lock; ++} ++ ++static inline int iam_leaf_is_locked(const struct iam_leaf *leaf) ++{ ++ int result; ++ ++ result = dynlock_is_locked(path_dynlock(leaf->il_path), ++ leaf->il_curidx); ++ if (!result) ++ dump_stack(); ++ return result; ++} ++ ++static inline int iam_frame_is_locked(struct iam_path *path, ++ const struct iam_frame *frame) ++{ ++ int result; ++ ++ result = dynlock_is_locked(path_dynlock(path), frame->curidx); ++ if (!result) ++ dump_stack(); ++ return result; ++} ++ ++int dx_lookup_lock(struct iam_path *path, ++ struct dynlock_handle **dl, enum dynlock_type lt); ++ + void dx_insert_block(struct iam_path *path, struct iam_frame *frame, + u32 hash, u32 block); + int dx_index_is_compat(struct iam_path *path); +@@ -858,7 +920,8 @@ int ext3_htree_next_block(struct inode * + + struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, + u32 *block, int *err); +-int split_index_node(handle_t *handle, struct iam_path *path); ++int split_index_node(handle_t *handle, struct iam_path *path, ++ struct dynlock_handle **lh); + struct ext3_dir_entry_2 *split_entry(struct inode *dir, + struct ext3_dir_entry_2 *de, + unsigned long ino, mode_t mode, +@@ -874,6 +937,10 @@ struct ext3_dir_entry_2 *move_entries(st + + extern struct iam_descr iam_htree_compat_param; + ++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, ++ enum dynlock_type lt); ++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh); ++ + /* + * external + */ +@@ -889,7 +956,7 @@ int iam_read_leaf(struct iam_path *p); + int iam_node_read(struct iam_container *c, iam_ptr_t ptr, + handle_t *handle, struct buffer_head **bh); + +-void iam_insert_key(struct iam_path *path, struct iam_frame *frame, ++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, + const struct iam_ikey *key, iam_ptr_t ptr); + + int iam_leaf_at_end(const struct iam_leaf *l); diff --git a/ldiskfs/kernel_patches/patches/ext3-tall-htree.patch b/ldiskfs/kernel_patches/patches/ext3-tall-htree.patch new file mode 100644 index 0000000..5021759 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-tall-htree.patch @@ -0,0 +1,431 @@ +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:35:38.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:35:47.000000000 +0800 +@@ -48,6 +48,11 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++/* ++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. ++ */ ++#define DX_MAX_TREE_HEIGHT (5) ++ + static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +@@ -75,7 +80,7 @@ + #ifdef DX_DEBUG + #define dxtrace(command) command + #else +-#define dxtrace(command) ++#define dxtrace(command) + #endif + + struct fake_dirent +@@ -168,7 +173,7 @@ + static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, +- struct dx_frame *frames, ++ struct dx_frame *frames, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); +@@ -249,7 +254,7 @@ + } + + struct stats +-{ ++{ + unsigned names; + unsigned space; + unsigned bcount; +@@ -367,7 +372,7 @@ + goto fail; + } + +- if ((indirect = root->info.indirect_levels) > 1) { ++ if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -436,12 +441,15 @@ + + static void dx_release (struct dx_frame *frames) + { ++ int height; ++ + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels; ++ for (; height >= 0; height--) { ++ assert(frames[height].bh != NULL); ++ brelse(frames[height].bh); ++ } + } + + /* +@@ -463,7 +471,7 @@ + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, +- struct dx_frame *frames, ++ struct dx_frame *frames, + __u32 *start_hash) + { + struct dx_frame *p; +@@ -582,7 +590,7 @@ + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct inode *dir; + int block, err; + int count = 0; +@@ -627,7 +635,7 @@ + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { +@@ -644,7 +652,7 @@ + break; + } + dx_release(frames); +- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", ++ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +@@ -918,7 +926,7 @@ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -1037,7 +1045,7 @@ + parent = ERR_PTR(-ENOMEM); + } + return parent; +-} ++} + + #define S_SHIFT 12 + static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { +@@ -1098,6 +1106,8 @@ + return prev; + } + ++/* Allocate new node, and split leaf node @bh into it, inserting new pointer ++ * into parent node identified by @frame */ + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +@@ -1185,7 +1195,7 @@ + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. +- * ++ * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +@@ -1286,7 +1296,7 @@ + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1427,20 +1437,29 @@ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; +- struct dx_entry *entries, *at; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe; ++ struct dx_node *node2; ++ struct dx_entry *entries; /* old block contents */ ++ struct dx_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; + struct buffer_head * bh; ++ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; ++ u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; + int err; ++ int nr_splet; ++ int i; ++ size_t isize; + + frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; +- at = frame->at; ++ ++ /* XXX nikita: global serialization! */ ++ isize = dir->i_size; + + if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; +@@ -1456,29 +1475,43 @@ + goto cleanup; + } + ++ /* ++ * Tall-tree handling: we might have to split multiple index blocks ++ * all the way up to tree root. Tricky point here is error handling: ++ * to avoid complicated undo/rollback we ++ * ++ * - first allocate all necessary blocks ++ * ++ * - insert pointers into them atomically. ++ * ++ * XXX nikita: this algorithm is *not* scalable, as it assumes that at ++ * least nodes in the path are locked. ++ */ ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +- /* Need to split index? */ +- if (dx_get_count(entries) == dx_get_limit(entries)) { +- u32 newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; +- struct dx_entry *entries2; +- struct dx_node *node2; +- struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { ++ /* What levels need split? */ ++ for (nr_splet = 0; frame >= frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++nr_splet) { ++ if (nr_splet == DX_MAX_TREE_HEIGHT) { + ext3_warning(sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; + } +- bh2 = ext3_append (handle, dir, &newblock, &err); +- if (!(bh2)) ++ } ++ ++ safe = frame; ++ /* Go back down, allocating blocks, and adding blocks into ++ * transaction... */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ if (!bh_new[i]) + goto cleanup; +- node2 = (struct dx_node *)(bh2->b_data); ++ node2 = (struct dx_node *)(bh_new[i]->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); + node2->fake.inode = 0; +@@ -1486,72 +1519,112 @@ + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { +- unsigned icount1 = icount/2, icount2 = icount - icount1; +- unsigned hash2 = dx_get_hash(entries + icount1); +- dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ +- err = ext3_journal_get_write_access(handle, +- frames[0].bh); ++ } ++ /* Add "safe" node to transaction too */ ++ if (safe + 1 != frames) { ++ err = ext3_journal_get_write_access(handle, safe->bh); ++ if (err) ++ goto journal_error; ++ } ++ ++ /* Go through nodes once more, inserting pointers */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ unsigned count; ++ int idx; ++ struct buffer_head *bh2; ++ ++ entries = frame->entries; ++ count = dx_get_count(entries); ++ idx = frame->at - entries; ++ ++ bh2 = bh_new[i]; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ ++ if (frame == frames) { ++ /* splitting root node. Tricky point: ++ * ++ * In the "normal" B-tree we'd split root *and* add ++ * new root to the tree with pointers to the old root ++ * and its sibling (thus introducing two new nodes). ++ * ++ * In htree it's enough to add one node, because ++ * capacity of the root node is smaller than that of ++ * non-root one. ++ */ ++ struct dx_root *root; ++ u8 indirects; ++ ++ root = (struct dx_root *) frames->bh->b_data; ++ indirects = root->info.indirect_levels; ++ dxtrace(printk("Creating new root %d\n", indirects)); ++ memcpy((char *) entries2, (char *) entries, ++ count * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock[i]); ++ root->info.indirect_levels = indirects + 1; ++ ++ /* Shift frames in the path */ ++ memmove(frames + 2, frames + 1, ++ (sizeof frames) - 2 * sizeof frames[0]); ++ /* Add new access path frame */ ++ frames[1].at = entries2 + idx; ++ frames[1].entries = entries = entries2; ++ frames[1].bh = bh2; ++ ++ frame; ++ bh_new[i] = NULL; /* buffer head is "consumed" */ ++ err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; +- +- memcpy ((char *) entries2, (char *) (entries + icount1), +- icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); +- dx_set_count (entries2, icount2); ++ } else { ++ /* splitting non-root index node. */ ++ unsigned count1 = count/2, count2 = count - count1; ++ unsigned hash2 = dx_get_hash(entries + count1); ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy ((char *) entries2, (char *) (entries + count1), ++ count2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, count1); ++ dx_set_count (entries2, count2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ +- if (at - entries >= icount1) { +- frame->at = at = at - entries - icount1 + entries2; ++ if (idx >= count1) { ++ frame->at = entries2 + idx - count1; + frame->entries = entries = entries2; + swap(frame->bh, bh2); ++ bh_new[i] = bh2; + } +- dx_insert_block (frames + 0, hash2, newblock); +- dxtrace(dx_show_index ("node", frames[1].entries)); ++ dx_insert_block (frame - 1, hash2, newblock[i]); ++ dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; +- brelse (bh2); +- } else { +- dxtrace(printk("Creating second level index...\n")); +- memcpy((char *) entries2, (char *) entries, +- icount * sizeof(struct dx_entry)); +- dx_set_limit(entries2, dx_node_limit(dir)); +- +- /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(entries + 0, newblock); +- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext3_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; + } +- ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); +- bh = NULL; +- goto cleanup; ++ goto cleanup2; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: + if (bh) + brelse(bh); ++cleanup2: ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ if (err) ++ inode->i_size = isize; + dx_release(frames); + return err; + } +@@ -1561,7 +1634,7 @@ + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +-static int ext3_delete_entry (handle_t *handle, ++static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) +@@ -1821,7 +1894,7 @@ + de1 = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || +- !le32_to_cpu(de1->inode) || ++ !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", +@@ -1891,7 +1964,7 @@ + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: +- * I think I can trigger J_ASSERT in ext3_orphan_add(). We block ++ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on lock_super()), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. +@@ -2415,4 +2488,4 @@ + .removexattr = generic_removexattr, + #endif + .permission = ext3_permission, +-}; ++}; diff --git a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch index f71e470005..adba428 100644 --- a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch @@ -91,17 +91,9 @@ diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/ioctl.c RH_2_6_9_42_0_3/fs/ext3/ioctl.c diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/namei.c RH_2_6_9_42_0_3/fs/ext3/namei.c --- RH_2_6_9_42_0_3.orig/fs/ext3/namei.c 2006-10-23 13:32:59.000000000 +0300 +++ RH_2_6_9_42_0_3/fs/ext3/namei.c 2007-02-22 18:58:13.000000000 +0200 -@@ -97,6 +97,7 @@ struct dx_entry - __le32 block; - }; - -+ - /* - * dx_root_info is laid out so that if it should somehow get overlaid by a - * dirent the two low bits of the hash version will be zero. Therefore, the -@@ -141,6 +142,14 @@ struct dx_map_entry - u32 offs; - }; +@@ -1624,6 +1633,28 @@ static int ext3_add_nondir(handle_t *han + return err; + } +#define LVFS_DENTRY_PARAM_MAGIC 20070216UL +struct lvfs_dentry_params @@ -110,14 +102,7 @@ diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/namei.c RH_2_6_9_42_0_3/fs/ext3/namei.c + void *p_ptr; + u32 magic; +}; -+ - #ifdef CONFIG_EXT3_INDEX - static inline unsigned dx_get_block (struct dx_entry *entry); - static void dx_set_block (struct dx_entry *entry, unsigned value); -@@ -1624,6 +1633,20 @@ static int ext3_add_nondir(handle_t *han - return err; - } - ++ +static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, + int mode, struct dentry *dentry) +{ diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series index 7c2246f..a4cc364 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -14,8 +14,19 @@ ext3-mballoc3-core.patch ext3-mballoc3-rhel4.patch ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch -ext3-lookup-dotdot-2.6.9.patch -ext3-sector_t-overflow-2.6.9-rhel4.patch -ext3-check-jbd-errors-2.6.9.patch +ext3-tall-htree.patch +ext3-htree-path.patch +ext3-htree-r5-hash.patch +ext3-htree-path-ops.patch +ext3-hash-selection.patch +ext3-htree-comments.patch +ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.9-rhel4.patch +ext3-check-jbd-errors-2.6.9.patch ext3-uninit-2.6.9.patch ext3-nanosecond-2.6-rhel4.patch +ext3-iam-ops.patch +ext3-iam-separate.patch +ext3-iam-uapi.patch +ext3-orphans-delay.patch +ext3-pdirops-2.6.9.patch -- 1.8.3.1