From df82caa8e667d3e1cae8657308c4d1c262b27a0b Mon Sep 17 00:00:00 2001 From: wangdi Date: Sun, 23 Apr 2006 15:33:32 +0000 Subject: [PATCH] Branch: b_new_cmd 1) merge b_iam patch to this branch 2) update prototype, add ext3-iam-ops.patch --- .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 7 + .../patches/ext3-hash-selection.patch | 126 ++ .../patches/ext3-htree-comments.patch | 1643 ++++++++++++++++++++ .../patches/ext3-htree-path-ops.patch | 481 ++++-- .../kernel_patches/patches/ext3-htree-path.patch | 406 +++++ .../patches/ext3-htree-r5-hash.patch | 88 ++ lustre/kernel_patches/patches/ext3-iam-ops.patch | 646 ++++++++ .../kernel_patches/patches/ext3-tall-htree.patch | 431 +++++ .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 7 + 9 files changed, 3710 insertions(+), 125 deletions(-) create mode 100644 lustre/kernel_patches/patches/ext3-hash-selection.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-comments.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-path.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-r5-hash.patch create mode 100644 lustre/kernel_patches/patches/ext3-iam-ops.patch create mode 100644 lustre/kernel_patches/patches/ext3-tall-htree.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series index bab81b9..b90ed7a 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -10,3 +10,10 @@ ext3-extents-2.6.9-rhel4.patch ext3-mballoc2-2.6.9-rhel4.patch ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch +ext3-tall-htree.patch +ext3-htree-path.patch +ext3-htree-r5-hash.patch +ext3-htree-path-ops.patch +ext3-hash-selection.patch +ext3-htree-comments.patch +ext3-iam-ops.patch diff --git a/lustre/kernel_patches/patches/ext3-hash-selection.patch b/lustre/kernel_patches/patches/ext3-hash-selection.patch new file mode 100644 index 0000000..12343cc --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-hash-selection.patch @@ -0,0 +1,126 @@ +Index: linux-2.6.9/fs/ext3/hash.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/hash.c 2006-04-23 22:39:01.000000000 +0800 ++++ linux-2.6.9/fs/ext3/hash.c 2006-04-23 22:39:16.000000000 +0800 +@@ -127,6 +127,11 @@ + return a; + } + ++static __u32 dx_same_hash(const signed char *msg, int len) ++{ ++ return 0xcafebabeUL; ++} ++ + static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) + { + __u32 pad, val; +@@ -220,6 +225,9 @@ + case DX_HASH_R5: + hash = dx_r5_hash(name, len); + break; ++ case DX_HASH_SAME: ++ hash = dx_same_hash(name, len); ++ break; + default: + hinfo->hash = 0; + return -1; +Index: linux-2.6.9/fs/ext3/super.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/super.c 2006-04-23 22:38:55.000000000 +0800 ++++ linux-2.6.9/fs/ext3/super.c 2006-04-23 22:39:56.000000000 +0800 +@@ -598,6 +598,7 @@ + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_extdebug, Opt_mballoc, ++ Opt_hashfunc, + }; + + static match_table_t tokens = { +@@ -651,6 +652,7 @@ + {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, ++ {Opt_hashfunc,"hash=%s"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, + }; +@@ -675,6 +677,7 @@ + return sb_block; + } + ++int user_selected_hash_function = -1; + static int parse_options (char * options, struct super_block *sb, + unsigned long * inum, unsigned long *n_blocks_count, int is_remount) + { +@@ -963,6 +966,23 @@ + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; ++ case Opt_hashfunc: ++ if (strncmp (args[0].from,"legacy",6) == 0){ ++ user_selected_hash_function = 0; ++ } else if (strncmp (args[0].from,"half_md4",8) == 0){ ++ user_selected_hash_function = 1; ++ } else if (strncmp (args[0].from,"tea",3) == 0){ ++ user_selected_hash_function = 2; ++ } else if (strncmp (args[0].from,"r5",2) == 0){ ++ user_selected_hash_function = 3; ++ } else if (strncmp (args[0].from,"same",4) == 0){ ++ user_selected_hash_function = 4; ++ } else { ++ printk ("Hashfunc name wrong\n"); ++ return 0; ++ } ++ break; ++ + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:39:02.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:39:16.000000000 +0800 +@@ -365,10 +365,7 @@ + struct htree_cookie *hc = cookie; + + root = data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_R5 && +- root->info.hash_version != DX_HASH_LEGACY) { ++ if (root->info.hash_version > DX_HASH_MAX) { + ext3_warning(sb, __FUNCTION__, + "Unrecognised inode hash code %d", + root->info.hash_version); +@@ -1467,6 +1464,7 @@ + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ ++extern int user_selected_hash_function; + static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) + { +@@ -1522,7 +1520,9 @@ + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; +- root->info.hash_version = DX_HASH_R5; ++ if (user_selected_hash_function >= 0 && ++ user_selected_hash_function <= DX_HASH_MAX) ++ root->info.hash_version = user_selected_hash_function; + entries = (void *)root->entries; + dx_set_block (&path, entries, 1); + dx_set_count (entries, 1); +Index: linux-2.6.9/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.9.orig/include/linux/ext3_fs.h 2006-04-23 22:39:01.000000000 +0800 ++++ linux-2.6.9/include/linux/ext3_fs.h 2006-04-23 22:39:16.000000000 +0800 +@@ -665,6 +665,8 @@ + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 + #define DX_HASH_R5 3 ++#define DX_HASH_SAME 4 ++#define DX_HASH_MAX 4 + + /* hash info structure used by the directory hash */ + struct dx_hash_info diff --git a/lustre/kernel_patches/patches/ext3-htree-comments.patch b/lustre/kernel_patches/patches/ext3-htree-comments.patch new file mode 100644 index 0000000..159add6 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-htree-comments.patch @@ -0,0 +1,1643 @@ +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-17 18:32:12.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 21:40:41.000000000 +0800 +@@ -24,6 +24,78 @@ + * Theodore Ts'o, 2002 + */ + ++/* ++ * iam: big theory statement. ++ * ++ * iam (Index Access Module) is a module providing abstraction of persistent ++ * transactional container on top of generalized ext3 htree. ++ * ++ * iam supports: ++ * ++ * - key, pointer, and record size specifiable per container. ++ * ++ * - trees taller than 2 index levels. ++ * ++ * - read/write to existing ext3 htree directories as iam containers. ++ * ++ * iam container is a tree, consisting of leaf nodes containing keys and ++ * records stored in this container, and index nodes, containing keys and ++ * pointers to leaf or index nodes. ++ * ++ * iam does not work with keys directly, instead it calls user-supplied key ++ * comparison function (->dpo_keycmp()). ++ * ++ * Pointers are (currently) interpreted as logical offsets (measured in ++ * blocksful) within underlying flat file on top of which iam tree lives. ++ * ++ * On-disk format: ++ * ++ * iam mostly tries to reuse existing htree formats. ++ * ++ * Format of index node: ++ * ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * | | count | | | | | | ++ * | gap | / | entry | entry | .... | entry | free space | ++ * | | limit | | | | | | ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * ++ * gap this part of node is never accessed by iam code. It ++ * exists for binary compatibility with ext3 htree (that, ++ * in turn, stores fake struct ext2_dirent for ext2 ++ * compatibility), and to keep some unspecified per-node ++ * data. Gap can be different for root and non-root index ++ * nodes. Gap size can be specified for each container ++ * (gap of 0 is allowed). ++ * ++ * count/limit current number of entries in this node, and the maximal ++ * number of entries that can fit into node. count/limit ++ * has the same size as entry, and is itself counted in ++ * count. ++ * ++ * entry index entry: consists of a key immediately followed by ++ * a pointer to a child node. Size of a key and size of a ++ * pointer depends on container. Entry has neither ++ * alignment nor padding. ++ * ++ * free space portion of node new entries are added to ++ * ++ * Entries in index node are sorted by their key value. ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ */ ++ + #include + #include + #include +@@ -98,14 +170,6 @@ + __le16 count; + }; + +-struct dx_entry; /* incomplete type */ +-struct dx_key; /* incomplete type */ +- +-struct dx_entry_compat { +- __le32 hash; +- __le32 block; +-}; +- + /* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the +@@ -135,111 +199,513 @@ + struct {} entries[0]; + }; + +- +-struct dx_frame +-{ +- struct buffer_head *bh; +- struct dx_entry *entries; +- struct dx_entry *at; +-}; +- + struct dx_map_entry + { + u32 hash; + u32 offs; + }; + +-struct dx_path; +-struct dx_param { +- size_t dpo_key_size; +- size_t dpo_ptr_size; +- size_t dpo_node_gap; +- size_t dpo_root_gap; +- +- u32 (*dpo_root_ptr)(struct dx_path *path); +- int (*dpo_node_check)(struct dx_path *path, +- struct dx_frame *frame, void *cookie); +- int (*dpo_node_init)(struct dx_path *path, +- struct buffer_head *bh, int root); +- int (*dpo_keycmp)(struct dx_path *path, +- struct dx_key *k1, struct dx_key *k2); ++/* ++ * Entry within index tree node. Consists of a key immediately followed ++ * (without padding) by a pointer to the child node. ++ * ++ * Both key and pointer are of variable size, hence incomplete type. ++ */ ++struct iam_entry; ++ ++struct iam_entry_compat { ++ __le32 hash; ++ __le32 block; ++}; ++ ++/* ++ * Incomplete type used to refer to keys in iam container. ++ * ++ * As key size can be different from container to container, iam has to use ++ * incomplete type. Clients cast pointer to iam_key to real key type and back. ++ */ ++struct iam_key; ++ ++/* Incomplete type use to refer to the records stored in iam containers. */ ++struct iam_rec; ++ ++typedef __u64 iam_ptr_t; ++ ++/* ++ * Index node traversed during tree lookup. ++ */ ++struct iam_frame { ++ struct buffer_head *bh; /* buffer holding node data */ ++ struct iam_entry *entries; /* array of entries */ ++ struct iam_entry *at; /* target entry, found by binary search */ ++}; ++ ++/* leaf node reached by tree lookup */ ++struct iam_leaf { ++ struct buffer_head *bh; ++ struct iam_leaf_entry *entries; ++ struct iam_leaf_entry *at; ++}; ++ ++struct iam_path; ++struct iam_container; ++ ++/* ++ * Parameters, describing a flavor of iam container. ++ */ ++struct iam_descr { ++ /* ++ * Size of a key in this container, in bytes. ++ */ ++ size_t id_key_size; ++ /* ++ * Size of a pointer to the next level (stored in index nodes), in ++ * bytes. ++ */ ++ size_t id_ptr_size; ++ /* ++ * Size of a record (stored in leaf nodes), in bytes. ++ */ ++ size_t id_rec_size; ++ /* ++ * Size of unused (by iam) space at the beginning of every non-root ++ * node, in bytes. Used for compatibility with ext3. ++ */ ++ size_t id_node_gap; ++ /* ++ * Size of unused (by iam) space at the beginning of root node, in ++ * bytes. Used for compatibility with ext3. ++ */ ++ size_t id_root_gap; ++ ++ /* ++ * Returns pointer (in the same sense as pointer in index entry) to ++ * the root node. ++ */ ++ __u32 (*id_root_ptr)(struct iam_container *c); ++ ++ /* ++ * Check validity and consistency of index node. This is called when ++ * iam just loaded new node into frame. ++ */ ++ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); ++ /* ++ * Initialize new node (stored in @bh) that is going to be added into ++ * tree. ++ */ ++ int (*id_node_init)(struct iam_container *c, ++ struct buffer_head *bh, int root); ++ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ /* ++ * Key comparison function. Returns -1, 0, +1. ++ */ ++ int (*id_keycmp)(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2); ++ /* ++ * Create new container. ++ * ++ * Newly created container has a root node and a single leaf. Leaf ++ * contains single record with the smallest possible key. ++ */ ++ int (*id_create)(struct iam_container *c); ++ struct { ++ /* ++ * leaf operations. ++ */ ++ /* ++ * returns true iff leaf is positioned at the last entry. ++ */ ++ int (*at_end)(struct iam_container *c, struct iam_leaf *l); ++ /* position leaf at the first entry */ ++ void (*start)(struct iam_container *c, struct iam_leaf *l); ++ /* more leaf to the next entry. */ ++ void (*next)(struct iam_container *c, struct iam_leaf *l); ++ /* return key of current leaf record in @k */ ++ void (*key)(struct iam_container *c, struct iam_leaf *l, ++ struct iam_key *k); ++ /* return pointer to entry body */ ++ struct iam_rec *(*rec)(struct iam_container *c, ++ struct iam_leaf *l); ++ } id_leaf; ++}; ++ ++struct iam_container { ++ /* ++ * Underlying flat file. IO against this object is issued to ++ * read/write nodes. ++ */ ++ struct inode *ic_object; ++ /* ++ * container flavor. ++ */ ++ struct iam_descr *ic_descr; ++ /* ++ * pointer to flavor-specific per-container data. ++ */ ++ void *ic_descr_data; + }; + + /* + * Structure to keep track of a path drilled through htree. + */ +-struct dx_path { +- struct inode *dp_object; +- struct dx_param *dp_param; +- int dp_indirect; +- struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; +- struct dx_frame *dp_frame; +- struct dx_key *dp_key_target; +- struct dx_key *dp_key_scratch[DX_SCRATCH_KEYS]; +-}; +- +-struct dx_path_compat { +- struct dx_path dpc_path; +- __u32 dpc_scrach[DX_SCRATCH_KEYS]; +-}; +- +-static u32 htree_root_ptr(struct dx_path *p); +-static int htree_node_check(struct dx_path *path, +- struct dx_frame *frame, void *cookie); +-static int htree_node_init(struct dx_path *path, ++struct iam_path { ++ /* ++ * Parent container. ++ */ ++ struct iam_container *ip_container; ++ /* ++ * Number of index levels minus one. ++ */ ++ int ip_indirect; ++ /* ++ * Nodes that top-to-bottom traversal passed through. ++ */ ++ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; ++ /* ++ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one ++ * immediately above leaf). ++ */ ++ struct iam_frame *ip_frame; ++ /* ++ * Leaf node: a child of ->ip_frame. ++ */ ++ struct iam_leaf *ip_leaf; ++ /* ++ * Key searched for. ++ */ ++ struct iam_key *ip_key_target; ++ /* ++ * Scratch-pad area for temporary keys. ++ */ ++ struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; ++ /* ++ * pointer to flavor-specific per-container data. ++ */ ++ void *ip_descr_data; ++}; ++ ++/* ++ * Helper structure for legacy htrees. ++ */ ++struct iam_path_compat { ++ struct iam_path ipc_path; ++ struct iam_container ipc_container; ++ __u32 ipc_scrach[DX_SCRATCH_KEYS]; ++}; ++ ++static u32 htree_root_ptr(struct iam_container *c); ++static int htree_node_check(struct iam_path *path, struct iam_frame *frame); ++static int htree_node_init(struct iam_container *c, + struct buffer_head *bh, int root); +-static int htree_keycmp(struct dx_path *path, +- struct dx_key *k1, struct dx_key *k2); ++static int htree_keycmp(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2); ++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ ++/* ++ * Parameters describing iam compatibility mode in which existing ext3 htrees ++ * can be manipulated. ++ */ ++static struct iam_descr htree_compat_param = { ++ .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, ++ .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, ++ .id_node_gap = offsetof(struct dx_node, entries), ++ .id_root_gap = offsetof(struct dx_root, entries), ++ ++ .id_root_ptr = htree_root_ptr, ++ .id_node_check = htree_node_check, ++ .id_node_init = htree_node_init, ++ .id_node_read = htree_node_read, ++ .id_keycmp = htree_keycmp ++}; ++ ++ ++struct iam_key; ++struct iam_rec; ++struct iam_descr; ++struct iam_container; ++struct iam_path; + +-static struct dx_param htree_compat_param = { +- .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, +- .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, +- .dpo_node_gap = offsetof(struct dx_node, entries), +- .dpo_root_gap = offsetof(struct dx_root, entries), +- +- .dpo_root_ptr = htree_root_ptr, +- .dpo_node_check = htree_node_check, +- .dpo_node_init = htree_node_init, +- .dpo_keycmp = htree_keycmp ++/* ++ * Initialize container @c, acquires additional reference on @inode. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode); ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c); ++ ++/* ++ * Search container @c for record with key @k. If record is found, its data ++ * are moved into @r. ++ * ++ * ++ * ++ * Return values: +ve: found, 0: not-found, -ve: error ++ */ ++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); ++/* ++ * Insert new record @r with key @k into container @c (within context of ++ * transaction @h. ++ * ++ * Return values: 0: success, -ve: error, including -EEXIST when record with ++ * given key is already present. ++ * ++ * postcondition: ergo(result == 0 || result == -EEXIST, ++ * iam_lookup(c, k, r2) > 0 && ++ * !memcmp(r, r2, c->ic_descr->id_rec_size)); ++ */ ++int iam_insert(handle_t *h, struct iam_container *c, ++ struct iam_key *k, struct iam_rec *r); ++/* ++ * Replace existing record with key @k, or insert new one. New record data are ++ * in @r. ++ * ++ * Return values: 0: success, -ve: error. ++ * ++ * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && ++ * !memcmp(r, r2, c->ic_descr->id_rec_size)); ++ */ ++int iam_update(handle_t *h, struct iam_container *c, ++ struct iam_key *k, struct iam_rec *r); ++/* ++ * Delete existing record with key @k. ++ * ++ * Return values: 0: success, -ENOENT: not-found, -ve: other error. ++ * ++ * postcondition: ergo(result == 0 || result == -ENOENT, ++ * !iam_lookup(c, k, *)); ++ */ ++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); ++ ++/* ++ * iam cursor (iterator) api. ++ */ ++ ++/* ++ * Flags controlling iterator functionality. ++ */ ++enum iam_it_flags { ++ /* ++ * this iterator will move (iam_it_{prev,next}() will be called on it) ++ */ ++ IAM_IT_MOVE = (1 << 0), ++ /* ++ * tree can be updated through this iterator. ++ */ ++ IAM_IT_WRITE = (1 << 1) + }; + ++/* ++ * States of iterator state machine. ++ */ ++enum iam_it_state { ++ /* initial state */ ++ IAM_IT_DETACHED, ++ /* iterator is above particular record in the container */ ++ IAM_IT_ATTACHED ++}; ++ ++/* ++ * Iterator. ++ * ++ * Immediately after call to iam_it_init() iterator is in "detached" ++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but ++ * doesn't point to any particular record in this container. ++ * ++ * After successful call to iam_it_get() and until corresponding call to ++ * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). ++ * ++ * Attached iterator can move through records in a container (provided ++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it ++ * passes over them, and can modify container (provided IAM_IT_WRITE ++ * permission). ++ * ++ * Concurrency: iterators are supposed to be local to thread. Interfaces below ++ * do no internal serialization. ++ * ++ */ ++struct iam_iterator { ++ /* ++ * iterator flags, taken from enum iam_it_flags. ++ */ ++ __u32 ii_flags; ++ enum iam_it_state ii_state; ++ /* ++ * path to the record. Valid in IAM_IT_ATTACHED state. ++ */ ++ struct iam_path ii_path; ++}; ++ ++static inline struct iam_key *keycpy(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2) ++{ ++ return memcpy(k1, k2, c->ic_descr->id_key_size); ++} ++ ++static inline int keycmp(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2) ++{ ++ return c->ic_descr->id_keycmp(c, k1, k2); ++} ++ ++static struct iam_container *iam_it_container(struct iam_iterator *it) ++{ ++ return it->ii_path.ip_container; ++} ++ ++static inline int it_keycmp(struct iam_iterator *it, ++ struct iam_key *k1, struct iam_key *k2) ++{ ++ return keycmp(iam_it_container(it), k1, k2); ++} ++ ++/* ++ * Initialize iterator to IAM_IT_DETACHED state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags); ++/* ++ * Finalize iterator and release all resources. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_fini(struct iam_iterator *it); ++ ++/* ++ * Attach iterator. After successful completion, @it points to record with the ++ * largest key not larger than @k. Semantics of ->id_create() method guarantee ++ * that such record will always be found. ++ * ++ * Return value: 0: positioned on existing record, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ * postcondition: ergo(result == 0, ++ * (it_state(it) == IAM_IT_ATTACHED && ++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) ++ */ ++int iam_it_get(struct iam_iterator *it, struct iam_key *k); ++ ++/* ++ * Duplicates iterator. ++ * ++ * postcondition: it_state(dst) == it_state(src) && ++ * iam_it_container(dst) == iam_it_container(src) && ++ * dst->ii_flags = src->ii_flags && ++ * ergo(it_state(it) == IAM_IT_ATTACHED, ++ * iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) ++ */ ++void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src); ++ ++/* ++ * Detach iterator. Does nothing it detached state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_put(struct iam_iterator *it); ++ ++/* ++ * Move iterator one record right. ++ * ++ * Return value: 0: success, ++ * +1: end of container reached ++ * -ve: error ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE ++ * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) ++ */ ++int iam_it_next(struct iam_iterator *it); ++ ++/* ++ * Return pointer to the record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++const struct iam_rec *iam_it_rec_get(struct iam_iterator *it); ++ ++/* ++ * Replace contents of record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); ++ ++/* ++ * Place key under iterator in @k, return @k ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++const struct iam_key *iam_it_key_get(struct iam_iterator *it, ++ struct iam_key *k); ++ ++/* ++ * Insert new record with key @k and contents from @r, shifting records to the ++ * right. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE && ++ * it_keycmp(it, iam_it_key_get(it, *), k) < 0 ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, ++ * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && ++ * !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, ++ struct iam_key *k, struct iam_rec *r); ++/* ++ * Delete record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); + + #ifdef CONFIG_EXT3_INDEX +-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry); +-static void dx_set_block(struct dx_path *p, +- struct dx_entry *entry, unsigned value); +-static inline struct dx_key *dx_get_key(struct dx_path *p, +- struct dx_entry *entry, +- struct dx_key *key); +-static void dx_set_key(struct dx_path *p, struct dx_entry *entry, +- struct dx_key *key); +-static unsigned dx_get_count(struct dx_entry *entries); +-static unsigned dx_get_limit(struct dx_entry *entries); +-static void dx_set_count(struct dx_entry *entries, unsigned value); +-static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct dx_path *p); +-static unsigned dx_node_limit(struct dx_path *p); ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry); ++static void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value); ++static inline struct iam_key *dx_get_key(struct iam_path *p, ++ struct iam_entry *entry, ++ struct iam_key *key); ++static void dx_set_key(struct iam_path *p, struct iam_entry *entry, ++ struct iam_key *key); ++static unsigned dx_get_count(struct iam_entry *entries); ++static unsigned dx_get_limit(struct iam_entry *entries); ++static void dx_set_count(struct iam_entry *entries, unsigned value); ++static void dx_set_limit(struct iam_entry *entries, unsigned value); ++static unsigned dx_root_limit(struct iam_path *p); ++static unsigned dx_node_limit(struct iam_path *p); + static int dx_probe(struct dentry *dentry, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_path *path); ++ struct iam_path *path); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_path *path, +- struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct iam_path *path, ++ struct iam_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_path *path, __u32 *start_hash); ++ struct iam_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +-static inline void dx_path_init(struct dx_path *path, struct inode *inode); +-static inline void dx_path_fini(struct dx_path *path); ++static inline void iam_path_init(struct iam_path *path, ++ struct iam_container *c); ++static inline void iam_path_fini(struct iam_path *path); + + + /* +@@ -247,153 +713,154 @@ + * Mask them off for now. + */ + +-static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off) ++static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off) + { + return (void *)((char *)entry + off); + } + +-static inline size_t dx_entry_size(struct dx_path *p) ++static inline struct iam_descr *path_descr(struct iam_path *p) + { +- return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size; ++ return p->ip_container->ic_descr; + } + +-static inline struct dx_entry *dx_entry_shift(struct dx_path *p, +- struct dx_entry *entry, int shift) ++static inline struct inode *path_obj(struct iam_path *p) ++{ ++ return p->ip_container->ic_object; ++} ++ ++static inline size_t iam_entry_size(struct iam_path *p) ++{ ++ return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size; ++} ++ ++static inline struct iam_entry *iam_entry_shift(struct iam_path *p, ++ struct iam_entry *entry, int shift) + { + void *e = entry; +- return e + shift * dx_entry_size(p); ++ return e + shift * iam_entry_size(p); + } + +-static inline ptrdiff_t dx_entry_diff(struct dx_path *p, +- struct dx_entry *e1, struct dx_entry *e2) ++static inline ptrdiff_t iam_entry_diff(struct iam_path *p, ++ struct iam_entry *e1, struct iam_entry *e2) + { + ptrdiff_t diff; + + diff = (void *)e1 - (void *)e2; +- assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff); +- return diff / dx_entry_size(p); ++ assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff); ++ return diff / iam_entry_size(p); + } + +-static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry) ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) + { +- return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size)) ++ return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size)) + & 0x00ffffff; + } + +-static inline void dx_set_block(struct dx_path *p, +- struct dx_entry *entry, unsigned value) ++static inline void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value) + { +- *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value); ++ *(u32*)entry_off(entry, ++ path_descr(p)->id_key_size) = cpu_to_le32(value); + } + +-static inline struct dx_key *dx_get_key(struct dx_path *p, +- struct dx_entry *entry, +- struct dx_key *key) ++static inline struct iam_key *dx_get_key(struct iam_path *p, ++ struct iam_entry *entry, ++ struct iam_key *key) + { +- memcpy(key, entry, p->dp_param->dpo_key_size); ++ memcpy(key, entry, path_descr(p)->id_key_size); + return key; + } + +-static inline struct dx_key *dx_key_at(struct dx_path *p, +- struct dx_entry *entry) ++static inline struct iam_key *iam_key_at(struct iam_path *p, ++ struct iam_entry *entry) + { +- return (struct dx_key *)entry; ++ return (struct iam_key *)entry; + } + +-static inline void dx_set_key(struct dx_path *p, +- struct dx_entry *entry, struct dx_key *key) ++static inline void dx_set_key(struct iam_path *p, ++ struct iam_entry *entry, struct iam_key *key) + { +- memcpy(entry, key, p->dp_param->dpo_key_size); ++ memcpy(entry, key, path_descr(p)->id_key_size); + } + +-static inline unsigned dx_get_count (struct dx_entry *entries) ++static inline unsigned dx_get_count (struct iam_entry *entries) + { + return le16_to_cpu(((struct dx_countlimit *) entries)->count); + } + +-static inline unsigned dx_get_limit (struct dx_entry *entries) ++static inline unsigned dx_get_limit (struct iam_entry *entries) + { + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); + } + +-static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++static inline void dx_set_count (struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); + } + +-static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++static inline void dx_set_limit (struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct dx_path *p) ++static inline unsigned dx_root_limit(struct iam_path *p) + { +- struct dx_param *param = p->dp_param; +- unsigned entry_space = p->dp_object->i_sb->s_blocksize - +- param->dpo_root_gap; +- return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++ struct iam_descr *param = path_descr(p); ++ unsigned entry_space = path_obj(p)->i_sb->s_blocksize - ++ param->id_root_gap; ++ return entry_space / (param->id_key_size + param->id_ptr_size); + } + +-static inline unsigned dx_node_limit(struct dx_path *p) ++static inline unsigned dx_node_limit(struct iam_path *p) + { +- struct dx_param *param = p->dp_param; +- unsigned entry_space = p->dp_object->i_sb->s_blocksize - +- param->dpo_node_gap; +- return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++ struct iam_descr *param = path_descr(p); ++ unsigned entry_space = path_obj(p)->i_sb->s_blocksize - ++ param->id_node_gap; ++ return entry_space / (param->id_key_size + param->id_ptr_size); + } + +-static inline int dx_index_is_compat(struct dx_path *path) ++static inline int dx_index_is_compat(struct iam_path *path) + { +- return path->dp_param == &htree_compat_param; ++ return path_descr(path) == &htree_compat_param; + } + +-static struct dx_entry *dx_get_entries(struct dx_path *path, void *data, ++static struct iam_entry *dx_get_entries(struct iam_path *path, void *data, + int root) + { + return data + + (root ? +- path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap); ++ path_descr(path)->id_root_gap : path_descr(path)->id_node_gap); + } + +-static struct dx_entry *dx_node_get_entries(struct dx_path *path, +- struct dx_frame *frame) ++static struct iam_entry *dx_node_get_entries(struct iam_path *path, ++ struct iam_frame *frame) + { + return dx_get_entries(path, +- frame->bh->b_data, frame == path->dp_frames); +-} +- +-static inline struct dx_key *keycpy(struct dx_path *p, +- struct dx_key *k1, struct dx_key *k2) +-{ +- return memcpy(k1, k2, p->dp_param->dpo_key_size); +-} +- +-static inline int keycmp(struct dx_path *p, +- struct dx_key *k1, struct dx_key *k2) +-{ +- return p->dp_param->dpo_keycmp(p, k1, k2); ++ frame->bh->b_data, frame == path->ip_frames); + } + +-static int dx_node_check(struct dx_path *p, struct dx_frame *f) ++static int dx_node_check(struct iam_path *p, struct iam_frame *f) + { +- struct dx_entry *e; ++ struct iam_entry *e; ++ struct iam_container *c; + unsigned count; + unsigned i; + ++ c = p->ip_container; + e = dx_node_get_entries(p, f); + count = dx_get_count(e); +- e = dx_entry_shift(p, e, 1); +- for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) { +- keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]); +- dx_get_key(p, e, p->dp_key_scratch[1]); ++ e = iam_entry_shift(p, e, 1); ++ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { ++ keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]); ++ dx_get_key(p, e, p->ip_key_scratch[1]); + if (i > 0 && +- keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0) ++ keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0) + return 0; + } + return 1; + } + +-static u32 htree_root_ptr(struct dx_path *path) ++static u32 htree_root_ptr(struct iam_container *c) + { + return 0; + } +@@ -403,20 +870,19 @@ + struct dentry *dentry; + }; + +-static int htree_node_check(struct dx_path *path, struct dx_frame *frame, +- void *cookie) ++static int htree_node_check(struct iam_path *path, struct iam_frame *frame) + { + void *data; +- struct dx_entry *entries; ++ struct iam_entry *entries; + struct super_block *sb; + + data = frame->bh->b_data; + entries = dx_node_get_entries(path, frame); +- sb = path->dp_object->i_sb; +- if (frame == path->dp_frames) { ++ sb = path_obj(path)->i_sb; ++ if (frame == path->ip_frames) { + /* root node */ + struct dx_root *root; +- struct htree_cookie *hc = cookie; ++ struct htree_cookie *hc = path->ip_descr_data; + + root = data; + if (root->info.hash_version > DX_HASH_MAX) { +@@ -433,8 +899,8 @@ + return ERR_BAD_DX_DIR; + } + +- path->dp_indirect = root->info.indirect_levels; +- if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) { ++ path->ip_indirect = root->info.indirect_levels; ++ if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) { + ext3_warning(sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -450,17 +916,17 @@ + if (hc->dentry) + ext3fs_dirhash(hc->dentry->d_name.name, + hc->dentry->d_name.len, hc->hinfo); +- path->dp_key_target = (struct dx_key *)&hc->hinfo->hash; ++ path->ip_key_target = (struct iam_key *)&hc->hinfo->hash; + } else { + /* non-root index */ +- assert(entries == data + path->dp_param->dpo_node_gap); ++ assert(entries == data + path_descr(path)->id_node_gap); + assert(dx_get_limit(entries) == dx_node_limit(path)); + } + frame->entries = frame->at = entries; + return 0; + } + +-static int htree_node_init(struct dx_path *path, ++static int htree_node_init(struct iam_container *c, + struct buffer_head *bh, int root) + { + struct dx_node *node; +@@ -468,13 +934,24 @@ + assert(!root); + + node = (void *)bh->b_data; +- node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize); ++ node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); + node->fake.inode = 0; + return 0; + } + +-static int htree_keycmp(struct dx_path *path, +- struct dx_key *k1, struct dx_key *k2) ++static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *handle, struct buffer_head **bh) ++{ ++ int result = 0; ++ ++ *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result); ++ if (*bh == NULL) ++ result = -EIO; ++ return result; ++} ++ ++static int htree_keycmp(struct iam_container *c, ++ struct iam_key *k1, struct iam_key *k2) + { + __u32 p1 = le32_to_cpu(*(__u32 *)k1); + __u32 p2 = le32_to_cpu(*(__u32 *)k2); +@@ -486,7 +963,7 @@ + * Debug + */ + #ifdef DX_DEBUG +-static void dx_show_index (char * label, struct dx_entry *entries) ++static void dx_show_index (char * label, struct iam_entry *entries) + { + int i, n = dx_get_count (entries); + printk("%s index ", label); +@@ -535,7 +1012,7 @@ + } + + struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, +- struct dx_entry *entries, int levels) ++ struct iam_entry *entries, int levels) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; +@@ -565,32 +1042,33 @@ + } + #endif /* DX_DEBUG */ + +-static int dx_lookup(struct dx_path *path, void *cookie) ++static int dx_lookup(struct iam_path *path) + { + u32 ptr; +- int err; ++ int err = 0; + int i; + +- struct dx_param *param; +- struct dx_frame *frame; +- +- param = path->dp_param; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct iam_container *c; + +- for (frame = path->dp_frames, i = 0, +- ptr = param->dpo_root_ptr(path); i <= path->dp_indirect; ++ param = path_descr(path); ++ c = path->ip_container; ++ ++ for (frame = path->ip_frames, i = 0, ++ ptr = param->id_root_ptr(path->ip_container); ++ i <= path->ip_indirect; + ptr = dx_get_block(path, frame->at), ++frame, ++i) { +- struct dx_entry *entries; +- struct dx_entry *p; +- struct dx_entry *q; +- struct dx_entry *m; ++ struct iam_entry *entries; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; + unsigned count; + +- frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err); +- if (frame->bh == NULL) { +- err = -EIO; ++ err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh); ++ if (err != 0) + break; +- } +- err = param->dpo_node_check(path, frame, cookie); ++ err = param->id_node_check(path, frame); + if (err != 0) + break; + +@@ -599,37 +1077,37 @@ + entries = frame->entries; + count = dx_get_count(entries); + assert(count && count <= dx_get_limit(entries)); +- p = dx_entry_shift(path, entries, 1); +- q = dx_entry_shift(path, entries, count - 1); ++ p = iam_entry_shift(path, entries, 1); ++ q = iam_entry_shift(path, entries, count - 1); + while (p <= q) { +- m = dx_entry_shift(path, +- p, dx_entry_diff(path, q, p) / 2); ++ m = iam_entry_shift(path, ++ p, iam_entry_diff(path, q, p) / 2); + dxtrace(printk(".")); +- if (keycmp(path, dx_key_at(path, m), +- path->dp_key_target) > 0) +- q = dx_entry_shift(path, m, -1); ++ if (keycmp(c, iam_key_at(path, m), ++ path->ip_key_target) > 0) ++ q = iam_entry_shift(path, m, -1); + else +- p = dx_entry_shift(path, m, +1); ++ p = iam_entry_shift(path, m, +1); + } + +- frame->at = dx_entry_shift(path, p, -1); ++ frame->at = iam_entry_shift(path, p, -1); + if (1) { // linear search cross check + unsigned n = count - 1; +- struct dx_entry *at; ++ struct iam_entry *at; + + at = entries; + while (n--) { + dxtrace(printk(",")); +- at = dx_entry_shift(path, at, +1); +- if (keycmp(path, dx_key_at(path, at), +- path->dp_key_target) > 0) { +- if (at != dx_entry_shift(path, frame->at, 1)) { ++ at = iam_entry_shift(path, at, +1); ++ if (keycmp(c, iam_key_at(path, at), ++ path->ip_key_target) > 0) { ++ if (at != iam_entry_shift(path, frame->at, 1)) { + BREAKPOINT; + printk(KERN_EMERG "%i\n", +- keycmp(path, dx_key_at(path, at), +- path->dp_key_target)); ++ keycmp(c, iam_key_at(path, at), ++ path->ip_key_target)); + } +- at = dx_entry_shift(path, at, -1); ++ at = iam_entry_shift(path, at, -1); + break; + } + } +@@ -637,8 +1115,8 @@ + } + } + if (err != 0) +- dx_path_fini(path); +- path->dp_frame = --frame; ++ iam_path_fini(path); ++ path->ip_frame = --frame; + return err; + } + +@@ -652,7 +1130,7 @@ + * back to userspace. + */ + static int dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_path *path) ++ struct dx_hash_info *hinfo, struct iam_path *path) + { + int err; + struct htree_cookie hc = { +@@ -661,39 +1139,78 @@ + }; + + assert(dx_index_is_compat(path)); +- err = dx_lookup(path, &hc); +- assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL); ++ path->ip_descr_data = &hc; ++ err = dx_lookup(path); ++ assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); + return err; + } + +-static inline void dx_path_init(struct dx_path *path, struct inode *inode) ++/* ++ * Initialize container @c, acquires additional reference on @inode. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode) ++{ ++ memset(c, 0, sizeof *c); ++ c->ic_descr = descr; ++ c->ic_object = igrab(inode); ++ if (c->ic_object != NULL) ++ return 0; ++ else ++ return -ENOENT; ++} ++ ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c) ++{ ++ if (c->ic_object != NULL) { ++ iput(c->ic_object); ++ c->ic_object = NULL; ++ } ++} ++ ++static inline void iam_path_init(struct iam_path *path, struct iam_container *c) + { + memset(path, 0, sizeof *path); +- path->dp_object = inode; +- path->dp_frame = path->dp_frames; ++ path->ip_container = c; ++ path->ip_frame = path->ip_frames; + } + +-static inline void dx_path_fini(struct dx_path *path) ++static inline void iam_path_fini(struct iam_path *path) + { + int i; + +- for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) { +- if (path->dp_frames[i].bh != NULL) { +- brelse(path->dp_frames[i].bh); +- path->dp_frames[i].bh = NULL; ++ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { ++ if (path->ip_frames[i].bh != NULL) { ++ brelse(path->ip_frames[i].bh); ++ path->ip_frames[i].bh = NULL; + } + } + } + +-static void dx_path_compat_init(struct dx_path_compat *path, +- struct inode *inode) ++static void iam_path_compat_init(struct iam_path_compat *path, ++ struct inode *inode) + { + int i; +- dx_path_init(&path->dpc_path, inode); +- path->dpc_path.dp_param = &htree_compat_param; +- for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i) +- path->dpc_path.dp_key_scratch[i] = +- (struct dx_key *)&path->dpc_scrach[i]; ++ ++ iam_container_init(&path->ipc_container, &htree_compat_param, inode); ++ /* ++ * XXX hack allowing finalization of iam_path_compat with ++ * iam_path_fini(). ++ */ ++ iput(inode); ++ iam_path_init(&path->ipc_path, &path->ipc_container); ++ for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) ++ path->ipc_path.ip_key_scratch[i] = ++ (struct iam_key *)&path->ipc_scrach[i]; ++} ++ ++static void iam_path_compat_fini(struct iam_path_compat *path) ++{ ++ iam_path_fini(&path->ipc_path); ++ iam_container_fini(&path->ipc_container); + } + + /* +@@ -714,16 +1231,16 @@ + * hash of the next page. + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_path *path, __u32 *start_hash) ++ struct iam_path *path, __u32 *start_hash) + { +- struct dx_frame *p; ++ struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + assert(dx_index_is_compat(path)); + +- p = path->dp_frame; ++ p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and +@@ -732,11 +1249,11 @@ + * nodes need to be read. + */ + while (1) { +- p->at = dx_entry_shift(path, p->at, +1); +- if (p->at < dx_entry_shift(path, p->entries, ++ p->at = iam_entry_shift(path, p->at, +1); ++ if (p->at < iam_entry_shift(path, p->entries, + dx_get_count(p->entries))) + break; +- if (p == path->dp_frames) ++ if (p == path->ip_frames) + return 0; + num_frames++; + --p; +@@ -749,7 +1266,7 @@ + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- dx_get_key(path, p->at, (struct dx_key *)&bhash); ++ dx_get_key(path, p->at, (struct iam_key *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { +@@ -761,8 +1278,10 @@ + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, +- dx_get_block(path, p->at), 0, &err))) ++ err = path_descr(path)->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, p->at), ++ NULL, &bh); ++ if (err != 0) + return err; /* Failure */ + ++p; + brelse (p->bh); +@@ -837,8 +1356,8 @@ + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; + struct inode *dir; + int block, err; + int count = 0; +@@ -848,7 +1367,7 @@ + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; +- dx_path_compat_init(&cpath, dir); ++ iam_path_compat_init(&cpath, dir); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -865,7 +1384,7 @@ + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + de = ext3_next_entry(de); +@@ -875,7 +1394,7 @@ + } + + while (1) { +- block = dx_get_block(path, path->dp_frame->at); ++ block = dx_get_block(path, path->ip_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -900,12 +1419,12 @@ + (count && ((hashval & 1) == 0))) + break; + } +- dx_path_fini(path); ++ iam_path_fini(path); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_path_fini(path); ++ iam_path_fini(path); + return (err); + } + +@@ -964,18 +1483,18 @@ + } while(more); + } + +-static void dx_insert_block(struct dx_path *path, +- struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct iam_path *path, ++ struct iam_frame *frame, u32 hash, u32 block) + { +- struct dx_entry *entries = frame->entries; +- struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1); ++ struct iam_entry *entries = frame->entries; ++ struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1); + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); +- assert(old < dx_entry_shift(path, entries, count)); +- memmove(dx_entry_shift(path, new, 1), new, +- (char *)dx_entry_shift(path, entries, count) - (char *)new); +- dx_set_key(path, new, (struct dx_key *)&hash); ++ assert(old < iam_entry_shift(path, entries, count)); ++ memmove(iam_entry_shift(path, new, 1), new, ++ (char *)iam_entry_shift(path, entries, count) - (char *)new); ++ dx_set_key(path, new, (struct iam_key *)&hash); + dx_set_block(path, new, block); + dx_set_count(entries, count + 1); + } +@@ -1177,9 +1696,9 @@ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; +- struct dx_entry_compat dummy_dot = { ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_entry_compat dummy_dot = { + .block = 0 + }; + struct ext3_dir_entry_2 *de, *top; +@@ -1190,8 +1709,8 @@ + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + +- dx_path_compat_init(&cpath, dir); +- ++ iam_path_compat_init(&cpath, dir); ++ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +@@ -1199,13 +1718,15 @@ + if (*err != 0) + return NULL; + } else { +- path->dp_frame->bh = NULL; /* for dx_path_fini() */ +- path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ ++ path->ip_frame->bh = NULL; /* for iam_path_fini() */ ++ path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ + } + hash = hinfo.hash; + do { +- block = dx_get_block(path, path->dp_frame->at); +- if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ block = dx_get_block(path, path->ip_frame->at); ++ *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block, ++ NULL, &bh); ++ if (*err != 0) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - +@@ -1220,7 +1741,7 @@ + goto errout; + } + *res_dir = de; +- dx_path_fini(path); ++ iam_path_fini(path); + return bh; + } + brelse (bh); +@@ -1238,7 +1759,7 @@ + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_path_fini(path); ++ iam_path_fini(path); + return NULL; + } + #endif +@@ -1363,11 +1884,11 @@ + + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ +-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path, +- struct buffer_head **bh,struct dx_frame *frame, ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path, ++ struct buffer_head **bh,struct iam_frame *frame, + struct dx_hash_info *hinfo, int *error) + { +- struct inode *dir = path->dp_object; ++ struct inode *dir = path_obj(path); + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; +@@ -1553,9 +2074,9 @@ + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; +- struct dx_entry *entries; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; +@@ -1565,7 +2086,7 @@ + u32 block; + struct fake_dirent *fde; + +- dx_path_compat_init(&cpath, dir); ++ iam_path_compat_init(&cpath, dir); + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1612,12 +2133,12 @@ + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- path->dp_frame->entries = entries; +- path->dp_frame->at = entries; +- path->dp_frame->bh = bh; ++ path->ip_frame->entries = entries; ++ path->ip_frame->at = entries; ++ path->ip_frame->bh = bh; + bh = bh2; +- de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval); +- dx_path_fini(path); ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &retval); ++ iam_path_fini(path); + if (!de) + return retval; + +@@ -1698,12 +2219,12 @@ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_path_compat cpath; +- struct dx_path *path = &cpath.dpc_path; +- struct dx_param *param; +- struct dx_frame *frame, *safe; +- struct dx_entry *entries; /* old block contents */ +- struct dx_entry *entries2; /* new block contents */ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame, *safe; ++ struct iam_entry *entries; /* old block contents */ ++ struct iam_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; + struct buffer_head * bh; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; +@@ -1716,20 +2237,22 @@ + int i; + size_t isize; + +- dx_path_compat_init(&cpath, dir); +- param = path->dp_param; ++ iam_path_compat_init(&cpath, dir); ++ param = path_descr(path); + + err = dx_probe(dentry, NULL, &hinfo, path); + if (err != 0) + return err; +- frame = path->dp_frame; ++ frame = path->ip_frame; + entries = frame->entries; + + /* XXX nikita: global serialization! */ + isize = dir->i_size; + +- if (!(bh = ext3_bread(handle, dir, +- dx_get_block(path, frame->at), 0, &err))) ++ err = param->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, ++ frame->at), handle, &bh); ++ if (err != 0) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); +@@ -1761,7 +2284,7 @@ + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ +- for (nr_splet = 0; frame >= path->dp_frames && ++ for (nr_splet = 0; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +@@ -1778,7 +2301,7 @@ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); + if (!bh_new[i] || +- param->dpo_node_init(path, bh_new[i], 0) != 0) ++ param->id_node_init(path->ip_container, bh_new[i], 0) != 0) + goto cleanup; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); +@@ -1786,7 +2309,7 @@ + goto journal_error; + } + /* Add "safe" node to transaction too */ +- if (safe + 1 != path->dp_frames) { ++ if (safe + 1 != path->ip_frames) { + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1800,12 +2323,12 @@ + + entries = frame->entries; + count = dx_get_count(entries); +- idx = dx_entry_diff(path, frame->at, entries); ++ idx = iam_entry_diff(path, frame->at, entries); + + bh2 = bh_new[i]; + entries2 = dx_get_entries(path, bh2->b_data, 0); + +- if (frame == path->dp_frames) { ++ if (frame == path->ip_frames) { + /* splitting root node. Tricky point: + * + * In the "normal" B-tree we'd split root *and* add +@@ -1818,14 +2341,14 @@ + */ + struct dx_root *root; + u8 indirects; +- struct dx_frame *frames; ++ struct iam_frame *frames; + +- frames = path->dp_frames; ++ frames = path->ip_frames; + root = (struct dx_root *) frames->bh->b_data; + indirects = root->info.indirect_levels; + dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, +- count * dx_entry_size(path)); ++ count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ +@@ -1835,9 +2358,9 @@ + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, +- (sizeof path->dp_frames) - 2 * sizeof frames[0]); ++ (sizeof path->ip_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ +- frames[1].at = dx_entry_shift(path, entries2, idx); ++ frames[1].at = iam_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; + assert(dx_node_check(path, frame)); +@@ -1853,22 +2376,22 @@ + unsigned hash2; + + dx_get_key(path, +- dx_entry_shift(path, entries, count1), +- (struct dx_key *)&hash2); ++ iam_entry_shift(path, entries, count1), ++ (struct iam_key *)&hash2); + + dxtrace(printk("Split index %i/%i\n", count1, count2)); + + memcpy ((char *) entries2, +- (char *) dx_entry_shift(path, entries, count1), +- count2 * dx_entry_size(path)); ++ (char *) iam_entry_shift(path, entries, count1), ++ count2 * iam_entry_size(path)); + dx_set_count (entries, count1); + dx_set_count (entries2, count2); + dx_set_limit (entries2, dx_node_limit(path)); + + /* Which index block gets the new entry? */ + if (idx >= count1) { +- frame->at = dx_entry_shift(path, entries2, +- idx - count1); ++ frame->at = iam_entry_shift(path, entries2, ++ idx - count1); + frame->entries = entries = entries2; + swap(frame->bh, bh2); + bh_new[i] = bh2; +@@ -1903,7 +2426,7 @@ + } + if (err) + inode->i_size = isize; +- dx_path_fini(path); ++ iam_path_fini(path); + return err; + } + #endif diff --git a/lustre/kernel_patches/patches/ext3-htree-path-ops.patch b/lustre/kernel_patches/patches/ext3-htree-path-ops.patch index 9a2edbd..ec66561 100644 --- a/lustre/kernel_patches/patches/ext3-htree-path-ops.patch +++ b/lustre/kernel_patches/patches/ext3-htree-path-ops.patch @@ -1,8 +1,20 @@ Index: iam-src/fs/ext3/namei.c =================================================================== ---- iam-src.orig/fs/ext3/namei.c 2006-02-12 16:43:57.000000000 +0300 -+++ iam-src/fs/ext3/namei.c 2006-02-12 23:22:12.000000000 +0300 -@@ -83,22 +83,21 @@ static struct buffer_head *ext3_append(h +--- iam-src.orig/fs/ext3/namei.c 2006-02-15 18:31:48.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-15 21:25:34.000000000 +0300 +@@ -51,7 +51,10 @@ + /* + * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. + */ +-#define DX_MAX_TREE_HEIGHT (5) ++enum { ++ DX_MAX_TREE_HEIGHT = 5, ++ DX_SCRATCH_KEYS = 2 ++}; + + static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, +@@ -83,22 +86,22 @@ static struct buffer_head *ext3_append(h #define dxtrace(command) #endif @@ -25,12 +37,13 @@ Index: iam-src/fs/ext3/namei.c -struct dx_entry -{ +struct dx_entry; /* incomplete type */ ++struct dx_key; /* incomplete type */ + +struct dx_entry_compat { __le32 hash; __le32 block; }; -@@ -109,8 +108,7 @@ struct dx_entry +@@ -109,8 +112,7 @@ struct dx_entry * hash version mod 4 should never be 0. Sincerely, the paranoia department. */ @@ -40,7 +53,7 @@ Index: iam-src/fs/ext3/namei.c struct fake_dirent dot; char dot_name[4]; struct fake_dirent dotdot; -@@ -124,13 +122,13 @@ struct dx_root +@@ -124,13 +126,13 @@ struct dx_root u8 unused_flags; } info; @@ -56,7 +69,7 @@ Index: iam-src/fs/ext3/namei.c }; -@@ -147,38 +145,76 @@ struct dx_map_entry +@@ -147,38 +149,88 @@ struct dx_map_entry u32 offs; }; @@ -72,6 +85,8 @@ Index: iam-src/fs/ext3/namei.c + struct dx_frame *frame, void *cookie); + int (*dpo_node_init)(struct dx_path *path, + struct buffer_head *bh, int root); ++ int (*dpo_keycmp)(struct dx_path *path, ++ struct dx_key *k1, struct dx_key *k2); +}; + /* @@ -86,8 +101,13 @@ Index: iam-src/fs/ext3/namei.c + int dp_indirect; + struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; + struct dx_frame *dp_frame; -+ void *dp_key_target; -+ void *dp_key; ++ struct dx_key *dp_key_target; ++ struct dx_key *dp_key_scratch[DX_SCRATCH_KEYS]; ++}; ++ ++struct dx_path_compat { ++ struct dx_path dpc_path; ++ __u32 dpc_scrach[DX_SCRATCH_KEYS]; }; +static u32 htree_root_ptr(struct dx_path *p); @@ -95,6 +115,8 @@ Index: iam-src/fs/ext3/namei.c + struct dx_frame *frame, void *cookie); +static int htree_node_init(struct dx_path *path, + struct buffer_head *bh, int root); ++static int htree_keycmp(struct dx_path *path, ++ struct dx_key *k1, struct dx_key *k2); + +static struct dx_param htree_compat_param = { + .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, @@ -104,7 +126,8 @@ Index: iam-src/fs/ext3/namei.c + + .dpo_root_ptr = htree_root_ptr, + .dpo_node_check = htree_node_check, -+ .dpo_node_init = htree_node_init ++ .dpo_node_init = htree_node_init, ++ .dpo_keycmp = htree_keycmp +}; + + @@ -127,9 +150,11 @@ Index: iam-src/fs/ext3/namei.c +static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry); +static void dx_set_block(struct dx_path *p, + struct dx_entry *entry, unsigned value); -+static inline void *dx_get_key(struct dx_path *p, -+ struct dx_entry *entry, void *key); -+static void dx_set_key(struct dx_path *p, struct dx_entry *entry, void *key); ++static inline struct dx_key *dx_get_key(struct dx_path *p, ++ struct dx_entry *entry, ++ struct dx_key *key); ++static void dx_set_key(struct dx_path *p, struct dx_entry *entry, ++ struct dx_key *key); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); @@ -152,7 +177,7 @@ Index: iam-src/fs/ext3/namei.c static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_path *path, __u32 *start_hash); static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -@@ -186,29 +222,65 @@ static struct buffer_head * ext3_dx_find +@@ -186,29 +238,72 @@ static struct buffer_head * ext3_dx_find static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -167,75 +192,79 @@ Index: iam-src/fs/ext3/namei.c -static inline unsigned dx_get_block (struct dx_entry *entry) +static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off) -+{ + { +- return le32_to_cpu(entry->block) & 0x00ffffff; + return (void *)((char *)entry + off); -+} -+ + } + +-static inline void dx_set_block (struct dx_entry *entry, unsigned value) +static inline size_t dx_entry_size(struct dx_path *p) { -- return le32_to_cpu(entry->block) & 0x00ffffff; +- entry->block = cpu_to_le32(value); + return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size; } --static inline void dx_set_block (struct dx_entry *entry, unsigned value) +-static inline unsigned dx_get_hash (struct dx_entry *entry) +static inline struct dx_entry *dx_entry_shift(struct dx_path *p, + struct dx_entry *entry, int shift) { -- entry->block = cpu_to_le32(value); +- return le32_to_cpu(entry->hash); + void *e = entry; + return e + shift * dx_entry_size(p); - } - --static inline unsigned dx_get_hash (struct dx_entry *entry) ++} ++ +static inline ptrdiff_t dx_entry_diff(struct dx_path *p, + struct dx_entry *e1, struct dx_entry *e2) - { -- return le32_to_cpu(entry->hash); ++{ + ptrdiff_t diff; + + diff = (void *)e1 - (void *)e2; + assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff); + return diff / dx_entry_size(p); -+} -+ -+static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry) -+{ -+ return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size)) -+ & 0x00ffffff; } -static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -+static inline void dx_set_block(struct dx_path *p, -+ struct dx_entry *entry, unsigned value) ++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry) { - entry->hash = cpu_to_le32(value); ++ return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size)) ++ & 0x00ffffff; ++} ++ ++static inline void dx_set_block(struct dx_path *p, ++ struct dx_entry *entry, unsigned value) ++{ + *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value); +} + -+static inline void *dx_get_key(struct dx_path *p, -+ struct dx_entry *entry, void *key) ++static inline struct dx_key *dx_get_key(struct dx_path *p, ++ struct dx_entry *entry, ++ struct dx_key *key) +{ + memcpy(key, entry, p->dp_param->dpo_key_size); + return key; +} + ++static inline struct dx_key *dx_key_at(struct dx_path *p, ++ struct dx_entry *entry) ++{ ++ return (struct dx_key *)entry; ++} ++ +static inline void dx_set_key(struct dx_path *p, -+ struct dx_entry *entry, void *key) ++ struct dx_entry *entry, struct dx_key *key) +{ + memcpy(entry, key, p->dp_param->dpo_key_size); } static inline unsigned dx_get_count (struct dx_entry *entries) -@@ -231,17 +303,123 @@ static inline void dx_set_limit (struct +@@ -231,17 +326,163 @@ static inline void dx_set_limit (struct ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } -static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +static inline unsigned dx_root_limit(struct dx_path *p) - { -- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - -- EXT3_DIR_REC_LEN(2) - infosize; -- return 0? 20: entry_space / sizeof(struct dx_entry); ++{ + struct dx_param *param = p->dp_param; + unsigned entry_space = p->dp_object->i_sb->s_blocksize - + param->dpo_root_gap; @@ -270,6 +299,37 @@ Index: iam-src/fs/ext3/namei.c + frame->bh->b_data, frame == path->dp_frames); +} + ++static inline struct dx_key *keycpy(struct dx_path *p, ++ struct dx_key *k1, struct dx_key *k2) ++{ ++ return memcpy(k1, k2, p->dp_param->dpo_key_size); ++} ++ ++static inline int keycmp(struct dx_path *p, ++ struct dx_key *k1, struct dx_key *k2) ++{ ++ return p->dp_param->dpo_keycmp(p, k1, k2); ++} ++ ++static int dx_node_check(struct dx_path *p, struct dx_frame *f) ++{ ++ struct dx_entry *e; ++ unsigned count; ++ unsigned i; ++ ++ e = dx_node_get_entries(p, f); ++ count = dx_get_count(e); ++ e = dx_entry_shift(p, e, 1); ++ for (i = 0; i < count - 1; ++i, e = dx_entry_shift(p, e, 1)) { ++ keycpy(p, p->dp_key_scratch[0], p->dp_key_scratch[1]); ++ dx_get_key(p, e, p->dp_key_scratch[1]); ++ if (i > 0 && ++ keycmp(p, p->dp_key_scratch[0], p->dp_key_scratch[1]) > 0) ++ return 0; ++ } ++ return 1; ++} ++ +static u32 htree_root_ptr(struct dx_path *path) +{ + return 0; @@ -330,7 +390,7 @@ Index: iam-src/fs/ext3/namei.c + if (hc->dentry) + ext3fs_dirhash(hc->dentry->d_name.name, + hc->dentry->d_name.len, hc->hinfo); -+ path->dp_key_target = &hc->hinfo->hash; ++ path->dp_key_target = (struct dx_key *)&hc->hinfo->hash; + } else { + /* non-root index */ + assert(entries == data + path->dp_param->dpo_node_gap); @@ -338,14 +398,14 @@ Index: iam-src/fs/ext3/namei.c + } + frame->entries = frame->at = entries; + return 0; - } - --static inline unsigned dx_node_limit (struct inode *dir) ++} ++ +static int htree_node_init(struct dx_path *path, + struct buffer_head *bh, int root) { -- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); -- return 0? 22: entry_space / sizeof(struct dx_entry); +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - +- EXT3_DIR_REC_LEN(2) - infosize; +- return 0? 20: entry_space / sizeof(struct dx_entry); + struct dx_node *node; + + assert(!root); @@ -356,8 +416,20 @@ Index: iam-src/fs/ext3/namei.c + return 0; } +-static inline unsigned dx_node_limit (struct inode *dir) ++static int htree_keycmp(struct dx_path *path, ++ struct dx_key *k1, struct dx_key *k2) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); +- return 0? 22: entry_space / sizeof(struct dx_entry); ++ __u32 p1 = le32_to_cpu(*(__u32 *)k1); ++ __u32 p2 = le32_to_cpu(*(__u32 *)k2); ++ ++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); + } + /* -@@ -327,123 +505,101 @@ struct stats dx_show_entries(struct dx_h +@@ -327,123 +568,105 @@ struct stats dx_show_entries(struct dx_h } #endif /* DX_DEBUG */ @@ -454,6 +526,8 @@ Index: iam-src/fs/ext3/namei.c + if (err != 0) + break; + ++ assert(dx_node_check(path, frame)); ++ + entries = frame->entries; count = dx_get_count(entries); - assert (count && count <= dx_get_limit(entries)); @@ -471,9 +545,8 @@ Index: iam-src/fs/ext3/namei.c dxtrace(printk(".")); - if (dx_get_hash(m) > hash) - q = m - 1; -+ if (memcmp(dx_get_key(path, m, path->dp_key), -+ path->dp_key_target, -+ param->dpo_key_size) > 0) ++ if (keycmp(path, dx_key_at(path, m), ++ path->dp_key_target) > 0) + q = dx_entry_shift(path, m, -1); else - p = m + 1; @@ -496,9 +569,14 @@ Index: iam-src/fs/ext3/namei.c - { - at--; + at = dx_entry_shift(path, at, +1); -+ if (memcmp(dx_get_key(path, at, path->dp_key), -+ path->dp_key_target, -+ param->dpo_key_size) > 0) { ++ if (keycmp(path, dx_key_at(path, at), ++ path->dp_key_target) > 0) { ++ if (at != dx_entry_shift(path, frame->at, 1)) { ++ BREAKPOINT; ++ printk(KERN_EMERG "%i\n", ++ keycmp(path, dx_key_at(path, at), ++ path->dp_key_target)); ++ } + at = dx_entry_shift(path, at, -1); break; } @@ -546,24 +624,22 @@ Index: iam-src/fs/ext3/namei.c + struct dx_hash_info *hinfo, struct dx_path *path) +{ + int err; -+ __u32 hash_storage; + struct htree_cookie hc = { + .dentry = dentry, + .hinfo = hinfo + }; + + assert(dx_index_is_compat(path)); -+ path->dp_key = &hash_storage; + err = dx_lookup(path, &hc); + assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL); + return err; } static inline void dx_path_init(struct dx_path *path, struct inode *inode) -@@ -458,8 +614,10 @@ static inline void dx_path_fini(struct d +@@ -458,11 +681,24 @@ static inline void dx_path_fini(struct d int i; - for (i = 0; i < ARRAY_SIZE(path->dp_frames); i--) { + for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) { - if (path->dp_frames[i].bh != NULL) + if (path->dp_frames[i].bh != NULL) { brelse(path->dp_frames[i].bh); @@ -572,7 +648,21 @@ Index: iam-src/fs/ext3/namei.c } } -@@ -488,6 +646,8 @@ static int ext3_htree_next_block(struct ++static void dx_path_compat_init(struct dx_path_compat *path, ++ struct inode *inode) ++{ ++ int i; ++ dx_path_init(&path->dpc_path, inode); ++ path->dpc_path.dp_param = &htree_compat_param; ++ for (i = 0; i < ARRAY_SIZE(path->dpc_path.dp_key_scratch); ++i) ++ path->dpc_path.dp_key_scratch[i] = ++ (struct dx_key *)&path->dpc_scrach[i]; ++} ++ + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -488,6 +724,8 @@ static int ext3_htree_next_block(struct int err, num_frames = 0; __u32 bhash; @@ -581,7 +671,7 @@ Index: iam-src/fs/ext3/namei.c p = path->dp_frame; /* * Find the next leaf page by incrementing the frame pointer. -@@ -497,7 +657,9 @@ static int ext3_htree_next_block(struct +@@ -497,7 +735,9 @@ static int ext3_htree_next_block(struct * nodes need to be read. */ while (1) { @@ -592,16 +682,16 @@ Index: iam-src/fs/ext3/namei.c break; if (p == path->dp_frames) return 0; -@@ -512,7 +674,7 @@ static int ext3_htree_next_block(struct +@@ -512,7 +752,7 @@ static int ext3_htree_next_block(struct * desired contiuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ - bhash = dx_get_hash(p->at); -+ dx_get_key(path, p->at, &bhash); ++ dx_get_key(path, p->at, (struct dx_key *)&bhash); if (start_hash) *start_hash = bhash; if ((hash & 1) == 0) { -@@ -524,12 +686,13 @@ static int ext3_htree_next_block(struct +@@ -524,12 +764,14 @@ static int ext3_htree_next_block(struct * block so no check is necessary */ while (num_frames--) { @@ -614,37 +704,80 @@ Index: iam-src/fs/ext3/namei.c p->bh = bh; - p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + p->at = p->entries = dx_node_get_entries(path, p); ++ assert(dx_node_check(path, p)); } return 1; } -@@ -609,6 +772,7 @@ int ext3_htree_fill_tree(struct file *di +@@ -598,7 +840,8 @@ int ext3_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_path path; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; + struct inode *dir; + int block, err; + int count = 0; +@@ -608,7 +851,7 @@ int ext3_htree_fill_tree(struct file *di + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = dir_file->f_dentry->d_inode; - dx_path_init(&path, dir); -+ path.dp_param = &htree_compat_param; +- dx_path_init(&path, dir); ++ dx_path_compat_init(&cpath, dir); if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; -@@ -619,7 +783,8 @@ int ext3_htree_fill_tree(struct file *di +@@ -619,12 +862,13 @@ int ext3_htree_fill_tree(struct file *di } hinfo.hash = start_hash; hinfo.minor_hash = 0; - if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err)) -+ err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path); ++ err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, path); + if (err != 0) return err; /* Add '.' and '..' from the htree header */ -@@ -634,7 +799,7 @@ int ext3_htree_fill_tree(struct file *di + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path->dp_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + de = ext3_next_entry(de); +@@ -634,7 +878,7 @@ int ext3_htree_fill_tree(struct file *di } while (1) { - block = dx_get_block(path.dp_frame->at); -+ block = dx_get_block(&path, path.dp_frame->at); ++ block = dx_get_block(path, path->dp_frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); if (ret < 0) { -@@ -722,17 +887,19 @@ static void dx_sort_map (struct dx_map_e +@@ -643,7 +887,8 @@ int ext3_htree_fill_tree(struct file *di + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval); ++ ret = ext3_htree_next_block(dir, ++ HASH_NB_ALWAYS, path, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -658,12 +903,12 @@ int ext3_htree_fill_tree(struct file *di + (count && ((hashval & 1) == 0))) + break; + } +- dx_path_fini(&path); ++ dx_path_fini(path); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_path_fini(&path); ++ dx_path_fini(path); + return (err); + } + +@@ -722,17 +967,19 @@ static void dx_sort_map (struct dx_map_e } while(more); } @@ -665,50 +798,78 @@ Index: iam-src/fs/ext3/namei.c + assert(old < dx_entry_shift(path, entries, count)); + memmove(dx_entry_shift(path, new, 1), new, + (char *)dx_entry_shift(path, entries, count) - (char *)new); -+ dx_set_key(path, new, &hash); ++ dx_set_key(path, new, (struct dx_key *)&hash); + dx_set_block(path, new, block); dx_set_count(entries, count + 1); } #endif -@@ -934,7 +1101,9 @@ static struct buffer_head * ext3_dx_find +@@ -933,8 +1180,11 @@ static struct buffer_head * ext3_dx_find + struct super_block * sb; struct dx_hash_info hinfo; u32 hash; - struct dx_path path; +- struct dx_path path; - struct dx_entry dummy_dot; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; + struct dx_entry_compat dummy_dot = { + .block = 0 + }; struct ext3_dir_entry_2 *de, *top; struct buffer_head *bh; unsigned long block; -@@ -944,19 +1113,21 @@ static struct buffer_head * ext3_dx_find +@@ -943,20 +1193,21 @@ static struct buffer_head * ext3_dx_find + const u8 *name = dentry->d_name.name; struct inode *dir = dentry->d_parent->d_inode; - dx_path_init(&path, dir); -+ path.dp_param = &htree_compat_param; +- dx_path_init(&path, dir); ++ dx_path_compat_init(&cpath, dir); + sb = dir->i_sb; /* NFS may look up ".." - look at dx_root directory block */ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(dx_probe(dentry, NULL, &hinfo, &path, err))) -+ *err = dx_probe(dentry, NULL, &hinfo, &path); ++ *err = dx_probe(dentry, NULL, &hinfo, path); + if (*err != 0) return NULL; } else { - path.dp_frame->bh = NULL; /* for dx_path_fini() */ - path.dp_frame->at = &dummy_dot; /* hack for zero entry*/ - dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */ -+ path.dp_frame->bh = NULL; /* for dx_path_fini() */ -+ path.dp_frame->at = (void *)&dummy_dot; /* hack for zero entry*/ ++ path->dp_frame->bh = NULL; /* for dx_path_fini() */ ++ path->dp_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ } hash = hinfo.hash; do { - block = dx_get_block(path.dp_frame->at); -+ block = dx_get_block(&path, path.dp_frame->at); ++ block = dx_get_block(path, path->dp_frame->at); if (!(bh = ext3_bread (NULL,dir, block, 0, err))) goto errout; de = (struct ext3_dir_entry_2 *) bh->b_data; -@@ -1115,10 +1286,11 @@ static struct ext3_dir_entry_2* dx_pack_ +@@ -972,12 +1223,12 @@ static struct buffer_head * ext3_dx_find + goto errout; + } + *res_dir = de; +- dx_path_fini(&path); ++ dx_path_fini(path); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ +- retval = ext3_htree_next_block(dir, hash, &path, NULL); ++ retval = ext3_htree_next_block(dir, hash, path, NULL); + if (retval < 0) { + ext3_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", +@@ -990,7 +1241,7 @@ static struct buffer_head * ext3_dx_find + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_path_fini(&path); ++ dx_path_fini(path); + return NULL; + } + #endif +@@ -1115,10 +1366,11 @@ static struct ext3_dir_entry_2* dx_pack_ /* Allocate new node, and split leaf node @bh into it, inserting new pointer * into parent node identified by @frame */ @@ -721,7 +882,7 @@ Index: iam-src/fs/ext3/namei.c unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; struct buffer_head *bh2; -@@ -1180,7 +1352,7 @@ static struct ext3_dir_entry_2 *do_split +@@ -1180,7 +1432,7 @@ static struct ext3_dir_entry_2 *do_split swap(*bh, bh2); de = de2; } @@ -730,76 +891,109 @@ Index: iam-src/fs/ext3/namei.c err = ext3_journal_dirty_metadata (handle, bh2); if (err) goto journal_error; -@@ -1315,6 +1487,7 @@ static int make_indexed_dir(handle_t *ha +@@ -1303,7 +1555,8 @@ static int make_indexed_dir(handle_t *ha + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_path path; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1314,7 +1567,7 @@ static int make_indexed_dir(handle_t *ha + u32 block; struct fake_dirent *fde; - dx_path_init(&path, dir); -+ path.dp_param = &htree_compat_param; +- dx_path_init(&path, dir); ++ dx_path_compat_init(&cpath, dir); blocksize = dir->i_sb->s_blocksize; dxtrace(printk("Creating index\n")); retval = ext3_journal_get_write_access(handle, bh); -@@ -1350,10 +1523,10 @@ static int make_indexed_dir(handle_t *ha +@@ -1350,21 +1603,21 @@ static int make_indexed_dir(handle_t *ha root->info.info_length = sizeof(root->info); root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; root->info.hash_version = DX_HASH_R5; - entries = root->entries; - dx_set_block (entries, 1); + entries = (void *)root->entries; -+ dx_set_block (&path, entries, 1); ++ dx_set_block (path, entries, 1); dx_set_count (entries, 1); - dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); -+ dx_set_limit (entries, dx_root_limit(&path)); ++ dx_set_limit (entries, dx_root_limit(path)); /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; -@@ -1363,7 +1536,7 @@ static int make_indexed_dir(handle_t *ha - path.dp_frame->at = entries; - path.dp_frame->bh = bh; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- path.dp_frame->entries = entries; +- path.dp_frame->at = entries; +- path.dp_frame->bh = bh; ++ path->dp_frame->entries = entries; ++ path->dp_frame->at = entries; ++ path->dp_frame->bh = bh; bh = bh2; - de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval); -+ de = do_split(handle, &path, &bh, path.dp_frame, &hinfo, &retval); - dx_path_fini(&path); +- dx_path_fini(&path); ++ de = do_split(handle, path, &bh, path->dp_frame, &hinfo, &retval); ++ dx_path_fini(path); if (!de) return retval; -@@ -1446,8 +1619,8 @@ static int ext3_dx_add_entry(handle_t *h + +@@ -1445,9 +1698,10 @@ static int ext3_add_entry (handle_t *han + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct dx_path path; +- struct dx_path path; ++ struct dx_path_compat cpath; ++ struct dx_path *path = &cpath.dpc_path; + struct dx_param *param; struct dx_frame *frame, *safe; - struct dx_node *node2; struct dx_entry *entries; /* old block contents */ struct dx_entry *entries2; /* new block contents */ struct dx_hash_info hinfo; -@@ -1463,7 +1636,10 @@ static int ext3_dx_add_entry(handle_t *h +@@ -1462,16 +1716,20 @@ static int ext3_dx_add_entry(handle_t *h + int i; size_t isize; - dx_path_init(&path, dir); +- dx_path_init(&path, dir); - if (!dx_probe(dentry, NULL, &hinfo, &path, &err)) -+ param = path.dp_param = &htree_compat_param; ++ dx_path_compat_init(&cpath, dir); ++ param = path->dp_param; + -+ err = dx_probe(dentry, NULL, &hinfo, &path); ++ err = dx_probe(dentry, NULL, &hinfo, path); + if (err != 0) return err; - frame = path.dp_frame; +- frame = path.dp_frame; ++ frame = path->dp_frame; entries = frame->entries; -@@ -1471,7 +1647,8 @@ static int ext3_dx_add_entry(handle_t *h + /* XXX nikita: global serialization! */ isize = dir->i_size; - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + if (!(bh = ext3_bread(handle, dir, -+ dx_get_block(&path, frame->at), 0, &err))) ++ dx_get_block(path, frame->at), 0, &err))) goto cleanup; BUFFER_TRACE(bh, "get_write_access"); -@@ -1519,12 +1696,9 @@ static int ext3_dx_add_entry(handle_t *h +@@ -1503,7 +1761,7 @@ static int ext3_dx_add_entry(handle_t *h + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ +- for (nr_splet = 0; frame >= path.dp_frames && ++ for (nr_splet = 0; frame >= path->dp_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +@@ -1519,19 +1777,16 @@ static int ext3_dx_add_entry(handle_t *h * transaction... */ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); - if (!bh_new[i]) + if (!bh_new[i] || -+ param->dpo_node_init(&path, bh_new[i], 0) != 0) ++ param->dpo_node_init(path, bh_new[i], 0) != 0) goto cleanup; - node2 = (struct dx_node *)(bh_new[i]->b_data); - entries2 = node2->entries; @@ -808,87 +1002,124 @@ Index: iam-src/fs/ext3/namei.c BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) -@@ -1545,11 +1719,10 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + } + /* Add "safe" node to transaction too */ +- if (safe + 1 != path.dp_frames) { ++ if (safe + 1 != path->dp_frames) { + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1545,13 +1800,12 @@ static int ext3_dx_add_entry(handle_t *h entries = frame->entries; count = dx_get_count(entries); - idx = frame->at - entries; -+ idx = dx_entry_diff(&path, frame->at, entries); ++ idx = dx_entry_diff(path, frame->at, entries); bh2 = bh_new[i]; - node2 = (struct dx_node *)(bh2->b_data); - entries2 = node2->entries; -+ entries2 = dx_get_entries(&path, bh2->b_data, 0); ++ entries2 = dx_get_entries(path, bh2->b_data, 0); - if (frame == path.dp_frames) { +- if (frame == path.dp_frames) { ++ if (frame == path->dp_frames) { /* splitting root node. Tricky point: -@@ -1571,19 +1744,19 @@ static int ext3_dx_add_entry(handle_t *h + * + * In the "normal" B-tree we'd split root *and* add +@@ -1566,27 +1820,29 @@ static int ext3_dx_add_entry(handle_t *h + u8 indirects; + struct dx_frame *frames; + +- frames = path.dp_frames; ++ frames = path->dp_frames; + root = (struct dx_root *) frames->bh->b_data; indirects = root->info.indirect_levels; dxtrace(printk("Creating new root %d\n", indirects)); memcpy((char *) entries2, (char *) entries, - count * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); -+ count * dx_entry_size(&path)); -+ dx_set_limit(entries2, dx_node_limit(&path)); ++ count * dx_entry_size(path)); ++ dx_set_limit(entries2, dx_node_limit(path)); /* Set up root */ dx_set_count(entries, 1); - dx_set_block(entries + 0, newblock[i]); -+ dx_set_block(&path, entries, newblock[i]); ++ dx_set_block(path, entries, newblock[i]); root->info.indirect_levels = indirects + 1; /* Shift frames in the path */ memmove(frames + 2, frames + 1, - (sizeof path.dp_frames) - 2 * sizeof frames[0]); +- (sizeof path.dp_frames) - 2 * sizeof frames[0]); ++ (sizeof path->dp_frames) - 2 * sizeof frames[0]); /* Add new access path frame */ - frames[1].at = entries2 + idx; -+ frames[1].at = dx_entry_shift(&path, entries2, idx); ++ frames[1].at = dx_entry_shift(path, entries2, idx); frames[1].entries = entries = entries2; frames[1].bh = bh2; ++ assert(dx_node_check(path, frame)); ++ frame; -@@ -1594,23 +1767,30 @@ static int ext3_dx_add_entry(handle_t *h ++ assert(dx_node_check(path, frame)); + bh_new[i] = NULL; /* buffer head is "consumed" */ + err = ext3_journal_get_write_access(handle, bh2); + if (err) +@@ -1594,23 +1850,32 @@ static int ext3_dx_add_entry(handle_t *h } else { /* splitting non-root index node. */ unsigned count1 = count/2, count2 = count - count1; - unsigned hash2 = dx_get_hash(entries + count1); + unsigned hash2; + -+ dx_get_key(&path, -+ dx_entry_shift(&path, entries, count1), -+ &hash2); ++ dx_get_key(path, ++ dx_entry_shift(path, entries, count1), ++ (struct dx_key *)&hash2); + dxtrace(printk("Split index %i/%i\n", count1, count2)); - memcpy ((char *) entries2, (char *) (entries + count1), - count2 * sizeof(struct dx_entry)); + memcpy ((char *) entries2, -+ (char *) dx_entry_shift(&path, entries, count1), -+ count2 * dx_entry_size(&path)); ++ (char *) dx_entry_shift(path, entries, count1), ++ count2 * dx_entry_size(path)); dx_set_count (entries, count1); dx_set_count (entries2, count2); - dx_set_limit (entries2, dx_node_limit(dir)); -+ dx_set_limit (entries2, dx_node_limit(&path)); ++ dx_set_limit (entries2, dx_node_limit(path)); /* Which index block gets the new entry? */ if (idx >= count1) { - frame->at = entries2 + idx - count1; -+ frame->at = dx_entry_shift(&path, entries2, ++ frame->at = dx_entry_shift(path, entries2, + idx - count1); frame->entries = entries = entries2; swap(frame->bh, bh2); bh_new[i] = bh2; } - dx_insert_block (frame - 1, hash2, newblock[i]); -+ dx_insert_block(&path, frame - 1, hash2, newblock[i]); ++ dx_insert_block(path, frame - 1, hash2, newblock[i]); ++ assert(dx_node_check(path, frame)); ++ assert(dx_node_check(path, frame - 1)); dxtrace(dx_show_index ("node", frame->entries)); dxtrace(dx_show_index ("node", ((struct dx_node *) bh2->b_data)->entries)); -@@ -1619,7 +1799,7 @@ static int ext3_dx_add_entry(handle_t *h +@@ -1619,9 +1884,10 @@ static int ext3_dx_add_entry(handle_t *h goto journal_error; } } - de = do_split(handle, dir, &bh, --frame, &hinfo, &err); -+ de = do_split(handle, &path, &bh, --frame, &hinfo, &err); ++ de = do_split(handle, path, &bh, --frame, &hinfo, &err); if (!de) goto cleanup; ++ assert(dx_node_check(path, frame)); err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; + +@@ -1637,7 +1903,7 @@ cleanup2: + } + if (err) + inode->i_size = isize; +- dx_path_fini(&path); ++ dx_path_fini(path); + return err; + } + #endif diff --git a/lustre/kernel_patches/patches/ext3-htree-path.patch b/lustre/kernel_patches/patches/ext3-htree-path.patch new file mode 100644 index 0000000..893d1d1 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-htree-path.patch @@ -0,0 +1,406 @@ +Index: iam-src/fs/ext3/namei.c +=================================================================== +--- iam-src.orig/fs/ext3/namei.c 2006-02-09 20:44:02.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-10 18:23:32.000000000 +0300 +@@ -147,6 +147,15 @@ struct dx_map_entry + u32 offs; + }; + ++/* ++ * Structure to keep track of a path drilled through htree. ++ */ ++struct dx_path { ++ struct inode *dp_object; ++ struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; ++ struct dx_frame *dp_frame; ++}; ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -161,9 +170,8 @@ static unsigned dx_node_limit (struct in + static struct dx_frame *dx_probe(struct dentry *dentry, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame, ++ struct dx_path *path, + int *err); +-static void dx_release (struct dx_frame *frames); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); +@@ -172,9 +180,7 @@ static struct ext3_dir_entry_2 *dx_move_ + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); + static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_frame *frame, +- struct dx_frame *frames, +- __u32 *start_hash); ++ struct dx_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +@@ -332,13 +338,13 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++ struct dx_hash_info *hinfo, struct dx_path *path, int *err) + { + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; +- struct dx_frame *frame = frame_in; ++ struct dx_frame *frame = path->dp_frames; + u32 hash; + + frame->bh = NULL; +@@ -352,8 +358,7 @@ dx_probe(struct dentry *dentry, struct i + root->info.hash_version != DX_HASH_R5 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); ++ "Unrecognised inode hash code %d", root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -424,7 +429,8 @@ dx_probe(struct dentry *dentry, struct i + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; ++ if (!indirect--) ++ return path->dp_frame = frame; + if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; +@@ -432,7 +438,7 @@ dx_probe(struct dentry *dentry, struct i + frame++; + } + fail2: +- while (frame >= frame_in) { ++ while (frame >= path->dp_frames) { + brelse(frame->bh); + frame--; + } +@@ -440,16 +446,20 @@ fail: + return NULL; + } + +-static void dx_release (struct dx_frame *frames) ++static inline void dx_path_init(struct dx_path *path, struct inode *inode) + { +- int height; ++ memset(path, 0, sizeof *path); ++ path->dp_object = inode; ++ path->dp_frame = path->dp_frames; ++} + +- if (frames[0].bh == NULL) +- return; +- height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels; +- for (; height >= 0; height--) { +- assert(frames[height].bh != NULL); +- brelse(frames[height].bh); ++static inline void dx_path_fini(struct dx_path *path) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(path->dp_frames); i++) { ++ if (path->dp_frames[i].bh != NULL) ++ brelse(path->dp_frames[i].bh); + } + } + +@@ -471,16 +481,14 @@ static void dx_release (struct dx_frame + * hash of the next page. + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_frame *frame, +- struct dx_frame *frames, +- __u32 *start_hash) ++ struct dx_path *path, __u32 *start_hash) + { + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + +- p = frame; ++ p = path->dp_frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and +@@ -491,10 +499,10 @@ static int ext3_htree_next_block(struct + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; +- if (p == frames) ++ if (p == path->dp_frames) + return 0; + num_frames++; +- p--; ++ --p; + } + + /* +@@ -516,10 +524,9 @@ static int ext3_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, &err))) ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err))) + return err; /* Failure */ +- p++; ++ ++p; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; +@@ -591,7 +598,7 @@ int ext3_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; ++ struct dx_path path; + struct inode *dir; + int block, err; + int count = 0; +@@ -601,6 +608,7 @@ int ext3_htree_fill_tree(struct file *di + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; ++ dx_path_init(&path, dir); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -611,13 +619,12 @@ int ext3_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); +- if (!frame) ++ if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err)) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path.dp_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + de = ext3_next_entry(de); +@@ -627,7 +634,7 @@ int ext3_htree_fill_tree(struct file *di + } + + while (1) { +- block = dx_get_block(frame->at); ++ block = dx_get_block(path.dp_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -636,8 +643,7 @@ int ext3_htree_fill_tree(struct file *di + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, &path, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -652,12 +658,12 @@ int ext3_htree_fill_tree(struct file *di + (count && ((hashval & 1) == 0))) + break; + } +- dx_release(frames); ++ dx_path_fini(&path); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_release(frames); ++ dx_path_fini(&path); + return (err); + } + +@@ -927,7 +933,8 @@ static struct buffer_head * ext3_dx_find + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; ++ struct dx_path path; ++ struct dx_entry dummy_dot; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -936,20 +943,20 @@ static struct buffer_head * ext3_dx_find + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + ++ dx_path_init(&path, dir); + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) ++ if (!(dx_probe(dentry, NULL, &hinfo, &path, err))) + return NULL; + } else { +- frame = frames; +- frame->bh = NULL; /* for dx_release() */ +- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ +- dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ path.dp_frame->bh = NULL; /* for dx_path_fini() */ ++ path.dp_frame->at = &dummy_dot; /* hack for zero entry*/ ++ dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */ + } + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = dx_get_block(path.dp_frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -965,13 +972,12 @@ static struct buffer_head * ext3_dx_find + goto errout; + } + *res_dir = de; +- dx_release (frames); ++ dx_path_fini(&path); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ +- retval = ext3_htree_next_block(dir, hash, frame, +- frames, NULL); ++ retval = ext3_htree_next_block(dir, hash, &path, NULL); + if (retval < 0) { + ext3_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", +@@ -984,7 +990,7 @@ static struct buffer_head * ext3_dx_find + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_release (frames); ++ dx_path_fini(&path); + return NULL; + } + #endif +@@ -1297,7 +1303,7 @@ static int make_indexed_dir(handle_t *ha + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; ++ struct dx_path path; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1308,6 +1314,7 @@ static int make_indexed_dir(handle_t *ha + u32 block; + struct fake_dirent *fde; + ++ dx_path_init(&path, dir); + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1352,14 +1359,13 @@ static int make_indexed_dir(handle_t *ha + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- frame = frames; +- frame->entries = entries; +- frame->at = entries; +- frame->bh = bh; ++ path.dp_frame->entries = entries; ++ path.dp_frame->at = entries; ++ path.dp_frame->bh = bh; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); +- dx_release (frames); +- if (!(de)) ++ de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval); ++ dx_path_fini(&path); ++ if (!de) + return retval; + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +@@ -1439,7 +1445,8 @@ static int ext3_add_entry (handle_t *han + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe; ++ struct dx_path path; ++ struct dx_frame *frame, *safe; + struct dx_node *node2; + struct dx_entry *entries; /* old block contents */ + struct dx_entry *entries2; /* new block contents */ +@@ -1455,9 +1462,10 @@ static int ext3_dx_add_entry(handle_t *h + int i; + size_t isize; + +- frame = dx_probe(dentry, NULL, &hinfo, frames, &err); +- if (!frame) ++ dx_path_init(&path, dir); ++ if (!dx_probe(dentry, NULL, &hinfo, &path, &err)) + return err; ++ frame = path.dp_frame; + entries = frame->entries; + + /* XXX nikita: global serialization! */ +@@ -1495,7 +1503,7 @@ static int ext3_dx_add_entry(handle_t *h + dx_get_count(entries), dx_get_limit(entries))); + + /* What levels need split? */ +- for (nr_splet = 0; frame >= frames && ++ for (nr_splet = 0; frame >= path.dp_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +@@ -1523,7 +1531,7 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + } + /* Add "safe" node to transaction too */ +- if (safe + 1 != frames) { ++ if (safe + 1 != path.dp_frames) { + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -1543,7 +1551,7 @@ static int ext3_dx_add_entry(handle_t *h + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + +- if (frame == frames) { ++ if (frame == path.dp_frames) { + /* splitting root node. Tricky point: + * + * In the "normal" B-tree we'd split root *and* add +@@ -1556,7 +1564,9 @@ static int ext3_dx_add_entry(handle_t *h + */ + struct dx_root *root; + u8 indirects; ++ struct dx_frame *frames; + ++ frames = path.dp_frames; + root = (struct dx_root *) frames->bh->b_data; + indirects = root->info.indirect_levels; + dxtrace(printk("Creating new root %d\n", indirects)); +@@ -1571,7 +1581,7 @@ static int ext3_dx_add_entry(handle_t *h + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, +- (sizeof frames) - 2 * sizeof frames[0]); ++ (sizeof path.dp_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ + frames[1].at = entries2 + idx; + frames[1].entries = entries = entries2; +@@ -1627,7 +1637,7 @@ cleanup2: + } + if (err) + inode->i_size = isize; +- dx_release(frames); ++ dx_path_fini(&path); + return err; + } + #endif diff --git a/lustre/kernel_patches/patches/ext3-htree-r5-hash.patch b/lustre/kernel_patches/patches/ext3-htree-r5-hash.patch new file mode 100644 index 0000000..48897e7 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-htree-r5-hash.patch @@ -0,0 +1,88 @@ +Index: iam-src/fs/ext3/hash.c +=================================================================== +--- iam-src.orig/fs/ext3/hash.c 2006-02-11 01:08:59.000000000 +0300 ++++ iam-src/fs/ext3/hash.c 2006-02-11 20:46:22.000000000 +0300 +@@ -4,7 +4,7 @@ + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. +- * ++ * + * This file may be redistributed under the terms of the GNU Public + * License. + */ +@@ -115,6 +115,18 @@ static __u32 dx_hack_hash (const char *n + return (hash0 << 1); + } + ++static __u32 dx_r5_hash(const signed char *msg, int len) ++{ ++ __u32 a = 0; ++ while (len--) { ++ a += *msg << 4; ++ a += *msg >> 4; ++ a *= 11; ++ msg++; ++ } ++ return a; ++} ++ + static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) + { + __u32 pad, val; +@@ -146,11 +158,11 @@ static void str2hashbuf(const char *msg, + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. +- * ++ * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. +- * ++ * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. +@@ -205,6 +217,9 @@ int ext3fs_dirhash(const char *name, int + hash = buf[0]; + minor_hash = buf[1]; + break; ++ case DX_HASH_R5: ++ hash = dx_r5_hash(name, len); ++ break; + default: + hinfo->hash = 0; + return -1; +Index: iam-src/fs/ext3/namei.c +=================================================================== +--- iam-src.orig/fs/ext3/namei.c 2006-02-11 01:09:12.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-11 20:45:58.000000000 +0300 +@@ -370,6 +370,7 @@ dx_probe(struct dentry *dentry, struct i + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_R5 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unrecognised inode hash code %d", root->info.hash_version); +@@ -1363,6 +1364,7 @@ static int make_indexed_dir(handle_t *ha + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; ++ root->info.hash_version = DX_HASH_R5; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); +Index: iam-src/include/linux/ext3_fs.h +=================================================================== +--- iam-src.orig/include/linux/ext3_fs.h 2006-02-11 01:08:59.000000000 +0300 ++++ iam-src/include/linux/ext3_fs.h 2006-02-11 20:45:58.000000000 +0300 +@@ -665,6 +665,7 @@ struct ext3_dir_entry_2 { + #define DX_HASH_LEGACY 0 + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 ++#define DX_HASH_R5 3 + + /* hash info structure used by the directory hash */ + struct dx_hash_info diff --git a/lustre/kernel_patches/patches/ext3-iam-ops.patch b/lustre/kernel_patches/patches/ext3-iam-ops.patch new file mode 100644 index 0000000..f0181d9 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-iam-ops.patch @@ -0,0 +1,646 @@ +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:05:38.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:22:58.000000000 +0800 +@@ -82,13 +82,16 @@ + * + * Entries in index node are sorted by their key value. + * ++ * Format of leaf node: + * +- * +- * +- * +- * +- * +- * ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * | | count | | | | | | ++ * | gap | / | leaf | leaf | .... | leaf | free space | ++ * | | limit | | | | | | ++ * +-----+-------+-------+-------+------+-------+------------+ ++ ++ * leaf For leaf entry: consists of a rec immediately followd by ++ * a key. size of a key and size of a rec depends on container. + * + * + * +@@ -241,6 +244,7 @@ + }; + + /* leaf node reached by tree lookup */ ++#define iam_leaf_entry iam_rec + struct iam_leaf { + struct buffer_head *bh; + struct iam_leaf_entry *entries; +@@ -508,6 +512,11 @@ + IAM_IT_ATTACHED + }; + ++struct htree_cookie { ++ struct dx_hash_info *hinfo; ++ struct dentry *dentry; ++}; ++ + /* + * Iterator. + * +@@ -704,7 +713,7 @@ + struct inode *inode); + + static inline void iam_path_init(struct iam_path *path, +- struct iam_container *c); ++ struct iam_container *c, struct htree_cookie *hc); + static inline void iam_path_fini(struct iam_path *path); + + +@@ -865,11 +874,6 @@ + return 0; + } + +-struct htree_cookie { +- struct dx_hash_info *hinfo; +- struct dentry *dentry; +-}; +- + static int htree_node_check(struct iam_path *path, struct iam_frame *frame) + { + void *data; +@@ -1171,11 +1175,13 @@ + } + } + +-static inline void iam_path_init(struct iam_path *path, struct iam_container *c) ++static inline void iam_path_init(struct iam_path *path, struct iam_container *c, ++ struct htree_cookie *hc) + { + memset(path, 0, sizeof *path); + path->ip_container = c; + path->ip_frame = path->ip_frames; ++ path->ip_descr_data = hc; + } + + static inline void iam_path_fini(struct iam_path *path) +@@ -1201,7 +1207,7 @@ + * iam_path_fini(). + */ + iput(inode); +- iam_path_init(&path->ipc_path, &path->ipc_container); ++ iam_path_init(&path->ipc_path, &path->ipc_container, NULL); + for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) + path->ipc_path.ip_key_scratch[i] = + (struct iam_key *)&path->ipc_scrach[i]; +@@ -1213,6 +1219,382 @@ + iam_container_fini(&path->ipc_container); + } + ++static int iam_leaf_init(struct iam_path *path, struct iam_leaf *leaf) ++{ ++ int block, err; ++ struct buffer_head *bh; ++ ++ block = dx_get_block(path, path->ip_frame->at); ++ err = path_descr(path)->id_node_read(path->ip_container, block, ++ NULL, &bh); ++ if (err) ++ return err; ++ ++ leaf->bh = bh; ++ leaf->entries = (struct iam_leaf_entry *)bh->b_data; ++ return 0; ++} ++ ++static void iam_leaf_fini(struct iam_leaf *leaf) ++{ ++ if (leaf->bh) ++ brelse(leaf->bh); ++} ++ ++int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ do { ++ struct iam_leaf leaf; ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ for (path_descr(path)->id_leaf.start(c, &leaf); ++ !path_descr(path)->id_leaf.at_end(c, &leaf); ++ path_descr(path)->id_leaf.next(c, &leaf)) { ++ struct iam_key *key; ++ ++ key = kmalloc(path_descr(path)->id_key_size, GFP_KERNEL); ++ path_descr(path)->id_leaf.key(c, &leaf, key); ++ if (keycmp(c, k, key) == 0) { ++ memcpy(r, path_descr(path)->id_leaf.rec(c, &leaf), ++ path_descr(path)->id_rec_size); ++ iam_path_fini(path); ++ iam_leaf_fini(&leaf); ++ return 0; ++ } ++ } ++ ++ iam_leaf_fini(&leaf); ++ /* Check to see if we should continue to search */ ++ err = ext3_htree_next_block(c->ic_object, hinfo.hash, path, NULL); ++ if (err < 0) ++ goto errout; ++ } while (err == 1); ++errout: ++ iam_path_fini(path); ++ return(err); ++} ++ ++static inline size_t iam_leaf_entry_size(struct iam_path *p) ++{ ++ return path_descr(p)->id_rec_size + path_descr(p)->id_key_size; ++} ++ ++static inline ptrdiff_t iam_leaf_entry_diff(struct iam_path *p, ++ struct iam_leaf_entry *e1, struct iam_leaf_entry *e2) ++{ ++ ptrdiff_t diff; ++ ++ diff = (void *)e1 - (void *)e2; ++ assert(diff / iam_leaf_entry_size(p) * iam_leaf_entry_size(p) == diff); ++ return diff / iam_leaf_entry_size(p); ++} ++ ++static inline struct iam_leaf_entry* ++iam_leaf_entry_shift(struct iam_path *p, struct iam_leaf_entry *entry, int shift) ++{ ++ void *e = entry; ++ return e + shift * iam_leaf_entry_size(p); ++} ++ ++static inline struct iam_key * ++dx_leaf_get_key(struct iam_path *p, struct iam_leaf_entry *e, struct iam_key *key) ++{ ++ memcpy(key, e, path_descr(p)->id_key_size); ++ return key; ++} ++ ++static inline struct iam_key * ++iam_leaf_key_at(struct iam_path *p, struct iam_leaf_entry *entry) ++{ ++ void *e = entry; ++ return e + path_descr(p)->id_rec_size; ++} ++static inline struct iam_leaf_entry * ++iam_leaf_entry_at(struct iam_path *p, struct iam_leaf_entry *entry) ++{ ++ return entry; ++} ++ ++static int iam_leaf_lookup(struct iam_path *path, struct iam_leaf *leaf, ++ struct iam_key *k) ++{ ++ struct iam_leaf_entry *p, *q, *m; ++ struct iam_leaf_entry *entries = leaf->entries; ++ int count = dx_get_count((struct iam_entry *)entries); ++ ++ p = iam_leaf_entry_shift(path, entries, 1); ++ q = iam_leaf_entry_shift(path, entries, count - 1); ++ while (p <= q) { ++ m = iam_leaf_entry_shift(path, ++ p, iam_leaf_entry_diff(path, q, p) / 2); ++ dxtrace(printk(".")); ++ if (keycmp(path->ip_container, iam_leaf_key_at(path, m), ++ path->ip_key_target) > 0) ++ q = iam_leaf_entry_shift(path, m, -1); ++ else ++ p = iam_leaf_entry_shift(path, m, +1); ++ } ++ leaf->at = q; ++ return 0; ++} ++ ++/*XXX what kind of lock should this entry be locked: WangDi */ ++static int iam_leaf_insert(handle_t *handle, struct iam_path *path, ++ struct iam_key *k, struct iam_rec *r) ++{ ++ struct iam_leaf leaf; ++ struct iam_leaf_entry *p, *q; ++ int err, count; ++ ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ path_descr(path)->id_leaf.start(path->ip_container, &leaf); ++ count = dx_get_count((struct iam_entry *)leaf.entries); ++ if (dx_get_count((struct iam_entry *)leaf.entries) >= ++ dx_get_limit((struct iam_entry *)leaf.entries)){ ++ err = -ENOSPC; ++ goto errout; ++ } ++ ++ err = iam_leaf_lookup(path, &leaf, k); ++ if (err) ++ goto errout; ++ ++ /*insert the k/r to leaf entries*/ ++ p = iam_leaf_entry_shift(path, leaf.at, 1); ++ q = iam_leaf_entry_shift(path, leaf.entries, count - 1); ++ while (q < p) { ++ memcpy(iam_leaf_entry_shift(path, q, 1), q, iam_leaf_entry_size(path)); ++ q = iam_leaf_entry_shift(path, q, -1); ++ } ++ memcpy(iam_leaf_entry_at(path, p), r, path_descr(path)->id_rec_size); ++ memcpy(iam_leaf_key_at(path, p), k, path_descr(path)->id_key_size); ++ ++ dx_set_count((struct iam_entry*)leaf.entries, count + 1); ++ err = ext3_journal_dirty_metadata(handle, leaf.bh); ++ if (err) ++ ext3_std_error(path->ip_container->ic_object->i_sb, err); ++errout: ++ iam_leaf_fini(&leaf); ++ return err; ++} ++ ++static int split_leaf_node(handle_t *handle, struct iam_path *path) ++{ ++ struct inode *dir = path_obj(path); ++ unsigned continued = 0; ++ struct buffer_head *bh2; ++ u32 newblock, hash_split; ++ char *data2; ++ struct iam_leaf leaf; ++ unsigned split; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) { ++ err = -ENOSPC; ++ goto errout; ++ } ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ BUFFER_TRACE(leaf.bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, leaf.bh); ++ if (err) { ++ journal_error: ++ iam_leaf_fini(&leaf); ++ brelse(bh2); ++ ext3_std_error(dir->i_sb, err); ++ err = -EIO; ++ goto errout; ++ } ++ data2 = bh2->b_data; ++ split = dx_get_count((struct iam_entry*)leaf.entries)/2; ++ hash_split = *(__u32*)iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)); ++ if (keycmp(path->ip_container, iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split)), ++ iam_leaf_key_at(path, iam_leaf_entry_shift(path, leaf.entries, split -1))) == 0) ++ continued = 1; ++ ++ memcpy(iam_leaf_entry_shift(path, (struct iam_leaf_entry *)data2, 1), ++ iam_leaf_entry_shift(path, leaf.entries, split), ++ split * iam_leaf_entry_size(path)); ++ ++ /* Which block gets the new entry? */ ++ dx_insert_block(path, path->ip_frame, hash_split + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, leaf.bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ iam_leaf_fini(&leaf); ++errout: ++ return err; ++} ++ ++static int split_index_node(handle_t *handle, struct iam_path *path); ++int iam_insert(handle_t *handle, struct iam_container *c, struct iam_key *k, ++ struct iam_rec *r) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_insert(handle, path, k, r); ++ ++ if (err != -ENOSPC) ++ goto errout; ++ ++ err = split_index_node(handle, path); ++ if (err) ++ goto errout; ++ ++ err = split_leaf_node(handle, path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_insert(handle, path, k, r); ++errout: ++ iam_path_fini(path); ++ return(err); ++} ++ ++static int iam_leaf_delete(handle_t *handle, struct iam_path *path, ++ struct iam_key *k) ++{ ++ struct iam_leaf leaf; ++ struct iam_leaf_entry *p, *q; ++ int err, count; ++ ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_lookup(path, &leaf, k); ++ if (err) ++ goto errout; ++ ++ count = dx_get_count((struct iam_entry*)leaf.entries); ++ /*delete the k to leaf entries*/ ++ p = iam_leaf_entry_shift(path, leaf.at, 1); ++ q = iam_leaf_entry_shift(path, leaf.entries, count - 1); ++ while (p < q) { ++ memcpy(p, iam_leaf_entry_shift(path, p, 1), iam_leaf_entry_size(path)); ++ p = iam_leaf_entry_shift(path, p, 1); ++ } ++ dx_set_count((struct iam_entry*)leaf.entries, count - 1); ++ ++ err = ext3_journal_dirty_metadata(handle, leaf.bh); ++ if (err) ++ ext3_std_error(path_obj(path)->i_sb, err); ++errout: ++ iam_leaf_fini(&leaf); ++ return err; ++} ++ ++int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_delete(h, path, k); ++errout: ++ iam_path_fini(path); ++ return err; ++} ++ ++static int iam_leaf_update(handle_t *handle, struct iam_path *path, ++ struct iam_key *k, struct iam_rec *r) ++{ ++ struct iam_leaf leaf; ++ int err; ++ ++ err = iam_leaf_init(path, &leaf); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_lookup(path, &leaf, k); ++ if (err) ++ goto errout; ++ ++ memcpy(iam_leaf_entry_at(path, leaf.at), r, path_descr(path)->id_rec_size); ++ memcpy(iam_leaf_key_at(path, leaf.at), k, path_descr(path)->id_key_size); ++ ++ err = ext3_journal_dirty_metadata(handle, leaf.bh); ++ if (err) ++ ext3_std_error(path_obj(path)->i_sb, err); ++errout: ++ iam_leaf_fini(&leaf); ++ return err; ++} ++ ++int iam_update(handle_t *h, struct iam_container *c, ++ struct iam_key *k, struct iam_rec *r) ++{ ++ struct dx_hash_info hinfo; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct htree_cookie hc = { ++ .hinfo = &hinfo ++ }; ++ int err, i; ++ ++ iam_path_init(path, c, &hc); ++ for (i = 0; i < ARRAY_SIZE(path->ip_key_scratch); ++i) ++ path->ip_key_scratch[i] = ++ (struct iam_key *)&cpath.ipc_scrach[i]; ++ err = dx_lookup(path); ++ if (err) ++ goto errout; ++ ++ err = iam_leaf_update(h, path, k, r); ++errout: ++ iam_path_fini(path); ++ return err; ++} + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -2213,59 +2595,21 @@ + } + + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ +-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) +-{ +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct iam_descr *param; +- struct iam_frame *frame, *safe; ++static int split_index_node(handle_t *handle, struct iam_path *path) ++{ ++ + struct iam_entry *entries; /* old block contents */ + struct iam_entry *entries2; /* new block contents */ +- struct dx_hash_info hinfo; +- struct buffer_head * bh; ++ struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; +- struct inode *dir = dentry->d_parent->d_inode; +- struct super_block * sb = dir->i_sb; +- struct ext3_dir_entry_2 *de; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; +- int err; ++ struct inode *dir = path_obj(path); + int nr_splet; +- int i; +- size_t isize; ++ int i, err; + +- iam_path_compat_init(&cpath, dir); +- param = path_descr(path); +- +- err = dx_probe(dentry, NULL, &hinfo, path); +- if (err != 0) +- return err; + frame = path->ip_frame; + entries = frame->entries; + +- /* XXX nikita: global serialization! */ +- isize = dir->i_size; +- +- err = param->id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, +- frame->at), handle, &bh); +- if (err != 0) +- goto cleanup; +- +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- +- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (err != -ENOSPC) { +- bh = NULL; +- goto cleanup; +- } +- + /* + * Tall-tree handling: we might have to split multiple index blocks + * all the way up to tree root. Tricky point here is error handling: +@@ -2288,7 +2632,7 @@ + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { + if (nr_splet == DX_MAX_TREE_HEIGHT) { +- ext3_warning(sb, __FUNCTION__, ++ ext3_warning(dir->i_sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; +@@ -2301,7 +2645,7 @@ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); + if (!bh_new[i] || +- param->id_node_init(path->ip_container, bh_new[i], 0) != 0) ++ path_descr(path)->id_node_init(path->ip_container, bh_new[i], 0) != 0) + goto cleanup; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); +@@ -2407,23 +2751,81 @@ + goto journal_error; + } + } ++ goto cleanup; ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++ ++cleanup: ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ return err; ++} ++ ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh = NULL; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct ext3_dir_entry_2 *de; ++ int err; ++ size_t isize; ++ ++ iam_path_compat_init(&cpath, dir); ++ param = path_descr(path); ++ ++ err = dx_probe(dentry, NULL, &hinfo, path); ++ if (err != 0) ++ return err; ++ frame = path->ip_frame; ++ ++ /* XXX nikita: global serialization! */ ++ isize = dir->i_size; ++ ++ err = param->id_node_read(path->ip_container, (iam_ptr_t)dx_get_block(path, frame->at), ++ handle, &bh); ++ if (err != 0) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); ++ if (err != -ENOSPC) { ++ bh = NULL; ++ goto cleanup; ++ } ++ ++ err = split_index_node(handle, path); ++ if (err) ++ goto cleanup; ++ ++ /*copy split inode too*/ + de = do_split(handle, path, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; ++ + assert(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; + + journal_error: +- ext3_std_error(dir->i_sb, err); ++ ext3_std_error(dir->i_sb, err); + cleanup: + if (bh) + brelse(bh); + cleanup2: +- for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { +- if (bh_new[i] != NULL) +- brelse(bh_new[i]); +- } + if (err) + inode->i_size = isize; + iam_path_fini(path); diff --git a/lustre/kernel_patches/patches/ext3-tall-htree.patch b/lustre/kernel_patches/patches/ext3-tall-htree.patch new file mode 100644 index 0000000..5021759 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-tall-htree.patch @@ -0,0 +1,431 @@ +Index: linux-2.6.9/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/namei.c 2006-04-23 22:35:38.000000000 +0800 ++++ linux-2.6.9/fs/ext3/namei.c 2006-04-23 22:35:47.000000000 +0800 +@@ -48,6 +48,11 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++/* ++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. ++ */ ++#define DX_MAX_TREE_HEIGHT (5) ++ + static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +@@ -75,7 +80,7 @@ + #ifdef DX_DEBUG + #define dxtrace(command) command + #else +-#define dxtrace(command) ++#define dxtrace(command) + #endif + + struct fake_dirent +@@ -168,7 +173,7 @@ + static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, +- struct dx_frame *frames, ++ struct dx_frame *frames, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); +@@ -249,7 +254,7 @@ + } + + struct stats +-{ ++{ + unsigned names; + unsigned space; + unsigned bcount; +@@ -367,7 +372,7 @@ + goto fail; + } + +- if ((indirect = root->info.indirect_levels) > 1) { ++ if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -436,12 +441,15 @@ + + static void dx_release (struct dx_frame *frames) + { ++ int height; ++ + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels; ++ for (; height >= 0; height--) { ++ assert(frames[height].bh != NULL); ++ brelse(frames[height].bh); ++ } + } + + /* +@@ -463,7 +471,7 @@ + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, +- struct dx_frame *frames, ++ struct dx_frame *frames, + __u32 *start_hash) + { + struct dx_frame *p; +@@ -582,7 +590,7 @@ + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct inode *dir; + int block, err; + int count = 0; +@@ -627,7 +635,7 @@ + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { +@@ -644,7 +652,7 @@ + break; + } + dx_release(frames); +- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", ++ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +@@ -918,7 +926,7 @@ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -1037,7 +1045,7 @@ + parent = ERR_PTR(-ENOMEM); + } + return parent; +-} ++} + + #define S_SHIFT 12 + static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { +@@ -1098,6 +1106,8 @@ + return prev; + } + ++/* Allocate new node, and split leaf node @bh into it, inserting new pointer ++ * into parent node identified by @frame */ + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +@@ -1185,7 +1195,7 @@ + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. +- * ++ * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +@@ -1286,7 +1296,7 @@ + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1427,20 +1437,29 @@ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; +- struct dx_entry *entries, *at; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe; ++ struct dx_node *node2; ++ struct dx_entry *entries; /* old block contents */ ++ struct dx_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; + struct buffer_head * bh; ++ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; ++ u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; + int err; ++ int nr_splet; ++ int i; ++ size_t isize; + + frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; +- at = frame->at; ++ ++ /* XXX nikita: global serialization! */ ++ isize = dir->i_size; + + if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; +@@ -1456,29 +1475,43 @@ + goto cleanup; + } + ++ /* ++ * Tall-tree handling: we might have to split multiple index blocks ++ * all the way up to tree root. Tricky point here is error handling: ++ * to avoid complicated undo/rollback we ++ * ++ * - first allocate all necessary blocks ++ * ++ * - insert pointers into them atomically. ++ * ++ * XXX nikita: this algorithm is *not* scalable, as it assumes that at ++ * least nodes in the path are locked. ++ */ ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +- /* Need to split index? */ +- if (dx_get_count(entries) == dx_get_limit(entries)) { +- u32 newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; +- struct dx_entry *entries2; +- struct dx_node *node2; +- struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { ++ /* What levels need split? */ ++ for (nr_splet = 0; frame >= frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++nr_splet) { ++ if (nr_splet == DX_MAX_TREE_HEIGHT) { + ext3_warning(sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; + } +- bh2 = ext3_append (handle, dir, &newblock, &err); +- if (!(bh2)) ++ } ++ ++ safe = frame; ++ /* Go back down, allocating blocks, and adding blocks into ++ * transaction... */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ if (!bh_new[i]) + goto cleanup; +- node2 = (struct dx_node *)(bh2->b_data); ++ node2 = (struct dx_node *)(bh_new[i]->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); + node2->fake.inode = 0; +@@ -1486,72 +1519,112 @@ + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { +- unsigned icount1 = icount/2, icount2 = icount - icount1; +- unsigned hash2 = dx_get_hash(entries + icount1); +- dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ +- err = ext3_journal_get_write_access(handle, +- frames[0].bh); ++ } ++ /* Add "safe" node to transaction too */ ++ if (safe + 1 != frames) { ++ err = ext3_journal_get_write_access(handle, safe->bh); ++ if (err) ++ goto journal_error; ++ } ++ ++ /* Go through nodes once more, inserting pointers */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ unsigned count; ++ int idx; ++ struct buffer_head *bh2; ++ ++ entries = frame->entries; ++ count = dx_get_count(entries); ++ idx = frame->at - entries; ++ ++ bh2 = bh_new[i]; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ ++ if (frame == frames) { ++ /* splitting root node. Tricky point: ++ * ++ * In the "normal" B-tree we'd split root *and* add ++ * new root to the tree with pointers to the old root ++ * and its sibling (thus introducing two new nodes). ++ * ++ * In htree it's enough to add one node, because ++ * capacity of the root node is smaller than that of ++ * non-root one. ++ */ ++ struct dx_root *root; ++ u8 indirects; ++ ++ root = (struct dx_root *) frames->bh->b_data; ++ indirects = root->info.indirect_levels; ++ dxtrace(printk("Creating new root %d\n", indirects)); ++ memcpy((char *) entries2, (char *) entries, ++ count * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock[i]); ++ root->info.indirect_levels = indirects + 1; ++ ++ /* Shift frames in the path */ ++ memmove(frames + 2, frames + 1, ++ (sizeof frames) - 2 * sizeof frames[0]); ++ /* Add new access path frame */ ++ frames[1].at = entries2 + idx; ++ frames[1].entries = entries = entries2; ++ frames[1].bh = bh2; ++ ++ frame; ++ bh_new[i] = NULL; /* buffer head is "consumed" */ ++ err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; +- +- memcpy ((char *) entries2, (char *) (entries + icount1), +- icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); +- dx_set_count (entries2, icount2); ++ } else { ++ /* splitting non-root index node. */ ++ unsigned count1 = count/2, count2 = count - count1; ++ unsigned hash2 = dx_get_hash(entries + count1); ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy ((char *) entries2, (char *) (entries + count1), ++ count2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, count1); ++ dx_set_count (entries2, count2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ +- if (at - entries >= icount1) { +- frame->at = at = at - entries - icount1 + entries2; ++ if (idx >= count1) { ++ frame->at = entries2 + idx - count1; + frame->entries = entries = entries2; + swap(frame->bh, bh2); ++ bh_new[i] = bh2; + } +- dx_insert_block (frames + 0, hash2, newblock); +- dxtrace(dx_show_index ("node", frames[1].entries)); ++ dx_insert_block (frame - 1, hash2, newblock[i]); ++ dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; +- brelse (bh2); +- } else { +- dxtrace(printk("Creating second level index...\n")); +- memcpy((char *) entries2, (char *) entries, +- icount * sizeof(struct dx_entry)); +- dx_set_limit(entries2, dx_node_limit(dir)); +- +- /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(entries + 0, newblock); +- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext3_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; + } +- ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); +- bh = NULL; +- goto cleanup; ++ goto cleanup2; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: + if (bh) + brelse(bh); ++cleanup2: ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ if (err) ++ inode->i_size = isize; + dx_release(frames); + return err; + } +@@ -1561,7 +1634,7 @@ + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +-static int ext3_delete_entry (handle_t *handle, ++static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) +@@ -1821,7 +1894,7 @@ + de1 = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || +- !le32_to_cpu(de1->inode) || ++ !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", +@@ -1891,7 +1964,7 @@ + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: +- * I think I can trigger J_ASSERT in ext3_orphan_add(). We block ++ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on lock_super()), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. +@@ -2415,4 +2488,4 @@ + .removexattr = generic_removexattr, + #endif + .permission = ext3_permission, +-}; ++}; diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series index bab81b9..b90ed7a 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -10,3 +10,10 @@ ext3-extents-2.6.9-rhel4.patch ext3-mballoc2-2.6.9-rhel4.patch ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch +ext3-tall-htree.patch +ext3-htree-path.patch +ext3-htree-r5-hash.patch +ext3-htree-path-ops.patch +ext3-hash-selection.patch +ext3-htree-comments.patch +ext3-iam-ops.patch -- 1.8.3.1