From: yangsheng Date: Mon, 4 Feb 2008 07:33:48 +0000 (+0000) Subject: Branch HEAD X-Git-Tag: v1_9_50~1^3~39 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=55808528bce05d84208c87e714db351674f89de6;p=fs%2Flustre-release.git Branch HEAD b=14482 i=alex i=adilger Move iam patches to RHEL5 kernel. --- diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-2.6.18-rhel5.patch b/ldiskfs/kernel_patches/patches/ext3-iam-2.6.18-rhel5.patch new file mode 100644 index 0000000..27f8fe1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-2.6.18-rhel5.patch @@ -0,0 +1,2272 @@ +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2007-11-26 23:09:05.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs.h 2007-11-26 23:09:06.000000000 +0300 +@@ -812,6 +812,9 @@ + #define DX_HASH_LEGACY 0 + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 ++#define DX_HASH_R5 6 ++#define DX_HASH_SAME 7 ++#define DX_HASH_MAX 7 + + #ifdef __KERNEL__ + +@@ -942,9 +945,6 @@ + extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); + + /* dir.c */ +-extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, +- struct buffer_head *, unsigned long); + extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2007-11-26 23:09:04.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs_i.h 2007-11-26 23:16:00.000000000 +0300 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #define HAVE_DISK_INODE_VERSION + +@@ -157,6 +157,11 @@ + struct mutex truncate_mutex; + struct inode vfs_inode; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; ++ + struct ext3_ext_cache i_cached_extent; + + /* mballoc */ +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2007-11-26 23:09:05.000000000 +0300 ++++ linux-stage/fs/ext3/super.c 2007-11-26 23:09:06.000000000 +0300 +@@ -464,6 +464,10 @@ + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; + ++ dynlock_init(&ei->i_htree_lock); ++ sema_init(&ei->i_rename_sem, 1); ++ sema_init(&ei->i_append_sem, 1); ++ + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); +@@ -695,7 +699,7 @@ + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_grpquota, + Opt_extents, Opt_noextents, Opt_extdebug, +- Opt_mballoc, Opt_nomballoc, Opt_stripe, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_hashfunc, + }; + + static match_table_t tokens = { +@@ -756,6 +760,7 @@ + {Opt_stripe, "stripe=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, ++ {Opt_hashfunc,"hash=%s"}, + }; + + static ext3_fsblk_t get_sb_block(void **data) +@@ -779,6 +784,7 @@ + return sb_block; + } + ++int user_selected_hash_function = -1; + static int parse_options (char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext3_fsblk_t *n_blocks_count, int is_remount) +@@ -1120,6 +1126,22 @@ + return 0; + sbi->s_stripe = option; + break; ++ case Opt_hashfunc: ++ if (strncmp (args[0].from,"legacy",6) == 0){ ++ user_selected_hash_function = 0; ++ } else if (strncmp (args[0].from,"half_md4",8) == 0){ ++ user_selected_hash_function = 1; ++ } else if (strncmp (args[0].from,"tea",3) == 0){ ++ user_selected_hash_function = 2; ++ } else if (strncmp (args[0].from,"r5",2) == 0){ ++ user_selected_hash_function = 3; ++ } else if (strncmp (args[0].from,"same",4) == 0){ ++ user_selected_hash_function = 4; ++ } else { ++ printk ("Hashfunc name wrong\n"); ++ return 0; ++ } ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2007-11-26 23:09:04.000000000 +0300 ++++ linux-stage/fs/ext3/namei.c 2007-11-26 23:09:06.000000000 +0300 +@@ -24,6 +24,7 @@ + * Theodore Ts'o, 2002 + */ + ++#include + #include + #include + #include +@@ -36,6 +37,7 @@ + #include + #include + #include ++#include + + #include "namei.h" + #include "xattr.h" +@@ -50,25 +52,29 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +-static struct buffer_head *ext3_append(handle_t *handle, ++ ++struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) + { + struct buffer_head *bh; ++ struct ext3_inode_info *ei = EXT3_I(inode); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + +- if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ bh = ext3_bread(handle, inode, *block, 1, err); ++ if (bh != NULL) { + inode->i_size += inode->i_sb->s_blocksize; +- EXT3_I(inode)->i_disksize = inode->i_size; +- ext3_journal_get_write_access(handle,bh); ++ ei->i_disksize = inode->i_size; + } ++ up(&ei->i_append_sem); ++ + return bh; + } + +-#ifndef assert +-#define assert(test) J_ASSERT(test) +-#endif + + #ifndef swap + #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +@@ -77,167 +83,84 @@ + #define dxtrace(command) + #endif + +-struct fake_dirent +-{ +- __le32 inode; +- __le16 rec_len; +- u8 name_len; +- u8 file_type; +-}; +- +-struct dx_countlimit +-{ +- __le16 limit; +- __le16 count; +-}; +- +-struct dx_entry +-{ +- __le32 hash; +- __le32 block; +-}; +- +-/* +- * dx_root_info is laid out so that if it should somehow get overlaid by a +- * dirent the two low bits of the hash version will be zero. Therefore, the +- * hash version mod 4 should never be 0. Sincerely, the paranoia department. +- */ +- +-struct dx_root +-{ +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct dx_entry entries[0]; +-}; +- +-struct dx_node +-{ +- struct fake_dirent fake; +- struct dx_entry entries[0]; +-}; +- +- +-struct dx_frame +-{ +- struct buffer_head *bh; +- struct dx_entry *entries; +- struct dx_entry *at; +-}; +- +-struct dx_map_entry +-{ +- u32 hash; +- u16 offs; +- u16 size; +-}; +- + #ifdef CONFIG_EXT3_INDEX +-static inline unsigned dx_get_block (struct dx_entry *entry); +-static void dx_set_block (struct dx_entry *entry, unsigned value); +-static inline unsigned dx_get_hash (struct dx_entry *entry); +-static void dx_set_hash (struct dx_entry *entry, unsigned value); +-static unsigned dx_get_count (struct dx_entry *entries); +-static unsigned dx_get_limit (struct dx_entry *entries); +-static void dx_set_count (struct dx_entry *entries, unsigned value); +-static void dx_set_limit (struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +-static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, +- struct inode *dir, +- struct dx_hash_info *hinfo, +- struct dx_frame *frame, +- int *err); +-static void dx_release (struct dx_frame *frames); ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry); ++static void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value); ++static unsigned dx_get_limit(struct iam_entry *entries); ++static void dx_set_count(struct iam_entry *entries, unsigned value); ++static void dx_set_limit(struct iam_entry *entries, unsigned value); ++static unsigned dx_root_limit(struct iam_path *p); ++static unsigned dx_node_limit(struct iam_path *p); ++static int dx_probe(struct qstr *name, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct iam_path *path); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_frame *frame, +- struct dx_frame *frames, +- __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +- +-/* +- * Future: use high four bits of block for coalesce-on-delete flags +- * Mask them off for now. +- */ +- +-static inline unsigned dx_get_block (struct dx_entry *entry) ++static inline void dx_set_limit(struct iam_entry *entries, unsigned value) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; +-} +- +-static inline void dx_set_block (struct dx_entry *entry, unsigned value) +-{ +- entry->block = cpu_to_le32(value); +-} +- +-static inline unsigned dx_get_hash (struct dx_entry *entry) +-{ +- return le32_to_cpu(entry->hash); +-} +- +-static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +-{ +- entry->hash = cpu_to_le32(value); +-} +- +-static inline unsigned dx_get_count (struct dx_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->count); +-} +- +-static inline unsigned dx_get_limit (struct dx_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->limit); ++ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++int dx_index_is_compat(struct iam_path *path) + { +- ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); ++ return iam_path_descr(path) == &iam_htree_compat_param; + } + +-static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +-{ +- ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +-} + +-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++int dx_node_check(struct iam_path *p, struct iam_frame *f) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - +- EXT3_DIR_REC_LEN(2) - infosize; +- return 0? 20: entry_space / sizeof(struct dx_entry); +-} ++ struct iam_entry *e; ++ struct iam_container *c; ++ unsigned count; ++ unsigned i; ++ iam_ptr_t blk; ++ iam_ptr_t root; ++ struct inode *inode; + +-static inline unsigned dx_node_limit (struct inode *dir) +-{ +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); +- return 0? 22: entry_space / sizeof(struct dx_entry); ++ c = p->ip_container; ++ e = dx_node_get_entries(p, f); ++ count = dx_get_count(e); ++ e = iam_entry_shift(p, e, 1); ++ root = iam_path_descr(p)->id_ops->id_root_ptr(c); ++ ++ inode = iam_path_obj(p); ++ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { ++ iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1)); ++ iam_get_ikey(p, e, iam_path_ikey(p, 1)); ++ if (i > 0 && ++ iam_ikeycmp(c, iam_path_ikey(p, 0), ++ iam_path_ikey(p, 1)) > 0) ++ return 0; ++ blk = dx_get_block(p, e); ++ /* ++ * Disable this check as it is racy. ++ */ ++ if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) ++ return 0; ++ /* ++ * By definition of a tree, no node points to the root. ++ */ ++ if (blk == root) ++ return 0; ++ } ++ return 1; + } + + /* + * Debug + */ + #ifdef DX_DEBUG +-static void dx_show_index (char * label, struct dx_entry *entries) ++static void dx_show_index (char * label, struct iam_entry *entries) + { + int i, n = dx_get_count (entries); + printk("%s index ", label); +@@ -288,7 +212,7 @@ + } + + struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, +- struct dx_entry *entries, int levels) ++ struct iam_entry *entries, int levels) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; +@@ -319,134 +243,368 @@ + #endif /* DX_DEBUG */ + + /* +- * Probe for a directory leaf block to search. ++ * Per-node tree locking. +- * +- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +- * error in the directory index, and the caller should fall back to +- * searching the directory normally. The callers of dx_probe **MUST** +- * check for this error code, and make sure it never gets reflected +- * back to userspace. + */ +-static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +-{ +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; +- struct buffer_head *bh; +- struct dx_frame *frame = frame_in; +- u32 hash; + +- frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; +- if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) +- goto fail; +- root = (struct dx_root *) bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_LEGACY) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++#define DX_DEBUG (1) ++ ++#if DX_DEBUG ++static struct dx_lock_stats { ++ unsigned dls_bh_lock; ++ unsigned dls_bh_busy; ++ unsigned dls_bh_again; ++ unsigned dls_bh_full_again; ++} dx_lock_stats = { 0, }; ++#define DX_DEVAL(x) x ++#else ++#define DX_DEVAL(x) ++#endif ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++ DX_DEVAL(dx_lock_stats.dls_bh_lock++); ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ DX_DEVAL(dx_lock_stats.dls_bh_busy++); ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++/* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, ++ enum dynlock_type lt) ++{ ++ return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS); ++} ++ ++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh) ++{ ++ if (lh != NULL) ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh); ++} ++ ++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh) ++{ ++ int i; ++ ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) { ++ if (*lh != NULL) { ++ dx_unlock_htree(dir, *lh); ++ *lh = NULL; ++ } + } +- hinfo->hash_version = root->info.hash_version; +- hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); +- hash = hinfo->hash; +- +- if (root->info.unused_flags & 1) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; ++} ++ ++/* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct iam_entry *dx_find_position(struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ int count; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; ++ ++ count = dx_get_count(frame->entries); ++ assert_corr(count && count <= dx_get_limit(frame->entries)); ++ p = iam_entry_shift(path, frame->entries, ++ dx_index_is_compat(path) ? 1 : 2); ++ q = iam_entry_shift(path, frame->entries, count - 1); ++ while (p <= q) { ++ m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2); ++ if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m), ++ path->ip_ikey_target) > 0) ++ q = iam_entry_shift(path, m, -1); ++ else ++ p = iam_entry_shift(path, m, +1); + } ++ return iam_entry_shift(path, p, -1); ++} + +- if ((indirect = root->info.indirect_levels) > 1) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; ++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame) ++{ ++ return dx_get_block(path, dx_find_position(path, frame)); ++} ++ ++/* ++ * Fast check for frame consistency. ++ */ ++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame) ++{ ++ struct iam_container *bag; ++ struct iam_entry *next; ++ struct iam_entry *last; ++ struct iam_entry *entries; ++ struct iam_entry *at; ++ ++ bag = path->ip_container; ++ at = frame->at; ++ entries = frame->entries; ++ last = iam_entry_shift(path, entries, dx_get_count(entries) - 1); ++ ++ if (unlikely(at > last)) ++ return -EAGAIN; ++ ++ if (unlikely(dx_get_block(path, at) != frame->leaf)) ++ return -EAGAIN; ++ ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at), ++ path->ip_ikey_target) > 0)) ++ return -EAGAIN; ++ ++ next = iam_entry_shift(path, at, +1); ++ if (next <= last) { ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next), ++ path->ip_ikey_target) <= 0)) ++ return -EAGAIN; + } ++ return 0; ++} + +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); +- while (1) +- { +- count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; +- dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; +- else +- p = m + 1; +- } ++/* ++ * returns 0 if path was unchanged, -EAGAIN otherwise. ++ */ ++static int dx_check_path(struct iam_path *path, struct iam_frame *frame) ++{ ++ int equal; + +- if (0) // linear search cross check +- { +- unsigned n = count - 1; +- at = entries; +- while (n--) +- { +- dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; +- break; +- } ++ dx_lock_bh(frame->bh); ++ equal = dx_check_fast(path, frame) == 0 || ++ frame->leaf == dx_find_ptr(path, frame); ++ DX_DEVAL(dx_lock_stats.dls_bh_again += !equal); ++ dx_unlock_bh(frame->bh); ++ ++ return equal ? 0 : -EAGAIN; ++} ++ ++/* ++ * returns 0 if path was unchanged, -EAGAIN otherwise. ++ */ ++static int dx_check_full_path(struct iam_path *path, int search) ++{ ++ struct iam_frame *bottom; ++ struct iam_frame *scan; ++ int i; ++ int result; ++ ++ do_corr(schedule()); ++ ++ for (bottom = path->ip_frames, i = 0; ++ i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) { ++ ; /* find last filled in frame */ ++ } ++ ++ /* ++ * Lock frames, bottom to top. ++ */ ++ for (scan = bottom - 1; scan >= path->ip_frames; --scan) ++ dx_lock_bh(scan->bh); ++ /* ++ * Check them top to bottom. ++ */ ++ result = 0; ++ for (scan = path->ip_frames; scan < bottom; ++scan) { ++ struct iam_entry *pos; ++ ++ if (search) { ++ if (dx_check_fast(path, scan) == 0) ++ continue; ++ ++ pos = dx_find_position(path, scan); ++ if (scan->leaf != dx_get_block(path, pos)) { ++ result = -EAGAIN; ++ break; ++ } ++ scan->at = pos; ++ } else { ++ pos = iam_entry_shift(path, scan->entries, ++ dx_get_count(scan->entries) - 1); ++ if (scan->at > pos || ++ scan->leaf != dx_get_block(path, scan->at)) { ++ result = -EAGAIN; ++ break; + } +- assert (at == p - 1); + } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); +- frame->bh = bh; +- frame->entries = entries; +- frame->at = at; +- if (!indirect--) return frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) +- goto fail2; +- at = entries = ((struct dx_node *) bh->b_data)->entries; +- assert (dx_get_limit(entries) == dx_node_limit (dir)); +- frame++; +- } +-fail2: +- while (frame >= frame_in) { +- brelse(frame->bh); +- frame--; + } +-fail: +- return NULL; ++ ++ /* ++ * Unlock top to bottom. ++ */ ++ for (scan = path->ip_frames; scan < bottom; ++scan) ++ dx_unlock_bh(scan->bh); ++ DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result); ++ do_corr(schedule()); ++ ++ return result; + } + +-static void dx_release (struct dx_frame *frames) ++static int dx_lookup_try(struct iam_path *path) ++{ ++ u32 ptr; ++ int err = 0; ++ int i; ++ ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct iam_container *c; ++ ++ param = iam_path_descr(path); ++ c = path->ip_container; ++ ++ ptr = param->id_ops->id_root_ptr(c); ++ for (frame = path->ip_frames, i = 0; i <= path->ip_indirect; ++ ++frame, ++i) { ++ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL, ++ &frame->bh); ++ do_corr(schedule()); ++ ++ dx_lock_bh(frame->bh); ++ /* ++ * node must be initialized under bh lock because concurrent ++ * creation procedure may change it and dx_lookup_try() will ++ * see obsolete tree height. -bzzz ++ */ ++ if (err != 0) ++ break; ++ ++ if (EXT3_INVARIANT_ON) { ++ err = param->id_ops->id_node_check(path, frame); ++ if (err != 0) ++ break; ++ } ++ ++ err = param->id_ops->id_node_load(path, frame); ++ if (err != 0) ++ break; ++ ++ assert_inv(dx_node_check(path, frame)); ++ /* ++ * splitting may change root index block and move hash we're ++ * looking for into another index block so, we have to check ++ * this situation and repeat from begining if path got changed ++ * -bzzz ++ */ ++ if (i > 0) { ++ err = dx_check_path(path, frame - 1); ++ if (err != 0) ++ break; ++ } ++ ++ frame->at = dx_find_position(path, frame); ++ frame->curidx = ptr; ++ frame->leaf = ptr = dx_get_block(path, frame->at); ++ ++ dx_unlock_bh(frame->bh); ++ do_corr(schedule()); ++ } ++ if (err != 0) ++ dx_unlock_bh(frame->bh); ++ path->ip_frame = --frame; ++ return err; ++} ++ ++static int dx_lookup(struct iam_path *path) ++{ ++ int err; ++ int i; ++ ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i) ++ assert(path->ip_frames[i].bh == NULL); ++ ++ do { ++ err = dx_lookup_try(path); ++ do_corr(schedule()); ++ if (err != 0) ++ iam_path_fini(path); ++ } while (err == -EAGAIN); ++ ++ return err; ++} ++ ++/* ++ * Performs path lookup and returns with found leaf (if any) locked by htree ++ * lock. ++ */ ++int dx_lookup_lock(struct iam_path *path, ++ struct dynlock_handle **dl, enum dynlock_type lt) + { +- if (frames[0].bh == NULL) +- return; ++ int result; ++ struct inode *dir; + +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ dir = iam_path_obj(path); ++ while ((result = dx_lookup(path)) == 0) { ++ do_corr(schedule()); ++ *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt); ++ if (*dl == NULL) { ++ iam_path_fini(path); ++ result = -ENOMEM; ++ break; ++ } ++ do_corr(schedule()); ++ /* ++ * while locking leaf we just found may get split so we need ++ * to check this -bzzz ++ */ ++ if (dx_check_full_path(path, 1) == 0) ++ break; ++ dx_unlock_htree(dir, *dl); ++ *dl = NULL; ++ iam_path_fini(path); ++ } ++ return result; + } + + /* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static int dx_probe(struct qstr *name, struct inode *dir, ++ struct dx_hash_info *hinfo, struct iam_path *path) ++{ ++ int err; ++ struct iam_path_compat *ipc; ++ ++ assert_corr(path->ip_data != NULL); ++ ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr); ++ ipc->ipc_qstr = name; ++ ipc->ipc_hinfo = hinfo; ++ ++ assert_corr(dx_index_is_compat(path)); ++ err = dx_lookup(path); ++ assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); ++ return err; ++} ++ ++ ++/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is +@@ -463,17 +632,16 @@ + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct dx_frame *frame, +- struct dx_frame *frames, +- __u32 *start_hash) ++static int ext3_htree_advance(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash, ++ int compat) + { +- struct dx_frame *p; ++ struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + +- p = frame; ++ p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and +@@ -482,14 +650,26 @@ + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); ++ p->at = iam_entry_shift(path, p->at, +1); ++ if (p->at < iam_entry_shift(path, p->entries, ++ dx_get_count(p->entries))) { ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); + break; +- if (p == frames) ++ } ++ dx_unlock_bh(p->bh); ++ if (p == path->ip_frames) + return 0; + num_frames++; +- p--; ++ --p; + } + ++ if (compat) { ++ /* ++ * Htree hash magic. ++ */ + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir +@@ -497,30 +677,146 @@ + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- bhash = dx_get_hash(p->at); ++ iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } ++ } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, &err))) ++ iam_ptr_t idx; ++ ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); ++ idx = p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); ++ err = iam_path_descr(path)->id_ops-> ++ id_node_read(path->ip_container, idx, NULL, &bh); ++ if (err != 0) + return err; /* Failure */ +- p++; +- brelse (p->bh); ++ ++p; ++ brelse(p->bh); ++ assert_corr(p->bh != bh); + p->bh = bh; +- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->entries = dx_node_get_entries(path, p); ++ p->at = iam_entry_shift(path, p->entries, !compat); ++ assert_corr(p->curidx != idx); ++ p->curidx = idx; ++ dx_lock_bh(p->bh); ++ assert_corr(p->leaf != dx_get_block(path, p->at)); ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); ++ assert_inv(dx_node_check(path, p)); + } + return 1; + } + +- ++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh) ++{ ++ struct iam_frame *f; ++ ++ for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) { ++ do_corr(schedule()); ++ *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ); ++ if (*lh == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static int iam_index_advance(struct iam_path *path) ++{ ++ return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0); ++} ++ ++/* ++ * Advance index part of @path to point to the next leaf. Returns 1 on ++ * success, 0, when end of container was reached. Leaf node is locked. ++ */ ++int iam_index_next(struct iam_container *c, struct iam_path *path) ++{ ++ iam_ptr_t cursor; ++ struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, }; ++ int result; ++ struct inode *object; ++ ++ /* ++ * Locking for iam_index_next()... is to be described. ++ */ ++ ++ object = c->ic_object; ++ cursor = path->ip_frame->leaf; ++ ++ while (1) { ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result == 0 && cursor == path->ip_frame->leaf) { ++ result = iam_index_advance(path); ++ ++ assert_corr(result == 0 || ++ cursor != path->ip_frame->leaf); ++ break; ++ } ++ do { ++ dx_unlock_array(object, lh); ++ ++ iam_path_release(path); ++ do_corr(schedule()); ++ ++ result = dx_lookup(path); ++ if (result < 0) ++ break; ++ ++ while (path->ip_frame->leaf != cursor) { ++ do_corr(schedule()); ++ ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ ++ result = iam_index_advance(path); ++ if (result == 0) { ++ ext3_error(object->i_sb, __FUNCTION__, ++ "cannot find cursor: %u\n", ++ cursor); ++ result = -EIO; ++ } ++ if (result < 0) ++ break; ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ dx_unlock_array(object, lh); ++ } ++ } while (result == -EAGAIN); ++ if (result < 0) ++ break; ++ } ++ dx_unlock_array(object, lh); ++ return result; ++} ++ ++int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash) ++{ ++ return ext3_htree_advance(dir, hash, path, start_hash, 1); ++} ++ + /* + * p is at least 6 bytes before the end of page + */ +@@ -593,7 +889,8 @@ + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; + struct inode *dir; + int block, err; + int count = 0; +@@ -603,6 +900,7 @@ + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; ++ iam_path_compat_init(&cpath, dir); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -613,19 +911,19 @@ + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); +- if (!frame) ++ err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, path); ++ if (err != 0) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { +- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { +- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; ++ de = (struct ext3_dir_entry_2 *) path->ip_frames[0].bh->b_data; + de = ext3_next_entry(de); + if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; +@@ -633,7 +931,7 @@ + } + + while (1) { +- block = dx_get_block(frame->at); ++ block = dx_get_block(path, path->ip_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -642,8 +940,8 @@ + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ ret = ext3_htree_next_block(dir, ++ HASH_NB_ALWAYS, path, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -658,12 +956,12 @@ + (count && ((hashval & 1) == 0))) + break; + } +- dx_release(frames); +- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", ++ iam_path_fini(path); ++ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +- dx_release(frames); ++ iam_path_fini(path); + return (err); + } + +@@ -695,7 +1011,6 @@ + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = (u16) ((char *) de - base); +- map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } +@@ -723,19 +1021,45 @@ + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++void iam_insert_key(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) + { +- struct dx_entry *entries = frame->entries; +- struct dx_entry *old = frame->at, *new = old + 1; ++ struct iam_entry *entries = frame->entries; ++ struct iam_entry *new = iam_entry_shift(path, frame->at, +1); + int count = dx_get_count(entries); + +- assert(count < dx_get_limit(entries)); +- assert(old < entries + count); +- memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); +- dx_set_hash(new, hash); +- dx_set_block(new, block); ++ /* ++ * Unfortunately we cannot assert this, as this function is sometimes ++ * called by VFS under i_sem and without pdirops lock. ++ */ ++ assert_corr(1 || iam_frame_is_locked(path, frame)); ++ assert_corr(count < dx_get_limit(entries)); ++ assert_corr(frame->at < iam_entry_shift(path, entries, count)); ++ assert_inv(dx_node_check(path, frame)); ++ ++ memmove(iam_entry_shift(path, new, 1), new, ++ (char *)iam_entry_shift(path, entries, count) - (char *)new); ++ dx_set_ikey(path, new, key); ++ dx_set_block(path, new, ptr); + dx_set_count(entries, count + 1); ++ assert_inv(dx_node_check(path, frame)); ++} ++ ++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) ++{ ++ dx_lock_bh(frame->bh); ++ iam_insert_key(path, frame, key, ptr); ++ dx_unlock_bh(frame->bh); ++} ++ ++void dx_insert_block(struct iam_path *path, struct iam_frame *frame, ++ u32 hash, u32 block) ++{ ++ assert_corr(dx_index_is_compat(path)); ++ iam_insert_key(path, frame, (struct iam_ikey *)&hash, block); + } ++ + #endif + + +@@ -934,7 +1258,11 @@ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_entry_compat dummy_dot = { ++ .block = 0 ++ }; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -943,21 +1271,25 @@ + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + ++ iam_path_compat_init(&cpath, dir); ++ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) ++ *err = dx_probe(&dentry->d_name, NULL, &hinfo, path); ++ if (*err != 0) + return NULL; + } else { +- frame = frames; +- frame->bh = NULL; /* for dx_release() */ +- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ +- dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ path->ip_frame->bh = NULL; /* for iam_path_fini() */ ++ path->ip_frame->at = (void *)&dummy_dot;/* hack for zero entry*/ + } + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); +- if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ block = dx_get_block(path, path->ip_frame->at); ++ *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)block, ++ NULL, &bh); ++ if (*err != 0) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - +@@ -972,13 +1304,12 @@ + goto errout; + } + *res_dir = de; +- dx_release (frames); ++ iam_path_fini(path); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ +- retval = ext3_htree_next_block(dir, hash, frame, +- frames, NULL); ++ retval = ext3_htree_next_block(dir, hash, path, NULL); + if (retval < 0) { + ext3_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", +@@ -991,7 +1322,7 @@ + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); +- dx_release (frames); ++ iam_path_fini(path); + return NULL; + } + #endif +@@ -1124,19 +1455,69 @@ + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++struct ext3_dir_entry_2 *move_entries(struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct buffer_head **bh1, ++ struct buffer_head **bh2, ++ __u32 *delim_hash) + { ++ char *data1; ++ char *data2; + unsigned blocksize = dir->i_sb->s_blocksize; +- unsigned count, continued; ++ unsigned count; ++ unsigned continued; ++ unsigned split; ++ u32 hash2; ++ ++ struct dx_map_entry *map; ++ struct ext3_dir_entry_2 *de1; ++ struct ext3_dir_entry_2 *de2; ++ ++ data1 = (*bh1)->b_data; ++ data2 = (*bh2)->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map(map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ frame->leaf, hash2, split, count - split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de1 = dx_pack_dirents(data1, blocksize); ++ de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) { ++ swap(*bh1, *bh2); ++ de1 = de2; ++ } ++ *delim_hash = hash2 + continued; ++ return de1; ++} ++ ++/* Allocate new node, and split leaf node @bh into it, inserting new pointer ++ * into parent node identified by @frame */ ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path, ++ struct buffer_head **bh,struct iam_frame *frame, ++ struct dx_hash_info *hinfo, int *error) ++{ ++ struct inode *dir = iam_path_obj(path); + struct buffer_head *bh2; + u32 newblock; + u32 hash2; +- struct dx_map_entry *map; +- char *data1 = (*bh)->b_data, *data2; +- unsigned split, move, size, i; +- struct ext3_dir_entry_2 *de = NULL, *de2; ++ struct ext3_dir_entry_2 *de = NULL; + int err; + + bh2 = ext3_append (handle, dir, &newblock, error); +@@ -1161,46 +1542,9 @@ + if (err) + goto journal_error; + +- data2 = bh2->b_data; +- +- /* create map in the end of data2 block */ +- map = (struct dx_map_entry *) (data2 + blocksize); +- count = dx_make_map ((struct ext3_dir_entry_2 *) data1, +- blocksize, hinfo, map); +- map -= count; +- dx_sort_map (map, count); +- /* Split the existing block in the middle, size-wise */ +- size = 0; +- move = 0; +- for (i = count-1; i >= 0; i--) { +- /* is more than half of this entry in 2nd half of the block? */ +- if (size + map[i].size/2 > blocksize/2) +- break; +- size += map[i].size; +- move++; +- } +- /* map index at which we will split */ +- split = count - move; +- hash2 = map[split].hash; +- continued = hash2 == map[split - 1].hash; +- dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- +- /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split); +- de = dx_pack_dirents(data1,blocksize); +- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); +- de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ de = move_entries(dir, hinfo, bh, &bh2, &hash2); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; +- } +- dx_insert_block (frame, hash2 + continued, newblock); ++ dx_insert_block(path, frame, hash2, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1203,6 +1558,63 @@ + } + #endif + ++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir, ++ struct buffer_head *bh, ++ const char *name, int namelen) ++{ ++ struct ext3_dir_entry_2 *de; ++ char *top; ++ unsigned long offset; ++ int nlen; ++ int rlen; ++ int reclen; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ offset = 0; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", ++ dir, de, bh, offset)) ++ return ERR_PTR(-EIO); ++ if (ext3_match(namelen, name, de)) ++ return ERR_PTR(-EEXIST); ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ return de; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ return ERR_PTR(-ENOSPC); ++} ++ ++struct ext3_dir_entry_2 *split_entry(struct inode *dir, ++ struct ext3_dir_entry_2 *de, ++ unsigned long ino, mode_t mode, ++ const char *name, int namelen) ++{ ++ int nlen; ++ int rlen; ++ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1; ++ ++ de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ de->inode = cpu_to_le32(ino); ++ if (ino != 0) ++ ext3_set_de_type(dir->i_sb, de, mode); ++ de->name_len = namelen; ++ memcpy(de->name, name, namelen); ++ return de; ++} + + /* + * Add a new entry into a directory (leaf) block. If de is non-NULL, +@@ -1222,34 +1634,16 @@ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; +- unsigned long offset = 0; +- unsigned short reclen; +- int nlen, rlen, err; +- char *top; ++ int err; + +- reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { +- de = (struct ext3_dir_entry_2 *)bh->b_data; +- top = bh->b_data + dir->i_sb->s_blocksize - reclen; +- while ((char *) de <= top) { +- if (!ext3_check_dir_entry("ext3_add_entry", dir, de, +- bh, offset)) { +- brelse (bh); +- return -EIO; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; +- } +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if ((de->inode? rlen - nlen: rlen) >= reclen) +- break; +- de = (struct ext3_dir_entry_2 *)((char *)de + rlen); +- offset += rlen; ++ de = find_insertion_point(dir, bh, name, namelen); ++ if (IS_ERR(de)) { ++ err = PTR_ERR(de); ++ if (err != -ENOSPC) ++ brelse(bh); ++ return err; + } +- if ((char *) de > top) +- return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -1260,22 +1654,9 @@ + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); +- de1->rec_len = cpu_to_le16(rlen - nlen); +- de->rec_len = cpu_to_le16(nlen); +- de = de1; +- } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); ++ ++ split_entry(dir, de, inode ? inode->i_ino : 0, ++ inode ? inode->i_mode : 0, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend +@@ -1304,6 +1685,7 @@ + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ ++extern int user_selected_hash_function; + static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) + { +@@ -1312,8 +1694,9 @@ + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_frame frames[2], *frame; +- struct dx_entry *entries; ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; +@@ -1323,6 +1706,7 @@ + u32 block; + struct fake_dirent *fde; + ++ iam_path_compat_init(&cpath, dir); + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1357,23 +1741,25 @@ + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; +- entries = root->entries; +- dx_set_block (entries, 1); ++ if (user_selected_hash_function >= 0 && ++ user_selected_hash_function <= DX_HASH_MAX) ++ root->info.hash_version = user_selected_hash_function; ++ entries = (void *)root->entries; ++ dx_set_block (path, entries, 1); + dx_set_count (entries, 1); +- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit (entries, dx_root_limit(path)); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); +- frame = frames; +- frame->entries = entries; +- frame->at = entries; +- frame->bh = bh; ++ path->ip_frame->entries = entries; ++ path->ip_frame->at = entries; ++ path->ip_frame->bh = bh; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); +- dx_release (frames); +- if (!(de)) ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &retval); ++ iam_path_fini(path); ++ if (!de) + return retval; + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +@@ -1444,139 +1830,384 @@ + return add_dirent_to_buf(handle, dentry, inode, de, bh); + } + ++static int shift_entries(struct iam_path *path, ++ struct iam_frame *frame, unsigned count, ++ struct iam_entry *entries, struct iam_entry *entries2, ++ u32 newblock) ++{ ++ unsigned count1; ++ unsigned count2; ++ int delta; ++ ++ struct iam_frame *parent = frame - 1; ++ struct iam_ikey *pivot = iam_path_ikey(path, 3); ++ ++ delta = dx_index_is_compat(path) ? 0 : +1; ++ ++ count1 = count/2 + delta; ++ count2 = count - count1; ++ iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot); ++ ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy((char *) iam_entry_shift(path, entries2, delta), ++ (char *) iam_entry_shift(path, entries, count1), ++ count2 * iam_entry_size(path)); ++ ++ dx_set_count(entries2, count2 + delta); ++ dx_set_limit(entries2, dx_node_limit(path)); ++ ++ /* ++ * NOTE: very subtle piece of code competing dx_probe() may find 2nd ++ * level index in root index, then we insert new index here and set ++ * new count in that 2nd level index. so, dx_probe() may see 2nd level ++ * index w/o hash it looks for. the solution is to check root index ++ * after we locked just founded 2nd level index -bzzz ++ */ ++ iam_insert_key_lock(path, parent, pivot, newblock); ++ ++ /* ++ * now old and new 2nd level index blocks contain all pointers, so ++ * dx_probe() may find it in the both. it's OK -bzzz ++ */ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, count1); ++ dx_unlock_bh(frame->bh); ++ ++ /* ++ * now old 2nd level index block points to first half of leafs. it's ++ * importand that dx_probe() must check root index block for changes ++ * under dx_lock_bh(frame->bh) -bzzz ++ */ ++ ++ return count1; ++} ++ + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ +-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int split_index_node(handle_t *handle, struct iam_path *path, ++ struct dynlock_handle **lh) + { +- struct dx_frame frames[2], *frame; +- struct dx_entry *entries, *at; +- struct dx_hash_info hinfo; +- struct buffer_head * bh; +- struct inode *dir = dentry->d_parent->d_inode; +- struct super_block * sb = dir->i_sb; +- struct ext3_dir_entry_2 *de; +- int err; + +- frame = dx_probe(dentry, NULL, &hinfo, frames, &err); +- if (!frame) +- return err; +- entries = frame->entries; +- at = frame->at; ++ struct iam_entry *entries; /* old block contents */ ++ struct iam_entry *entries2; /* new block contents */ ++ struct iam_frame *frame, *safe; ++ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; ++ u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; ++ struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct inode *dir = iam_path_obj(path); ++ struct iam_descr *descr; ++ int nr_splet; ++ int i, err; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) +- goto cleanup; ++ descr = iam_path_descr(path); ++ /* ++ * Algorithm below depends on this. ++ */ ++ assert_corr(dx_root_limit(path) < dx_node_limit(path)); + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; ++ frame = path->ip_frame; ++ entries = frame->entries; + +- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (err != -ENOSPC) { +- bh = NULL; +- goto cleanup; +- } ++ /* ++ * Tall-tree handling: we might have to split multiple index blocks ++ * all the way up to tree root. Tricky point here is error handling: ++ * to avoid complicated undo/rollback we ++ * ++ * - first allocate all necessary blocks ++ * ++ * - insert pointers into them atomically. ++ */ ++ ++ /* ++ * Locking: leaf is already locked. htree-locks are acquired on all ++ * index nodes that require split bottom-to-top, on the "safe" node, ++ * and on all new nodes ++ */ + +- /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +- /* Need to split index? */ +- if (dx_get_count(entries) == dx_get_limit(entries)) { +- u32 newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; +- struct dx_entry *entries2; +- struct dx_node *node2; +- struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext3_warning(sb, __FUNCTION__, +- "Directory index full!"); ++ /* What levels need split? */ ++ for (nr_splet = 0; frame >= path->ip_frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++nr_splet) { ++ do_corr(schedule()); ++ if (nr_splet == DX_MAX_TREE_HEIGHT) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; + } +- bh2 = ext3_append (handle, dir, &newblock, &err); +- if (!(bh2)) ++ } ++ ++ safe = frame; ++ ++ /* ++ * Lock all nodes, bottom to top. ++ */ ++ for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) { ++ do_corr(schedule()); ++ lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE); ++ if (lock[i] == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ } ++ ++ /* ++ * Check for concurrent index modification. ++ */ ++ err = dx_check_full_path(path, 1); ++ if (err) ++ goto cleanup; ++ /* ++ * And check that the same number of nodes is to be split. ++ */ ++ for (i = 0, frame = path->ip_frame; frame >= path->ip_frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++i) { ++ ; ++ } ++ if (i != nr_splet) { ++ err = -EAGAIN; ++ goto cleanup; ++ } ++ ++ /* Go back down, allocating blocks, locking them, and adding into ++ * transaction... */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ do_corr(schedule()); ++ if (!bh_new[i] || ++ descr->id_ops->id_node_init(path->ip_container, ++ bh_new[i], 0) != 0) ++ goto cleanup; ++ new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE); ++ if (new_lock[i] == NULL) { ++ err = -ENOMEM; + goto cleanup; +- node2 = (struct dx_node *)(bh2->b_data); +- entries2 = node2->entries; +- node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +- node2->fake.inode = 0; ++ } ++ do_corr(schedule()); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { +- unsigned icount1 = icount/2, icount2 = icount - icount1; +- unsigned hash2 = dx_get_hash(entries + icount1); +- dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ +- err = ext3_journal_get_write_access(handle, +- frames[0].bh); ++ } ++ /* Add "safe" node to transaction too */ ++ if (safe + 1 != path->ip_frames) { ++ do_corr(schedule()); ++ err = ext3_journal_get_write_access(handle, safe->bh); ++ if (err) ++ goto journal_error; ++ } ++ ++ /* Go through nodes once more, inserting pointers */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ unsigned count; ++ int idx; ++ struct buffer_head *bh2; ++ struct buffer_head *bh; ++ ++ entries = frame->entries; ++ count = dx_get_count(entries); ++ idx = iam_entry_diff(path, frame->at, entries); ++ ++ bh2 = bh_new[i]; ++ entries2 = dx_get_entries(path, bh2->b_data, 0); ++ ++ bh = frame->bh; ++ if (frame == path->ip_frames) { ++ /* splitting root node. Tricky point: ++ * ++ * In the "normal" B-tree we'd split root *and* add ++ * new root to the tree with pointers to the old root ++ * and its sibling (thus introducing two new nodes). ++ * ++ * In htree it's enough to add one node, because ++ * capacity of the root node is smaller than that of ++ * non-root one. ++ */ ++ struct iam_frame *frames; ++ struct iam_entry *next; ++ ++ assert_corr(i == 0); ++ ++ do_corr(schedule()); ++ ++ frames = path->ip_frames; ++ memcpy((char *) entries2, (char *) entries, ++ count * iam_entry_size(path)); ++ dx_set_limit(entries2, dx_node_limit(path)); ++ ++ /* Set up root */ ++ dx_lock_bh(frame->bh); ++ next = descr->id_ops->id_root_inc(path->ip_container, ++ path, frame); ++ dx_set_block(path, next, newblock[0]); ++ dx_unlock_bh(frame->bh); ++ ++ do_corr(schedule()); ++ /* Shift frames in the path */ ++ memmove(frames + 2, frames + 1, ++ (sizeof path->ip_frames) - 2 * sizeof frames[0]); ++ /* Add new access path frame */ ++ frames[1].at = iam_entry_shift(path, entries2, idx); ++ frames[1].entries = entries = entries2; ++ frames[1].bh = bh2; ++ assert_inv(dx_node_check(path, frame)); ++ ++ path->ip_frame; ++ ++ frame; ++ assert_inv(dx_node_check(path, frame)); ++ bh_new[0] = NULL; /* buffer head is "consumed" */ ++ err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); ++ } else { ++ /* splitting non-root index node. */ ++ struct iam_frame *parent = frame - 1; + +- memcpy ((char *) entries2, (char *) (entries + icount1), +- icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); +- dx_set_count (entries2, icount2); +- dx_set_limit (entries2, dx_node_limit(dir)); +- ++ do_corr(schedule()); ++ count = shift_entries(path, frame, count, ++ entries, entries2, newblock[i]); + /* Which index block gets the new entry? */ +- if (at - entries >= icount1) { +- frame->at = at = at - entries - icount1 + entries2; ++ if (idx >= count) { ++ int d = dx_index_is_compat(path) ? 0 : +1; ++ ++ frame->at = iam_entry_shift(path, entries2, ++ idx - count + d); + frame->entries = entries = entries2; ++ frame->curidx = newblock[i]; + swap(frame->bh, bh2); ++ assert_corr(lock[i + 1] != NULL); ++ assert_corr(new_lock[i] != NULL); ++ swap(lock[i + 1], new_lock[i]); ++ bh_new[i] = bh2; ++ parent->at = iam_entry_shift(path, ++ parent->at, +1); + } +- dx_insert_block (frames + 0, hash2, newblock); +- dxtrace(dx_show_index ("node", frames[1].entries)); ++ assert_inv(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, parent)); ++ dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; +- brelse (bh2); +- } else { +- dxtrace(printk("Creating second level index...\n")); +- memcpy((char *) entries2, (char *) entries, +- icount * sizeof(struct dx_entry)); +- dx_set_limit(entries2, dx_node_limit(dir)); +- +- /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(entries + 0, newblock); +- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext3_journal_get_write_access(handle, +- frame->bh); ++ do_corr(schedule()); ++ err = ext3_journal_dirty_metadata(handle, parent->bh); + if (err) + goto journal_error; + } +- ext3_journal_dirty_metadata(handle, frames[0].bh); ++ do_corr(schedule()); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto journal_error; ++ } ++ /* ++ * This function was called to make insertion of new leaf ++ * possible. Check that it fulfilled its obligations. ++ */ ++ assert_corr(dx_get_count(path->ip_frame->entries) < ++ dx_get_limit(path->ip_frame->entries)); ++ assert_corr(lock[nr_splet] != NULL); ++ *lh = lock[nr_splet]; ++ lock[nr_splet] = NULL; ++ if (nr_splet > 0) { ++ /* ++ * Log ->i_size modification. ++ */ ++ err = ext3_mark_inode_dirty(handle, dir); ++ if (err) ++ goto journal_error; ++ } ++ goto cleanup; ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++ ++cleanup: ++ dx_unlock_array(dir, lock); ++ dx_unlock_array(dir, new_lock); ++ ++ assert_corr(err || iam_frame_is_locked(path, path->ip_frame)); ++ ++ do_corr(schedule()); ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ return err; ++} ++ ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh = NULL; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct ext3_dir_entry_2 *de; ++ struct dynlock_handle *dummy = NULL; ++ int err; ++ size_t isize; ++ ++ iam_path_compat_init(&cpath, dir); ++ param = iam_path_descr(path); ++ ++ err = dx_probe(&dentry->d_name, NULL, &hinfo, path); ++ if (err != 0) ++ return err; ++ frame = path->ip_frame; ++ ++ isize = dir->i_size; ++ ++ err = param->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, frame->at), ++ handle, &bh); ++ if (err != 0) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); ++ if (err != -ENOSPC) { ++ bh = NULL; ++ goto cleanup; + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ ++ err = split_index_node(handle, path, &dummy); ++ if (err) ++ goto cleanup; ++ ++ /*copy split inode too*/ ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err); + if (!de) + goto cleanup; ++ ++ assert_inv(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); +- bh = NULL; +- goto cleanup; ++ goto cleanup2; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: + if (bh) + brelse(bh); +- dx_release(frames); ++cleanup2: ++ dx_unlock_htree(dir, dummy); ++ if (err) ++ inode->i_size = isize; ++ iam_path_fini(path); + return err; + } + #endif +@@ -1678,6 +2309,26 @@ + return ext3_new_inode(handle, dir, mode, inum); + } + ++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext3_new_inode(handle, dir, mode, 0); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext3_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ ext3_set_aops(inode); ++ } ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext3_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2007-11-26 23:09:03.000000000 +0300 ++++ linux-stage/fs/ext3/ioctl.c 2007-11-26 23:09:06.000000000 +0300 +@@ -16,6 +16,7 @@ + #include + #include + ++#include + + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +@@ -275,6 +276,6 @@ + + + default: +- return -ENOTTY; ++ return iam_uapi_ioctl(inode, filp, cmd, arg); + } + } +Index: linux-stage/fs/ext3/file.c +=================================================================== +--- linux-stage.orig/fs/ext3/file.c 2007-11-26 23:08:59.000000000 +0300 ++++ linux-stage/fs/ext3/file.c 2007-11-26 23:09:06.000000000 +0300 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include "xattr.h" + #include "acl.h" + +@@ -41,8 +42,12 @@ + ext3_discard_reservation(inode); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + } +- if (is_dx(inode) && filp->private_data) +- ext3_htree_free_dir_info(filp->private_data); ++ if (is_dx(inode) && filp->private_data) { ++ if (S_ISDIR(inode->i_mode)) ++ ext3_htree_free_dir_info(filp->private_data); ++ else ++ ext3_iam_release(filp, inode); ++ } + + return 0; + } +Index: linux-stage/fs/ext3/hash.c +=================================================================== +--- linux-stage.orig/fs/ext3/hash.c 2007-11-26 23:08:59.000000000 +0300 ++++ linux-stage/fs/ext3/hash.c 2007-11-26 23:09:06.000000000 +0300 +@@ -49,6 +49,23 @@ + return (hash0 << 1); + } + ++static __u32 dx_r5_hash(const signed char *msg, int len) ++{ ++ __u32 a = 0; ++ while (len--) { ++ a += *msg << 4; ++ a += *msg >> 4; ++ a *= 11; ++ msg++; ++ } ++ return a; ++} ++ ++static __u32 dx_same_hash(const signed char *msg, int len) ++{ ++ return 0xcafebabeUL; ++} ++ + static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) + { + __u32 pad, val; +@@ -139,6 +156,12 @@ + hash = buf[0]; + minor_hash = buf[1]; + break; ++ case DX_HASH_R5: ++ hash = dx_r5_hash(name, len); ++ break; ++ case DX_HASH_SAME: ++ hash = dx_same_hash(name, len); ++ break; + default: + hinfo->hash = 0; + return -1; +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2007-11-26 23:09:05.000000000 +0300 ++++ linux-stage/fs/ext3/Makefile 2007-11-26 23:09:06.000000000 +0300 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ +- mballoc.o dynlocks.o ++ mballoc.o dynlocks.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/dir.c +=================================================================== +--- linux-stage.orig/fs/ext3/dir.c 2007-11-26 23:09:04.000000000 +0300 ++++ linux-stage/fs/ext3/dir.c 2007-11-26 23:09:06.000000000 +0300 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +@@ -61,6 +62,7 @@ + } + + ++#if EXT3_INVARIANT_ON + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -90,6 +92,7 @@ + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; + } ++#endif + + static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) +@@ -304,12 +307,14 @@ + root->rb_node = NULL; + } + ++extern struct iam_private_info *ext3_iam_alloc_info(int flags); ++extern void ext3_iam_release_info(struct iam_private_info *info); + + static struct dir_private_info *create_dir_info(loff_t pos) + { + struct dir_private_info *p; + +- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ p = (void *)ext3_iam_alloc_info(GFP_KERNEL); + if (!p) + return NULL; + p->root.rb_node = NULL; +@@ -325,6 +330,7 @@ + void ext3_htree_free_dir_info(struct dir_private_info *p) + { + free_rb_tree_fname(&p->root); ++ ext3_iam_release_info((void *)p); + kfree(p); + } + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series index 1d9f2f5..f686f4c 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series @@ -16,3 +16,8 @@ ext3-inode-version-2.6.18-vanilla.patch ext3-mmp-2.6.18-vanilla.patch ext3-unlink-race.patch ext3-statfs-2.6-rhel5.patch +ext3-dynlocks-common.patch +ext3-dynlocks-2.6.18-vanilla.patch +ext3-iam-common.patch +ext3-iam-2.6.18-rhel5.patch +ext3-orphans-delay.patch