From 4ca3f861340c6f9daa803c7ae87dbf4514817e49 Mon Sep 17 00:00:00 2001 From: shadow Date: Thu, 15 Nov 2007 16:38:11 +0000 Subject: [PATCH] add support for sles10 kernels for iam code. b=12502 i=nikita i=johann --- .../patches/ext3-hash-selection-sles10.patch | 126 + .../kernel_patches/patches/ext3-iam-common.patch | 5395 ++++++++++++++++++++ .../kernel_patches/patches/ext3-iam-rhel4.patch | 2664 ++++++++++ .../kernel_patches/patches/ext3-iam-sles10.patch | 2657 ++++++++++ .../patches/ext3-tall-htree-sles10.patch | 432 ++ .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 6 +- .../series/ldiskfs-2.6-sles10.series | 10 + 7 files changed, 11286 insertions(+), 4 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/ext3-hash-selection-sles10.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-iam-common.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-iam-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-iam-sles10.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-tall-htree-sles10.patch diff --git a/ldiskfs/kernel_patches/patches/ext3-hash-selection-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-hash-selection-sles10.patch new file mode 100644 index 0000000..8f724a2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-hash-selection-sles10.patch @@ -0,0 +1,126 @@ +Index: linux-stage/fs/ext3/hash.c +=================================================================== +--- linux-stage.orig/fs/ext3/hash.c 2007-08-30 14:53:05.000000000 +0300 ++++ linux-stage/fs/ext3/hash.c 2007-08-30 14:58:29.000000000 +0300 +@@ -61,6 +61,11 @@ + return a; + } + ++static __u32 dx_same_hash(const signed char *msg, int len) ++{ ++ return 0xcafebabeUL; ++} ++ + static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) + { + __u32 pad, val; +@@ -154,6 +159,9 @@ + case DX_HASH_R5: + hash = dx_r5_hash(name, len); + break; ++ case DX_HASH_SAME: ++ hash = dx_same_hash(name, len); ++ break; + default: + hinfo->hash = 0; + return -1; +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2007-08-30 14:53:04.000000000 +0300 ++++ linux-stage/fs/ext3/super.c 2007-08-30 15:00:54.000000000 +0300 +@@ -691,7 +691,7 @@ + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, + Opt_mballoc, Opt_nomballoc, Opt_stripe, +- Opt_grpquota ++ Opt_grpquota, Opt_hashfunc + }; + + static match_table_t tokens = { +@@ -755,6 +755,7 @@ + {Opt_stripe, "stripe=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, ++ {Opt_hashfunc,"hash=%s"}, + }; + + static unsigned long get_sb_block(void **data) +@@ -777,6 +778,7 @@ + return sb_block; + } + ++int user_selected_hash_function = -1; + static int parse_options (char *options, struct super_block *sb, + unsigned long *inum, unsigned long *journal_devnum, + unsigned long *n_blocks_count, int is_remount) +@@ -1124,6 +1126,22 @@ + return 0; + sbi->s_stripe = option; + break; ++ case Opt_hashfunc: ++ if (strncmp (args[0].from,"legacy",6) == 0){ ++ user_selected_hash_function = 0; ++ } else if (strncmp (args[0].from,"half_md4",8) == 0){ ++ user_selected_hash_function = 1; ++ } else if (strncmp (args[0].from,"tea",3) == 0){ ++ user_selected_hash_function = 2; ++ } else if (strncmp (args[0].from,"r5",2) == 0){ ++ user_selected_hash_function = 3; ++ } else if (strncmp (args[0].from,"same",4) == 0){ ++ user_selected_hash_function = 4; ++ } else { ++ printk ("Hashfunc name wrong\n"); ++ return 0; ++ } ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2007-08-30 14:53:05.000000000 +0300 ++++ linux-stage/fs/ext3/namei.c 2007-08-30 14:58:29.000000000 +0300 +@@ -421,10 +421,7 @@ + struct htree_cookie *hc = cookie; + + root = data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_R5 && +- root->info.hash_version != DX_HASH_LEGACY) { ++ if (root->info.hash_version > DX_HASH_MAX) { + ext3_warning(sb, __FUNCTION__, + "Unrecognised inode hash code %d", + root->info.hash_version); +@@ -1573,6 +1570,7 @@ + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ ++extern int user_selected_hash_function; + static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) + { +@@ -1628,7 +1626,9 @@ + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; +- root->info.hash_version = DX_HASH_R5; ++ if (user_selected_hash_function >= 0 && ++ user_selected_hash_function <= DX_HASH_MAX) ++ root->info.hash_version = user_selected_hash_function; + entries = (void *)root->entries; + dx_set_block (path, entries, 1); + dx_set_count (entries, 1); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2007-08-30 14:53:05.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs.h 2007-08-30 14:58:29.000000000 +0300 +@@ -809,6 +809,8 @@ + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 + #define DX_HASH_R5 3 ++#define DX_HASH_SAME 4 ++#define DX_HASH_MAX 4 + + /* hash info structure used by the directory hash */ + struct dx_hash_info diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-common.patch b/ldiskfs/kernel_patches/patches/ext3-iam-common.patch new file mode 100644 index 0000000..f90409d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-common.patch @@ -0,0 +1,5395 @@ +Index: linux-stage/fs/ext3/iam_lvar.c +=================================================================== +--- linux-stage.orig/fs/ext3/iam_lvar.c 2006-06-16 16:07:58.000000000 +0300 ++++ linux-stage/fs/ext3/iam_lvar.c 2007-10-21 17:32:18.000000000 +0300 +@@ -0,0 +1,1077 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_lvar.c ++ * implementation of iam format for fixed size records, variable sized keys. ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error() */ ++#include ++ ++#include ++ ++/* ++ * Leaf operations. ++ */ ++ ++enum { ++ IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in ++ * lustre/utils/create_iam.c */ ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct lvar_leaf_header { ++ __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */ ++ __le16 vlh_used; /* used bytes, including header */ ++}; ++ ++/* ++ * Format of leaf entry: ++ * ++ * __le16 keysize ++ * u8 key[keysize] ++ * u8 record[rec_size] ++ * ++ * Entries are ordered in key order. ++ */ ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++typedef __u32 lvar_hash_t; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct lvar_leaf_entry { ++ __le32 vle_hash; ++ __le16 vle_keysize; ++ u8 vle_key[0]; ++}; ++ ++#define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1))) ++ ++ ++static inline int blocksize(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize; ++} ++ ++static inline const char *kchar(const struct iam_key *key) ++{ ++ return (void *)key; ++} ++ ++static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent) ++{ ++ return (struct iam_lentry *)ent; ++} ++ ++static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent) ++{ ++ return (struct lvar_leaf_entry *)lent; ++} ++ ++ ++static inline int e_keysize(const struct lvar_leaf_entry *ent) ++{ ++ return le16_to_cpu(ent->vle_keysize); ++} ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++enum { ++ LVAR_PAD = 4, ++ LVAR_ROUND = LVAR_PAD - 1 ++}; ++ ++static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize) ++{ ++ CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1))); ++ ++ return (offsetof(struct lvar_leaf_entry, vle_key) + ++ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND; ++} ++ ++static inline int rec_size(const struct iam_rec *rec) ++{ ++ return *(const char *)rec; ++} ++ ++static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent) ++{ ++ return ((void *)ent) + ++ offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent); ++} ++ ++static inline int e_size(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent) ++{ ++ return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent))); ++} ++ ++static inline char *e_char(const struct lvar_leaf_entry *ent) ++{ ++ return (char *)&ent->vle_key; ++} ++ ++static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent) ++{ ++ return (struct iam_key *)e_char(ent); ++} ++ ++static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent) ++{ ++ return le32_to_cpu(ent->vle_hash); ++} ++ ++static void e_print(const struct lvar_leaf_entry *ent) ++{ ++ printk(" %p %8.8x \"%*.*s\"\n", ent, e_hash(ent), ++ e_keysize(ent), e_keysize(ent), e_char(ent)); ++} ++#if 0 ++static int e_check(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent) ++{ ++ const void *point = ent; ++ const void *start = leaf->il_bh->b_data; ++ return ++ start + sizeof(struct lvar_leaf_header) <= point && ++ point + e_size(leaf, ent) < start + blocksize(leaf); ++} ++#endif ++ ++static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent) ++{ ++ return ((void *)ent) + e_size(leaf, ent); ++} ++ ++#define LVAR_HASH_SANDWICH (0) ++#define LVAR_HASH_TEA (1) ++#define LVAR_HASH_R5 (0) ++#define LVAR_HASH_PREFIX (0) ++ ++static __u32 hash_build0(const char *name, int namelen) ++{ ++ __u32 result; ++ ++ if (namelen == 0) ++ return 0; ++ if (strncmp(name, ".", 1) == 0 && namelen == 1) ++ return 1; ++ if (strncmp(name, "..", 2) == 0 && namelen == 2) ++ return 2; ++ ++ if (LVAR_HASH_PREFIX) { ++ result = 0; ++ strncpy((void *)&result, ++ name, min(namelen, (int)sizeof result)); ++ } else { ++ struct dx_hash_info hinfo; ++ ++ if (LVAR_HASH_TEA) ++ hinfo.hash_version = DX_HASH_TEA; ++ else ++ hinfo.hash_version = DX_HASH_R5; ++ hinfo.seed = 0; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ result = hinfo.hash; ++ if (LVAR_HASH_SANDWICH) { ++ __u32 result2; ++ ++ hinfo.hash_version = DX_HASH_TEA; ++ hinfo.seed = 0; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ result2 = hinfo.hash; ++ result = (0xfc000000 & result2) | (0x03ffffff & result); ++ } ++ } ++ return result; ++} ++ ++enum { ++ HASH_GRAY_AREA = 1024, ++ MAX_HASH_SIZE = 0x7fffffffUL ++}; ++ ++static __u32 hash_build(const char *name, int namelen) ++{ ++ __u32 hash; ++ ++ hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE; ++ if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA) ++ hash &= HASH_GRAY_AREA - 1; ++ return hash; ++} ++ ++static inline lvar_hash_t get_hash(const struct iam_container *bag, ++ const char *name, int namelen) ++{ ++ return hash_build(name, namelen); ++} ++ ++static inline int e_eq(const struct lvar_leaf_entry *ent, ++ const char *name, int namelen) ++{ ++ return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen); ++} ++ ++static inline int e_cmp(const struct iam_leaf *leaf, ++ const struct lvar_leaf_entry *ent, lvar_hash_t hash) ++{ ++ lvar_hash_t ehash; ++ ++ ehash = e_hash(ent); ++ return ehash == hash ? 0 : (ehash < hash ? -1 : +1); ++} ++ ++static struct lvar_leaf_header *n_head(const struct iam_leaf *l) ++{ ++ return (struct lvar_leaf_header *)l->il_bh->b_data; ++} ++ ++static int h_used(const struct lvar_leaf_header *hdr) ++{ ++ return le16_to_cpu(hdr->vlh_used); ++} ++ ++static void h_used_adj(const struct iam_leaf *leaf, ++ struct lvar_leaf_header *hdr, int adj) ++{ ++ int used; ++ ++ used = h_used(hdr) + adj; ++ assert_corr(sizeof *hdr <= used && used <= blocksize(leaf)); ++ hdr->vlh_used = cpu_to_le16(used); ++} ++ ++static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf) ++{ ++ return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header); ++} ++ ++static struct lvar_leaf_entry *n_end(const struct iam_leaf *l) ++{ ++ return (void *)l->il_bh->b_data + h_used(n_head(l)); ++} ++ ++static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l) ++{ ++ return lentry_lvar(l->il_at); ++} ++ ++void n_print(const struct iam_leaf *l) ++{ ++ struct lvar_leaf_entry *scan; ++ ++ printk(KERN_EMERG "used: %d\n", h_used(n_head(l))); ++ for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan)) ++ e_print(scan); ++} ++ ++#if EXT3_CORRECTNESS_ON ++static int n_at_rec(const struct iam_leaf *folio) ++{ ++ return ++ n_start(folio) <= lentry_lvar(folio->il_at) && ++ lentry_lvar(folio->il_at) < n_end(folio); ++} ++ ++#if EXT3_INVARIANT_ON ++static int n_invariant(const struct iam_leaf *leaf) ++{ ++ struct iam_path *path; ++ struct lvar_leaf_entry *scan; ++ struct lvar_leaf_entry *end; ++ lvar_hash_t hash; ++ lvar_hash_t nexthash; ++ lvar_hash_t starthash; ++ ++ end = n_end(leaf); ++ hash = 0; ++ path = leaf->il_path; ++ ++ if (h_used(n_head(leaf)) > blocksize(leaf)) ++ return 0; ++ ++ /* ++ * Delimiting key in the parent index node. Clear least bit to account ++ * for hash collision marker. ++ */ ++ starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1; ++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { ++ nexthash = e_hash(scan); ++ if (nexthash != get_hash(iam_leaf_container(leaf), ++ e_char(scan), e_keysize(scan))) { ++ BREAKPOINT(); ++ return 0; ++ } ++ if (0 && nexthash < starthash) { ++ /* ++ * Unfortunately this useful invariant cannot be ++ * reliably checked as parent node is nor necessarily ++ * locked. ++ */ ++ n_print(leaf); ++ printk("%#x < %#x\n", nexthash, starthash); ++ dump_stack(); ++ return 0; ++ } ++ if (nexthash < hash) { ++ BREAKPOINT(); ++ return 0; ++ } ++ hash = nexthash; ++ } ++ if (scan != end) { ++ BREAKPOINT(); ++ return 0; ++ } ++ return 1; ++} ++/* EXT3_INVARIANT_ON */ ++#endif ++ ++/* EXT3_CORRECTNESS_ON */ ++#endif ++ ++static struct iam_ikey *lvar_ikey(const struct iam_leaf *l, ++ struct iam_ikey *key) ++{ ++ lvar_hash_t *hash; ++ ++ assert_corr(n_at_rec(l)); ++ ++ hash = (void *)key; ++ *hash = e_hash(n_cur(l)); ++ return key; ++} ++ ++static struct iam_key *lvar_key(const struct iam_leaf *l) ++{ ++ return e_key(n_cur(l)); ++} ++ ++static int lvar_key_size(const struct iam_leaf *l) ++{ ++ return e_keysize(n_cur(l)); ++} ++ ++static void lvar_start(struct iam_leaf *l) ++{ ++ l->il_at = lvar_lentry(n_start(l)); ++} ++ ++static int lvar_init(struct iam_leaf *l) ++{ ++ int result; ++ int used; ++ struct lvar_leaf_header *head; ++ ++ assert_corr(l->il_bh != NULL); ++ ++ head = n_head(l); ++ used = h_used(head); ++ if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) && ++ used <= blocksize(l)) { ++ l->il_at = l->il_entries = lvar_lentry(n_start(l)); ++ result = 0; ++ } else { ++ struct inode *obj; ++ ++ obj = iam_leaf_container(l)->ic_object; ++ ext3_error(obj->i_sb, __FUNCTION__, ++ "Wrong magic in node %llu (#%lu): %#x != %#x or " ++ "wrong used: %i", ++ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino, ++ head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC), ++ used); ++ result = -EIO; ++ } ++ return result; ++} ++ ++static void lvar_fini(struct iam_leaf *l) ++{ ++ l->il_entries = l->il_at = NULL; ++} ++ ++struct iam_rec *lvar_rec(const struct iam_leaf *l) ++{ ++ assert_corr(n_at_rec(l)); ++ return e_rec(n_cur(l)); ++} ++ ++static void lvar_next(struct iam_leaf *l) ++{ ++ assert_corr(n_at_rec(l)); ++ assert_corr(iam_leaf_is_locked(l)); ++ l->il_at = lvar_lentry(e_next(l, n_cur(l))); ++} ++ ++static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k) ++{ ++ struct lvar_leaf_entry *found; ++ struct lvar_leaf_entry *scan; ++ struct lvar_leaf_entry *end; ++ int result; ++ const char *name; ++ int namelen; ++ int found_equal; ++ lvar_hash_t hash; ++ int last; ++ ++ assert_inv(n_invariant(leaf)); ++ end = n_end(leaf); ++ ++ name = kchar(k); ++ namelen = strlen(name); ++ hash = get_hash(iam_leaf_container(leaf), name, namelen); ++ found = NULL; ++ found_equal = 0; ++ last = 1; ++ ++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { ++ lvar_hash_t scan_hash; ++ ++ scan_hash = e_hash(scan); ++ if (scan_hash < hash) ++ found = scan; ++ else if (scan_hash == hash) { ++ if (e_eq(scan, name, namelen)) { ++ /* ++ * perfect match ++ */ ++ leaf->il_at = lvar_lentry(scan); ++ return IAM_LOOKUP_EXACT; ++ } else if (!found_equal) { ++ found = scan; ++ found_equal = 1; ++ } ++ } else { ++ last = 0; ++ break; ++ } ++ } ++ if (found == NULL) { ++ /* ++ * @k is less than all hashes in the leaf. ++ */ ++ lvar_start(leaf); ++ result = IAM_LOOKUP_BEFORE; ++ } else { ++ leaf->il_at = lvar_lentry(found); ++ result = IAM_LOOKUP_OK; ++ assert_corr(n_at_rec(leaf)); ++ } ++ if (last) ++ result |= IAM_LOOKUP_LAST; ++ assert_inv(n_invariant(leaf)); ++ ++ return result; ++} ++ ++static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik) ++{ ++ struct lvar_leaf_entry *scan; ++ struct lvar_leaf_entry *end; ++ lvar_hash_t hash; ++ ++ assert_inv(n_invariant(leaf)); ++ end = n_end(leaf); ++ hash = *(const lvar_hash_t *)ik; ++ ++ lvar_start(leaf); ++ for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { ++ lvar_hash_t scan_hash; ++ ++ scan_hash = e_hash(scan); ++ if (scan_hash > hash) ++ return scan == n_start(leaf) ? ++ IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK; ++ leaf->il_at = lvar_lentry(scan); ++ if (scan_hash == hash) ++ return IAM_LOOKUP_EXACT; ++ } ++ assert_inv(n_invariant(leaf)); ++ /* ++ * @ik is greater than any key in the node. Return last record in the ++ * node. ++ */ ++ return IAM_LOOKUP_OK; ++} ++ ++static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l))); ++} ++ ++static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ assert_corr(n_at_rec(l)); ++ assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l))); ++ assert_corr(iam_leaf_is_locked(l)); ++ __lvar_key_set(l, k); ++ assert_inv(n_invariant(l)); ++} ++ ++static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ lvar_hash_t hash; ++ const char *name; ++ ++ name = kchar(k); ++ ++ hash = get_hash(iam_leaf_container(l), name, strlen(name)); ++ return e_cmp(l, n_cur(l), hash); ++} ++ ++static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ const char *name; ++ ++ name = kchar(k); ++ return e_eq(n_cur(l), name, strlen(name)); ++} ++ ++static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ memcpy(e_rec(n_cur(l)), r, rec_size(r)); ++} ++ ++static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ assert_corr(n_at_rec(l)); ++ assert_corr(iam_leaf_is_locked(l)); ++ __lvar_rec_set(l, r); ++ assert_inv(n_invariant(l)); ++} ++ ++static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r) ++{ ++ struct iam_rec *rec; ++ ++ rec = e_rec(n_cur(l)); ++ assert_corr(n_at_rec(l)); ++ assert_corr(iam_leaf_is_locked(l)); ++ memcpy(r, rec, rec_size(rec)); ++ assert_inv(n_invariant(l)); ++} ++ ++static int lvar_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ assert_corr(iam_leaf_is_locked(l)); ++ return ++ h_used(n_head(l)) + ++ getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l); ++} ++ ++static int lvar_at_end(const struct iam_leaf *folio) ++{ ++ assert_corr(iam_leaf_is_locked(folio)); ++ return n_cur(folio) == n_end(folio); ++} ++ ++static void lvar_rec_add(struct iam_leaf *leaf, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ const char *key; ++ int ksize; ++ int shift; ++ void *end; ++ void *start; ++ ptrdiff_t diff; ++ ++ assert_corr(lvar_can_add(leaf, k, r)); ++ assert_inv(n_invariant(leaf)); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ key = kchar(k); ++ ksize = strlen(key); ++ shift = getsize(leaf, ksize, rec_size(r)); ++ ++ if (!lvar_at_end(leaf)) { ++ assert_corr(n_cur(leaf) < n_end(leaf)); ++ end = n_end(leaf); ++ if (lvar_key_cmp(leaf, k) <= 0) ++ lvar_next(leaf); ++ else ++ /* ++ * Another exceptional case: insertion with the key ++ * less than least key in the leaf. ++ */ ++ assert_corr(leaf->il_at == leaf->il_entries); ++ ++ start = leaf->il_at; ++ diff = PDIFF(end, start); ++ assert_corr(diff >= 0); ++ memmove(start + shift, start, diff); ++ } ++ h_used_adj(leaf, n_head(leaf), shift); ++ n_cur(leaf)->vle_keysize = cpu_to_le16(ksize); ++ n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf), ++ key, ksize)); ++ __lvar_key_set(leaf, k); ++ __lvar_rec_set(leaf, r); ++ assert_corr(n_at_rec(leaf)); ++ assert_inv(n_invariant(leaf)); ++} ++ ++static void lvar_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ void *next; ++ void *end; ++ int nob; ++ ++ assert_corr(n_at_rec(leaf)); ++ assert_inv(n_invariant(leaf)); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ end = n_end(leaf); ++ next = e_next(leaf, n_cur(leaf)); ++ nob = e_size(leaf, n_cur(leaf)); ++ memmove(leaf->il_at, next, end - next); ++ h_used_adj(leaf, n_head(leaf), -nob); ++ assert_inv(n_invariant(leaf)); ++} ++ ++static void lvar_init_new(struct iam_container *c, struct buffer_head *bh) ++{ ++ struct lvar_leaf_header *hdr; ++ ++ hdr = (struct lvar_leaf_header *)bh->b_data; ++ hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC); ++ hdr->vlh_used = sizeof *hdr; ++} ++ ++static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf, ++ struct lvar_leaf_entry **prev) ++{ ++ void *scan; ++ void *start; ++ int threshold; ++ ++ *prev = NULL; ++ threshold = blocksize(leaf) / 2; ++ for (scan = start = n_start(leaf); scan - start <= threshold; ++ *prev = scan, scan = e_next(leaf, scan)) { ++ ; ++ } ++ return scan; ++} ++ ++static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh, ++ iam_ptr_t new_blknr) ++{ ++ struct lvar_leaf_entry *first_to_move; ++ struct lvar_leaf_entry *last_to_stay; ++ struct iam_path *path; ++ struct lvar_leaf_header *hdr; ++ struct buffer_head *new_leaf; ++ ++ ptrdiff_t tomove; ++ lvar_hash_t hash; ++ ++ assert_inv(n_invariant(leaf)); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ new_leaf = *bh; ++ path = iam_leaf_path(leaf); ++ ++ hdr = (void *)new_leaf->b_data; ++ ++ first_to_move = find_pivot(leaf, &last_to_stay); ++ assert_corr(last_to_stay != NULL); ++ assert_corr(e_next(leaf, last_to_stay) == first_to_move); ++ ++ hash = e_hash(first_to_move); ++ if (hash == e_hash(last_to_stay)) ++ /* ++ * Duplicate hash. ++ */ ++ hash |= 1; ++ ++ tomove = PDIFF(n_end(leaf), first_to_move); ++ memmove(hdr + 1, first_to_move, tomove); ++ ++ h_used_adj(leaf, hdr, tomove); ++ h_used_adj(leaf, n_head(leaf), -tomove); ++ ++ assert_corr(n_end(leaf) == first_to_move); ++ ++ if (n_cur(leaf) >= first_to_move) { ++ /* ++ * insertion point moves into new leaf. ++ */ ++ ptrdiff_t shift; ++ int result; ++ ++ shift = PDIFF(leaf->il_at, first_to_move); ++ *bh = leaf->il_bh; ++ leaf->il_bh = new_leaf; ++ leaf->il_curidx = new_blknr; ++ ++ assert_corr(iam_leaf_is_locked(leaf)); ++ result = lvar_init(leaf); ++ /* ++ * init cannot fail, as node was just initialized. ++ */ ++ assert_corr(result == 0); ++ leaf->il_at = ((void *)leaf->il_at) + shift; ++ } ++ /* ++ * Insert pointer to the new node (together with the least key in ++ * the node) into index node. ++ */ ++ iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash, ++ new_blknr); ++ assert_corr(n_cur(leaf) < n_end(leaf)); ++ assert_inv(n_invariant(leaf)); ++} ++ ++static struct iam_leaf_operations lvar_leaf_ops = { ++ .init = lvar_init, ++ .init_new = lvar_init_new, ++ .fini = lvar_fini, ++ .start = lvar_start, ++ .next = lvar_next, ++ .key = lvar_key, ++ .ikey = lvar_ikey, ++ .rec = lvar_rec, ++ .key_set = lvar_key_set, ++ .key_cmp = lvar_key_cmp, ++ .key_eq = lvar_key_eq, ++ .key_size = lvar_key_size, ++ .rec_set = lvar_rec_set, ++ .rec_get = lvar_rec_get, ++ .lookup = lvar_lookup, ++ .ilookup = lvar_ilookup, ++ .at_end = lvar_at_end, ++ .rec_add = lvar_rec_add, ++ .rec_del = lvar_rec_del, ++ .can_add = lvar_can_add, ++ .split = lvar_split ++}; ++ ++/* ++ * Index operations. ++ */ ++ ++enum { ++ /* This is duplicated in lustre/utils/create_iam.c */ ++ /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */ ++ IAM_LVAR_ROOT_MAGIC = 0xb01dface ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct lvar_root { ++ __le32 vr_magic; ++ __le16 vr_recsize; ++ __le16 vr_ptrsize; ++ u8 vr_indirect_levels; ++ u8 vr_padding0; ++ __le16 vr_padding1; ++}; ++ ++static __u32 lvar_root_ptr(struct iam_container *c) ++{ ++ return 0; ++} ++ ++static int lvar_node_init(struct iam_container *c, struct buffer_head *bh, ++ int root) ++{ ++ return 0; ++} ++ ++static struct iam_entry *lvar_root_inc(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ struct lvar_root *root; ++ struct iam_entry *entries; ++ ++ assert_corr(iam_frame_is_locked(path, frame)); ++ entries = frame->entries; ++ ++ dx_set_count(entries, 2); ++ assert_corr(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ root = (void *)frame->bh->b_data; ++ assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC); ++ root->vr_indirect_levels ++; ++ frame->at = entries = iam_entry_shift(path, entries, 1); ++ memset(iam_ikey_at(path, entries), 0, ++ iam_path_descr(path)->id_ikey_size); ++ return entries; ++} ++ ++static int lvar_node_check(struct iam_path *path, struct iam_frame *frame) ++{ ++ unsigned count; ++ unsigned limit; ++ unsigned limit_correct; ++ struct iam_entry *entries; ++ ++ entries = dx_node_get_entries(path, frame); ++ ++ if (frame == path->ip_frames) { ++ struct lvar_root *root; ++ ++ root = (void *)frame->bh->b_data; ++ if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC) ++ return -EIO; ++ limit_correct = dx_root_limit(path); ++ } else ++ limit_correct = dx_node_limit(path); ++ count = dx_get_count(entries); ++ limit = dx_get_limit(entries); ++ if (count > limit) ++ return -EIO; ++ if (limit != limit_correct) ++ return -EIO; ++ return 0; ++} ++ ++static int lvar_node_load(struct iam_path *path, struct iam_frame *frame) ++{ ++ struct iam_entry *entries; ++ void *data; ++ entries = dx_node_get_entries(path, frame); ++ ++ data = frame->bh->b_data; ++ ++ if (frame == path->ip_frames) { ++ struct lvar_root *root; ++ const char *name; ++ ++ root = data; ++ name = kchar(path->ip_key_target); ++ path->ip_indirect = root->vr_indirect_levels; ++ if (path->ip_ikey_target == NULL) { ++ path->ip_ikey_target = iam_path_ikey(path, 4); ++ *(lvar_hash_t *)path->ip_ikey_target = ++ get_hash(path->ip_container, name, ++ strlen(name)); ++ } ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int lvar_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, const struct iam_ikey *k2) ++{ ++ lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1); ++ lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2); ++ ++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++} ++ ++static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c, ++ void *area) ++{ ++ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size); ++} ++ ++static int root_limit(int rootgap, int blocksize, int size) ++{ ++ int limit; ++ int nlimit; ++ ++ limit = (blocksize - rootgap) / size; ++ nlimit = blocksize / size; ++ if (limit == nlimit) ++ limit--; ++ return limit; ++} ++ ++static int lvar_root_limit(int blocksize, int size) ++{ ++ return root_limit(sizeof(struct lvar_root), blocksize, size); ++} ++ ++static void lvar_root(void *buf, ++ int blocksize, int keysize, int ptrsize, int recsize) ++{ ++ struct lvar_root *root; ++ struct dx_countlimit *limit; ++ void *entry; ++ int isize; ++ ++ isize = sizeof(lvar_hash_t) + ptrsize; ++ root = buf; ++ *root = (typeof(*root)) { ++ .vr_magic = cpu_to_le32(IAM_LVAR_ROOT_MAGIC), ++ .vr_recsize = cpu_to_le16(recsize), ++ .vr_ptrsize = cpu_to_le16(ptrsize), ++ .vr_indirect_levels = 0 ++ }; ++ ++ limit = (void *)(root + 1); ++ *limit = (typeof(*limit)){ ++ /* ++ * limit itself + one pointer to the leaf. ++ */ ++ .count = cpu_to_le16(2), ++ .limit = lvar_root_limit(blocksize, ++ sizeof (lvar_hash_t) + ptrsize) ++ }; ++ ++ entry = root + 1; ++ /* ++ * Skip over @limit. ++ */ ++ entry += isize; ++ ++ /* ++ * Entry format is followed by . In the minimal tree ++ * consisting of a root and single node, is a minimal possible ++ * key. ++ */ ++ *(lvar_hash_t *)entry = 0; ++ entry += sizeof(lvar_hash_t); ++ /* now @entry points to */ ++ if (ptrsize == 4) ++ *(u_int32_t *)entry = cpu_to_le32(1); ++ else ++ *(u_int64_t *)entry = cpu_to_le64(1); ++} ++ ++static int lvar_esize(int namelen, int recsize) ++{ ++ return (offsetof(struct lvar_leaf_entry, vle_key) + ++ namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND; ++} ++ ++static void lvar_leaf(void *buf, ++ int blocksize, int keysize, int ptrsize, int recsize) ++{ ++ struct lvar_leaf_header *head; ++ struct lvar_leaf_entry *entry; ++ ++ /* form leaf */ ++ head = buf; ++ *head = (typeof(*head)) { ++ .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC), ++ .vlh_used = cpu_to_le16(sizeof *head + lvar_esize(0, recsize)) ++ }; ++ entry = (void *)(head + 1); ++ *entry = (typeof(*entry)) { ++ .vle_hash = 0, ++ .vle_keysize = 0 ++ }; ++ memset(e_rec(entry), 0, recsize); ++ *(char *)e_rec(entry) = recsize; ++} ++ ++#include ++#include ++#include ++ ++int iam_lvar_create(struct inode *obj, ++ int keysize, int ptrsize, int recsize, handle_t *handle) ++{ ++ struct buffer_head *root_node; ++ struct buffer_head *leaf_node; ++ struct super_block *sb; ++ ++ u32 blknr; ++ int result; ++ unsigned long bsize; ++ ++ assert_corr(obj->i_size == 0); ++ ++ sb = obj->i_sb; ++ bsize = sb->s_blocksize; ++ root_node = ext3_append(handle, obj, &blknr, &result); ++ leaf_node = ext3_append(handle, obj, &blknr, &result); ++ if (root_node != NULL && leaf_node != NULL) { ++ lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize); ++ lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize); ++ ext3_mark_inode_dirty(handle, obj); ++ result = ext3_journal_dirty_metadata(handle, root_node); ++ if (result == 0) ++ result = ext3_journal_dirty_metadata(handle, leaf_node); ++ if (result != 0) ++ ext3_std_error(sb, result); ++ } ++ brelse(leaf_node); ++ brelse(root_node); ++ return result; ++} ++EXPORT_SYMBOL(iam_lvar_create); ++ ++static struct iam_operations lvar_ops = { ++ .id_root_ptr = lvar_root_ptr, ++ .id_node_read = iam_node_read, ++ .id_node_init = lvar_node_init, ++ .id_node_check = lvar_node_check, ++ .id_node_load = lvar_node_load, ++ .id_ikeycmp = lvar_ikeycmp, ++ .id_root_inc = lvar_root_inc, ++ .id_ipd_alloc = lvar_ipd_alloc, ++ .id_ipd_free = iam_ipd_free, ++ .id_name = "lvar" ++}; ++ ++static int lvar_guess(struct iam_container *c) ++{ ++ int result; ++ struct buffer_head *bh; ++ const struct lvar_root *root; ++ ++ assert_corr(c->ic_object != NULL); ++ ++ result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh); ++ if (result == 0) { ++ root = (void *)bh->b_data; ++ if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) { ++ struct iam_descr *descr; ++ ++ descr = c->ic_descr; ++ descr->id_key_size = EXT3_NAME_LEN; ++ descr->id_ikey_size = sizeof (lvar_hash_t); ++ descr->id_rec_size = le16_to_cpu(root->vr_recsize); ++ descr->id_ptr_size = le16_to_cpu(root->vr_ptrsize); ++ descr->id_root_gap = sizeof *root; ++ descr->id_node_gap = 0; ++ descr->id_ops = &lvar_ops; ++ descr->id_leaf_ops = &lvar_leaf_ops; ++ } else ++ result = -EBADF; ++ brelse(bh); ++ } ++ return result; ++} ++ ++static struct iam_format lvar_format = { ++ .if_guess = lvar_guess ++}; ++ ++void iam_lvar_format_init(void) ++{ ++ iam_format_register(&lvar_format); ++} ++ +Index: linux-stage/fs/ext3/iam_lfix.c +=================================================================== +--- linux-stage.orig/fs/ext3/iam_lfix.c 2006-06-16 16:07:58.000000000 +0300 ++++ linux-stage/fs/ext3/iam_lfix.c 2007-10-21 17:32:18.000000000 +0300 +@@ -0,0 +1,732 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_lfix.c ++ * implementation of iam format for fixed size records. ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Wang Di ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error() */ ++#include ++ ++#include ++ ++/* ++ * Leaf operations. ++ */ ++ ++enum { ++ IAM_LEAF_HEADER_MAGIC = 0x1976 /* This is duplicated in ++ * lustre/utils/create_iam.c */ ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct iam_leaf_head { ++ __le16 ill_magic; ++ __le16 ill_count; ++}; ++ ++static inline int iam_lfix_entry_size(const struct iam_leaf *l) ++{ ++ return iam_leaf_descr(l)->id_key_size + iam_leaf_descr(l)->id_rec_size; ++} ++ ++static inline struct iam_lentry * ++iam_lfix_shift(const struct iam_leaf *l, struct iam_lentry *entry, int shift) ++{ ++ return (void *)entry + shift * iam_lfix_entry_size(l); ++} ++ ++static inline struct iam_key *iam_leaf_key_at(struct iam_lentry *entry) ++{ ++ return (struct iam_key *)entry; ++} ++ ++static inline int lfix_keycmp(const struct iam_container *c, ++ const struct iam_key *k1, ++ const struct iam_key *k2) ++{ ++ return memcmp(k1, k2, c->ic_descr->id_key_size); ++} ++ ++static struct iam_leaf_head *iam_get_head(const struct iam_leaf *l) ++{ ++ return (struct iam_leaf_head *)l->il_bh->b_data; ++} ++ ++static struct iam_lentry *iam_entries(const struct buffer_head *bh) ++{ ++ return (void *)bh->b_data + sizeof(struct iam_leaf_head); ++} ++ ++static struct iam_lentry *iam_get_lentries(const struct iam_leaf *l) ++{ ++ return iam_entries(l->il_bh); ++} ++ ++static int leaf_count_limit(const struct iam_leaf *leaf) ++{ ++ int free_space; ++ ++ free_space = iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize; ++ free_space -= sizeof(struct iam_leaf_head); ++ return free_space / iam_lfix_entry_size(leaf); ++} ++ ++static int lentry_count_get(const struct iam_leaf *leaf) ++{ ++ return le16_to_cpu(iam_get_head(leaf)->ill_count); ++} ++ ++static void lentry_count_set(struct iam_leaf *leaf, unsigned count) ++{ ++ assert_corr(0 <= count && count <= leaf_count_limit(leaf)); ++ iam_get_head(leaf)->ill_count = cpu_to_le16(count); ++} ++ ++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l); ++ ++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON ++static int iam_leaf_at_rec(const struct iam_leaf *folio) ++{ ++ return ++ iam_get_lentries(folio) <= folio->il_at && ++ folio->il_at < iam_lfix_get_end(folio); ++} ++#endif ++ ++static struct iam_ikey *iam_lfix_ikey(const struct iam_leaf *l, ++ struct iam_ikey *key) ++{ ++ void *ie = l->il_at; ++ assert_corr(iam_leaf_at_rec(l)); ++ return (struct iam_ikey*)ie; ++} ++ ++static struct iam_key *iam_lfix_key(const struct iam_leaf *l) ++{ ++ void *ie = l->il_at; ++ assert_corr(iam_leaf_at_rec(l)); ++ return (struct iam_key*)ie; ++} ++ ++static int iam_lfix_key_size(const struct iam_leaf *l) ++{ ++ return iam_leaf_descr(l)->id_key_size; ++} ++ ++static void iam_lfix_start(struct iam_leaf *l) ++{ ++ l->il_at = iam_get_lentries(l); ++} ++ ++static inline ptrdiff_t iam_lfix_diff(const struct iam_leaf *l, ++ const struct iam_lentry *e1, ++ const struct iam_lentry *e2) ++{ ++ ptrdiff_t diff; ++ int esize; ++ ++ esize = iam_lfix_entry_size(l); ++ diff = (void *)e1 - (void *)e2; ++ assert_corr(diff / esize * esize == diff); ++ return diff / esize; ++} ++ ++static int iam_lfix_init(struct iam_leaf *l) ++{ ++ int result; ++ struct iam_leaf_head *ill; ++ int count; ++ ++ assert_corr(l->il_bh != NULL); ++ ++ ill = iam_get_head(l); ++ count = le16_to_cpu(ill->ill_count); ++ if (ill->ill_magic == le16_to_cpu(IAM_LEAF_HEADER_MAGIC) && ++ 0 <= count && count <= leaf_count_limit(l)) { ++ l->il_at = l->il_entries = iam_get_lentries(l); ++ result = 0; ++ } else { ++ struct inode *obj; ++ ++ obj = iam_leaf_container(l)->ic_object; ++ ext3_error(obj->i_sb, __FUNCTION__, ++ "Wrong magic in node %llu (#%lu): %#x != %#x or " ++ "wrong count: %i (%i)", ++ (unsigned long long)l->il_bh->b_blocknr, obj->i_ino, ++ ill->ill_magic, le16_to_cpu(IAM_LEAF_HEADER_MAGIC), ++ count, leaf_count_limit(l)); ++ result = -EIO; ++ } ++ return result; ++} ++ ++static void iam_lfix_fini(struct iam_leaf *l) ++{ ++ l->il_entries = l->il_at = NULL; ++} ++ ++static struct iam_lentry *iam_lfix_get_end(const struct iam_leaf *l) ++{ ++ int count = lentry_count_get(l); ++ struct iam_lentry *ile = iam_lfix_shift(l, l->il_entries, count); ++ ++ return ile; ++} ++ ++struct iam_rec *iam_lfix_rec(const struct iam_leaf *l) ++{ ++ void *e = l->il_at; ++ assert_corr(iam_leaf_at_rec(l)); ++ return e + iam_leaf_descr(l)->id_key_size; ++} ++ ++static void iam_lfix_next(struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ l->il_at = iam_lfix_shift(l, l->il_at, 1); ++} ++ ++/* ++ * Bug chasing. ++ */ ++int lfix_dump = 0; ++EXPORT_SYMBOL(lfix_dump); ++ ++static char hdigit(char ch) ++{ ++ static char d[] = "0123456789abcdef"; ++ return d[ch & 0xf]; ++} ++ ++static char *hex(char ch, char *area) ++{ ++ area[0] = hdigit(ch >> 4); ++ area[1] = hdigit(ch); ++ area[2] = 0; ++ return area; ++} ++ ++static void l_print(struct iam_leaf *leaf, struct iam_lentry *entry) ++{ ++ int i; ++ char *area; ++ char h[3]; ++ ++ area = (char *)entry; ++ printk(KERN_EMERG "["); ++ for (i = iam_lfix_key_size(leaf); i > 0; --i, ++area) ++ printk("%s", hex(*area, h)); ++ printk("]-("); ++ for (i = iam_leaf_descr(leaf)->id_rec_size; i > 0; --i, ++area) ++ printk("%s", hex(*area, h)); ++ printk(")\n"); ++} ++ ++static void lfix_print(struct iam_leaf *leaf) ++{ ++ struct iam_lentry *entry; ++ int count; ++ int i; ++ ++ entry = leaf->il_entries; ++ count = lentry_count_get(leaf); ++ printk(KERN_EMERG "lfix: %p %p %d\n", leaf, leaf->il_at, count); ++ for (i = 0; i < count; ++i, entry = iam_lfix_shift(leaf, entry, 1)) ++ l_print(leaf, entry); ++} ++ ++static int iam_lfix_lookup(struct iam_leaf *l, const struct iam_key *k) ++{ ++ struct iam_lentry *p, *q, *m, *t; ++ struct iam_container *c; ++ int count; ++ int result; ++ ++ count = lentry_count_get(l); ++ if (count == 0) ++ return IAM_LOOKUP_EMPTY; ++ ++ result = IAM_LOOKUP_OK; ++ c = iam_leaf_container(l); ++ ++ p = l->il_entries; ++ q = iam_lfix_shift(l, p, count - 1); ++ if (lfix_keycmp(c, k, iam_leaf_key_at(p)) < 0) { ++ /* ++ * @k is less than the least key in the leaf ++ */ ++ l->il_at = p; ++ result = IAM_LOOKUP_BEFORE; ++ } else if (lfix_keycmp(c, iam_leaf_key_at(q), k) <= 0) { ++ l->il_at = q; ++ } else { ++ /* ++ * EWD1293 ++ */ ++ while (iam_lfix_shift(l, p, 1) != q) { ++ m = iam_lfix_shift(l, p, iam_lfix_diff(l, q, p) / 2); ++ assert_corr(p < m && m < q); ++ if (lfix_keycmp(c, iam_leaf_key_at(m), k) <= 0) ++ p = m; ++ else ++ q = m; ++ } ++ assert_corr(lfix_keycmp(c, iam_leaf_key_at(p), k) <= 0 && ++ lfix_keycmp(c, k, iam_leaf_key_at(q)) < 0); ++ /* ++ * skip over records with duplicate keys. ++ */ ++ while (p > l->il_entries) { ++ t = iam_lfix_shift(l, p, -1); ++ if (lfix_keycmp(c, iam_leaf_key_at(t), k) == 0) ++ p = t; ++ else ++ break; ++ } ++ l->il_at = p; ++ } ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ if (lfix_keycmp(c, iam_leaf_key_at(l->il_at), k) == 0) ++ result = IAM_LOOKUP_EXACT; ++ ++ if (lfix_dump) ++ lfix_print(l); ++ ++ return result; ++} ++ ++static int iam_lfix_ilookup(struct iam_leaf *l, const struct iam_ikey *ik) ++{ ++ assert(0); ++ return IAM_LOOKUP_OK; ++} ++ ++static void iam_lfix_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ memcpy(iam_leaf_key_at(l->il_at), k, iam_leaf_descr(l)->id_key_size); ++} ++ ++static int iam_lfix_key_cmp(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ return lfix_keycmp(iam_leaf_container(l), iam_leaf_key_at(l->il_at), k); ++} ++ ++static int iam_lfix_key_eq(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ return !lfix_keycmp(iam_leaf_container(l), ++ iam_leaf_key_at(l->il_at), k); ++} ++ ++static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size); ++} ++ ++static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ memcpy(r, iam_lfix_rec(l), iam_leaf_descr(l)->id_rec_size); ++} ++ ++static void iam_lfix_rec_add(struct iam_leaf *leaf, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ struct iam_lentry *end; ++ struct iam_lentry *cur; ++ struct iam_lentry *start; ++ ptrdiff_t diff; ++ int count; ++ ++ assert_corr(iam_leaf_can_add(leaf, k, r)); ++ ++ count = lentry_count_get(leaf); ++ /* ++ * This branch handles two exceptional cases: ++ * ++ * - leaf positioned beyond last record, and ++ * ++ * - empty leaf. ++ */ ++ if (!iam_leaf_at_end(leaf)) { ++ end = iam_lfix_get_end(leaf); ++ cur = leaf->il_at; ++ if (lfix_keycmp(iam_leaf_container(leaf), ++ k, iam_leaf_key_at(cur)) >= 0) ++ iam_lfix_next(leaf); ++ else ++ /* ++ * Another exceptional case: insertion with the key ++ * less than least key in the leaf. ++ */ ++ assert_corr(cur == leaf->il_entries); ++ ++ start = leaf->il_at; ++ diff = (void *)end - (void *)start; ++ assert_corr(diff >= 0); ++ memmove(iam_lfix_shift(leaf, start, 1), start, diff); ++ } ++ lentry_count_set(leaf, count + 1); ++ iam_lfix_key_set(leaf, k); ++ iam_lfix_rec_set(leaf, r); ++ assert_corr(iam_leaf_at_rec(leaf)); ++} ++ ++static void iam_lfix_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ struct iam_lentry *next, *end; ++ int count; ++ ptrdiff_t diff; ++ ++ assert_corr(iam_leaf_at_rec(leaf)); ++ ++ count = lentry_count_get(leaf); ++ end = iam_lfix_get_end(leaf); ++ next = iam_lfix_shift(leaf, leaf->il_at, 1); ++ diff = (void *)end - (void *)next; ++ memmove(leaf->il_at, next, diff); ++ ++ lentry_count_set(leaf, count - 1); ++} ++ ++static int iam_lfix_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ return lentry_count_get(l) < leaf_count_limit(l); ++} ++ ++static int iam_lfix_at_end(const struct iam_leaf *folio) ++{ ++ return folio->il_at == iam_lfix_get_end(folio); ++} ++ ++static void iam_lfix_init_new(struct iam_container *c, struct buffer_head *bh) ++{ ++ struct iam_leaf_head *hdr; ++ ++ hdr = (struct iam_leaf_head*)bh->b_data; ++ hdr->ill_magic = cpu_to_le16(IAM_LEAF_HEADER_MAGIC); ++ hdr->ill_count = cpu_to_le16(0); ++} ++ ++static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh, ++ iam_ptr_t new_blknr) ++{ ++ struct iam_path *path; ++ struct iam_leaf_head *hdr; ++ const struct iam_ikey *pivot; ++ struct buffer_head *new_leaf; ++ ++ unsigned count; ++ unsigned split; ++ ++ void *start; ++ void *finis; ++ ++ new_leaf = *bh; ++ path = iam_leaf_path(l); ++ ++ hdr = (void *)new_leaf->b_data; ++ ++ count = lentry_count_get(l); ++ split = count / 2; ++ ++ start = iam_lfix_shift(l, iam_get_lentries(l), split); ++ finis = iam_lfix_shift(l, iam_get_lentries(l), count); ++ ++ pivot = (const struct iam_ikey *)iam_leaf_key_at(start); ++ ++ memmove(iam_entries(new_leaf), start, finis - start); ++ hdr->ill_count = count - split; ++ lentry_count_set(l, split); ++ if ((void *)l->il_at >= start) { ++ /* ++ * insertion point moves into new leaf. ++ */ ++ int shift; ++ int result; ++ ++ shift = iam_lfix_diff(l, l->il_at, start); ++ *bh = l->il_bh; ++ l->il_bh = new_leaf; ++ l->il_curidx = new_blknr; ++ result = iam_lfix_init(l); ++ /* ++ * init cannot fail, as node was just initialized. ++ */ ++ assert_corr(result == 0); ++ l->il_at = iam_lfix_shift(l, iam_get_lentries(l), shift); ++ } ++ /* ++ * Insert pointer to the new node (together with the least key in ++ * the node) into index node. ++ */ ++ iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr); ++} ++ ++static struct iam_leaf_operations iam_lfix_leaf_ops = { ++ .init = iam_lfix_init, ++ .init_new = iam_lfix_init_new, ++ .fini = iam_lfix_fini, ++ .start = iam_lfix_start, ++ .next = iam_lfix_next, ++ .key = iam_lfix_key, ++ .ikey = iam_lfix_ikey, ++ .rec = iam_lfix_rec, ++ .key_set = iam_lfix_key_set, ++ .key_cmp = iam_lfix_key_cmp, ++ .key_eq = iam_lfix_key_eq, ++ .key_size = iam_lfix_key_size, ++ .rec_set = iam_lfix_rec_set, ++ .rec_get = iam_lfix_rec_get, ++ .lookup = iam_lfix_lookup, ++ .ilookup = iam_lfix_ilookup, ++ .at_end = iam_lfix_at_end, ++ .rec_add = iam_lfix_rec_add, ++ .rec_del = iam_lfix_rec_del, ++ .can_add = iam_lfix_can_add, ++ .split = iam_lfix_split ++}; ++ ++/* ++ * Index operations. ++ */ ++ ++enum { ++ /* This is duplicated in lustre/utils/create_iam.c */ ++ /* ++ * Then shalt thou see the dew-BEDABBLED wretch ++ * Turn, and return, indenting with the way; ++ * Each envious brier his weary legs doth scratch, ++ * Each shadow makes him stop, each murmur stay: ++ * For misery is trodden on by many, ++ * And being low never relieved by any. ++ */ ++ IAM_LFIX_ROOT_MAGIC = 0xbedabb1edULL // d01efull ++}; ++ ++/* This is duplicated in lustre/utils/create_iam.c */ ++struct iam_lfix_root { ++ __le64 ilr_magic; ++ __le16 ilr_keysize; ++ __le16 ilr_recsize; ++ __le16 ilr_ptrsize; ++ u8 ilr_indirect_levels; ++ u8 ilr_padding; ++}; ++ ++static __u32 iam_lfix_root_ptr(struct iam_container *c) ++{ ++ return 0; ++} ++ ++static int iam_lfix_node_init(struct iam_container *c, struct buffer_head *bh, ++ int root) ++{ ++ return 0; ++} ++ ++static struct iam_entry *iam_lfix_root_inc(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ struct iam_lfix_root *root; ++ struct iam_entry *entries; ++ ++ entries = frame->entries; ++ ++ dx_set_count(entries, 2); ++ assert_corr(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ root = (void *)frame->bh->b_data; ++ assert_corr(le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC); ++ root->ilr_indirect_levels ++; ++ frame->at = entries = iam_entry_shift(path, entries, 1); ++ memset(iam_ikey_at(path, entries), 0, ++ iam_path_descr(path)->id_ikey_size); ++ return entries; ++} ++ ++static int iam_lfix_node_check(struct iam_path *path, struct iam_frame *frame) ++{ ++ unsigned count; ++ unsigned limit; ++ unsigned limit_correct; ++ struct iam_entry *entries; ++ ++ entries = dx_node_get_entries(path, frame); ++ ++ if (frame == path->ip_frames) { ++ struct iam_lfix_root *root; ++ ++ root = (void *)frame->bh->b_data; ++ if (le64_to_cpu(root->ilr_magic) != IAM_LFIX_ROOT_MAGIC) { ++ return -EIO; ++ } ++ limit_correct = dx_root_limit(path); ++ } else ++ limit_correct = dx_node_limit(path); ++ count = dx_get_count(entries); ++ limit = dx_get_limit(entries); ++ if (count > limit) { ++ return -EIO; ++ } ++ if (limit != limit_correct) { ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int iam_lfix_node_load(struct iam_path *path, struct iam_frame *frame) ++{ ++ struct iam_entry *entries; ++ void *data; ++ entries = dx_node_get_entries(path, frame); ++ ++ data = frame->bh->b_data; ++ ++ if (frame == path->ip_frames) { ++ struct iam_lfix_root *root; ++ ++ root = data; ++ path->ip_indirect = root->ilr_indirect_levels; ++ if (path->ip_ikey_target == NULL) ++ path->ip_ikey_target = ++ (struct iam_ikey *)path->ip_key_target; ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int iam_lfix_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2) ++{ ++ return memcmp(k1, k2, c->ic_descr->id_ikey_size); ++} ++ ++static struct iam_path_descr *iam_lfix_ipd_alloc(const struct iam_container *c, ++ void *area) ++{ ++ return iam_ipd_alloc(area, c->ic_descr->id_ikey_size); ++} ++ ++static struct iam_operations iam_lfix_ops = { ++ .id_root_ptr = iam_lfix_root_ptr, ++ .id_node_read = iam_node_read, ++ .id_node_init = iam_lfix_node_init, ++ .id_node_check = iam_lfix_node_check, ++ .id_node_load = iam_lfix_node_load, ++ .id_ikeycmp = iam_lfix_ikeycmp, ++ .id_root_inc = iam_lfix_root_inc, ++ .id_ipd_alloc = iam_lfix_ipd_alloc, ++ .id_ipd_free = iam_ipd_free, ++ .id_name = "lfix" ++}; ++ ++static int iam_lfix_guess(struct iam_container *c) ++{ ++ int result; ++ struct buffer_head *bh; ++ const struct iam_lfix_root *root; ++ ++ assert_corr(c->ic_object != NULL); ++ ++ result = iam_node_read(c, iam_lfix_root_ptr(c), NULL, &bh); ++ if (result == 0) { ++ root = (void *)bh->b_data; ++ if (le64_to_cpu(root->ilr_magic) == IAM_LFIX_ROOT_MAGIC) { ++ struct iam_descr *descr; ++ ++ descr = c->ic_descr; ++ descr->id_key_size = le16_to_cpu(root->ilr_keysize); ++ descr->id_ikey_size = le16_to_cpu(root->ilr_keysize); ++ descr->id_rec_size = le16_to_cpu(root->ilr_recsize); ++ descr->id_ptr_size = le16_to_cpu(root->ilr_ptrsize); ++ descr->id_root_gap = sizeof(struct iam_lfix_root); ++ descr->id_node_gap = 0; ++ descr->id_ops = &iam_lfix_ops; ++ descr->id_leaf_ops = &iam_lfix_leaf_ops; ++ } else ++ result = -EBADF; ++ brelse(bh); ++ } ++ return result; ++} ++ ++static struct iam_format iam_lfix_format = { ++ .if_guess = iam_lfix_guess ++}; ++ ++void iam_lfix_format_init(void) ++{ ++ iam_format_register(&iam_lfix_format); ++} ++ ++/* ++ * Debugging aid. ++ */ ++ ++#define KEYSIZE (8) ++#define RECSIZE (8) ++#define PTRSIZE (4) ++ ++#define LFIX_ROOT_RECNO \ ++ ((4096 - sizeof(struct iam_lfix_root)) / (KEYSIZE + PTRSIZE)) ++ ++#define LFIX_INDEX_RECNO (4096 / (KEYSIZE + PTRSIZE)) ++ ++#define LFIX_LEAF_RECNO \ ++ ((4096 - sizeof(struct iam_leaf_head)) / (KEYSIZE + RECSIZE)) ++ ++struct lfix_root { ++ struct iam_lfix_root lr_root; ++ struct { ++ char key[KEYSIZE]; ++ char ptr[PTRSIZE]; ++ } lr_entry[LFIX_ROOT_RECNO]; ++}; ++ ++struct lfix_index { ++ struct dx_countlimit li_cl; ++ char li_padding[KEYSIZE + PTRSIZE - sizeof(struct dx_countlimit)]; ++ struct { ++ char key[KEYSIZE]; ++ char ptr[PTRSIZE]; ++ } li_entry[LFIX_INDEX_RECNO - 1]; ++}; ++ ++struct lfix_leaf { ++ struct iam_leaf_head ll_head; ++ struct { ++ char key[KEYSIZE]; ++ char rec[RECSIZE]; ++ } ll_entry[LFIX_LEAF_RECNO]; ++}; +Index: linux-stage/fs/ext3/iam_htree.c +=================================================================== +--- linux-stage.orig/fs/ext3/iam_htree.c 2006-06-16 16:07:58.000000000 +0300 ++++ linux-stage/fs/ext3/iam_htree.c 2007-10-21 17:32:18.000000000 +0300 +@@ -0,0 +1,685 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_htree.c ++ * implementation of iam format for ext3/htree. ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error(), EXT3_DIR_ROUND() */ ++#include ++ ++#include ++ ++ ++static inline struct ext3_dir_entry_2 *dent(struct iam_lentry *ent) ++{ ++ return (struct ext3_dir_entry_2 *)ent; ++} ++ ++static inline struct iam_path_compat *getipc(const struct iam_leaf *folio) ++{ ++ struct iam_path *path; ++ ++ path = iam_leaf_path(folio); ++ assert_corr(dx_index_is_compat(path)); ++ assert_corr(path->ip_data != NULL); ++ return container_of(path->ip_data, struct iam_path_compat, ipc_descr); ++} ++ ++static inline struct ext3_dir_entry_2 *getent(const struct iam_leaf *folio) ++{ ++ return dent(folio->il_at); ++} ++ ++static __u32 hashname(const struct iam_leaf *folio, ++ const char *name, int namelen) ++{ ++ int result; ++ struct dx_hash_info *hinfo; ++ ++ hinfo = getipc(folio)->ipc_hinfo; ++ assert_corr(hinfo != NULL); ++ result = ext3fs_dirhash(name, namelen, hinfo); ++ assert_corr(result == 0); ++ return hinfo->hash; ++} ++ ++static __u32 gethash(const struct iam_leaf *folio, ++ const struct ext3_dir_entry_2 *ent) ++{ ++ return hashname(folio, ent->name, ent->name_len); ++} ++ ++static inline size_t recsize(size_t namelen) ++{ ++ return EXT3_DIR_REC_LEN(namelen); ++} ++ ++static struct ext3_dir_entry_2 *getlast(const struct iam_leaf *folio, int namelen) ++{ ++ return ++ (void *)folio->il_bh->b_data + ++ iam_leaf_container(folio)->ic_object->i_sb->s_blocksize - ++ recsize(namelen); ++} ++ ++static struct ext3_dir_entry_2 *gettop(const struct iam_leaf *folio) ++{ ++ return getlast(folio, 0); ++} ++ ++static inline int ent_is_live(const struct ext3_dir_entry_2 *ent) ++{ ++ return ent->inode != 0; ++} ++ ++static struct ext3_dir_entry_2 *entnext(const struct ext3_dir_entry_2 *ent) ++{ ++ return (void *)ent + le16_to_cpu(ent->rec_len); ++} ++ ++static struct ext3_dir_entry_2 *skipdead(struct ext3_dir_entry_2 *ent) ++{ ++ if (!ent_is_live(ent)) ++ ent = entnext(ent); ++ /* ++ * There can be no more than one dead entry in a row. ++ */ ++ return ent; ++} ++ ++static struct ext3_dir_entry_2 *getstart(const struct iam_leaf *folio) ++{ ++ return (void *)folio->il_bh->b_data; ++} ++ ++static int getfreespace(const struct ext3_dir_entry_2 *ent) ++{ ++ int free; ++ ++ free = le16_to_cpu(ent->rec_len); ++ if (ent_is_live(ent)) ++ free -= recsize(ent->name_len); ++ assert_corr(free >= 0); ++ return free; ++} ++ ++static int entcmp(const struct iam_leaf *folio, ++ const struct ext3_dir_entry_2 *e0, const struct ext3_dir_entry_2 *e1) ++{ ++ __u32 hash0; ++ __u32 hash1; ++ ++ assert_corr(ent_is_live(e0)); ++ assert_corr(ent_is_live(e1)); ++ ++ hash0 = gethash(folio, e0); ++ hash1 = gethash(folio, e1); ++ if (hash0 < hash1) ++ return -1; ++ else if (hash0 > hash1) ++ return +1; ++ else if (e0 < e1) ++ return -1; ++ else if (e0 > e1) ++ return +1; ++ else ++ return 0; ++} ++ ++#if EXT3_CORRECTNESS_ON || EXT3_INVARIANT_ON ++static int iam_leaf_at_rec(const struct iam_leaf *folio) ++{ ++ struct ext3_dir_entry_2 *ent; ++ ++ ent = getent(folio); ++ return getstart(folio) <= ent && ++ ent < gettop(folio) && ent_is_live(ent); ++} ++#endif ++ ++/* ++ * Leaf operations. ++ */ ++ ++static struct iam_ikey *iam_htree_ikey(const struct iam_leaf *l, ++ struct iam_ikey *key) ++{ ++ __u32 *hash; ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ hash = (void *)key; ++ *hash = gethash(l, getent(l)); ++ return key; ++} ++ ++static struct iam_key *iam_htree_key(const struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ return (struct iam_key *)&getent(l)->name; ++} ++ ++static int iam_htree_key_size(const struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ ++ return getent(l)->name_len; ++} ++ ++static void iam_htree_start(struct iam_leaf *l) ++{ ++ l->il_at = (void *)skipdead(getstart(l)); ++} ++ ++static int iam_htree_init(struct iam_leaf *l) ++{ ++ assert_corr(l->il_bh != NULL); ++ ++ l->il_at = l->il_entries = (void *)getstart(l); ++ return 0; ++} ++ ++static void iam_htree_fini(struct iam_leaf *l) ++{ ++ l->il_entries = l->il_at = NULL; ++} ++ ++struct iam_rec *iam_htree_rec(const struct iam_leaf *l) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ return (void *)&getent(l)->inode; ++} ++ ++static void iam_htree_next(struct iam_leaf *l) ++{ ++ struct ext3_dir_entry_2 *scan; ++ struct ext3_dir_entry_2 *found; ++ ++ assert_corr(iam_leaf_at_rec(l)); ++ found = NULL; ++ for (scan = getstart(l); scan < gettop(l); scan = entnext(scan)) { ++ if (scan != getent(l) && ent_is_live(scan) && ++ entcmp(l, getent(l), scan) < 0 && ++ (found == NULL || entcmp(l, scan, found) < 0)) ++ found = scan; ++ } ++ assert_corr(ergo(found != NULL, ++ gethash(l, getent(l)) <= gethash(l, found))); ++ l->il_at = (void *)(found ? : gettop(l)); ++} ++ ++static int iam_htree_at_end(const struct iam_leaf *folio) ++{ ++ return getent(folio) >= gettop(folio); ++} ++ ++ ++static inline int match(int len, const char *const name, ++ struct ext3_dir_entry_2 *de) ++{ ++ if (len != de->name_len) ++ return 0; ++ if (!de->inode) ++ return 0; ++ return !memcmp(name, de->name, len); ++} ++ ++static int iam_htree_lookup(struct iam_leaf *l, const struct iam_key *k) ++{ ++ struct iam_container *c; ++ struct ext3_dir_entry_2 *scan; ++ struct ext3_dir_entry_2 *found; ++ __u32 hash; ++ int result; ++ int namelen; ++ int last = 1; ++ const char *name; ++ ++ c = iam_leaf_container(l); ++ name = (const char *)k; ++ namelen = strlen(name); ++ hash = hashname(l, name, namelen); ++ found = NULL; ++ result = IAM_LOOKUP_OK; ++ for (scan = getstart(l); scan < getlast(l, namelen); ++ scan = entnext(scan)) { ++ if (match(namelen, name, scan)) { ++ found = scan; ++ result = IAM_LOOKUP_EXACT; ++ break; ++ } else if (ent_is_live(scan)) { ++ if (gethash(l, scan) <= hash) ++ found = scan; ++ else ++ last = 0; ++ } ++ } ++ if (found == NULL) { ++ /* ++ * @k is less than all hashes in the leaf. ++ */ ++ iam_htree_start(l); ++ result = IAM_LOOKUP_BEFORE; ++ } else { ++ l->il_at = (void *)found; ++ assert_corr(iam_leaf_at_rec(l)); ++ } ++ if (last) ++ result |= IAM_LOOKUP_LAST; ++ return result; ++} ++ ++static int iam_htree_ilookup(struct iam_leaf *l, const struct iam_ikey *ik) ++{ ++ assert(0); ++ return IAM_LOOKUP_OK; ++} ++ ++static void iam_htree_key_set(struct iam_leaf *l, const struct iam_key *k) ++{ ++ assert_corr(iam_leaf_at_rec(l)); ++ assert(0); ++} ++ ++static int iam_htree_key_cmp(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ const char *name; ++ __u32 h0; ++ __u32 h1; ++ ++ name = (const char *)k; ++ ++ assert_corr(ent_is_live(getent(l))); ++ ++ h0 = gethash(l, getent(l)); ++ h1 = hashname(l, name, strlen(name)); ++ ++ return h0 < h1 ? -1 : (h0 == h1 ? 0 : +1); ++} ++ ++static int iam_htree_key_eq(const struct iam_leaf *l, const struct iam_key *k) ++{ ++ const char *name; ++ ++ name = (const char *)k; ++ return match(strlen(name), name, getent(l)); ++} ++ ++static void iam_htree_rec_set(struct iam_leaf *l, const struct iam_rec *r) ++{ ++ __u32 *ino; ++ ++ ino = (void *)r; ++ getent(l)->inode = cpu_to_le32(*ino); ++} ++ ++static void iam_htree_rec_get(const struct iam_leaf *l, struct iam_rec *r) ++{ ++ __u32 *ino; ++ ++ ino = (void *)r; ++ *ino = le32_to_cpu(getent(l)->inode); ++} ++ ++static void iam_htree_rec_add(struct iam_leaf *leaf, const struct iam_key *k, ++ const struct iam_rec *r) ++{ ++ struct ext3_dir_entry_2 *scan; ++ struct inode *dir; ++ const char *name; ++ ++ __u32 *ino; ++ int namelen; ++ ++ assert_corr(iam_leaf_can_add(leaf, k, r)); ++ ++ dir = iam_leaf_container(leaf)->ic_object; ++ ino = (void *)r; ++ name = (const char *)k; ++ namelen = strlen(name); ++ ++ scan = find_insertion_point(dir, leaf->il_bh, name, namelen); ++ assert_corr(!IS_ERR(scan)); ++ scan = split_entry(dir, scan, *ino, EXT3_FT_UNKNOWN, name, namelen); ++ leaf->il_at = (void *)scan; ++} ++ ++static void iam_htree_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ struct ext3_dir_entry_2 *orig; ++ struct ext3_dir_entry_2 *scan; ++ struct ext3_dir_entry_2 *prev; ++ ++ assert_corr(iam_leaf_at_rec(leaf)); ++ ++ orig = getent(leaf); ++ ++ if (shift) ++ iam_htree_next(leaf); ++ ++ for (prev = NULL, scan = getstart(leaf); scan < orig; ++ prev = scan, scan = entnext(scan)) ++ ; ++ ++ assert_corr(scan == orig); ++ if (prev != NULL) { ++ prev->rec_len = cpu_to_le16(le16_to_cpu(prev->rec_len) + ++ le16_to_cpu(scan->rec_len)); ++ } else { ++ assert_corr(scan == getstart(leaf)); ++ scan->inode = 0; ++ } ++ iam_leaf_container(leaf)->ic_object->i_version ++; ++} ++ ++static int iam_htree_can_add(const struct iam_leaf *leaf, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ struct ext3_dir_entry_2 *scan; ++ int size; ++ ++ size = recsize(strlen((const char *)k)); ++ for (scan = getstart(leaf); ++ scan < gettop(leaf); scan = entnext(scan)) { ++ if (getfreespace(scan) >= size) ++ return 1; ++ } ++ return 0; ++} ++ ++static void iam_htree_init_new(struct iam_container *c, struct buffer_head *bh) ++{ ++ /* ++ * Do nothing, all work is done by iam_htree_split(). ++ */ ++} ++ ++static void iam_htree_split(struct iam_leaf *l, struct buffer_head **bh, ++ iam_ptr_t new_blknr) ++{ ++ __u32 delim_hash; ++ __u32 old_hash; ++ struct buffer_head *newbh = *bh; ++ struct iam_path *path; ++ ++ old_hash = gethash(l, getent(l)); ++ move_entries(iam_leaf_container(l)->ic_object, ++ getipc(l)->ipc_hinfo, &l->il_bh, bh, &delim_hash); ++ /* ++ * Insert pointer to the new node (together with the least key in ++ * the node) into index node. ++ */ ++ path = iam_leaf_path(l); ++ if (l->il_bh == newbh) { ++ /* ++ * insertion point moves into new leaf. ++ */ ++ assert_corr(delim_hash >= old_hash); ++ l->il_curidx = new_blknr; ++ iam_htree_lookup(l, (void *)&old_hash); ++ } ++ iam_insert_key_lock(path, ++ path->ip_frame, (void *)&delim_hash, new_blknr); ++} ++ ++static struct iam_leaf_operations iam_htree_leaf_ops = { ++ .init = iam_htree_init, ++ .init_new = iam_htree_init_new, ++ .fini = iam_htree_fini, ++ .start = iam_htree_start, ++ .next = iam_htree_next, ++ .key = iam_htree_key, ++ .ikey = iam_htree_ikey, ++ .rec = iam_htree_rec, ++ .key_set = iam_htree_key_set, ++ .key_cmp = iam_htree_key_cmp, ++ .key_eq = iam_htree_key_eq, ++ .key_size = iam_htree_key_size, ++ .rec_set = iam_htree_rec_set, ++ .rec_get = iam_htree_rec_get, ++ .lookup = iam_htree_lookup, ++ .ilookup = iam_htree_ilookup, ++ .at_end = iam_htree_at_end, ++ .rec_add = iam_htree_rec_add, ++ .rec_del = iam_htree_rec_del, ++ .can_add = iam_htree_can_add, ++ .split = iam_htree_split ++}; ++ ++/* ++ * Index operations. ++ */ ++ ++static __u32 iam_htree_root_ptr(struct iam_container *c) ++{ ++ return 0; ++} ++ ++static int iam_htree_node_check(struct iam_path *path, struct iam_frame *frame) ++{ ++ /* XXX no checks yet */ ++ return 0; ++} ++ ++static int is_htree(struct super_block *sb, ++ const struct dx_root *root, int silent) ++{ ++ if (root->info.hash_version > DX_HASH_MAX) { ++ if (!silent) ++ ext3_warning(sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ return -EIO; ++ } ++ ++ if (root->info.unused_flags & 1) { ++ if (!silent) ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ return -EIO; ++ } ++ ++ if (root->info.indirect_levels > DX_MAX_TREE_HEIGHT - 1) { ++ if (!silent) ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int iam_htree_node_load(struct iam_path *path, struct iam_frame *frame) ++{ ++ void *data; ++ struct iam_entry *entries; ++ struct super_block *sb; ++ ++ data = frame->bh->b_data; ++ entries = dx_node_get_entries(path, frame); ++ sb = iam_path_obj(path)->i_sb; ++ if (frame == path->ip_frames) { ++ /* root node */ ++ struct dx_root *root; ++ struct iam_path_compat *ipc; ++ int check; ++ const char *name; ++ int namelen; ++ ++ root = data; ++ assert_corr(path->ip_data != NULL); ++ ipc = container_of(path->ip_data, struct iam_path_compat, ++ ipc_descr); ++ ++ check = is_htree(sb, root, 0); ++ if (check != 0) ++ return check; ++ path->ip_indirect = root->info.indirect_levels; ++ ++ assert_corr((char *)entries == (((char *)&root->info) + ++ root->info.info_length)); ++ assert_corr(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ ipc->ipc_hinfo->hash_version = root->info.hash_version; ++ ipc->ipc_hinfo->seed = EXT3_SB(sb)->s_hash_seed; ++ name = NULL; ++ if (ipc->ipc_qstr) { ++ name = ipc->ipc_qstr->name; ++ namelen = ipc->ipc_qstr->len; ++ } else if (ipc->ipc_hinfo == &ipc->ipc_hinfo_area){ ++ name = (const char *)path->ip_key_target; ++ namelen = strlen(name); ++ } ++ if (name != NULL) ++ ext3fs_dirhash(name, namelen, ipc->ipc_hinfo); ++ if (path->ip_ikey_target == NULL) { ++ path->ip_ikey_target = iam_path_ikey(path, 4); ++ *(__u32 *)path->ip_ikey_target = ipc->ipc_hinfo->hash; ++ } ++ } else { ++ /* non-root index */ ++ assert_corr(entries == ++ data + iam_path_descr(path)->id_node_gap); ++ assert_corr(dx_get_limit(entries) == dx_node_limit(path)); ++ } ++ frame->entries = frame->at = entries; ++ return 0; ++} ++ ++static int iam_htree_node_init(struct iam_container *c, ++ struct buffer_head *bh, int root) ++{ ++ struct dx_node *node; ++ ++ assert_corr(!root); ++ ++ node = (void *)bh->b_data; ++ node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); ++ node->fake.inode = 0; ++ return 0; ++} ++ ++static struct iam_entry *iam_htree_root_inc(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ struct dx_root *root; ++ struct iam_entry *entries; ++ ++ entries = frame->entries; ++ ++ dx_set_count(entries, 1); ++ root = (struct dx_root *) frame->bh->b_data; ++ root->info.indirect_levels++; ++ ++ return entries; ++} ++ ++static int iam_htree_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2) ++{ ++ __u32 p1 = le32_to_cpu(*(__u32 *)k1); ++ __u32 p2 = le32_to_cpu(*(__u32 *)k2); ++ ++ return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++} ++ ++static struct iam_path_descr *iam_htree_ipd_alloc(const struct iam_container *c, ++ void *area) ++{ ++ struct iam_path_compat *ipc; ++ ++ ipc = area; ++ memset(ipc, 0, sizeof *ipc); ++ iam_path_compat_init(ipc, c->ic_object); ++ return &ipc->ipc_descr; ++} ++ ++static void iam_htree_ipd_free(struct iam_path_descr *ipd) ++{ ++} ++ ++static struct iam_operations iam_htree_ops = { ++ .id_root_ptr = iam_htree_root_ptr, ++ .id_node_read = iam_node_read, ++ .id_node_init = iam_htree_node_init, ++ .id_node_check = iam_htree_node_check, ++ .id_node_load = iam_htree_node_load, ++ .id_ikeycmp = iam_htree_ikeycmp, ++ .id_root_inc = iam_htree_root_inc, ++ .id_ipd_alloc = iam_htree_ipd_alloc, ++ .id_ipd_free = iam_htree_ipd_free, ++ .id_name = "htree" ++}; ++ ++/* ++ * Parameters describing iam compatibility mode in which existing ext3 htrees ++ * can be manipulated. ++ */ ++struct iam_descr iam_htree_compat_param = { ++ .id_key_size = EXT3_NAME_LEN, ++ .id_rec_size = sizeof ((struct ext3_dir_entry_2 *)NULL)->inode, ++ .id_ikey_size = sizeof ((struct dx_map_entry *)NULL)->hash, ++ .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, ++ .id_node_gap = offsetof(struct dx_node, entries), ++ .id_root_gap = offsetof(struct dx_root, entries), ++ .id_ops = &iam_htree_ops, ++ .id_leaf_ops = &iam_htree_leaf_ops ++}; ++EXPORT_SYMBOL(iam_htree_compat_param); ++ ++static int iam_htree_guess(struct iam_container *c) ++{ ++ int result; ++ struct buffer_head *bh; ++ const struct dx_root *root; ++ ++ assert_corr(c->ic_object != NULL); ++ ++ result = iam_node_read(c, iam_htree_root_ptr(c), NULL, &bh); ++ if (result == 0) { ++ root = (void *)bh->b_data; ++ result = is_htree(c->ic_object->i_sb, root, 1); ++ if (result == 0) ++ c->ic_descr = &iam_htree_compat_param; ++ else ++ result = -EBADF; ++ brelse(bh); ++ } ++ return result; ++} ++ ++static struct iam_format iam_htree_format = { ++ .if_guess = iam_htree_guess ++}; ++ ++void iam_htree_format_init(void) ++{ ++ iam_format_register(&iam_htree_format); ++} +Index: linux-stage/fs/ext3/iam.c +=================================================================== +--- linux-stage.orig/fs/ext3/iam.c 2006-06-16 16:07:58.000000000 +0300 ++++ linux-stage/fs/ext3/iam.c 2007-10-21 17:32:18.000000000 +0300 +@@ -0,0 +1,1433 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam.c ++ * Top-level entry points into iam module ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Wang Di ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++/* ++ * iam: big theory statement. ++ * ++ * iam (Index Access Module) is a module providing abstraction of persistent ++ * transactional container on top of generalized ext3 htree. ++ * ++ * iam supports: ++ * ++ * - key, pointer, and record size specifiable per container. ++ * ++ * - trees taller than 2 index levels. ++ * ++ * - read/write to existing ext3 htree directories as iam containers. ++ * ++ * iam container is a tree, consisting of leaf nodes containing keys and ++ * records stored in this container, and index nodes, containing keys and ++ * pointers to leaf or index nodes. ++ * ++ * iam does not work with keys directly, instead it calls user-supplied key ++ * comparison function (->dpo_keycmp()). ++ * ++ * Pointers are (currently) interpreted as logical offsets (measured in ++ * blocksful) within underlying flat file on top of which iam tree lives. ++ * ++ * On-disk format: ++ * ++ * iam mostly tries to reuse existing htree formats. ++ * ++ * Format of index node: ++ * ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * | | count | | | | | | ++ * | gap | / | entry | entry | .... | entry | free space | ++ * | | limit | | | | | | ++ * +-----+-------+-------+-------+------+-------+------------+ ++ * ++ * gap this part of node is never accessed by iam code. It ++ * exists for binary compatibility with ext3 htree (that, ++ * in turn, stores fake struct ext2_dirent for ext2 ++ * compatibility), and to keep some unspecified per-node ++ * data. Gap can be different for root and non-root index ++ * nodes. Gap size can be specified for each container ++ * (gap of 0 is allowed). ++ * ++ * count/limit current number of entries in this node, and the maximal ++ * number of entries that can fit into node. count/limit ++ * has the same size as entry, and is itself counted in ++ * count. ++ * ++ * entry index entry: consists of a key immediately followed by ++ * a pointer to a child node. Size of a key and size of a ++ * pointer depends on container. Entry has neither ++ * alignment nor padding. ++ * ++ * free space portion of node new entries are added to ++ * ++ * Entries in index node are sorted by their key value. ++ * ++ * Format of a leaf node is not specified. Generic iam code accesses leaf ++ * nodes through ->id_leaf methods in struct iam_descr. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "xattr.h" ++#include "iopen.h" ++#include "acl.h" ++ ++/* ++ * List of all registered formats. ++ * ++ * No locking. Callers synchronize. ++ */ ++static LIST_HEAD(iam_formats); ++ ++void iam_format_register(struct iam_format *fmt) ++{ ++ list_add(&fmt->if_linkage, &iam_formats); ++} ++EXPORT_SYMBOL(iam_format_register); ++ ++/* ++ * Determine format of given container. This is done by scanning list of ++ * registered formats and calling ->if_guess() method of each in turn. ++ */ ++static int iam_format_guess(struct iam_container *c) ++{ ++ int result; ++ struct iam_format *fmt; ++ ++ /* ++ * XXX temporary initialization hook. ++ */ ++ { ++ static int initialized = 0; ++ ++ if (!initialized) { ++ /* ++ * Keep that order: htree should be registered first, ++ * so that iam_htree_guess() runs last. ++ */ ++ iam_htree_format_init(); ++ iam_lvar_format_init(); ++ iam_lfix_format_init(); ++ initialized = 1; ++ } ++ } ++ ++ result = -ENOENT; ++ list_for_each_entry(fmt, &iam_formats, if_linkage) { ++ result = fmt->if_guess(c); ++ if (result == 0) ++ break; ++ } ++ return result; ++} ++ ++/* ++ * Initialize container @c. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode) ++{ ++ memset(c, 0, sizeof *c); ++ c->ic_descr = descr; ++ c->ic_object = inode; ++ init_rwsem(&c->ic_sem); ++ return 0; ++} ++EXPORT_SYMBOL(iam_container_init); ++ ++/* ++ * Determine container format. ++ */ ++int iam_container_setup(struct iam_container *c) ++{ ++ return iam_format_guess(c); ++} ++EXPORT_SYMBOL(iam_container_setup); ++ ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c) ++{ ++} ++EXPORT_SYMBOL(iam_container_fini); ++ ++void iam_path_init(struct iam_path *path, struct iam_container *c, ++ struct iam_path_descr *pd) ++{ ++ memset(path, 0, sizeof *path); ++ path->ip_container = c; ++ path->ip_frame = path->ip_frames; ++ path->ip_data = pd; ++ path->ip_leaf.il_path = path; ++} ++ ++static void iam_leaf_fini(struct iam_leaf *leaf); ++ ++void iam_path_release(struct iam_path *path) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { ++ if (path->ip_frames[i].bh != NULL) { ++ brelse(path->ip_frames[i].bh); ++ path->ip_frames[i].bh = NULL; ++ } ++ } ++} ++ ++void iam_path_fini(struct iam_path *path) ++{ ++ iam_leaf_fini(&path->ip_leaf); ++ iam_path_release(path); ++} ++ ++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode) ++{ ++ int i; ++ ++ path->ipc_hinfo = &path->ipc_hinfo_area; ++ for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i) ++ path->ipc_descr.ipd_key_scratch[i] = ++ (struct iam_ikey *)&path->ipc_scratch[i]; ++ ++ iam_container_init(&path->ipc_container, ++ &iam_htree_compat_param, inode); ++ iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr); ++} ++ ++void iam_path_compat_fini(struct iam_path_compat *path) ++{ ++ iam_path_fini(&path->ipc_path); ++ iam_container_fini(&path->ipc_container); ++} ++ ++/* ++ * Helper function initializing iam_path_descr and its key scratch area. ++ */ ++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize) ++{ ++ struct iam_path_descr *ipd; ++ void *karea; ++ int i; ++ ++ ipd = area; ++ karea = ipd + 1; ++ for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize) ++ ipd->ipd_key_scratch[i] = karea; ++ return ipd; ++} ++EXPORT_SYMBOL(iam_ipd_alloc); ++ ++void iam_ipd_free(struct iam_path_descr *ipd) ++{ ++} ++EXPORT_SYMBOL(iam_ipd_free); ++ ++int iam_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh) ++{ ++ int result = 0; ++ ++ *bh = ext3_bread(h, c->ic_object, (int)ptr, 0, &result); ++ if (*bh == NULL) ++ result = -EIO; ++ return result; ++} ++ ++/* ++ * Return pointer to current leaf record. Pointer is valid while corresponding ++ * leaf node is locked and pinned. ++ */ ++static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->rec(leaf); ++} ++ ++/* ++ * Return pointer to the current leaf key. This function returns pointer to ++ * the key stored in node. ++ * ++ * Caller should assume that returned pointer is only valid while leaf node is ++ * pinned and locked. ++ */ ++static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->key(leaf); ++} ++ ++static int iam_leaf_key_size(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->key_size(leaf); ++} ++ ++static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf, ++ struct iam_ikey *key) ++{ ++ return iam_leaf_ops(leaf)->ikey(leaf, key); ++} ++ ++static int iam_leaf_keycmp(const struct iam_leaf *leaf, ++ const struct iam_key *key) ++{ ++ return iam_leaf_ops(leaf)->key_cmp(leaf, key); ++} ++ ++static int iam_leaf_keyeq(const struct iam_leaf *leaf, ++ const struct iam_key *key) ++{ ++ return iam_leaf_ops(leaf)->key_eq(leaf, key); ++} ++ ++#if EXT3_INVARIANT_ON ++static int iam_leaf_check(struct iam_leaf *leaf); ++extern int dx_node_check(struct iam_path *p, struct iam_frame *f); ++ ++static int iam_path_check(struct iam_path *p) ++{ ++ int i; ++ int result; ++ struct iam_frame *f; ++ struct iam_descr *param; ++ ++ result = 1; ++ param = iam_path_descr(p); ++ for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) { ++ f = &p->ip_frames[i]; ++ if (f->bh != NULL) { ++ result = dx_node_check(p, f); ++ if (result) ++ result = !param->id_ops->id_node_check(p, f); ++ } ++ } ++ if (result && p->ip_leaf.il_bh != NULL) ++ result = iam_leaf_check(&p->ip_leaf); ++ if (result == 0) { ++ ext3_std_error(iam_path_obj(p)->i_sb, result); ++ } ++ return result; ++} ++#endif ++ ++static int iam_leaf_load(struct iam_path *path) ++{ ++ iam_ptr_t block; ++ int err; ++ struct iam_container *c; ++ struct buffer_head *bh; ++ struct iam_leaf *leaf; ++ struct iam_descr *descr; ++ ++ c = path->ip_container; ++ leaf = &path->ip_leaf; ++ descr = iam_path_descr(path); ++ block = path->ip_frame->leaf; ++ if (block == 0) { ++ /* XXX bug 11027 */ ++ printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n", ++ (long unsigned)path->ip_frame->leaf, ++ dx_get_count(dx_node_get_entries(path, path->ip_frame)), ++ path->ip_frames[0].bh, path->ip_frames[1].bh, ++ path->ip_frames[2].bh); ++ } ++ err = descr->id_ops->id_node_read(c, block, NULL, &bh); ++ if (err == 0) { ++ leaf->il_bh = bh; ++ leaf->il_curidx = block; ++ err = iam_leaf_ops(leaf)->init(leaf); ++ assert_inv(ergo(err == 0, iam_leaf_check(leaf))); ++ } ++ return err; ++} ++ ++static void iam_leaf_unlock(struct iam_leaf *leaf) ++{ ++ if (leaf->il_lock != NULL) { ++ dx_unlock_htree(iam_leaf_container(leaf)->ic_object, ++ leaf->il_lock); ++ do_corr(schedule()); ++ leaf->il_lock = NULL; ++ } ++} ++ ++static void iam_leaf_fini(struct iam_leaf *leaf) ++{ ++ if (leaf->il_path != NULL) { ++ iam_leaf_unlock(leaf); ++ assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf))); ++ iam_leaf_ops(leaf)->fini(leaf); ++ if (leaf->il_bh) { ++ brelse(leaf->il_bh); ++ leaf->il_bh = NULL; ++ leaf->il_curidx = 0; ++ } ++ } ++} ++ ++static void iam_leaf_start(struct iam_leaf *folio) ++{ ++ iam_leaf_ops(folio)->start(folio); ++} ++ ++void iam_leaf_next(struct iam_leaf *folio) ++{ ++ iam_leaf_ops(folio)->next(folio); ++} ++ ++static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key, ++ const struct iam_rec *rec) ++{ ++ iam_leaf_ops(leaf)->rec_add(leaf, key, rec); ++} ++ ++static void iam_rec_del(struct iam_leaf *leaf, int shift) ++{ ++ iam_leaf_ops(leaf)->rec_del(leaf, shift); ++} ++ ++int iam_leaf_at_end(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_ops(leaf)->at_end(leaf); ++} ++ ++void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr) ++{ ++ iam_leaf_ops(l)->split(l, bh, nr); ++} ++ ++int iam_leaf_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ return iam_leaf_ops(l)->can_add(l, k, r); ++} ++ ++#if EXT3_INVARIANT_ON ++static int iam_leaf_check(struct iam_leaf *leaf) ++{ ++ return 1; ++#if 0 ++ struct iam_lentry *orig; ++ struct iam_path *path; ++ struct iam_container *bag; ++ struct iam_ikey *k0; ++ struct iam_ikey *k1; ++ int result; ++ int first; ++ ++ orig = leaf->il_at; ++ path = iam_leaf_path(leaf); ++ bag = iam_leaf_container(leaf); ++ ++ result = iam_leaf_ops(leaf)->init(leaf); ++ if (result != 0) ++ return result; ++ ++ first = 1; ++ iam_leaf_start(leaf); ++ k0 = iam_path_ikey(path, 0); ++ k1 = iam_path_ikey(path, 1); ++ while (!iam_leaf_at_end(leaf)) { ++ iam_ikeycpy(bag, k0, k1); ++ iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1)); ++ if (!first && iam_ikeycmp(bag, k0, k1) > 0) { ++ return 0; ++ } ++ first = 0; ++ iam_leaf_next(leaf); ++ } ++ leaf->il_at = orig; ++ return 1; ++#endif ++} ++#endif ++ ++static int iam_txn_dirty(handle_t *handle, ++ struct iam_path *path, struct buffer_head *bh) ++{ ++ int result; ++ ++ result = ext3_journal_dirty_metadata(handle, bh); ++ if (result != 0) ++ ext3_std_error(iam_path_obj(path)->i_sb, result); ++ return result; ++} ++ ++static int iam_txn_add(handle_t *handle, ++ struct iam_path *path, struct buffer_head *bh) ++{ ++ int result; ++ ++ result = ext3_journal_get_write_access(handle, bh); ++ if (result != 0) ++ ext3_std_error(iam_path_obj(path)->i_sb, result); ++ return result; ++} ++ ++/***********************************************************************/ ++/* iterator interface */ ++/***********************************************************************/ ++ ++static enum iam_it_state it_state(const struct iam_iterator *it) ++{ ++ return it->ii_state; ++} ++ ++/* ++ * Helper function returning scratch key. ++ */ ++static struct iam_container *iam_it_container(const struct iam_iterator *it) ++{ ++ return it->ii_path.ip_container; ++} ++ ++static inline int it_keycmp(const struct iam_iterator *it, ++ const struct iam_key *k) ++{ ++ return iam_leaf_keycmp(&it->ii_path.ip_leaf, k); ++} ++ ++static inline int it_keyeq(const struct iam_iterator *it, ++ const struct iam_key *k) ++{ ++ return iam_leaf_keyeq(&it->ii_path.ip_leaf, k); ++} ++ ++static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik) ++{ ++ return iam_ikeycmp(it->ii_path.ip_container, ++ iam_leaf_ikey(&it->ii_path.ip_leaf, ++ iam_path_ikey(&it->ii_path, 0)), ik); ++} ++ ++static inline int it_at_rec(const struct iam_iterator *it) ++{ ++ return !iam_leaf_at_end(&it->ii_path.ip_leaf); ++} ++ ++static inline int it_before(const struct iam_iterator *it) ++{ ++ return it_state(it) == IAM_IT_SKEWED && it_at_rec(it); ++} ++ ++/* ++ * Helper wrapper around iam_it_get(): returns 0 (success) only when record ++ * with exactly the same key as asked is found. ++ */ ++static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k) ++{ ++ int result; ++ ++ result = iam_it_get(it, k); ++ if (result > 0) ++ result = 0; ++ else if (result == 0) ++ /* ++ * Return -ENOENT if cursor is located above record with a key ++ * different from one specified, or in the empty leaf. ++ * ++ * XXX returning -ENOENT only works if iam_it_get() never ++ * returns -ENOENT as a legitimate error. ++ */ ++ result = -ENOENT; ++ return result; ++} ++ ++void iam_container_write_lock(struct iam_container *ic) ++{ ++ down_write(&ic->ic_sem); ++} ++ ++void iam_container_write_unlock(struct iam_container *ic) ++{ ++ up_write(&ic->ic_sem); ++} ++ ++void iam_container_read_lock(struct iam_container *ic) ++{ ++ down_read(&ic->ic_sem); ++} ++ ++void iam_container_read_unlock(struct iam_container *ic) ++{ ++ up_read(&ic->ic_sem); ++} ++ ++/* ++ * Initialize iterator to IAM_IT_DETACHED state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, ++ struct iam_path_descr *pd) ++{ ++ memset(it, 0, sizeof *it); ++ it->ii_flags = flags; ++ it->ii_state = IAM_IT_DETACHED; ++ iam_path_init(&it->ii_path, c, pd); ++ return 0; ++} ++EXPORT_SYMBOL(iam_it_init); ++ ++/* ++ * Finalize iterator and release all resources. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_fini(struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ iam_path_fini(&it->ii_path); ++} ++EXPORT_SYMBOL(iam_it_fini); ++ ++/* ++ * Performs tree top-to-bottom traversal starting from root, and loads leaf ++ * node. ++ */ ++static int iam_path_lookup(struct iam_path *path, int index) ++{ ++ struct iam_container *c; ++ struct iam_descr *descr; ++ struct iam_leaf *leaf; ++ int result; ++ ++ c = path->ip_container; ++ leaf = &path->ip_leaf; ++ descr = iam_path_descr(path); ++ result = dx_lookup_lock(path, &leaf->il_lock, DLT_WRITE); ++ assert_inv(iam_path_check(path)); ++ do_corr(schedule()); ++ if (result == 0) { ++ result = iam_leaf_load(path); ++ assert_inv(ergo(result == 0, iam_leaf_check(leaf))); ++ if (result == 0) { ++ do_corr(schedule()); ++ if (index) ++ result = iam_leaf_ops(leaf)-> ++ ilookup(leaf, path->ip_ikey_target); ++ else ++ result = iam_leaf_ops(leaf)-> ++ lookup(leaf, path->ip_key_target); ++ do_corr(schedule()); ++ } ++ if (result < 0) ++ iam_leaf_unlock(leaf); ++ } ++ return result; ++} ++ ++/* ++ * Common part of iam_it_{i,}get(). ++ */ ++static int __iam_it_get(struct iam_iterator *it, int index) ++{ ++ int result; ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ ++ result = iam_path_lookup(&it->ii_path, index); ++ if (result >= 0) { ++ int collision; ++ ++ collision = result & IAM_LOOKUP_LAST; ++ switch (result & ~IAM_LOOKUP_LAST) { ++ case IAM_LOOKUP_EXACT: ++ result = +1; ++ it->ii_state = IAM_IT_ATTACHED; ++ break; ++ case IAM_LOOKUP_OK: ++ result = 0; ++ it->ii_state = IAM_IT_ATTACHED; ++ break; ++ case IAM_LOOKUP_BEFORE: ++ case IAM_LOOKUP_EMPTY: ++ result = 0; ++ it->ii_state = IAM_IT_SKEWED; ++ break; ++ default: ++ assert(0); ++ } ++ result |= collision; ++ } ++ /* ++ * See iam_it_get_exact() for explanation. ++ */ ++ assert_corr(result != -ENOENT); ++ return result; ++} ++ ++/* ++ * Correct hash, but not the same key was found, iterate through hash ++ * collision chain, looking for correct record. ++ */ ++static int iam_it_collision(struct iam_iterator *it) ++{ ++ int result; ++ ++ assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target))); ++ ++ while ((result = iam_it_next(it)) == 0) { ++ do_corr(schedule()); ++ if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0) ++ return -ENOENT; ++ if (it_keyeq(it, it->ii_path.ip_key_target)) ++ return 0; ++ } ++ return result; ++} ++ ++/* ++ * Attach iterator. After successful completion, @it points to record with ++ * least key not larger than @k. ++ * ++ * Return value: 0: positioned on existing record, ++ * +ve: exact position found, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED ++ * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED, ++ * it_keycmp(it, k) <= 0) ++ */ ++int iam_it_get(struct iam_iterator *it, const struct iam_key *k) ++{ ++ int result; ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ ++ it->ii_path.ip_ikey_target = NULL; ++ it->ii_path.ip_key_target = k; ++ ++ result = __iam_it_get(it, 0); ++ ++ if (result == IAM_LOOKUP_LAST) { ++ result = iam_it_collision(it); ++ if (result != 0) { ++ iam_it_put(it); ++ iam_it_fini(it); ++ result = __iam_it_get(it, 0); ++ } else ++ result = +1; ++ } ++ if (result > 0) ++ result &= ~IAM_LOOKUP_LAST; ++ ++ assert_corr(ergo(result > 0, it_keycmp(it, k) == 0)); ++ assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED, ++ it_keycmp(it, k) <= 0)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_get); ++ ++/* ++ * Attach iterator by index key. ++ */ ++static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k) ++{ ++ assert_corr(it_state(it) == IAM_IT_DETACHED); ++ ++ it->ii_path.ip_ikey_target = k; ++ return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST; ++} ++ ++/* ++ * Attach iterator, and assure it points to the record (not skewed). ++ * ++ * Return value: 0: positioned on existing record, ++ * +ve: exact position found, ++ * -ve: error. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED && ++ * !(it->ii_flags&IAM_IT_WRITE) ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) ++ */ ++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k) ++{ ++ int result; ++ assert_corr(it_state(it) == IAM_IT_DETACHED && ++ !(it->ii_flags&IAM_IT_WRITE)); ++ result = iam_it_get(it, k); ++ if (result == 0) { ++ if (it_state(it) != IAM_IT_ATTACHED) { ++ assert_corr(it_state(it) == IAM_IT_SKEWED); ++ result = iam_it_next(it); ++ } ++ } ++ assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_get_at); ++ ++/* ++ * Duplicates iterator. ++ * ++ * postcondition: it_state(dst) == it_state(src) && ++ * iam_it_container(dst) == iam_it_container(src) && ++ * dst->ii_flags = src->ii_flags && ++ * ergo(it_state(src) == IAM_IT_ATTACHED, ++ * iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ * iam_it_key_get(dst) == iam_it_key_get(src)) ++ */ ++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src) ++{ ++ dst->ii_flags = src->ii_flags; ++ dst->ii_state = src->ii_state; ++ /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */ ++ /* ++ * XXX: duplicate lock. ++ */ ++ assert_corr(it_state(dst) == it_state(src)); ++ assert_corr(iam_it_container(dst) == iam_it_container(src)); ++ assert_corr(dst->ii_flags = src->ii_flags); ++ assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED, ++ iam_it_rec_get(dst) == iam_it_rec_get(src) && ++ iam_it_key_get(dst) == iam_it_key_get(src))); ++ ++} ++ ++/* ++ * Detach iterator. Does nothing it detached state. ++ * ++ * postcondition: it_state(it) == IAM_IT_DETACHED ++ */ ++void iam_it_put(struct iam_iterator *it) ++{ ++ if (it->ii_state != IAM_IT_DETACHED) { ++ it->ii_state = IAM_IT_DETACHED; ++ iam_leaf_fini(&it->ii_path.ip_leaf); ++ } ++} ++EXPORT_SYMBOL(iam_it_put); ++ ++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it, ++ struct iam_ikey *ikey); ++/* ++ * Move iterator one record right. ++ * ++ * Return value: 0: success, ++ * +1: end of container reached ++ * -ve: error ++ * ++ * precondition: (it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) && ++ * ergo(result > 0, it_state(it) == IAM_IT_DETACHED) ++ */ ++int iam_it_next(struct iam_iterator *it) ++{ ++ int result; ++ struct iam_path *path; ++ struct iam_leaf *leaf; ++ struct inode *obj; ++ do_corr(struct iam_ikey *ik_orig); ++ ++ /* assert_corr(it->ii_flags&IAM_IT_MOVE); */ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ ++ path = &it->ii_path; ++ leaf = &path->ip_leaf; ++ obj = iam_path_obj(path); ++ ++ assert_corr(iam_leaf_is_locked(leaf)); ++ ++ result = 0; ++ do_corr(ik_orig = it_at_rec(it) ? ++ iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL); ++ if (it_before(it)) { ++ assert_corr(!iam_leaf_at_end(leaf)); ++ it->ii_state = IAM_IT_ATTACHED; ++ } else { ++ if (!iam_leaf_at_end(leaf)) ++ /* advance within leaf node */ ++ iam_leaf_next(leaf); ++ /* ++ * multiple iterations may be necessary due to empty leaves. ++ */ ++ while (result == 0 && iam_leaf_at_end(leaf)) { ++ do_corr(schedule()); ++ /* advance index portion of the path */ ++ result = iam_index_next(iam_it_container(it), path); ++ assert_corr(iam_leaf_is_locked(leaf)); ++ if (result == 1) { ++ struct dynlock_handle *lh; ++ lh = dx_lock_htree(obj, path->ip_frame->leaf, ++ DLT_WRITE); ++ if (lh != NULL) { ++ iam_leaf_fini(leaf); ++ leaf->il_lock = lh; ++ result = iam_leaf_load(path); ++ if (result == 0) ++ iam_leaf_start(leaf); ++ } else ++ result = -ENOMEM; ++ } else if (result == 0) ++ /* end of container reached */ ++ result = +1; ++ if (result != 0) ++ iam_it_put(it); ++ } ++ if (result == 0) ++ it->ii_state = IAM_IT_ATTACHED; ++ } ++ assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)); ++ assert_corr(ergo(result > 0, it_state(it) == IAM_IT_DETACHED)); ++ assert_corr(ergo(result == 0 && ik_orig != NULL, ++ it_ikeycmp(it, ik_orig) >= 0)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_next); ++ ++/* ++ * Return pointer to the record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && it_at_rec(it) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_rec(&it->ii_path.ip_leaf); ++} ++EXPORT_SYMBOL(iam_it_rec_get); ++ ++static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r) ++{ ++ struct iam_leaf *folio; ++ ++ folio = &it->ii_path.ip_leaf; ++ iam_leaf_ops(folio)->rec_set(folio, r); ++} ++ ++/* ++ * Replace contents of record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE ++ * postcondition: it_state(it) == IAM_IT_ATTACHED && ++ * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_set(handle_t *h, ++ struct iam_iterator *it, const struct iam_rec *r) ++{ ++ int result; ++ struct iam_path *path; ++ struct buffer_head *bh; ++ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED && ++ it->ii_flags&IAM_IT_WRITE); ++ assert_corr(it_at_rec(it)); ++ ++ path = &it->ii_path; ++ bh = path->ip_leaf.il_bh; ++ result = iam_txn_add(h, path, bh); ++ if (result == 0) { ++ iam_it_reccpy(it, r); ++ result = iam_txn_dirty(h, path, bh); ++ } ++ return result; ++} ++EXPORT_SYMBOL(iam_it_rec_set); ++ ++/* ++ * Return pointer to the index key under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED ++ */ ++static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it, ++ struct iam_ikey *ikey) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey); ++} ++ ++/* ++ * Return pointer to the key under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED ++ */ ++struct iam_key *iam_it_key_get(const struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_key(&it->ii_path.ip_leaf); ++} ++EXPORT_SYMBOL(iam_it_key_get); ++ ++/* ++ * Return size of key under iterator (in bytes) ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED ++ */ ++int iam_it_key_size(const struct iam_iterator *it) ++{ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(it_at_rec(it)); ++ return iam_leaf_key_size(&it->ii_path.ip_leaf); ++} ++EXPORT_SYMBOL(iam_it_key_size); ++ ++/* ++ * Insertion of new record. Interaction with jbd during non-trivial case (when ++ * split happens) is as following: ++ * ++ * - new leaf node is involved into transaction by ext3_append(); ++ * ++ * - old leaf node is involved into transaction by iam_add_rec(); ++ * ++ * - leaf where insertion point ends in, is marked dirty by iam_add_rec(); ++ * ++ * - leaf without insertion point is marked dirty (as @new_leaf) by ++ * iam_new_leaf(); ++ * ++ * - split index nodes are involved into transaction and marked dirty by ++ * split_index_node(). ++ * ++ * - "safe" index node, which is no split, but where new pointer is inserted ++ * is involved into transaction and marked dirty by split_index_node(). ++ * ++ * - index node where pointer to new leaf is inserted is involved into ++ * transaction by split_index_node() and marked dirty by iam_add_rec(). ++ * ++ * - inode is marked dirty by iam_add_rec(). ++ * ++ */ ++ ++static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf) ++{ ++ int err; ++ iam_ptr_t blknr; ++ struct buffer_head *new_leaf; ++ struct buffer_head *old_leaf; ++ struct iam_container *c; ++ struct inode *obj; ++ struct iam_path *path; ++ ++ assert_inv(iam_leaf_check(leaf)); ++ ++ c = iam_leaf_container(leaf); ++ path = leaf->il_path; ++ ++ obj = c->ic_object; ++ new_leaf = ext3_append(handle, obj, (__u32 *)&blknr, &err); ++ do_corr(schedule()); ++ if (new_leaf != NULL) { ++ struct dynlock_handle *lh; ++ ++ lh = dx_lock_htree(obj, blknr, DLT_WRITE); ++ do_corr(schedule()); ++ if (lh != NULL) { ++ iam_leaf_ops(leaf)->init_new(c, new_leaf); ++ do_corr(schedule()); ++ old_leaf = leaf->il_bh; ++ iam_leaf_split(leaf, &new_leaf, blknr); ++ if (old_leaf != leaf->il_bh) { ++ /* ++ * Switched to the new leaf. ++ */ ++ iam_leaf_unlock(leaf); ++ leaf->il_lock = lh; ++ path->ip_frame->leaf = blknr; ++ } else ++ dx_unlock_htree(obj, lh); ++ do_corr(schedule()); ++ err = iam_txn_dirty(handle, path, new_leaf); ++ brelse(new_leaf); ++ if (err == 0) ++ err = ext3_mark_inode_dirty(handle, obj); ++ do_corr(schedule()); ++ } else ++ err = -ENOMEM; ++ } ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf)); ++ assert_inv(iam_path_check(iam_leaf_path(leaf))); ++ return err; ++} ++ ++static int iam_add_rec(handle_t *handle, struct iam_iterator *it, ++ struct iam_path *path, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ int err; ++ struct iam_leaf *leaf; ++ ++ leaf = &path->ip_leaf; ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_path_check(path)); ++ err = iam_txn_add(handle, path, leaf->il_bh); ++ if (err == 0) { ++ do_corr(schedule()); ++ if (!iam_leaf_can_add(leaf, k, r)) { ++ struct dynlock_handle *lh = NULL; ++ ++ do { ++ assert_corr(lh == NULL); ++ do_corr(schedule()); ++ err = split_index_node(handle, path, &lh); ++ if (err == -EAGAIN) { ++ assert_corr(lh == NULL); ++ ++ iam_path_fini(path); ++ it->ii_state = IAM_IT_DETACHED; ++ ++ do_corr(schedule()); ++ err = iam_it_get_exact(it, k); ++ if (err == -ENOENT) ++ err = +1; /* repeat split */ ++ else if (err == 0) ++ err = -EEXIST; ++ } ++ } while (err > 0); ++ assert_inv(iam_path_check(path)); ++ if (err == 0) { ++ assert_corr(lh != NULL); ++ do_corr(schedule()); ++ err = iam_new_leaf(handle, leaf); ++ if (err == 0) ++ err = iam_txn_dirty(handle, path, ++ path->ip_frame->bh); ++ } ++ dx_unlock_htree(iam_path_obj(path), lh); ++ do_corr(schedule()); ++ } ++ if (err == 0) { ++ iam_leaf_rec_add(leaf, k, r); ++ err = iam_txn_dirty(handle, path, leaf->il_bh); ++ } ++ } ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_leaf_check(&path->ip_leaf)); ++ assert_inv(iam_path_check(path)); ++ return err; ++} ++ ++/* ++ * Insert new record with key @k and contents from @r, shifting records to the ++ * right. On success, iterator is positioned on the newly inserted record. ++ * ++ * precondition: it->ii_flags&IAM_IT_WRITE && ++ * (it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_SKEWED) && ++ * ergo(it_state(it) == IAM_IT_ATTACHED, ++ * it_keycmp(it, k) <= 0) && ++ * ergo(it_before(it), it_keycmp(it, k) > 0)); ++ * postcondition: ergo(result == 0, ++ * it_state(it) == IAM_IT_ATTACHED && ++ * it_keycmp(it, k) == 0 && ++ * !memcmp(iam_it_rec_get(it), r, ...)) ++ */ ++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, ++ const struct iam_key *k, const struct iam_rec *r) ++{ ++ int result; ++ struct iam_path *path; ++ ++ path = &it->ii_path; ++ ++ assert_corr(it->ii_flags&IAM_IT_WRITE); ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_SKEWED); ++ assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED, ++ it_keycmp(it, k) <= 0)); ++ assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0)); ++ result = iam_add_rec(h, it, path, k, r); ++ if (result == 0) ++ it->ii_state = IAM_IT_ATTACHED; ++ assert_corr(ergo(result == 0, ++ it_state(it) == IAM_IT_ATTACHED && ++ it_keycmp(it, k) == 0)); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_rec_insert); ++ ++/* ++ * Delete record under iterator. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * it->ii_flags&IAM_IT_WRITE && ++ * it_at_rec(it) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED || ++ * it_state(it) == IAM_IT_DETACHED ++ */ ++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it) ++{ ++ int result; ++ struct iam_leaf *leaf; ++ struct iam_path *path; ++ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED && ++ it->ii_flags&IAM_IT_WRITE); ++ assert_corr(it_at_rec(it)); ++ ++ path = &it->ii_path; ++ leaf = &path->ip_leaf; ++ ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_path_check(path)); ++ ++ result = iam_txn_add(h, path, leaf->il_bh); ++ /* ++ * no compaction for now. ++ */ ++ if (result == 0) { ++ iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE); ++ result = iam_txn_dirty(h, path, leaf->il_bh); ++ if (result == 0 && iam_leaf_at_end(leaf) && ++ it->ii_flags&IAM_IT_MOVE) { ++ result = iam_it_next(it); ++ if (result > 0) ++ result = 0; ++ } ++ } ++ assert_inv(iam_leaf_check(leaf)); ++ assert_inv(iam_path_check(path)); ++ assert_corr(it_state(it) == IAM_IT_ATTACHED || ++ it_state(it) == IAM_IT_DETACHED); ++ return result; ++} ++EXPORT_SYMBOL(iam_it_rec_delete); ++ ++/* ++ * Convert iterator to cookie. ++ * ++ * precondition: it_state(it) == IAM_IT_ATTACHED && ++ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) ++ * postcondition: it_state(it) == IAM_IT_ATTACHED ++ */ ++iam_pos_t iam_it_store(const struct iam_iterator *it) ++{ ++ iam_pos_t result; ++ ++ assert_corr(it_state(it) == IAM_IT_ATTACHED); ++ assert_corr(it_at_rec(it)); ++ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= ++ sizeof result); ++ ++ result = 0; ++ return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result); ++} ++EXPORT_SYMBOL(iam_it_store); ++ ++/* ++ * Restore iterator from cookie. ++ * ++ * precondition: it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE && ++ * iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t) ++ * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED && ++ * iam_it_store(it) == pos) ++ */ ++int iam_it_load(struct iam_iterator *it, iam_pos_t pos) ++{ ++ assert_corr(it_state(it) == IAM_IT_DETACHED && ++ it->ii_flags&IAM_IT_MOVE); ++ assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos); ++ return iam_it_iget(it, (struct iam_ikey *)&pos); ++} ++EXPORT_SYMBOL(iam_it_load); ++ ++/***********************************************************************/ ++/* invariants */ ++/***********************************************************************/ ++ ++static inline int ptr_inside(void *base, size_t size, void *ptr) ++{ ++ return (base <= ptr) && (ptr < base + size); ++} ++ ++int iam_frame_invariant(struct iam_frame *f) ++{ ++ return ++ (f->bh != NULL && ++ f->bh->b_data != NULL && ++ ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) && ++ ptr_inside(f->bh->b_data, f->bh->b_size, f->at) && ++ f->entries <= f->at); ++} ++int iam_leaf_invariant(struct iam_leaf *l) ++{ ++ return ++ l->il_bh != NULL && ++ l->il_bh->b_data != NULL && ++ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) && ++ ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) && ++ l->il_entries <= l->il_at; ++} ++ ++int iam_path_invariant(struct iam_path *p) ++{ ++ int i; ++ ++ if (p->ip_container == NULL || ++ p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 || ++ p->ip_frame != p->ip_frames + p->ip_indirect || ++ !iam_leaf_invariant(&p->ip_leaf)) ++ return 0; ++ for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) { ++ if (i <= p->ip_indirect) { ++ if (!iam_frame_invariant(&p->ip_frames[i])) ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++int iam_it_invariant(struct iam_iterator *it) ++{ ++ return ++ (it->ii_state == IAM_IT_DETACHED || ++ it->ii_state == IAM_IT_ATTACHED || ++ it->ii_state == IAM_IT_SKEWED) && ++ !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) && ++ ergo(it->ii_state == IAM_IT_ATTACHED || ++ it->ii_state == IAM_IT_SKEWED, ++ iam_path_invariant(&it->ii_path) && ++ equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED)); ++} ++ ++/* ++ * Search container @c for record with key @k. If record is found, its data ++ * are moved into @r. ++ * ++ * Return values: 0: found, -ENOENT: not-found, -ve: error ++ */ ++int iam_lookup(struct iam_container *c, const struct iam_key *k, ++ struct iam_rec *r, struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, 0, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == 0) ++ /* ++ * record with required key found, copy it into user buffer ++ */ ++ iam_reccpy(&it.ii_path.ip_leaf, r); ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_lookup); ++ ++/* ++ * Insert new record @r with key @k into container @c (within context of ++ * transaction @h). ++ * ++ * Return values: 0: success, -ve: error, including -EEXIST when record with ++ * given key is already present. ++ * ++ * postcondition: ergo(result == 0 || result == -EEXIST, ++ * iam_lookup(c, k, r2) > 0; ++ */ ++int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ const struct iam_rec *r, struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, IAM_IT_WRITE, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == -ENOENT) ++ result = iam_it_rec_insert(h, &it, k, r); ++ else if (result == 0) ++ result = -EEXIST; ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_insert); ++ ++/* ++ * Update record with the key @k in container @c (within context of ++ * transaction @h), new record is given by @r. ++ * ++ * Return values: 0: success, -ve: error, including -ENOENT if no record with ++ * the given key found. ++ */ ++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ const struct iam_rec *r, struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, IAM_IT_WRITE, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == 0) ++ iam_it_rec_set(h, &it, r); ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_update); ++ ++/* ++ * Delete existing record with key @k. ++ * ++ * Return values: 0: success, -ENOENT: not-found, -ve: other error. ++ * ++ * postcondition: ergo(result == 0 || result == -ENOENT, ++ * !iam_lookup(c, k, *)); ++ */ ++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ struct iam_path_descr *pd) ++{ ++ struct iam_iterator it; ++ int result; ++ ++ iam_it_init(&it, c, IAM_IT_WRITE, pd); ++ ++ result = iam_it_get_exact(&it, k); ++ if (result == 0) ++ iam_it_rec_delete(h, &it); ++ iam_it_put(&it); ++ iam_it_fini(&it); ++ return result; ++} ++EXPORT_SYMBOL(iam_delete); ++ +Index: linux-stage/fs/ext3/iam-uapi.c +=================================================================== +--- linux-stage.orig/fs/ext3/iam-uapi.c 2006-06-16 16:07:58.000000000 +0300 ++++ linux-stage/fs/ext3/iam-uapi.c 2007-10-21 17:32:28.000000000 +0300 +@@ -0,0 +1,367 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * iam_uapi.c ++ * User-level interface to iam (ioctl based) ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#include ++#include ++/* ext3_error() */ ++#include ++#include ++ ++#include ++#include ++ ++ ++struct iam_private_info { ++ struct dir_private_info ipi_dir; /* has to be first */ ++ struct iam_container ipi_bag; ++ struct iam_descr ipi_descr; ++ struct iam_iterator ipi_it; ++ struct iam_path_descr *ipi_ipd; ++ char ipi_ipd_area[DX_IPD_MAX_SIZE]; ++}; ++ ++enum { ++ IAM_INSERT_CREDITS = 20 ++}; ++ ++static struct iam_private_info *get_ipi(struct file *filp) ++{ ++ return filp->private_data; ++} ++ ++static int iam_uapi_it(int cmd, struct inode *inode, ++ struct file *filp, struct iam_uapi_it *itop) ++{ ++ struct iam_private_info *ipi; ++ struct iam_iterator *it; ++ enum iam_it_state st; ++ int result = 0; ++ ++ ipi = get_ipi(filp); ++ it = &ipi->ipi_it; ++ st = it->ii_state; ++ switch (cmd) { ++ case IAM_IOC_IT_START: ++ result = iam_it_init(it, &ipi->ipi_bag, ++ IAM_IT_MOVE, ipi->ipi_ipd); ++ if (result == 0) ++ result = iam_it_get(it, itop->iui_op.iul_key); ++ break; ++ case IAM_IOC_IT_NEXT: ++ if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED) ++ result = iam_it_next(it); ++ else ++ result = -EBUSY; ++ break; ++ case IAM_IOC_IT_STOP: ++ iam_it_put(it); ++ iam_it_fini(it); ++ result = 0; ++ break; ++ } ++ st = it->ii_state; ++ if (st == IAM_IT_ATTACHED || st == IAM_IT_SKEWED) ++ memcpy(itop->iui_op.iul_key, iam_it_key_get(it), ++ iam_it_key_size(it)); ++ if (st == IAM_IT_ATTACHED) ++ iam_reccpy(&it->ii_path.ip_leaf, itop->iui_op.iul_rec); ++ itop->iui_state = st; ++ return result; ++} ++ ++static int iam_uapi_op(int cmd, struct inode *inode, ++ struct file *filp, struct iam_uapi_op *op) ++{ ++ int result; ++ struct iam_private_info *ipi; ++ ++ ipi = get_ipi(filp); ++ if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_DELETE) { ++ handle_t *h; ++ ++ h = ext3_journal_start(inode, IAM_INSERT_CREDITS); ++ if (!IS_ERR(h)) { ++ if (cmd == IAM_IOC_INSERT) ++ result = iam_insert(h, &ipi->ipi_bag, ++ op->iul_key, ++ op->iul_rec, ipi->ipi_ipd); ++ else ++ result = iam_delete(h, &ipi->ipi_bag, ++ op->iul_key, ipi->ipi_ipd); ++ ext3_journal_stop(h); ++ } else { ++ result = PTR_ERR(h); ++ ext3_std_error(inode->i_sb, result); ++ } ++ } else ++ result = iam_lookup(&ipi->ipi_bag, op->iul_key, ++ op->iul_rec, ipi->ipi_ipd); ++ return result; ++} ++ ++struct iam_private_info *ext3_iam_alloc_info(int flags) ++{ ++ struct iam_private_info *info; ++ ++ info = kmalloc(sizeof *info, flags); ++ if (info != NULL) ++ memset(info, 0, sizeof *info); ++ return info; ++} ++ ++void ext3_iam_release_info(struct iam_private_info *info) ++{ ++ iam_it_put(&info->ipi_it); ++ iam_it_fini(&info->ipi_it); ++ if (info->ipi_ipd != NULL) ++ info->ipi_bag.ic_descr->id_ops->id_ipd_free(info->ipi_ipd); ++ iam_container_fini(&info->ipi_bag); ++} ++ ++void ext3_iam_release(struct file *filp, struct inode *inode) ++{ ++ struct iam_private_info *info; ++ ++ info = filp->private_data; ++ ext3_iam_release_info(info); ++ ++ kfree(info); ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ ++static int iam_uapi_init(struct inode *inode, ++ struct file *filp, struct iam_uapi_info *ua) ++{ ++ int result; ++ struct iam_private_info *info; ++ ++ info = ext3_iam_alloc_info(GFP_KERNEL); ++ if (info != NULL) { ++ struct iam_container *bag; ++ struct iam_descr *des; ++ ++ bag = &info->ipi_bag; ++ des = &info->ipi_descr; ++ result = iam_container_init(bag, des, inode); ++ if (result == 0) { ++ result = iam_container_setup(bag); ++ if (result == 0) { ++ /* ++ * Container setup might change ->ic_descr ++ */ ++ des = bag->ic_descr; ++ info->ipi_ipd = des->id_ops-> ++ id_ipd_alloc(bag, info->ipi_ipd_area); ++ if (info->ipi_ipd != NULL) { ++ filp->private_data = info; ++ EXT3_I(inode)->i_flags |= EXT3_INDEX_FL; ++ } else ++ result = -ENOMEM; ++ } ++ } ++ } else ++ result = -ENOMEM; ++ return result; ++} ++ ++ ++static int getua(struct iam_uapi_info *ua, unsigned long arg) ++{ ++ if (copy_from_user(ua, (struct iam_uapi_info __user *)arg, sizeof *ua)) ++ return -EFAULT; ++ else ++ return 0; ++} ++ ++static int putua(struct iam_uapi_info *ua, unsigned long arg) ++{ ++ if (copy_to_user((struct iam_uapi_info __user *)arg, ua, sizeof *ua)) ++ return -EFAULT; ++ else ++ return 0; ++} ++ ++enum outop_t { ++ KEY = 1 << 0, ++ REC = 1 << 1, ++ STATE = 1 << 2 ++}; ++ ++static int outop(struct iam_uapi_op *op, struct iam_uapi_op *uop, ++ struct iam_descr *des, enum outop_t opt) ++{ ++ int result; ++ ++ if (((opt & REC) && copy_to_user((void __user *)uop->iul_rec, ++ op->iul_rec, des->id_rec_size)) || ++ ((opt & KEY) && copy_to_user((void __user *)uop->iul_key, ++ op->iul_key, des->id_key_size))) ++ result = -EFAULT; ++ else ++ result = 0; ++ return result; ++} ++ ++static void putop(struct iam_uapi_op *op) ++{ ++ kfree(op->iul_key); ++ kfree(op->iul_rec); ++} ++ ++static int getop(struct iam_uapi_op *op, struct iam_uapi_op *uop, ++ struct iam_descr *des, unsigned long arg) ++{ ++ int result; ++ int ks; ++ int rs; ++ ++ ks = des->id_key_size; ++ rs = des->id_rec_size; ++ op->iul_key = kmalloc(ks, GFP_KERNEL); ++ op->iul_rec = kmalloc(rs, GFP_KERNEL); ++ if (!copy_from_user(uop, ++ (struct iam_uapi_op __user *)arg, sizeof *uop) && ++ op->iul_key != NULL && op->iul_rec != NULL && ++ !copy_from_user(op->iul_key, (void __user *)uop->iul_key, ks) && ++ !copy_from_user(op->iul_rec, (void __user *)uop->iul_rec, rs)) ++ result = 0; ++ else { ++ result = -EFAULT; ++ putop(op); ++ } ++ return result; ++} ++ ++static int outit(struct iam_uapi_it *it, struct iam_uapi_it *uit, ++ struct iam_descr *des, enum outop_t opt, unsigned long arg) ++{ ++ int result; ++ ++ result = outop(&it->iui_op, &uit->iui_op, des, opt); ++ if (result == 0 && (opt&STATE)) ++ result = put_user(it->iui_state, (int __user *) arg); ++ return result; ++} ++ ++static void putit(struct iam_uapi_it *it) ++{ ++ putop(&it->iui_op); ++} ++ ++static int getit(struct iam_uapi_it *it, struct iam_uapi_it *uit, ++ struct iam_descr *des, unsigned long arg) ++{ ++ return getop(&it->iui_op, &uit->iui_op, des, ++ (unsigned long)&((struct iam_uapi_it *)arg)->iui_op); ++} ++ ++int iam_uapi_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int result; ++ struct iam_uapi_info ua; ++ struct iam_uapi_op uop; ++ struct iam_uapi_op op; ++ struct iam_uapi_it uit; ++ struct iam_uapi_it it; ++ enum outop_t opt; ++ ++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) { ++ result = -EACCES; ++ } else if (cmd == IAM_IOC_POLYMORPH) { ++ /* ++ * If polymorphing into directory, increase hard-link count. ++ */ ++ if (S_ISDIR((umode_t)arg) && !S_ISDIR(inode->i_mode)) ++ inode->i_nlink++; ++ else if (!S_ISDIR((umode_t)arg) && S_ISDIR(inode->i_mode)) ++ inode->i_nlink--; ++ inode->i_mode = (umode_t)arg; ++ mark_inode_dirty(inode); ++ result = 0; ++ } else if (cmd == IAM_IOC_INIT) { ++ if (filp->private_data == NULL) { ++ result = getua(&ua, arg); ++ if (result == 0) ++ result = iam_uapi_init(inode, filp, &ua); ++ } else ++ result = -EBUSY; ++ } else if (is_dx(inode) && filp->private_data != NULL) { ++ struct iam_descr *des; ++ ++ switch (cmd) { ++ case IAM_IOC_IT_START: ++ case IAM_IOC_IT_NEXT: ++ opt = KEY|REC|STATE; ++ break; ++ case IAM_IOC_LOOKUP: ++ opt = REC; ++ break; ++ default: ++ opt = 0; ++ break; ++ } ++ ++ des = get_ipi(filp)->ipi_bag.ic_descr; ++ if (cmd == IAM_IOC_GETINFO) { ++ ua.iui_keysize = des->id_key_size; ++ ua.iui_recsize = des->id_rec_size; ++ ua.iui_ptrsize = des->id_ptr_size; ++ ua.iui_height = 0; /* not yet */ ++ memcpy(ua.iui_fmt_name, des->id_ops->id_name, ++ ARRAY_SIZE(ua.iui_fmt_name)); ++ result = putua(&ua, arg); ++ } else if (cmd == IAM_IOC_INSERT || cmd == IAM_IOC_LOOKUP || ++ cmd == IAM_IOC_DELETE) { ++ result = getop(&op, &uop, des, arg); ++ if (result == 0) { ++ int res2; ++ result = iam_uapi_op(cmd, inode, filp, &op); ++ ++ res2 = outop(&op, &uop, des, opt); ++ result = result ? : res2; ++ putop(&op); ++ } ++ } else if (cmd == IAM_IOC_IT_START || cmd == IAM_IOC_IT_NEXT || ++ cmd == IAM_IOC_IT_STOP) { ++ result = getit(&it, &uit, des, arg); ++ if (result == 0) { ++ int res2; ++ ++ result = iam_uapi_it(cmd, inode, filp, &it); ++ ++ res2 = outit(&it, &uit, des, opt, arg); ++ result = result ? : res2; ++ putit(&it); ++ } ++ } else ++ result = -EINVAL; ++ } else ++ result = -ENOENT; ++ return result; ++} +Index: linux-stage/include/linux/lustre_iam.h +=================================================================== +--- linux-stage.orig/include/linux/lustre_iam.h 2006-06-16 16:07:58.000000000 +0300 ++++ linux-stage/include/linux/lustre_iam.h 2007-10-21 17:42:58.000000000 +0300 +@@ -0,0 +1,1071 @@ ++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- ++ * vim:expandtab:shiftwidth=8:tabstop=8: ++ * ++ * lustre_iam.c ++ * Top-level entry points into osd module ++ * ++ * Copyright (c) 2006 Cluster File Systems, Inc. ++ * Author: Wang Di ++ * Author: Nikita Danilov ++ * ++ * This file is part of the Lustre file system, http://www.lustre.org ++ * Lustre is a trademark of Cluster File Systems, Inc. ++ * ++ * You may have signed or agreed to another license before downloading ++ * this software. If so, you are bound by the terms and conditions ++ * of that agreement, and the following does not apply to you. See the ++ * LICENSE file included with this distribution for more information. ++ * ++ * If you did not agree to a different license, then this copy of Lustre ++ * is open source software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * In either case, Lustre is distributed in the hope that it will be ++ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty ++ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * license text for more details. ++ */ ++ ++#ifndef __LINUX_LUSTRE_IAM_H__ ++#define __LINUX_LUSTRE_IAM_H__ ++ ++#include ++ ++/* ++ * linux/include/linux/lustre_iam.h ++ */ ++#define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } }) ++/* implication */ ++#define ergo(a, b) (!(a) || (b)) ++/* logical equivalence */ ++#define equi(a, b) (!!(a) == !!(b)) ++ ++enum { ++ /* ++ * Maximal number of non-leaf levels in htree. In the stock ext3 this ++ * is 2. ++ */ ++ /* ++ * XXX reduced back to 2 to make per-node locking work. ++ */ ++ DX_MAX_TREE_HEIGHT = 5, ++ /* ++ * Scratch keys used by generic code for temporaries. ++ * ++ * Allocation: ++ * ++ * [0] reserved for assertions and as a staging area for ++ * record keys immediately used for key comparisons. ++ * ++ * [1] reserved for record key, stored during iteration over ++ * node records (see dx_node_check()). ++ * ++ * [2] reserved for leaf node operations. ++ * ++ * [3] reserved for index operations. ++ * ++ * [4] reserved for path->ip_ikey_target ++ * ++ */ ++ DX_SCRATCH_KEYS = 5, ++ /* ++ * Maximal format name length. ++ */ ++ DX_FMT_NAME_LEN = 16, ++}; ++ ++#ifdef __KERNEL__ ++/* handle_t, journal_start(), journal_stop() */ ++#include ++ ++/* ++ * Debugging. ++ * ++ * Various debugging levels. ++ */ ++ ++#if 0 ++/* ++ * Following macros are defined in config.h and are tunable through ++ * appropriate configure switches (indicated below). ++ */ ++ ++/* ++ * Compile basic assertions in. You want this most of the time. ++ * ++ * --{enable,disable}-ldiskfs-assert (on by default). ++ */ ++#define EXT3_ASSERT (1) ++ ++/* ++ * Compile heavier correctness checks in. You want this during development ++ * cycle. ++ * ++ * --{enable,disable}-ldiskfs-correctness (off by default). ++ */ ++#define EXT3_CORRECTNESS (1) ++ ++/* ++ * Compile heavy invariant checking in. You want this early during development ++ * or when chasing a bug. ++ * ++ * --{enable,disable}-ldiskfs-invariant (off by default). ++ */ ++#define EXT3_INVARIANT (1) ++#endif ++ ++#if defined(EXT3_ASSERT) ++#define EXT3_ASSERT_ON (1) ++#else ++#define EXT3_ASSERT_ON (0) ++#endif ++ ++#if defined(EXT3_CORRECTNESS) ++#define EXT3_CORRECTNESS_ON (1) ++#else ++#define EXT3_CORRECTNESS_ON (0) ++#endif ++ ++#if defined(EXT3_INVARIANT) ++#define EXT3_INVARIANT_ON (1) ++#else ++#define EXT3_INVARIANT_ON (0) ++#endif ++ ++#ifndef assert ++#if EXT3_ASSERT_ON ++#define assert(test) J_ASSERT(test) ++#else ++#define assert(test) ((void)(test)) ++#endif ++#endif ++ ++#if EXT3_CORRECTNESS_ON ++#define assert_corr(test) J_ASSERT(test) ++#define do_corr(exp) exp ++#else ++#define assert_corr(test) do {;} while (0) ++#define do_corr(exp) do {;} while (0) ++#endif ++ ++#if EXT3_INVARIANT_ON ++#define assert_inv(test) J_ASSERT(test) ++#else ++#define assert_inv(test) do {;} while (0) ++#endif ++ ++/* ++ * Entry within index tree node. Consists of a key immediately followed ++ * (without padding) by a pointer to the child node. ++ * ++ * Both key and pointer are of variable size, hence incomplete type. ++ */ ++struct iam_entry; ++ ++struct iam_entry_compat { ++ __le32 hash; ++ __le32 block; ++}; ++ ++/* ++ * Incomplete type used to refer to keys in iam container. ++ * ++ * As key size can be different from container to container, iam has to use ++ * incomplete type. Clients cast pointer to iam_key to real key type and back. ++ */ ++struct iam_key; ++ ++/* ++ * Incomplete type use to refer to the records stored in iam containers. ++ */ ++struct iam_rec; ++ ++/* ++ * Key in index node. Possibly compressed. Fixed size. ++ */ ++struct iam_ikey; ++ ++/* ++ * Scalar type into which certain iam_key's can be uniquely mapped. Used to ++ * support interfaces like readdir(), where iteration over index has to be ++ * re-startable. ++ */ ++typedef __u32 iam_ptr_t; ++ ++/* ++ * Index node traversed during tree lookup. ++ */ ++struct iam_frame { ++ struct buffer_head *bh; /* buffer holding node data */ ++ struct iam_entry *entries; /* array of entries */ ++ struct iam_entry *at; /* target entry, found by binary search */ ++ iam_ptr_t leaf; /* (logical) offset of child node found by ++ * binary search. */ ++ iam_ptr_t curidx; /* (logical) offset of this node. Used to ++ * per-node locking to detect concurrent ++ * splits. */ ++}; ++ ++/* ++ * Opaque entry in the leaf node. ++ */ ++struct iam_lentry; ++ ++struct iam_path; ++struct iam_container; ++ ++ ++/* leaf node reached by tree lookup */ ++struct iam_leaf { ++ struct iam_path *il_path; ++ struct buffer_head *il_bh; ++ struct iam_lentry *il_entries; ++ struct iam_lentry *il_at; ++ /* ++ * Lock on a leaf node. ++ */ ++ struct dynlock_handle *il_lock; ++ iam_ptr_t il_curidx; /* logical offset of leaf node. */ ++ void *il_descr_data; ++}; ++ ++/* ++ * Return values of ->lookup() operation from struct iam_leaf_operations. ++ */ ++enum iam_lookup_t { ++ /* ++ * lookup found a record with the key requested ++ */ ++ IAM_LOOKUP_EXACT = 0, ++ /* ++ * lookup positioned leaf on some record ++ */ ++ IAM_LOOKUP_OK = 1, ++ /* ++ * leaf was empty ++ */ ++ IAM_LOOKUP_EMPTY = 2, ++ /* ++ * lookup positioned leaf before first record ++ */ ++ IAM_LOOKUP_BEFORE = 3, ++ /* ++ * Found hash may have a continuation in the next leaf. ++ */ ++ IAM_LOOKUP_LAST = 0x100 ++}; ++ ++/* ++ * Format-specific container operations. These are called by generic iam code. ++ */ ++struct iam_operations { ++ /* ++ * Returns pointer (in the same sense as pointer in index entry) to ++ * the root node. ++ */ ++ __u32 (*id_root_ptr)(struct iam_container *c); ++ ++ /* ++ * Check validity and consistency of index node. ++ */ ++ int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); ++ /* ++ * Copy some data from node header into frame. This is called when ++ * new node is loaded into frame. ++ */ ++ int (*id_node_load)(struct iam_path *path, struct iam_frame *frame); ++ /* ++ * Initialize new node (stored in @bh) that is going to be added into ++ * tree. ++ */ ++ int (*id_node_init)(struct iam_container *c, ++ struct buffer_head *bh, int root); ++ int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ /* ++ * Key comparison functions. Returns -1, 0, +1. ++ */ ++ int (*id_ikeycmp)(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2); ++ /* ++ * Modify root node when tree height increases. ++ */ ++ struct iam_entry *(*id_root_inc)(struct iam_container *c, ++ struct iam_path *path, ++ struct iam_frame *frame); ++ ++ struct iam_path_descr *(*id_ipd_alloc)(const struct iam_container *c, ++ void *area); ++ void (*id_ipd_free)(struct iam_path_descr *ipd); ++ /* ++ * Format name. ++ */ ++ char id_name[DX_FMT_NAME_LEN]; ++}; ++ ++/* ++ * Another format-specific operation vector, consisting of methods to access ++ * leaf nodes. This is separated from struct iam_operations, because it is ++ * assumed that there will be many formats with different format of leaf ++ * nodes, yes the same struct iam_operations. ++ */ ++struct iam_leaf_operations { ++ /* ++ * leaf operations. ++ */ ++ ++ /* ++ * initialize just loaded leaf node. ++ */ ++ int (*init)(struct iam_leaf *p); ++ /* ++ * Format new node. ++ */ ++ void (*init_new)(struct iam_container *c, struct buffer_head *bh); ++ /* ++ * Release resources. ++ */ ++ void (*fini)(struct iam_leaf *l); ++ /* ++ * returns true iff leaf is positioned at the last entry. ++ */ ++ int (*at_end)(const struct iam_leaf *l); ++ /* position leaf at the first entry */ ++ void (*start)(struct iam_leaf *l); ++ /* more leaf to the next entry. */ ++ void (*next)(struct iam_leaf *l); ++ /* ++ * return key of current leaf record. This method may return ++ * either pointer to the key stored in node, or copy key into ++ * @k buffer supplied by caller and return pointer to this ++ * buffer. The latter approach is used when keys in nodes are ++ * not stored in plain form (e.g., htree doesn't store keys at ++ * all). ++ * ++ * Caller should assume that returned pointer is only valid ++ * while leaf node is pinned and locked. ++ */ ++ struct iam_ikey *(*ikey)(const struct iam_leaf *l, struct iam_ikey *k); ++ struct iam_key *(*key)(const struct iam_leaf *l); ++ /* return pointer to entry body. Pointer is valid while ++ corresponding leaf node is locked and pinned. */ ++ struct iam_rec *(*rec)(const struct iam_leaf *l); ++ ++ void (*key_set)(struct iam_leaf *l, const struct iam_key *k); ++ void (*rec_set)(struct iam_leaf *l, const struct iam_rec *r); ++ void (*rec_get)(const struct iam_leaf *l, struct iam_rec *r); ++ ++ int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k); ++ int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k); ++ ++ int (*key_size)(const struct iam_leaf *l); ++ /* ++ * Search leaf @l for a record with key @k or for a place ++ * where such record is to be inserted. ++ * ++ * Scratch keys from @path can be used. ++ */ ++ int (*lookup)(struct iam_leaf *l, const struct iam_key *k); ++ int (*ilookup)(struct iam_leaf *l, const struct iam_ikey *ik); ++ ++ int (*can_add)(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r); ++ /* ++ * add rec for a leaf ++ */ ++ void (*rec_add)(struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r); ++ /* ++ * remove rec for a leaf ++ */ ++ void (*rec_del)(struct iam_leaf *l, int shift); ++ /* ++ * split leaf node, moving some entries into @bh (the latter currently ++ * is assumed to be empty). ++ */ ++ void (*split)(struct iam_leaf *l, struct buffer_head **bh, ++ iam_ptr_t newblknr); ++}; ++ ++/* ++ * Parameters, describing a flavor of iam container. ++ */ ++struct iam_descr { ++ /* ++ * Size of a key in this container, in bytes. ++ */ ++ size_t id_key_size; ++ /* ++ * Size of a key in index nodes, in bytes. ++ */ ++ size_t id_ikey_size; ++ /* ++ * Size of a pointer to the next level (stored in index nodes), in ++ * bytes. ++ */ ++ size_t id_ptr_size; ++ /* ++ * Size of a record (stored in leaf nodes), in bytes. ++ */ ++ size_t id_rec_size; ++ /* ++ * Size of unused (by iam) space at the beginning of every non-root ++ * node, in bytes. Used for compatibility with ext3. ++ */ ++ size_t id_node_gap; ++ /* ++ * Size of unused (by iam) space at the beginning of root node, in ++ * bytes. Used for compatibility with ext3. ++ */ ++ size_t id_root_gap; ++ ++ struct iam_operations *id_ops; ++ struct iam_leaf_operations *id_leaf_ops; ++}; ++ ++/* ++ * An instance of iam container. ++ */ ++struct iam_container { ++ /* ++ * Underlying flat file. IO against this object is issued to ++ * read/write nodes. ++ */ ++ struct inode *ic_object; ++ /* ++ * container flavor. ++ */ ++ struct iam_descr *ic_descr; ++ /* ++ * read-write lock protecting index consistency. ++ */ ++ struct rw_semaphore ic_sem; ++}; ++ ++/* ++ * description-specific part of iam_path. This is usually embedded into larger ++ * structure. ++ */ ++struct iam_path_descr { ++ /* ++ * Scratch-pad area for temporary keys. ++ */ ++ struct iam_ikey *ipd_key_scratch[DX_SCRATCH_KEYS]; ++}; ++ ++/* ++ * Structure to keep track of a path drilled through htree. ++ */ ++struct iam_path { ++ /* ++ * Parent container. ++ */ ++ struct iam_container *ip_container; ++ /* ++ * Number of index levels minus one. ++ */ ++ int ip_indirect; ++ /* ++ * Nodes that top-to-bottom traversal passed through. ++ */ ++ struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; ++ /* ++ * Last filled frame in ->ip_frames. Refers to the 'twig' node (one ++ * immediately above leaf). ++ */ ++ struct iam_frame *ip_frame; ++ /* ++ * Leaf node: a child of ->ip_frame. ++ */ ++ struct iam_leaf ip_leaf; ++ /* ++ * Key searched for. ++ */ ++ const struct iam_key *ip_key_target; ++ const struct iam_ikey *ip_ikey_target; ++ /* ++ * Description-specific data. ++ */ ++ struct iam_path_descr *ip_data; ++}; ++ ++struct dx_hash_info; ++ ++/* ++ * Helper structure for legacy htrees. ++ */ ++struct iam_path_compat { ++ struct iam_path ipc_path; ++ struct iam_container ipc_container; ++ __u32 ipc_scratch[DX_SCRATCH_KEYS]; ++ struct dx_hash_info *ipc_hinfo; ++ struct qstr *ipc_qstr; ++ struct iam_path_descr ipc_descr; ++ struct dx_hash_info ipc_hinfo_area; ++}; ++ ++#define const_max(p, q) ((p > q) ? p : q) ++ ++enum { ++ DX_MAX_IKEY_SIZE = 32, /* be generous */ ++ /* ++ * Hack to avoid dynamic allocation and freeing of ipd. ++ */ ++ DX_IPD_MAX_SIZE = const_max(sizeof(struct iam_path_compat), ++ DX_MAX_IKEY_SIZE * DX_SCRATCH_KEYS + ++ sizeof(struct iam_path_descr)) ++}; ++ ++/* ++ * iam cursor (iterator) api. ++ */ ++ ++/* ++ * States of iterator state machine. ++ */ ++enum iam_it_state { ++ /* initial state */ ++ IAM_IT_DETACHED, ++ /* iterator is above particular record in the container */ ++ IAM_IT_ATTACHED, ++ /* iterator is positioned before record */ ++ IAM_IT_SKEWED ++}; ++ ++/* ++ * Flags controlling iterator functionality. ++ */ ++enum iam_it_flags { ++ /* ++ * this iterator will move (iam_it_next() will be called on it) ++ */ ++ IAM_IT_MOVE = (1 << 0), ++ /* ++ * tree can be updated through this iterator. ++ */ ++ IAM_IT_WRITE = (1 << 1) ++}; ++ ++/* ++ * Iterator. ++ * ++ * Immediately after call to iam_it_init() iterator is in "detached" ++ * (IAM_IT_DETACHED) state: it is associated with given parent container, but ++ * doesn't point to any particular record in this container. ++ * ++ * After successful call to iam_it_get() and until corresponding call to ++ * iam_it_put() iterator is in one of "active" states: IAM_IT_ATTACHED or ++ * IAM_IT_SKEWED. ++ * ++ * Active iterator can move through records in a container (provided ++ * IAM_IT_MOVE permission) in a key order, can get record and key values as it ++ * passes over them, and can modify container (provided IAM_IT_WRITE ++ * permission). ++ * ++ * Iteration may reach the end of container, at which point iterator switches ++ * into IAM_IT_DETACHED state. ++ * ++ * Concurrency: iterators are supposed to be local to thread. Interfaces below ++ * do no internal serialization of access to the iterator fields. ++ * ++ * When in non-detached state, iterator keeps some container nodes pinned in ++ * memory and locked (that locking may be implemented at the container ++ * granularity though). In particular, clients may assume that pointers to ++ * records and keys obtained through iterator interface as valid until ++ * iterator is detached (except that they may be invalidated by sub-sequent ++ * operations done through the same iterator). ++ * ++ */ ++struct iam_iterator { ++ /* ++ * iterator flags, taken from enum iam_it_flags. ++ */ ++ __u32 ii_flags; ++ enum iam_it_state ii_state; ++ /* ++ * path to the record. Valid in IAM_IT_ATTACHED, and IAM_IT_SKEWED ++ * states. ++ */ ++ struct iam_path ii_path; ++}; ++ ++void iam_path_init(struct iam_path *path, struct iam_container *c, ++ struct iam_path_descr *pd); ++void iam_path_fini(struct iam_path *path); ++void iam_path_release(struct iam_path *path); ++ ++void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode); ++void iam_path_compat_fini(struct iam_path_compat *path); ++ ++struct iam_path_descr *iam_ipd_alloc(void *area, int keysize); ++void iam_ipd_free(struct iam_path_descr *ipd); ++ ++int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags, ++ struct iam_path_descr *pd); ++void iam_it_fini(struct iam_iterator *it); ++int iam_it_get(struct iam_iterator *it, const struct iam_key *k); ++int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k); ++void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src); ++void iam_it_put(struct iam_iterator *it); ++int iam_it_next(struct iam_iterator *it); ++struct iam_rec *iam_it_rec_get(const struct iam_iterator *it); ++int iam_it_rec_set(handle_t *h, ++ struct iam_iterator *it, const struct iam_rec *r); ++struct iam_key *iam_it_key_get(const struct iam_iterator *it); ++int iam_it_key_size(const struct iam_iterator *it); ++int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, ++ const struct iam_key *k, const struct iam_rec *r); ++int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); ++ ++typedef __u64 iam_pos_t; ++ ++iam_pos_t iam_it_store(const struct iam_iterator *it); ++int iam_it_load(struct iam_iterator *it, iam_pos_t pos); ++ ++int iam_lookup(struct iam_container *c, const struct iam_key *k, ++ struct iam_rec *r, struct iam_path_descr *pd); ++int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ struct iam_path_descr *pd); ++int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, ++ const struct iam_rec *r, struct iam_path_descr *pd); ++int iam_insert(handle_t *handle, struct iam_container *c, ++ const struct iam_key *k, ++ const struct iam_rec *r, struct iam_path_descr *pd); ++/* ++ * Initialize container @c. ++ */ ++int iam_container_init(struct iam_container *c, ++ struct iam_descr *descr, struct inode *inode); ++/* ++ * Finalize container @c, release all resources. ++ */ ++void iam_container_fini(struct iam_container *c); ++ ++/* ++ * Determine container format. ++ */ ++int iam_container_setup(struct iam_container *c); ++ ++static inline struct iam_descr *iam_container_descr(struct iam_container *c) ++{ ++ return c->ic_descr; ++} ++ ++static inline struct iam_descr *iam_path_descr(const struct iam_path *p) ++{ ++ return p->ip_container->ic_descr; ++} ++ ++static inline struct inode *iam_path_obj(struct iam_path *p) ++{ ++ return p->ip_container->ic_object; ++} ++ ++static inline void iam_ikeycpy(const struct iam_container *c, ++ struct iam_ikey *k1, const struct iam_ikey *k2) ++{ ++ memcpy(k1, k2, c->ic_descr->id_ikey_size); ++} ++ ++static inline size_t iam_entry_size(struct iam_path *p) ++{ ++ return iam_path_descr(p)->id_ikey_size + iam_path_descr(p)->id_ptr_size; ++} ++ ++static inline struct iam_entry *iam_entry_shift(struct iam_path *p, ++ struct iam_entry *entry, ++ int shift) ++{ ++ void *e = entry; ++ return e + shift * iam_entry_size(p); ++} ++ ++static inline struct iam_ikey *iam_get_ikey(struct iam_path *p, ++ struct iam_entry *entry, ++ struct iam_ikey *key) ++{ ++ return memcpy(key, entry, iam_path_descr(p)->id_ikey_size); ++} ++ ++static inline struct iam_ikey *iam_ikey_at(struct iam_path *p, ++ struct iam_entry *entry) ++{ ++ return (struct iam_ikey *)entry; ++} ++ ++static inline ptrdiff_t iam_entry_diff(struct iam_path *p, ++ struct iam_entry *e1, ++ struct iam_entry *e2) ++{ ++ ptrdiff_t diff; ++ ++ diff = (void *)e1 - (void *)e2; ++ assert_corr(diff / iam_entry_size(p) * iam_entry_size(p) == diff); ++ return diff / iam_entry_size(p); ++} ++ ++/* ++ * Helper for the frequent case, where key was already placed into @k1 by ++ * callback. ++ */ ++static inline void iam_ikeycpy0(const struct iam_container *c, ++ struct iam_ikey *k1, const struct iam_ikey *k2) ++{ ++ if (k1 != k2) ++ iam_ikeycpy(c, k1, k2); ++} ++ ++static inline int iam_ikeycmp(const struct iam_container *c, ++ const struct iam_ikey *k1, ++ const struct iam_ikey *k2) ++{ ++ return c->ic_descr->id_ops->id_ikeycmp(c, k1, k2); ++} ++ ++static inline void *iam_entry_off(struct iam_entry *entry, size_t off) ++{ ++ return (void *)((char *)entry + off); ++} ++ ++/* ++ * Leaf helpers. ++ */ ++ ++static inline struct iam_path *iam_leaf_path(const struct iam_leaf *leaf) ++{ ++ return leaf->il_path; ++} ++ ++static inline struct iam_container * ++iam_leaf_container(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_path(leaf)->ip_container; ++} ++ ++static inline struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_container(leaf)->ic_descr; ++} ++ ++static inline struct iam_leaf_operations * ++iam_leaf_ops(const struct iam_leaf *leaf) ++{ ++ return iam_leaf_descr(leaf)->id_leaf_ops; ++} ++ ++static inline void iam_reccpy(const struct iam_leaf *leaf, ++ struct iam_rec *rec_dst) ++{ ++ iam_leaf_ops(leaf)->rec_get(leaf, rec_dst); ++} ++ ++/*XXX These stuff put here, just because they are used by iam.c and namei.c*/ ++static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) ++{ ++ return le32_to_cpu(*(u32*)iam_entry_off(entry, ++ iam_path_descr(p)->id_ikey_size)) ++ & 0x00ffffff; ++} ++ ++static inline void dx_set_block(struct iam_path *p, ++ struct iam_entry *entry, unsigned value) ++{ ++ *(u32*)iam_entry_off(entry, ++ iam_path_descr(p)->id_ikey_size) = ++ cpu_to_le32(value); ++} ++ ++static inline void dx_set_ikey(struct iam_path *p, struct iam_entry *entry, ++ const struct iam_ikey *key) ++{ ++ iam_ikeycpy(p->ip_container, iam_entry_off(entry, 0), key); ++} ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++struct fake_dirent { ++ __le32 inode; ++ __le16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit { ++ __le16 limit; ++ __le16 count; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root { ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ __le32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct {} entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct {} entries[0]; ++}; ++ ++ ++static inline unsigned dx_get_count(struct iam_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count); ++} ++ ++static inline unsigned dx_get_limit(struct iam_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit); ++} ++ ++static inline void dx_set_count(struct iam_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_node_limit(struct iam_path *p) ++{ ++ struct iam_descr *param = iam_path_descr(p); ++ unsigned entry_space = iam_path_obj(p)->i_sb->s_blocksize - ++ param->id_node_gap; ++ return entry_space / (param->id_ikey_size + param->id_ptr_size); ++} ++ ++static inline unsigned dx_root_limit(struct iam_path *p) ++{ ++ struct iam_descr *param = iam_path_descr(p); ++ unsigned limit = iam_path_obj(p)->i_sb->s_blocksize - ++ param->id_root_gap; ++ limit /= (param->id_ikey_size + param->id_ptr_size); ++ if (limit == dx_node_limit(p)) ++ limit--; ++ return limit; ++} ++ ++ ++static inline struct iam_entry *dx_get_entries(struct iam_path *path, ++ void *data, int root) ++{ ++ struct iam_descr *param = iam_path_descr(path); ++ return data + (root ? param->id_root_gap : param->id_node_gap); ++} ++ ++ ++static inline struct iam_entry *dx_node_get_entries(struct iam_path *path, ++ struct iam_frame *frame) ++{ ++ return dx_get_entries(path, ++ frame->bh->b_data, frame == path->ip_frames); ++} ++ ++static inline struct iam_ikey *iam_path_ikey(const struct iam_path *path, ++ int nr) ++{ ++ assert(0 <= nr && nr < ARRAY_SIZE(path->ip_data->ipd_key_scratch)); ++ return path->ip_data->ipd_key_scratch[nr]; ++} ++ ++static inline struct dynlock *path_dynlock(struct iam_path *path) ++{ ++ return &EXT3_I(iam_path_obj(path))->i_htree_lock; ++} ++ ++static inline int iam_leaf_is_locked(const struct iam_leaf *leaf) ++{ ++ int result; ++ ++ result = dynlock_is_locked(path_dynlock(leaf->il_path), ++ leaf->il_curidx); ++ if (!result) ++ dump_stack(); ++ return result; ++} ++ ++static inline int iam_frame_is_locked(struct iam_path *path, ++ const struct iam_frame *frame) ++{ ++ int result; ++ ++ result = dynlock_is_locked(path_dynlock(path), frame->curidx); ++ if (!result) ++ dump_stack(); ++ return result; ++} ++ ++int dx_lookup_lock(struct iam_path *path, ++ struct dynlock_handle **dl, enum dynlock_type lt); ++ ++void dx_insert_block(struct iam_path *path, struct iam_frame *frame, ++ u32 hash, u32 block); ++int dx_index_is_compat(struct iam_path *path); ++ ++int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash); ++ ++struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, ++ u32 *block, int *err); ++int split_index_node(handle_t *handle, struct iam_path *path, ++ struct dynlock_handle **lh); ++struct ext3_dir_entry_2 *split_entry(struct inode *dir, ++ struct ext3_dir_entry_2 *de, ++ unsigned long ino, mode_t mode, ++ const char *name, int namelen); ++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir, ++ struct buffer_head *bh, ++ const char *name, int namelen); ++struct ext3_dir_entry_2 *move_entries(struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct buffer_head **bh1, ++ struct buffer_head **bh2, ++ __u32 *delim_hash); ++ ++extern struct iam_descr iam_htree_compat_param; ++ ++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, ++ enum dynlock_type lt); ++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh); ++ ++/* ++ * external ++ */ ++void iam_container_write_lock(struct iam_container *c); ++void iam_container_write_unlock(struct iam_container *c); ++ ++void iam_container_read_lock(struct iam_container *c); ++void iam_container_read_unlock(struct iam_container *c); ++ ++int iam_index_next(struct iam_container *c, struct iam_path *p); ++int iam_read_leaf(struct iam_path *p); ++ ++int iam_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *handle, struct buffer_head **bh); ++ ++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr); ++ ++int iam_leaf_at_end(const struct iam_leaf *l); ++void iam_leaf_next(struct iam_leaf *folio); ++int iam_leaf_can_add(const struct iam_leaf *l, ++ const struct iam_key *k, const struct iam_rec *r); ++ ++struct iam_path *iam_leaf_path(const struct iam_leaf *leaf); ++struct iam_container *iam_leaf_container(const struct iam_leaf *leaf); ++struct iam_descr *iam_leaf_descr(const struct iam_leaf *leaf); ++struct iam_leaf_operations *iam_leaf_ops(const struct iam_leaf *leaf); ++ ++ ++int iam_node_read(struct iam_container *c, iam_ptr_t ptr, ++ handle_t *h, struct buffer_head **bh); ++ ++/* ++ * Container format. ++ */ ++struct iam_format { ++ /* ++ * Method called to recognize container format. Should return true iff ++ * container @c conforms to this format. This method may do IO to read ++ * container pages. ++ * ++ * If container is recognized, this method sets operation vectors ++ * ->id_ops and ->id_leaf_ops in container description (c->ic_descr), ++ * and fills other description fields. ++ */ ++ int (*if_guess)(struct iam_container *c); ++ /* ++ * Linkage into global list of container formats. ++ */ ++ struct list_head if_linkage; ++}; ++ ++void iam_format_register(struct iam_format *fmt); ++ ++void iam_lfix_format_init(void); ++void iam_lvar_format_init(void); ++void iam_htree_format_init(void); ++ ++struct iam_private_info; ++ ++void ext3_iam_release(struct file *filp, struct inode *inode); ++ ++int iam_uapi_ioctl(struct inode * inode, struct file * filp, unsigned int cmd, ++ unsigned long arg); ++ ++/* dir.c */ ++#if EXT3_INVARIANT_ON ++extern int ext3_check_dir_entry(const char *, struct inode *, ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++#else ++static inline int ext3_check_dir_entry(const char * function, ++ struct inode * dir, ++ struct ext3_dir_entry_2 * de, ++ struct buffer_head * bh, ++ unsigned long offset) ++{ ++ return 1; ++} ++#endif ++ ++/* __KERNEL__ */ ++#endif ++ ++/* ++ * User level API. Copy exists in lustre/lustre/tests/iam_ut.c ++ */ ++ ++struct iam_uapi_info { ++ __u16 iui_keysize; ++ __u16 iui_recsize; ++ __u16 iui_ptrsize; ++ __u16 iui_height; ++ char iui_fmt_name[DX_FMT_NAME_LEN]; ++}; ++ ++struct iam_uapi_op { ++ void *iul_key; ++ void *iul_rec; ++}; ++ ++struct iam_uapi_it { ++ struct iam_uapi_op iui_op; ++ __u16 iui_state; ++}; ++ ++enum iam_ioctl_cmd { ++ IAM_IOC_INIT = _IOW('i', 1, struct iam_uapi_info), ++ IAM_IOC_GETINFO = _IOR('i', 2, struct iam_uapi_info), ++ IAM_IOC_INSERT = _IOR('i', 3, struct iam_uapi_op), ++ IAM_IOC_LOOKUP = _IOWR('i', 4, struct iam_uapi_op), ++ IAM_IOC_DELETE = _IOR('i', 5, struct iam_uapi_op), ++ IAM_IOC_IT_START = _IOR('i', 6, struct iam_uapi_it), ++ IAM_IOC_IT_NEXT = _IOW('i', 7, struct iam_uapi_it), ++ IAM_IOC_IT_STOP = _IOR('i', 8, struct iam_uapi_it), ++ ++ IAM_IOC_POLYMORPH = _IOR('i', 9, unsigned long) ++}; ++ ++/* __LINUX_LUSTRE_IAM_H__ */ ++#endif diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-iam-rhel4.patch new file mode 100644 index 0000000..b19ec707 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-rhel4.patch @@ -0,0 +1,2664 @@ +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2007-10-20 17:14:38.000000000 +0300 ++++ linux-stage/fs/ext3/namei.c 2007-10-20 17:48:29.000000000 +0300 +@@ -24,78 +24,7 @@ + * Theodore Ts'o, 2002 + */ + +-/* +- * iam: big theory statement. +- * +- * iam (Index Access Module) is a module providing abstraction of persistent +- * transactional container on top of generalized ext3 htree. +- * +- * iam supports: +- * +- * - key, pointer, and record size specifiable per container. +- * +- * - trees taller than 2 index levels. +- * +- * - read/write to existing ext3 htree directories as iam containers. +- * +- * iam container is a tree, consisting of leaf nodes containing keys and +- * records stored in this container, and index nodes, containing keys and +- * pointers to leaf or index nodes. +- * +- * iam does not work with keys directly, instead it calls user-supplied key +- * comparison function (->dpo_keycmp()). +- * +- * Pointers are (currently) interpreted as logical offsets (measured in +- * blocksful) within underlying flat file on top of which iam tree lives. +- * +- * On-disk format: +- * +- * iam mostly tries to reuse existing htree formats. +- * +- * Format of index node: +- * +- * +-----+-------+-------+-------+------+-------+------------+ +- * | | count | | | | | | +- * | gap | / | entry | entry | .... | entry | free space | +- * | | limit | | | | | | +- * +-----+-------+-------+-------+------+-------+------------+ +- * +- * gap this part of node is never accessed by iam code. It +- * exists for binary compatibility with ext3 htree (that, +- * in turn, stores fake struct ext2_dirent for ext2 +- * compatibility), and to keep some unspecified per-node +- * data. Gap can be different for root and non-root index +- * nodes. Gap size can be specified for each container +- * (gap of 0 is allowed). +- * +- * count/limit current number of entries in this node, and the maximal +- * number of entries that can fit into node. count/limit +- * has the same size as entry, and is itself counted in +- * count. +- * +- * entry index entry: consists of a key immediately followed by +- * a pointer to a child node. Size of a key and size of a +- * pointer depends on container. Entry has neither +- * alignment nor padding. +- * +- * free space portion of node new entries are added to +- * +- * Entries in index node are sorted by their key value. +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- */ +- ++#include + #include + #include + #include +@@ -108,10 +37,10 @@ + #include + #include + #include ++#include + #include "xattr.h" + #include "iopen.h" + #include "acl.h" +- + /* + * define how far ahead to read directories while searching them. + */ +@@ -120,33 +49,29 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +-/* +- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. +- */ +-enum { +- DX_MAX_TREE_HEIGHT = 5, +- DX_SCRATCH_KEYS = 2 +-}; + +-static struct buffer_head *ext3_append(handle_t *handle, ++struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) + { + struct buffer_head *bh; ++ struct ext3_inode_info *ei = EXT3_I(inode); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + +- if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ bh = ext3_bread(handle, inode, *block, 1, err); ++ if (bh != NULL) { + inode->i_size += inode->i_sb->s_blocksize; +- EXT3_I(inode)->i_disksize = inode->i_size; +- ext3_journal_get_write_access(handle,bh); ++ ei->i_disksize = inode->i_size; + } ++ up(&ei->i_append_sem); ++ + return bh; + } + +-#ifndef assert +-#define assert(test) J_ASSERT(test) +-#endif + + #ifndef swap + #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +@@ -158,533 +83,16 @@ + #define dxtrace(command) + #endif + +-struct fake_dirent { +- __le32 inode; +- __le16 rec_len; +- u8 name_len; +- u8 file_type; +-}; +- +-struct dx_countlimit { +- __le16 limit; +- __le16 count; +-}; +- +-/* +- * dx_root_info is laid out so that if it should somehow get overlaid by a +- * dirent the two low bits of the hash version will be zero. Therefore, the +- * hash version mod 4 should never be 0. Sincerely, the paranoia department. +- */ +- +-struct dx_root { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct {} entries[0]; +-}; +- +-struct dx_node +-{ +- struct fake_dirent fake; +- struct {} entries[0]; +-}; +- +-struct dx_map_entry +-{ +- u32 hash; +- u32 offs; +-}; +- +-/* +- * Entry within index tree node. Consists of a key immediately followed +- * (without padding) by a pointer to the child node. +- * +- * Both key and pointer are of variable size, hence incomplete type. +- */ +-struct iam_entry; +- +-struct iam_entry_compat { +- __le32 hash; +- __le32 block; +-}; +- +-/* +- * Incomplete type used to refer to keys in iam container. +- * +- * As key size can be different from container to container, iam has to use +- * incomplete type. Clients cast pointer to iam_key to real key type and back. +- */ +-struct iam_key; +- +-/* Incomplete type use to refer to the records stored in iam containers. */ +-struct iam_rec; +- +-typedef __u64 iam_ptr_t; +- +-/* +- * Index node traversed during tree lookup. +- */ +-struct iam_frame { +- struct buffer_head *bh; /* buffer holding node data */ +- struct iam_entry *entries; /* array of entries */ +- struct iam_entry *at; /* target entry, found by binary search */ +-}; +- +-/* leaf node reached by tree lookup */ +-struct iam_leaf { +- struct buffer_head *bh; +- struct iam_leaf_entry *entries; +- struct iam_leaf_entry *at; +-}; +- +-struct iam_path; +-struct iam_container; +- +-/* +- * Parameters, describing a flavor of iam container. +- */ +-struct iam_descr { +- /* +- * Size of a key in this container, in bytes. +- */ +- size_t id_key_size; +- /* +- * Size of a pointer to the next level (stored in index nodes), in +- * bytes. +- */ +- size_t id_ptr_size; +- /* +- * Size of a record (stored in leaf nodes), in bytes. +- */ +- size_t id_rec_size; +- /* +- * Size of unused (by iam) space at the beginning of every non-root +- * node, in bytes. Used for compatibility with ext3. +- */ +- size_t id_node_gap; +- /* +- * Size of unused (by iam) space at the beginning of root node, in +- * bytes. Used for compatibility with ext3. +- */ +- size_t id_root_gap; +- +- /* +- * Returns pointer (in the same sense as pointer in index entry) to +- * the root node. +- */ +- __u32 (*id_root_ptr)(struct iam_container *c); +- +- /* +- * Check validity and consistency of index node. This is called when +- * iam just loaded new node into frame. +- */ +- int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); +- /* +- * Initialize new node (stored in @bh) that is going to be added into +- * tree. +- */ +- int (*id_node_init)(struct iam_container *c, +- struct buffer_head *bh, int root); +- int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, +- handle_t *h, struct buffer_head **bh); +- /* +- * Key comparison function. Returns -1, 0, +1. +- */ +- int (*id_keycmp)(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); +- /* +- * Create new container. +- * +- * Newly created container has a root node and a single leaf. Leaf +- * contains single record with the smallest possible key. +- */ +- int (*id_create)(struct iam_container *c); +- struct { +- /* +- * leaf operations. +- */ +- /* +- * returns true iff leaf is positioned at the last entry. +- */ +- int (*at_end)(struct iam_container *c, struct iam_leaf *l); +- /* position leaf at the first entry */ +- void (*start)(struct iam_container *c, struct iam_leaf *l); +- /* more leaf to the next entry. */ +- void (*next)(struct iam_container *c, struct iam_leaf *l); +- /* return key of current leaf record in @k */ +- void (*key)(struct iam_container *c, struct iam_leaf *l, +- struct iam_key *k); +- /* return pointer to entry body */ +- struct iam_rec *(*rec)(struct iam_container *c, +- struct iam_leaf *l); +- } id_leaf; +-}; +- +-struct iam_container { +- /* +- * Underlying flat file. IO against this object is issued to +- * read/write nodes. +- */ +- struct inode *ic_object; +- /* +- * container flavor. +- */ +- struct iam_descr *ic_descr; +- /* +- * pointer to flavor-specific per-container data. +- */ +- void *ic_descr_data; +-}; +- +-/* +- * Structure to keep track of a path drilled through htree. +- */ +-struct iam_path { +- /* +- * Parent container. +- */ +- struct iam_container *ip_container; +- /* +- * Number of index levels minus one. +- */ +- int ip_indirect; +- /* +- * Nodes that top-to-bottom traversal passed through. +- */ +- struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; +- /* +- * Last filled frame in ->ip_frames. Refers to the 'twig' node (one +- * immediately above leaf). +- */ +- struct iam_frame *ip_frame; +- /* +- * Leaf node: a child of ->ip_frame. +- */ +- struct iam_leaf *ip_leaf; +- /* +- * Key searched for. +- */ +- struct iam_key *ip_key_target; +- /* +- * Scratch-pad area for temporary keys. +- */ +- struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; +- /* +- * pointer to flavor-specific per-container data. +- */ +- void *ip_descr_data; +-}; +- +-/* +- * Helper structure for legacy htrees. +- */ +-struct iam_path_compat { +- struct iam_path ipc_path; +- struct iam_container ipc_container; +- __u32 ipc_scrach[DX_SCRATCH_KEYS]; +-}; +- +-static u32 htree_root_ptr(struct iam_container *c); +-static int htree_node_check(struct iam_path *path, struct iam_frame *frame); +-static int htree_node_init(struct iam_container *c, +- struct buffer_head *bh, int root); +-static int htree_keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); +-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, +- handle_t *h, struct buffer_head **bh); +- +-/* +- * Parameters describing iam compatibility mode in which existing ext3 htrees +- * can be manipulated. +- */ +-static struct iam_descr htree_compat_param = { +- .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, +- .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, +- .id_node_gap = offsetof(struct dx_node, entries), +- .id_root_gap = offsetof(struct dx_root, entries), +- +- .id_root_ptr = htree_root_ptr, +- .id_node_check = htree_node_check, +- .id_node_init = htree_node_init, +- .id_node_read = htree_node_read, +- .id_keycmp = htree_keycmp +-}; +- +- +-struct iam_key; +-struct iam_rec; +-struct iam_descr; +-struct iam_container; +-struct iam_path; +- +-/* +- * Initialize container @c, acquires additional reference on @inode. +- */ +-int iam_container_init(struct iam_container *c, +- struct iam_descr *descr, struct inode *inode); +-/* +- * Finalize container @c, release all resources. +- */ +-void iam_container_fini(struct iam_container *c); +- +-/* +- * Search container @c for record with key @k. If record is found, its data +- * are moved into @r. +- * +- * +- * +- * Return values: +ve: found, 0: not-found, -ve: error +- */ +-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); +-/* +- * Insert new record @r with key @k into container @c (within context of +- * transaction @h. +- * +- * Return values: 0: success, -ve: error, including -EEXIST when record with +- * given key is already present. +- * +- * postcondition: ergo(result == 0 || result == -EEXIST, +- * iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_insert(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Replace existing record with key @k, or insert new one. New record data are +- * in @r. +- * +- * Return values: 0: success, -ve: error. +- * +- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_update(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Delete existing record with key @k. +- * +- * Return values: 0: success, -ENOENT: not-found, -ve: other error. +- * +- * postcondition: ergo(result == 0 || result == -ENOENT, +- * !iam_lookup(c, k, *)); +- */ +-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); +- +-/* +- * iam cursor (iterator) api. +- */ +- +-/* +- * Flags controlling iterator functionality. +- */ +-enum iam_it_flags { +- /* +- * this iterator will move (iam_it_{prev,next}() will be called on it) +- */ +- IAM_IT_MOVE = (1 << 0), +- /* +- * tree can be updated through this iterator. +- */ +- IAM_IT_WRITE = (1 << 1) +-}; +- +-/* +- * States of iterator state machine. +- */ +-enum iam_it_state { +- /* initial state */ +- IAM_IT_DETACHED, +- /* iterator is above particular record in the container */ +- IAM_IT_ATTACHED +-}; +- +-/* +- * Iterator. +- * +- * Immediately after call to iam_it_init() iterator is in "detached" +- * (IAM_IT_DETACHED) state: it is associated with given parent container, but +- * doesn't point to any particular record in this container. +- * +- * After successful call to iam_it_get() and until corresponding call to +- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). +- * +- * Attached iterator can move through records in a container (provided +- * IAM_IT_MOVE permission) in a key order, can get record and key values as it +- * passes over them, and can modify container (provided IAM_IT_WRITE +- * permission). +- * +- * Concurrency: iterators are supposed to be local to thread. Interfaces below +- * do no internal serialization. +- * +- */ +-struct iam_iterator { +- /* +- * iterator flags, taken from enum iam_it_flags. +- */ +- __u32 ii_flags; +- enum iam_it_state ii_state; +- /* +- * path to the record. Valid in IAM_IT_ATTACHED state. +- */ +- struct iam_path ii_path; +-}; +- +-static inline struct iam_key *keycpy(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return memcpy(k1, k2, c->ic_descr->id_key_size); +-} +- +-static inline int keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return c->ic_descr->id_keycmp(c, k1, k2); +-} +- +-static struct iam_container *iam_it_container(struct iam_iterator *it) +-{ +- return it->ii_path.ip_container; +-} +- +-static inline int it_keycmp(struct iam_iterator *it, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return keycmp(iam_it_container(it), k1, k2); +-} +- +-/* +- * Initialize iterator to IAM_IT_DETACHED state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ +-int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags); +-/* +- * Finalize iterator and release all resources. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- */ +-void iam_it_fini(struct iam_iterator *it); +- +-/* +- * Attach iterator. After successful completion, @it points to record with the +- * largest key not larger than @k. Semantics of ->id_create() method guarantee +- * that such record will always be found. +- * +- * Return value: 0: positioned on existing record, +- * -ve: error. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- * postcondition: ergo(result == 0, +- * (it_state(it) == IAM_IT_ATTACHED && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) +- */ +-int iam_it_get(struct iam_iterator *it, struct iam_key *k); +- +-/* +- * Duplicates iterator. +- * +- * postcondition: it_state(dst) == it_state(src) && +- * iam_it_container(dst) == iam_it_container(src) && +- * dst->ii_flags = src->ii_flags && +- * ergo(it_state(it) == IAM_IT_ATTACHED, +- * iam_it_rec_get(dst) == iam_it_rec_get(src) && +- * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) +- */ +-void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src); +- +-/* +- * Detach iterator. Does nothing it detached state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ +-void iam_it_put(struct iam_iterator *it); +- +-/* +- * Move iterator one record right. +- * +- * Return value: 0: success, +- * +1: end of container reached +- * -ve: error +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE +- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) +- */ +-int iam_it_next(struct iam_iterator *it); +- +-/* +- * Return pointer to the record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-const struct iam_rec *iam_it_rec_get(struct iam_iterator *it); +- +-/* +- * Replace contents of record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); +- +-/* +- * Place key under iterator in @k, return @k +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-const struct iam_key *iam_it_key_get(struct iam_iterator *it, +- struct iam_key *k); +- +-/* +- * Insert new record with key @k and contents from @r, shifting records to the +- * right. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && +- * it->ii_flags&IAM_IT_WRITE && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0 +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, +- * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && +- * !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Delete record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); +- + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry); + static void dx_set_block(struct iam_path *p, + struct iam_entry *entry, unsigned value); +-static inline struct iam_key *dx_get_key(struct iam_path *p, +- struct iam_entry *entry, +- struct iam_key *key); +-static void dx_set_key(struct iam_path *p, struct iam_entry *entry, +- struct iam_key *key); +-static unsigned dx_get_count(struct iam_entry *entries); + static unsigned dx_get_limit(struct iam_entry *entries); + static void dx_set_count(struct iam_entry *entries, unsigned value); + static void dx_set_limit(struct iam_entry *entries, unsigned value); + static unsigned dx_root_limit(struct iam_path *p); + static unsigned dx_node_limit(struct iam_path *p); +-static int dx_probe(struct dentry *dentry, ++static int dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct iam_path *path); +@@ -694,269 +102,58 @@ + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct iam_path *path, +- struct iam_frame *frame, u32 hash, u32 block); +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct iam_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +- +-static inline void iam_path_init(struct iam_path *path, +- struct iam_container *c); +-static inline void iam_path_fini(struct iam_path *path); +- +- +-/* +- * Future: use high four bits of block for coalesce-on-delete flags +- * Mask them off for now. +- */ +- +-static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off) +-{ +- return (void *)((char *)entry + off); +-} +- +-static inline struct iam_descr *path_descr(struct iam_path *p) +-{ +- return p->ip_container->ic_descr; +-} +- +-static inline struct inode *path_obj(struct iam_path *p) +-{ +- return p->ip_container->ic_object; +-} +- +-static inline size_t iam_entry_size(struct iam_path *p) +-{ +- return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size; +-} +- +-static inline struct iam_entry *iam_entry_shift(struct iam_path *p, +- struct iam_entry *entry, int shift) +-{ +- void *e = entry; +- return e + shift * iam_entry_size(p); +-} +- +-static inline ptrdiff_t iam_entry_diff(struct iam_path *p, +- struct iam_entry *e1, struct iam_entry *e2) +-{ +- ptrdiff_t diff; +- +- diff = (void *)e1 - (void *)e2; +- assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff); +- return diff / iam_entry_size(p); +-} +- +-static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) +-{ +- return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size)) +- & 0x00ffffff; +-} +- +-static inline void dx_set_block(struct iam_path *p, +- struct iam_entry *entry, unsigned value) +-{ +- *(u32*)entry_off(entry, +- path_descr(p)->id_key_size) = cpu_to_le32(value); +-} +- +-static inline struct iam_key *dx_get_key(struct iam_path *p, +- struct iam_entry *entry, +- struct iam_key *key) +-{ +- memcpy(key, entry, path_descr(p)->id_key_size); +- return key; +-} +- +-static inline struct iam_key *iam_key_at(struct iam_path *p, +- struct iam_entry *entry) +-{ +- return (struct iam_key *)entry; +-} +- +-static inline void dx_set_key(struct iam_path *p, +- struct iam_entry *entry, struct iam_key *key) +-{ +- memcpy(entry, key, path_descr(p)->id_key_size); +-} +- +-static inline unsigned dx_get_count (struct iam_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->count); +-} +- +-static inline unsigned dx_get_limit (struct iam_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +-} +- +-static inline void dx_set_count (struct iam_entry *entries, unsigned value) +-{ +- ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +-} +- +-static inline void dx_set_limit (struct iam_entry *entries, unsigned value) ++static inline void dx_set_limit(struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct iam_path *p) +-{ +- struct iam_descr *param = path_descr(p); +- unsigned entry_space = path_obj(p)->i_sb->s_blocksize - +- param->id_root_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); +-} +- +-static inline unsigned dx_node_limit(struct iam_path *p) ++int dx_index_is_compat(struct iam_path *path) + { +- struct iam_descr *param = path_descr(p); +- unsigned entry_space = path_obj(p)->i_sb->s_blocksize - +- param->id_node_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); ++ return iam_path_descr(path) == &iam_htree_compat_param; + } + +-static inline int dx_index_is_compat(struct iam_path *path) +-{ +- return path_descr(path) == &htree_compat_param; +-} +- +-static struct iam_entry *dx_get_entries(struct iam_path *path, void *data, +- int root) +-{ +- return data + +- (root ? +- path_descr(path)->id_root_gap : path_descr(path)->id_node_gap); +-} + +-static struct iam_entry *dx_node_get_entries(struct iam_path *path, +- struct iam_frame *frame) +-{ +- return dx_get_entries(path, +- frame->bh->b_data, frame == path->ip_frames); +-} +- +-static int dx_node_check(struct iam_path *p, struct iam_frame *f) ++int dx_node_check(struct iam_path *p, struct iam_frame *f) + { + struct iam_entry *e; + struct iam_container *c; + unsigned count; +- unsigned i; +- +- c = p->ip_container; +- e = dx_node_get_entries(p, f); +- count = dx_get_count(e); +- e = iam_entry_shift(p, e, 1); +- for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { +- keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]); +- dx_get_key(p, e, p->ip_key_scratch[1]); +- if (i > 0 && +- keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0) +- return 0; +- } +- return 1; +-} +- +-static u32 htree_root_ptr(struct iam_container *c) +-{ +- return 0; +-} +- +-struct htree_cookie { +- struct dx_hash_info *hinfo; +- struct dentry *dentry; +-}; +- +-static int htree_node_check(struct iam_path *path, struct iam_frame *frame) +-{ +- void *data; +- struct iam_entry *entries; +- struct super_block *sb; +- +- data = frame->bh->b_data; +- entries = dx_node_get_entries(path, frame); +- sb = path_obj(path)->i_sb; +- if (frame == path->ip_frames) { +- /* root node */ +- struct dx_root *root; +- struct htree_cookie *hc = path->ip_descr_data; +- +- root = data; +- if (root->info.hash_version > DX_HASH_MAX) { +- ext3_warning(sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); +- return ERR_BAD_DX_DIR; +- } +- +- if (root->info.unused_flags & 1) { +- ext3_warning(sb, __FUNCTION__, +- "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); +- return ERR_BAD_DX_DIR; +- } +- +- path->ip_indirect = root->info.indirect_levels; +- if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) { +- ext3_warning(sb, __FUNCTION__, +- "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); +- return ERR_BAD_DX_DIR; +- } +- +- assert((char *)entries == (((char *)&root->info) + +- root->info.info_length)); +- assert(dx_get_limit(entries) == dx_root_limit(path)); +- +- hc->hinfo->hash_version = root->info.hash_version; +- hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed; +- if (hc->dentry) +- ext3fs_dirhash(hc->dentry->d_name.name, +- hc->dentry->d_name.len, hc->hinfo); +- path->ip_key_target = (struct iam_key *)&hc->hinfo->hash; +- } else { +- /* non-root index */ +- assert(entries == data + path_descr(path)->id_node_gap); +- assert(dx_get_limit(entries) == dx_node_limit(path)); +- } +- frame->entries = frame->at = entries; +- return 0; +-} +- +-static int htree_node_init(struct iam_container *c, +- struct buffer_head *bh, int root) +-{ +- struct dx_node *node; +- +- assert(!root); +- +- node = (void *)bh->b_data; +- node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); +- node->fake.inode = 0; +- return 0; +-} +- +-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, +- handle_t *handle, struct buffer_head **bh) +-{ +- int result = 0; +- +- *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result); +- if (*bh == NULL) +- result = -EIO; +- return result; +-} ++ unsigned i; ++ iam_ptr_t blk; ++ iam_ptr_t root; ++ struct inode *inode; + +-static int htree_keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- __u32 p1 = le32_to_cpu(*(__u32 *)k1); +- __u32 p2 = le32_to_cpu(*(__u32 *)k2); ++ c = p->ip_container; ++ e = dx_node_get_entries(p, f); ++ count = dx_get_count(e); ++ e = iam_entry_shift(p, e, 1); ++ root = iam_path_descr(p)->id_ops->id_root_ptr(c); + +- return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++ inode = iam_path_obj(p); ++ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { ++ iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1)); ++ iam_get_ikey(p, e, iam_path_ikey(p, 1)); ++ if (i > 0 && ++ iam_ikeycmp(c, iam_path_ikey(p, 0), ++ iam_path_ikey(p, 1)) > 0) ++ return 0; ++ blk = dx_get_block(p, e); ++ /* ++ * Disable this check as it is racy. ++ */ ++ if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) ++ return 0; ++ /* ++ * By definition of a tree, no node points to the root. ++ */ ++ if (blk == root) ++ return 0; ++ } ++ return 1; + } + + /* +@@ -1042,177 +239,379 @@ + } + #endif /* DX_DEBUG */ + +-static int dx_lookup(struct iam_path *path) +-{ +- u32 ptr; +- int err = 0; +- int i; ++/* ++ * Per-node tree locking. ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ */ + +- struct iam_descr *param; +- struct iam_frame *frame; +- struct iam_container *c; ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 + +- param = path_descr(path); +- c = path->ip_container; +- +- for (frame = path->ip_frames, i = 0, +- ptr = param->id_root_ptr(path->ip_container); +- i <= path->ip_indirect; +- ptr = dx_get_block(path, frame->at), ++frame, ++i) { +- struct iam_entry *entries; +- struct iam_entry *p; +- struct iam_entry *q; +- struct iam_entry *m; +- unsigned count; ++#define DX_DEBUG (1) + +- err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh); +- if (err != 0) +- break; +- err = param->id_node_check(path, frame); +- if (err != 0) +- break; ++#if DX_DEBUG ++static struct dx_lock_stats { ++ unsigned dls_bh_lock; ++ unsigned dls_bh_busy; ++ unsigned dls_bh_again; ++ unsigned dls_bh_full_again; ++} dx_lock_stats = { 0, }; ++#define DX_DEVAL(x) x ++#else ++#define DX_DEVAL(x) ++#endif + +- assert(dx_node_check(path, frame)); ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++ DX_DEVAL(dx_lock_stats.dls_bh_lock++); ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ DX_DEVAL(dx_lock_stats.dls_bh_busy++); ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} + +- entries = frame->entries; +- count = dx_get_count(entries); +- assert(count && count <= dx_get_limit(entries)); +- p = iam_entry_shift(path, entries, 1); +- q = iam_entry_shift(path, entries, count - 1); +- while (p <= q) { +- m = iam_entry_shift(path, +- p, iam_entry_diff(path, q, p) / 2); +- dxtrace(printk(".")); +- if (keycmp(c, iam_key_at(path, m), +- path->ip_key_target) > 0) +- q = iam_entry_shift(path, m, -1); +- else +- p = iam_entry_shift(path, m, +1); +- } ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} + +- frame->at = iam_entry_shift(path, p, -1); +- if (1) { // linear search cross check +- unsigned n = count - 1; +- struct iam_entry *at; +- +- at = entries; +- while (n--) { +- dxtrace(printk(",")); +- at = iam_entry_shift(path, at, +1); +- if (keycmp(c, iam_key_at(path, at), +- path->ip_key_target) > 0) { +- if (at != iam_entry_shift(path, frame->at, 1)) { +- BREAKPOINT; +- printk(KERN_EMERG "%i\n", +- keycmp(c, iam_key_at(path, at), +- path->ip_key_target)); +- } +- at = iam_entry_shift(path, at, -1); +- break; +- } +- } +- assert(at == frame->at); ++/* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, ++ enum dynlock_type lt) ++{ ++ return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS); ++} ++ ++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh) ++{ ++ if (lh != NULL) ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh); ++} ++ ++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh) ++{ ++ int i; ++ ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) { ++ if (*lh != NULL) { ++ dx_unlock_htree(dir, *lh); ++ *lh = NULL; + } + } +- if (err != 0) +- iam_path_fini(path); +- path->ip_frame = --frame; +- return err; + } + + /* +- * Probe for a directory leaf block to search. ++ * dx_find_position ++ * ++ * search position of specified hash in index + * +- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +- * error in the directory index, and the caller should fall back to +- * searching the directory normally. The callers of dx_probe **MUST** +- * check for this error code, and make sure it never gets reflected +- * back to userspace. + */ +-static int dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct iam_path *path) ++ ++struct iam_entry *dx_find_position(struct iam_path *path, ++ struct iam_frame *frame) + { +- int err; +- struct htree_cookie hc = { +- .dentry = dentry, +- .hinfo = hinfo +- }; ++ int count; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; + +- assert(dx_index_is_compat(path)); +- path->ip_descr_data = &hc; +- err = dx_lookup(path); +- assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); +- return err; ++ count = dx_get_count(frame->entries); ++ assert_corr(count && count <= dx_get_limit(frame->entries)); ++ p = iam_entry_shift(path, frame->entries, ++ dx_index_is_compat(path) ? 1 : 2); ++ q = iam_entry_shift(path, frame->entries, count - 1); ++ while (p <= q) { ++ m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2); ++ if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m), ++ path->ip_ikey_target) > 0) ++ q = iam_entry_shift(path, m, -1); ++ else ++ p = iam_entry_shift(path, m, +1); ++ } ++ return iam_entry_shift(path, p, -1); ++} ++ ++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame) ++{ ++ return dx_get_block(path, dx_find_position(path, frame)); + } + + /* +- * Initialize container @c, acquires additional reference on @inode. ++ * Fast check for frame consistency. + */ +-int iam_container_init(struct iam_container *c, +- struct iam_descr *descr, struct inode *inode) ++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame) + { +- memset(c, 0, sizeof *c); +- c->ic_descr = descr; +- c->ic_object = igrab(inode); +- if (c->ic_object != NULL) +- return 0; +- else +- return -ENOENT; ++ struct iam_container *bag; ++ struct iam_entry *next; ++ struct iam_entry *last; ++ struct iam_entry *entries; ++ struct iam_entry *at; ++ ++ bag = path->ip_container; ++ at = frame->at; ++ entries = frame->entries; ++ last = iam_entry_shift(path, entries, dx_get_count(entries) - 1); ++ ++ if (unlikely(at > last)) ++ return -EAGAIN; ++ ++ if (unlikely(dx_get_block(path, at) != frame->leaf)) ++ return -EAGAIN; ++ ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at), ++ path->ip_ikey_target) > 0)) ++ return -EAGAIN; ++ ++ next = iam_entry_shift(path, at, +1); ++ if (next <= last) { ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next), ++ path->ip_ikey_target) <= 0)) ++ return -EAGAIN; ++ } ++ return 0; + } + + /* +- * Finalize container @c, release all resources. ++ * returns 0 if path was unchanged, -EAGAIN otherwise. + */ +-void iam_container_fini(struct iam_container *c) ++static int dx_check_path(struct iam_path *path, struct iam_frame *frame) + { +- if (c->ic_object != NULL) { +- iput(c->ic_object); +- c->ic_object = NULL; +- } ++ int equal; ++ ++ dx_lock_bh(frame->bh); ++ equal = dx_check_fast(path, frame) == 0 || ++ frame->leaf == dx_find_ptr(path, frame); ++ DX_DEVAL(dx_lock_stats.dls_bh_again += !equal); ++ dx_unlock_bh(frame->bh); ++ ++ return equal ? 0 : -EAGAIN; + } + +-static inline void iam_path_init(struct iam_path *path, struct iam_container *c) ++/* ++ * returns 0 if path was unchanged, -EAGAIN otherwise. ++ */ ++static int dx_check_full_path(struct iam_path *path, int search) + { +- memset(path, 0, sizeof *path); +- path->ip_container = c; +- path->ip_frame = path->ip_frames; ++ struct iam_frame *bottom; ++ struct iam_frame *scan; ++ int i; ++ int result; ++ ++ do_corr(schedule()); ++ ++ for (bottom = path->ip_frames, i = 0; ++ i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) { ++ ; /* find last filled in frame */ ++ } ++ ++ /* ++ * Lock frames, bottom to top. ++ */ ++ for (scan = bottom - 1; scan >= path->ip_frames; --scan) ++ dx_lock_bh(scan->bh); ++ /* ++ * Check them top to bottom. ++ */ ++ result = 0; ++ for (scan = path->ip_frames; scan < bottom; ++scan) { ++ struct iam_entry *pos; ++ ++ if (search) { ++ if (dx_check_fast(path, scan) == 0) ++ continue; ++ ++ pos = dx_find_position(path, scan); ++ if (scan->leaf != dx_get_block(path, pos)) { ++ result = -EAGAIN; ++ break; ++ } ++ scan->at = pos; ++ } else { ++ pos = iam_entry_shift(path, scan->entries, ++ dx_get_count(scan->entries) - 1); ++ if (scan->at > pos || ++ scan->leaf != dx_get_block(path, scan->at)) { ++ result = -EAGAIN; ++ break; ++ } ++ } ++ } ++ ++ /* ++ * Unlock top to bottom. ++ */ ++ for (scan = path->ip_frames; scan < bottom; ++scan) ++ dx_unlock_bh(scan->bh); ++ DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result); ++ do_corr(schedule()); ++ ++ return result; + } + +-static inline void iam_path_fini(struct iam_path *path) ++static int dx_lookup_try(struct iam_path *path) + { ++ u32 ptr; ++ int err = 0; + int i; + +- for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { +- if (path->ip_frames[i].bh != NULL) { +- brelse(path->ip_frames[i].bh); +- path->ip_frames[i].bh = NULL; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct iam_container *c; ++ ++ param = iam_path_descr(path); ++ c = path->ip_container; ++ ++ ptr = param->id_ops->id_root_ptr(c); ++ for (frame = path->ip_frames, i = 0; i <= path->ip_indirect; ++ ++frame, ++i) { ++ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL, ++ &frame->bh); ++ do_corr(schedule()); ++ ++ dx_lock_bh(frame->bh); ++ /* ++ * node must be initialized under bh lock because concurrent ++ * creation procedure may change it and dx_lookup_try() will ++ * see obsolete tree height. -bzzz ++ */ ++ if (err != 0) ++ break; ++ ++ if (EXT3_INVARIANT_ON) { ++ err = param->id_ops->id_node_check(path, frame); ++ if (err != 0) ++ break; ++ } ++ ++ err = param->id_ops->id_node_load(path, frame); ++ if (err != 0) ++ break; ++ ++ assert_inv(dx_node_check(path, frame)); ++ /* ++ * splitting may change root index block and move hash we're ++ * looking for into another index block so, we have to check ++ * this situation and repeat from begining if path got changed ++ * -bzzz ++ */ ++ if (i > 0) { ++ err = dx_check_path(path, frame - 1); ++ if (err != 0) ++ break; + } ++ ++ frame->at = dx_find_position(path, frame); ++ frame->curidx = ptr; ++ frame->leaf = ptr = dx_get_block(path, frame->at); ++ ++ dx_unlock_bh(frame->bh); ++ do_corr(schedule()); + } ++ if (err != 0) ++ dx_unlock_bh(frame->bh); ++ path->ip_frame = --frame; ++ return err; + } + +-static void iam_path_compat_init(struct iam_path_compat *path, +- struct inode *inode) ++static int dx_lookup(struct iam_path *path) + { ++ int err; + int i; + +- iam_container_init(&path->ipc_container, &htree_compat_param, inode); +- /* +- * XXX hack allowing finalization of iam_path_compat with +- * iam_path_fini(). +- */ +- iput(inode); +- iam_path_init(&path->ipc_path, &path->ipc_container); +- for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) +- path->ipc_path.ip_key_scratch[i] = +- (struct iam_key *)&path->ipc_scrach[i]; ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i) ++ assert(path->ip_frames[i].bh == NULL); ++ ++ do { ++ err = dx_lookup_try(path); ++ do_corr(schedule()); ++ if (err != 0) ++ iam_path_fini(path); ++ } while (err == -EAGAIN); ++ ++ return err; ++} ++ ++/* ++ * Performs path lookup and returns with found leaf (if any) locked by htree ++ * lock. ++ */ ++int dx_lookup_lock(struct iam_path *path, ++ struct dynlock_handle **dl, enum dynlock_type lt) ++{ ++ int result; ++ struct inode *dir; ++ ++ dir = iam_path_obj(path); ++ while ((result = dx_lookup(path)) == 0) { ++ do_corr(schedule()); ++ *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt); ++ if (*dl == NULL) { ++ iam_path_fini(path); ++ result = -ENOMEM; ++ break; ++ } ++ do_corr(schedule()); ++ /* ++ * while locking leaf we just found may get split so we need ++ * to check this -bzzz ++ */ ++ if (dx_check_full_path(path, 1) == 0) ++ break; ++ dx_unlock_htree(dir, *dl); ++ *dl = NULL; ++ iam_path_fini(path); ++ } ++ return result; + } + +-static void iam_path_compat_fini(struct iam_path_compat *path) ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static int dx_probe(struct qstr *name, struct inode *dir, ++ struct dx_hash_info *hinfo, struct iam_path *path) + { +- iam_path_fini(&path->ipc_path); +- iam_container_fini(&path->ipc_container); ++ int err; ++ struct iam_path_compat *ipc; ++ ++ assert_corr(path->ip_data != NULL); ++ ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr); ++ ipc->ipc_qstr = name; ++ ipc->ipc_hinfo = hinfo; ++ ++ assert_corr(dx_index_is_compat(path)); ++ err = dx_lookup(path); ++ assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); ++ return err; + } + ++ + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -1230,16 +629,15 @@ + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct iam_path *path, __u32 *start_hash) ++static int ext3_htree_advance(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash, ++ int compat) + { + struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + +- assert(dx_index_is_compat(path)); +- + p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. +@@ -1249,16 +647,26 @@ + * nodes need to be read. + */ + while (1) { ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); + p->at = iam_entry_shift(path, p->at, +1); + if (p->at < iam_entry_shift(path, p->entries, +- dx_get_count(p->entries))) ++ dx_get_count(p->entries))) { ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); + break; ++ } ++ dx_unlock_bh(p->bh); + if (p == path->ip_frames) + return 0; + num_frames++; + --p; + } + ++ if (compat) { ++ /* ++ * Htree hash magic. ++ */ + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir +@@ -1266,33 +674,146 @@ + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- dx_get_key(path, p->at, (struct iam_key *)&bhash); ++ iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } ++ } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { +- err = path_descr(path)->id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, p->at), +- NULL, &bh); ++ iam_ptr_t idx; ++ ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); ++ idx = p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); ++ err = iam_path_descr(path)->id_ops-> ++ id_node_read(path->ip_container, idx, NULL, &bh); + if (err != 0) + return err; /* Failure */ + ++p; +- brelse (p->bh); ++ brelse(p->bh); ++ assert_corr(p->bh != bh); + p->bh = bh; +- p->at = p->entries = dx_node_get_entries(path, p); +- assert(dx_node_check(path, p)); ++ p->entries = dx_node_get_entries(path, p); ++ p->at = iam_entry_shift(path, p->entries, !compat); ++ assert_corr(p->curidx != idx); ++ p->curidx = idx; ++ dx_lock_bh(p->bh); ++ assert_corr(p->leaf != dx_get_block(path, p->at)); ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); ++ assert_inv(dx_node_check(path, p)); + } + return 1; + } + +- ++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh) ++{ ++ struct iam_frame *f; ++ ++ for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) { ++ do_corr(schedule()); ++ *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ); ++ if (*lh == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static int iam_index_advance(struct iam_path *path) ++{ ++ return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0); ++} ++ ++/* ++ * Advance index part of @path to point to the next leaf. Returns 1 on ++ * success, 0, when end of container was reached. Leaf node is locked. ++ */ ++int iam_index_next(struct iam_container *c, struct iam_path *path) ++{ ++ iam_ptr_t cursor; ++ struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, }; ++ int result; ++ struct inode *object; ++ ++ /* ++ * Locking for iam_index_next()... is to be described. ++ */ ++ ++ object = c->ic_object; ++ cursor = path->ip_frame->leaf; ++ ++ while (1) { ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result == 0 && cursor == path->ip_frame->leaf) { ++ result = iam_index_advance(path); ++ ++ assert_corr(result == 0 || ++ cursor != path->ip_frame->leaf); ++ break; ++ } ++ do { ++ dx_unlock_array(object, lh); ++ ++ iam_path_release(path); ++ do_corr(schedule()); ++ ++ result = dx_lookup(path); ++ if (result < 0) ++ break; ++ ++ while (path->ip_frame->leaf != cursor) { ++ do_corr(schedule()); ++ ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ ++ result = iam_index_advance(path); ++ if (result == 0) { ++ ext3_error(object->i_sb, __FUNCTION__, ++ "cannot find cursor: %u\n", ++ cursor); ++ result = -EIO; ++ } ++ if (result < 0) ++ break; ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ dx_unlock_array(object, lh); ++ } ++ } while (result == -EAGAIN); ++ if (result < 0) ++ break; ++ } ++ dx_unlock_array(object, lh); ++ return result; ++} ++ ++int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash) ++{ ++ return ext3_htree_advance(dir, hash, path, start_hash, 1); ++} ++ + /* + * p is at least 6 bytes before the end of page + */ +@@ -1496,21 +1017,45 @@ + } while(more); + } + +-static void dx_insert_block(struct iam_path *path, +- struct iam_frame *frame, u32 hash, u32 block) ++void iam_insert_key(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) + { + struct iam_entry *entries = frame->entries; +- struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1); ++ struct iam_entry *new = iam_entry_shift(path, frame->at, +1); + int count = dx_get_count(entries); + +- assert(count < dx_get_limit(entries)); +- assert(old < iam_entry_shift(path, entries, count)); ++ /* ++ * Unfortunately we cannot assert this, as this function is sometimes ++ * called by VFS under i_sem and without pdirops lock. ++ */ ++ assert_corr(1 || iam_frame_is_locked(path, frame)); ++ assert_corr(count < dx_get_limit(entries)); ++ assert_corr(frame->at < iam_entry_shift(path, entries, count)); ++ assert_inv(dx_node_check(path, frame)); ++ + memmove(iam_entry_shift(path, new, 1), new, + (char *)iam_entry_shift(path, entries, count) - (char *)new); +- dx_set_key(path, new, (struct iam_key *)&hash); +- dx_set_block(path, new, block); ++ dx_set_ikey(path, new, key); ++ dx_set_block(path, new, ptr); + dx_set_count(entries, count + 1); ++ assert_inv(dx_node_check(path, frame)); ++} ++ ++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) ++{ ++ dx_lock_bh(frame->bh); ++ iam_insert_key(path, frame, key, ptr); ++ dx_unlock_bh(frame->bh); ++} ++ ++void dx_insert_block(struct iam_path *path, struct iam_frame *frame, ++ u32 hash, u32 block) ++{ ++ assert_corr(dx_index_is_compat(path)); ++ iam_insert_key(path, frame, (struct iam_ikey *)&hash, block); + } ++ + #endif + + +@@ -1727,7 +1272,7 @@ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- *err = dx_probe(dentry, NULL, &hinfo, path); ++ *err = dx_probe(&dentry->d_name, NULL, &hinfo, path); + if (*err != 0) + return NULL; + } else { +@@ -1737,7 +1282,8 @@ + hash = hinfo.hash; + do { + block = dx_get_block(path, path->ip_frame->at); +- *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block, ++ *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)block, + NULL, &bh); + if (*err != 0) + goto errout; +@@ -1927,22 +1473,69 @@ + return prev; + } + ++struct ext3_dir_entry_2 *move_entries(struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct buffer_head **bh1, ++ struct buffer_head **bh2, ++ __u32 *delim_hash) ++{ ++ char *data1; ++ char *data2; ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count; ++ unsigned continued; ++ unsigned split; ++ u32 hash2; ++ ++ struct dx_map_entry *map; ++ struct ext3_dir_entry_2 *de1; ++ struct ext3_dir_entry_2 *de2; ++ ++ data1 = (*bh1)->b_data; ++ data2 = (*bh2)->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map(map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ frame->leaf, hash2, split, count - split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de1 = dx_pack_dirents(data1, blocksize); ++ de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) { ++ swap(*bh1, *bh2); ++ de1 = de2; ++ } ++ *delim_hash = hash2 + continued; ++ return de1; ++} ++ + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path, + struct buffer_head **bh,struct iam_frame *frame, + struct dx_hash_info *hinfo, int *error) + { +- struct inode *dir = path_obj(path); +- unsigned blocksize = dir->i_sb->s_blocksize; +- unsigned count, continued; ++ struct inode *dir = iam_path_obj(path); + struct buffer_head *bh2; + u32 newblock; + u32 hash2; +- struct dx_map_entry *map; +- char *data1 = (*bh)->b_data, *data2; +- unsigned split; +- struct ext3_dir_entry_2 *de = NULL, *de2; ++ struct ext3_dir_entry_2 *de = NULL; + int err; + + bh2 = ext3_append (handle, dir, &newblock, error); +@@ -1967,35 +1560,9 @@ + if (err) + goto journal_error; + +- data2 = bh2->b_data; +- +- /* create map in the end of data2 block */ +- map = (struct dx_map_entry *) (data2 + blocksize); +- count = dx_make_map ((struct ext3_dir_entry_2 *) data1, +- blocksize, hinfo, map); +- map -= count; +- split = count/2; // need to adjust to actual middle +- dx_sort_map (map, count); +- hash2 = map[split].hash; +- continued = hash2 == map[split - 1].hash; +- dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- +- /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split); +- de = dx_pack_dirents(data1,blocksize); +- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); +- de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ de = move_entries(dir, hinfo, bh, &bh2, &hash2); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; +- } +- dx_insert_block(path, frame, hash2 + continued, newblock); ++ dx_insert_block(path, frame, hash2, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -2009,6 +1576,63 @@ + } + #endif + ++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir, ++ struct buffer_head *bh, ++ const char *name, int namelen) ++{ ++ struct ext3_dir_entry_2 *de; ++ char *top; ++ unsigned long offset; ++ int nlen; ++ int rlen; ++ int reclen; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ offset = 0; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", ++ dir, de, bh, offset)) ++ return ERR_PTR(-EIO); ++ if (ext3_match(namelen, name, de)) ++ return ERR_PTR(-EEXIST); ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ return de; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ return ERR_PTR(-ENOSPC); ++} ++ ++struct ext3_dir_entry_2 *split_entry(struct inode *dir, ++ struct ext3_dir_entry_2 *de, ++ unsigned long ino, mode_t mode, ++ const char *name, int namelen) ++{ ++ int nlen; ++ int rlen; ++ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1; ++ ++ de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ de->inode = cpu_to_le32(ino); ++ if (ino != 0) ++ ext3_set_de_type(dir->i_sb, de, mode); ++ de->name_len = namelen; ++ memcpy(de->name, name, namelen); ++ return de; ++} + + /* + * Add a new entry into a directory (leaf) block. If de is non-NULL, +@@ -2028,34 +1652,16 @@ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; +- unsigned long offset = 0; +- unsigned short reclen; +- int nlen, rlen, err; +- char *top; ++ int err; + +- reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { +- de = (struct ext3_dir_entry_2 *)bh->b_data; +- top = bh->b_data + dir->i_sb->s_blocksize - reclen; +- while ((char *) de <= top) { +- if (!ext3_check_dir_entry("ext3_add_entry", dir, de, +- bh, offset)) { +- brelse (bh); +- return -EIO; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; +- } +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if ((de->inode? rlen - nlen: rlen) >= reclen) +- break; +- de = (struct ext3_dir_entry_2 *)((char *)de + rlen); +- offset += rlen; ++ de = find_insertion_point(dir, bh, name, namelen); ++ if (IS_ERR(de)) { ++ err = PTR_ERR(de); ++ if (err != -ENOSPC) ++ brelse(bh); ++ return err; + } +- if ((char *) de > top) +- return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -2066,22 +1672,9 @@ + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); +- de1->rec_len = cpu_to_le16(rlen - nlen); +- de->rec_len = cpu_to_le16(nlen); +- de = de1; +- } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); ++ ++ split_entry(dir, de, inode ? inode->i_ino : 0, ++ inode ? inode->i_mode : 0, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend +@@ -2257,60 +1850,85 @@ + return add_dirent_to_buf(handle, dentry, inode, de, bh); + } + ++static int shift_entries(struct iam_path *path, ++ struct iam_frame *frame, unsigned count, ++ struct iam_entry *entries, struct iam_entry *entries2, ++ u32 newblock) ++{ ++ unsigned count1; ++ unsigned count2; ++ int delta; ++ ++ struct iam_frame *parent = frame - 1; ++ struct iam_ikey *pivot = iam_path_ikey(path, 3); ++ ++ delta = dx_index_is_compat(path) ? 0 : +1; ++ ++ count1 = count/2 + delta; ++ count2 = count - count1; ++ iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot); ++ ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy((char *) iam_entry_shift(path, entries2, delta), ++ (char *) iam_entry_shift(path, entries, count1), ++ count2 * iam_entry_size(path)); ++ ++ dx_set_count(entries2, count2 + delta); ++ dx_set_limit(entries2, dx_node_limit(path)); ++ ++ /* ++ * NOTE: very subtle piece of code competing dx_probe() may find 2nd ++ * level index in root index, then we insert new index here and set ++ * new count in that 2nd level index. so, dx_probe() may see 2nd level ++ * index w/o hash it looks for. the solution is to check root index ++ * after we locked just founded 2nd level index -bzzz ++ */ ++ iam_insert_key_lock(path, parent, pivot, newblock); ++ ++ /* ++ * now old and new 2nd level index blocks contain all pointers, so ++ * dx_probe() may find it in the both. it's OK -bzzz ++ */ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, count1); ++ dx_unlock_bh(frame->bh); ++ ++ /* ++ * now old 2nd level index block points to first half of leafs. it's ++ * importand that dx_probe() must check root index block for changes ++ * under dx_lock_bh(frame->bh) -bzzz ++ */ ++ ++ return count1; ++} ++ + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ +-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int split_index_node(handle_t *handle, struct iam_path *path, ++ struct dynlock_handle **lh) + { +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct iam_descr *param; +- struct iam_frame *frame, *safe; ++ + struct iam_entry *entries; /* old block contents */ + struct iam_entry *entries2; /* new block contents */ +- struct dx_hash_info hinfo; +- struct buffer_head * bh; ++ struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; +- struct inode *dir = dentry->d_parent->d_inode; +- struct super_block * sb = dir->i_sb; +- struct ext3_dir_entry_2 *de; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; +- int err; ++ struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct inode *dir = iam_path_obj(path); ++ struct iam_descr *descr; + int nr_splet; +- int i; +- size_t isize; ++ int i, err; + +- iam_path_compat_init(&cpath, dir); +- param = path_descr(path); ++ descr = iam_path_descr(path); ++ /* ++ * Algorithm below depends on this. ++ */ ++ assert_corr(dx_root_limit(path) < dx_node_limit(path)); + +- err = dx_probe(dentry, NULL, &hinfo, path); +- if (err != 0) +- return err; + frame = path->ip_frame; + entries = frame->entries; + +- /* XXX nikita: global serialization! */ +- isize = dir->i_size; +- +- err = param->id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, +- frame->at), handle, &bh); +- if (err != 0) +- goto cleanup; +- +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- +- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (err != -ENOSPC) { +- bh = NULL; +- goto cleanup; +- } +- + /* + * Tall-tree handling: we might have to split multiple index blocks + * all the way up to tree root. Tricky point here is error handling: +@@ -2319,12 +1937,14 @@ + * - first allocate all necessary blocks + * + * - insert pointers into them atomically. +- * +- * XXX nikita: this algorithm is *not* scalable, as it assumes that at +- * least nodes in the path are locked. + */ + +- /* Block full, should compress but for now just split */ ++ /* ++ * Locking: leaf is already locked. htree-locks are acquired on all ++ * index nodes that require split bottom-to-top, on the "safe" node, ++ * and on all new nodes ++ */ ++ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + +@@ -2332,8 +1952,9 @@ + for (nr_splet = 0; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { ++ do_corr(schedule()); + if (nr_splet == DX_MAX_TREE_HEIGHT) { +- ext3_warning(sb, __FUNCTION__, ++ ext3_warning(dir->i_sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; +@@ -2341,13 +1962,53 @@ + } + + safe = frame; +- /* Go back down, allocating blocks, and adding blocks into ++ ++ /* ++ * Lock all nodes, bottom to top. ++ */ ++ for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) { ++ do_corr(schedule()); ++ lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE); ++ if (lock[i] == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ } ++ ++ /* ++ * Check for concurrent index modification. ++ */ ++ err = dx_check_full_path(path, 1); ++ if (err) ++ goto cleanup; ++ /* ++ * And check that the same number of nodes is to be split. ++ */ ++ for (i = 0, frame = path->ip_frame; frame >= path->ip_frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++i) { ++ ; ++ } ++ if (i != nr_splet) { ++ err = -EAGAIN; ++ goto cleanup; ++ } ++ ++ /* Go back down, allocating blocks, locking them, and adding into + * transaction... */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ do_corr(schedule()); + if (!bh_new[i] || +- param->id_node_init(path->ip_container, bh_new[i], 0) != 0) ++ descr->id_ops->id_node_init(path->ip_container, ++ bh_new[i], 0) != 0) ++ goto cleanup; ++ new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE); ++ if (new_lock[i] == NULL) { ++ err = -ENOMEM; + goto cleanup; ++ } ++ do_corr(schedule()); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) +@@ -2355,6 +2016,7 @@ + } + /* Add "safe" node to transaction too */ + if (safe + 1 != path->ip_frames) { ++ do_corr(schedule()); + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -2365,6 +2027,7 @@ + unsigned count; + int idx; + struct buffer_head *bh2; ++ struct buffer_head *bh; + + entries = frame->entries; + count = dx_get_count(entries); +@@ -2373,6 +2036,7 @@ + bh2 = bh_new[i]; + entries2 = dx_get_entries(path, bh2->b_data, 0); + ++ bh = frame->bh; + if (frame == path->ip_frames) { + /* splitting root node. Tricky point: + * +@@ -2384,23 +2048,26 @@ + * capacity of the root node is smaller than that of + * non-root one. + */ +- struct dx_root *root; +- u8 indirects; + struct iam_frame *frames; ++ struct iam_entry *next; ++ ++ assert_corr(i == 0); ++ ++ do_corr(schedule()); + + frames = path->ip_frames; +- root = (struct dx_root *) frames->bh->b_data; +- indirects = root->info.indirect_levels; +- dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, + count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(path, entries, newblock[i]); +- root->info.indirect_levels = indirects + 1; ++ dx_lock_bh(frame->bh); ++ next = descr->id_ops->id_root_inc(path->ip_container, ++ path, frame); ++ dx_set_block(path, next, newblock[0]); ++ dx_unlock_bh(frame->bh); + ++ do_corr(schedule()); + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, + (sizeof path->ip_frames) - 2 * sizeof frames[0]); +@@ -2408,54 +2075,146 @@ + frames[1].at = iam_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; +- assert(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, frame)); ++ ++ path->ip_frame; + ++ frame; +- assert(dx_node_check(path, frame)); +- bh_new[i] = NULL; /* buffer head is "consumed" */ ++ assert_inv(dx_node_check(path, frame)); ++ bh_new[0] = NULL; /* buffer head is "consumed" */ + err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); + } else { + /* splitting non-root index node. */ +- unsigned count1 = count/2, count2 = count - count1; +- unsigned hash2; +- +- dx_get_key(path, +- iam_entry_shift(path, entries, count1), +- (struct iam_key *)&hash2); +- +- dxtrace(printk("Split index %i/%i\n", count1, count2)); +- +- memcpy ((char *) entries2, +- (char *) iam_entry_shift(path, entries, count1), +- count2 * iam_entry_size(path)); +- dx_set_count (entries, count1); +- dx_set_count (entries2, count2); +- dx_set_limit (entries2, dx_node_limit(path)); ++ struct iam_frame *parent = frame - 1; + ++ do_corr(schedule()); ++ count = shift_entries(path, frame, count, ++ entries, entries2, newblock[i]); + /* Which index block gets the new entry? */ +- if (idx >= count1) { ++ if (idx >= count) { ++ int d = dx_index_is_compat(path) ? 0 : +1; ++ + frame->at = iam_entry_shift(path, entries2, +- idx - count1); ++ idx - count + d); + frame->entries = entries = entries2; ++ frame->curidx = newblock[i]; + swap(frame->bh, bh2); ++ assert_corr(lock[i + 1] != NULL); ++ assert_corr(new_lock[i] != NULL); ++ swap(lock[i + 1], new_lock[i]); + bh_new[i] = bh2; ++ parent->at = iam_entry_shift(path, ++ parent->at, +1); + } +- dx_insert_block(path, frame - 1, hash2, newblock[i]); +- assert(dx_node_check(path, frame)); +- assert(dx_node_check(path, frame - 1)); ++ assert_inv(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, parent)); + dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); ++ err = ext3_journal_dirty_metadata(handle, parent->bh); ++ if (err) ++ goto journal_error; + } ++ do_corr(schedule()); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto journal_error; ++ } ++ /* ++ * This function was called to make insertion of new leaf ++ * possible. Check that it fulfilled its obligations. ++ */ ++ assert_corr(dx_get_count(path->ip_frame->entries) < ++ dx_get_limit(path->ip_frame->entries)); ++ assert_corr(lock[nr_splet] != NULL); ++ *lh = lock[nr_splet]; ++ lock[nr_splet] = NULL; ++ if (nr_splet > 0) { ++ /* ++ * Log ->i_size modification. ++ */ ++ err = ext3_mark_inode_dirty(handle, dir); ++ if (err) ++ goto journal_error; ++ } ++ goto cleanup; ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++ ++cleanup: ++ dx_unlock_array(dir, lock); ++ dx_unlock_array(dir, new_lock); ++ ++ assert_corr(err || iam_frame_is_locked(path, path->ip_frame)); ++ ++ do_corr(schedule()); ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ return err; ++} ++ ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh = NULL; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct ext3_dir_entry_2 *de; ++ struct dynlock_handle *dummy = NULL; ++ int err; ++ size_t isize; ++ ++ iam_path_compat_init(&cpath, dir); ++ param = iam_path_descr(path); ++ ++ err = dx_probe(&dentry->d_name, NULL, &hinfo, path); ++ if (err != 0) ++ return err; ++ frame = path->ip_frame; ++ ++ isize = dir->i_size; ++ ++ err = param->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, frame->at), ++ handle, &bh); ++ if (err != 0) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); ++ if (err != -ENOSPC) { ++ bh = NULL; ++ goto cleanup; + } +- de = do_split(handle, path, &bh, --frame, &hinfo, &err); ++ ++ err = split_index_node(handle, path, &dummy); ++ if (err) ++ goto cleanup; ++ ++ /*copy split inode too*/ ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err); + if (!de) + goto cleanup; +- assert(dx_node_check(path, frame)); ++ ++ assert_inv(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; + +@@ -2465,10 +2224,7 @@ + if (bh) + brelse(bh); + cleanup2: +- for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { +- if (bh_new[i] != NULL) +- brelse(bh_new[i]); +- } ++ dx_unlock_htree(dir, dummy); + if (err) + inode->i_size = isize; + iam_path_fini(path); +@@ -2575,6 +2331,26 @@ + return ext3_new_inode(handle, dir, mode, inum); + } + ++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext3_new_inode(handle, dir, mode, 0); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext3_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ ext3_set_aops(inode); ++ } ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext3_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2007-10-20 17:14:36.000000000 +0300 ++++ linux-stage/fs/ext3/Makefile 2007-10-20 17:14:39.000000000 +0300 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o mballoc.o ++ extents.o mballoc.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/dir.c +=================================================================== +--- linux-stage.orig/fs/ext3/dir.c 2007-10-20 17:14:33.000000000 +0300 ++++ linux-stage/fs/ext3/dir.c 2007-10-20 17:14:39.000000000 +0300 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +@@ -61,6 +62,7 @@ + } + + ++#if EXT3_INVARIANT_ON + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -90,6 +92,7 @@ + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; + } ++#endif + + static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) +@@ -308,12 +311,14 @@ + root->rb_node = NULL; + } + ++extern struct iam_private_info *ext3_iam_alloc_info(int flags); ++extern void ext3_iam_release_info(struct iam_private_info *info); + + struct dir_private_info *create_dir_info(loff_t pos) + { + struct dir_private_info *p; + +- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ p = (void *)ext3_iam_alloc_info(GFP_KERNEL); + if (!p) + return NULL; + p->root.rb_node = NULL; +@@ -329,6 +334,7 @@ + void ext3_htree_free_dir_info(struct dir_private_info *p) + { + free_rb_tree_fname(&p->root); ++ ext3_iam_release_info((void *)p); + kfree(p); + } + +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2007-10-20 17:14:38.000000000 +0300 ++++ linux-stage/fs/ext3/ioctl.c 2007-10-20 17:14:39.000000000 +0300 +@@ -14,6 +14,7 @@ + #include + #include + ++#include + + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +@@ -250,6 +251,6 @@ + + + default: +- return -ENOTTY; ++ return iam_uapi_ioctl(inode, filp, cmd, arg); + } + } +Index: linux-stage/fs/ext3/file.c +=================================================================== +--- linux-stage.orig/fs/ext3/file.c 2007-10-20 17:14:33.000000000 +0300 ++++ linux-stage/fs/ext3/file.c 2007-10-20 17:14:39.000000000 +0300 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include "xattr.h" + #include "acl.h" + +@@ -31,14 +32,18 @@ + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +-static int ext3_release_file (struct inode * inode, struct file * filp) ++static int ext3_release_file(struct inode * inode, struct file * filp) + { + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + ext3_discard_reservation(inode); +- if (is_dx(inode) && filp->private_data) ++ if (is_dx(inode) && filp->private_data) { ++ if (S_ISDIR(inode->i_mode)) + ext3_htree_free_dir_info(filp->private_data); ++ else ++ ext3_iam_release(filp, inode); ++ } + + return 0; + } +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2007-10-20 17:14:39.000000000 +0300 ++++ linux-stage/fs/ext3/super.c 2007-10-20 17:14:39.000000000 +0300 +@@ -464,6 +464,10 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; ++ ++ dynlock_init(&ei->i_htree_lock); ++ sema_init(&ei->i_rename_sem, 1); ++ sema_init(&ei->i_append_sem, 1); + ei->vfs_inode.i_version = 1; + + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2007-10-20 17:14:38.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs.h 2007-10-20 17:14:39.000000000 +0300 +@@ -864,9 +864,7 @@ + extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv); + + /* dir.c */ +-extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, +- struct buffer_head *, unsigned long); ++ + extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2007-10-20 17:14:38.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs_i.h 2007-10-20 17:14:39.000000000 +0300 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct reserve_window { + __u32 _rsv_start; /* First byte reserved */ +@@ -128,6 +129,12 @@ + * by other means, so we have truncate_sem. + */ + struct semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; ++ + struct inode vfs_inode; + + __u32 i_cached_extent[4]; diff --git a/ldiskfs/kernel_patches/patches/ext3-iam-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-iam-sles10.patch new file mode 100644 index 0000000..3ecb4f4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-iam-sles10.patch @@ -0,0 +1,2657 @@ +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2007-10-24 10:02:52.000000000 +0300 ++++ linux-stage/fs/ext3/namei.c 2007-10-24 11:04:54.000000000 +0300 +@@ -24,78 +24,7 @@ + * Theodore Ts'o, 2002 + */ + +-/* +- * iam: big theory statement. +- * +- * iam (Index Access Module) is a module providing abstraction of persistent +- * transactional container on top of generalized ext3 htree. +- * +- * iam supports: +- * +- * - key, pointer, and record size specifiable per container. +- * +- * - trees taller than 2 index levels. +- * +- * - read/write to existing ext3 htree directories as iam containers. +- * +- * iam container is a tree, consisting of leaf nodes containing keys and +- * records stored in this container, and index nodes, containing keys and +- * pointers to leaf or index nodes. +- * +- * iam does not work with keys directly, instead it calls user-supplied key +- * comparison function (->dpo_keycmp()). +- * +- * Pointers are (currently) interpreted as logical offsets (measured in +- * blocksful) within underlying flat file on top of which iam tree lives. +- * +- * On-disk format: +- * +- * iam mostly tries to reuse existing htree formats. +- * +- * Format of index node: +- * +- * +-----+-------+-------+-------+------+-------+------------+ +- * | | count | | | | | | +- * | gap | / | entry | entry | .... | entry | free space | +- * | | limit | | | | | | +- * +-----+-------+-------+-------+------+-------+------------+ +- * +- * gap this part of node is never accessed by iam code. It +- * exists for binary compatibility with ext3 htree (that, +- * in turn, stores fake struct ext2_dirent for ext2 +- * compatibility), and to keep some unspecified per-node +- * data. Gap can be different for root and non-root index +- * nodes. Gap size can be specified for each container +- * (gap of 0 is allowed). +- * +- * count/limit current number of entries in this node, and the maximal +- * number of entries that can fit into node. count/limit +- * has the same size as entry, and is itself counted in +- * count. +- * +- * entry index entry: consists of a key immediately followed by +- * a pointer to a child node. Size of a key and size of a +- * pointer depends on container. Entry has neither +- * alignment nor padding. +- * +- * free space portion of node new entries are added to +- * +- * Entries in index node are sorted by their key value. +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- * +- */ +- ++#include + #include + #include + #include +@@ -108,6 +37,7 @@ + #include + #include + #include ++#include + + #include "namei.h" + #include "xattr.h" +@@ -122,33 +52,29 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +-/* +- * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. +- */ +-enum { +- DX_MAX_TREE_HEIGHT = 5, +- DX_SCRATCH_KEYS = 2 +-}; + +-static struct buffer_head *ext3_append(handle_t *handle, ++struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) + { + struct buffer_head *bh; ++ struct ext3_inode_info *ei = EXT3_I(inode); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + +- if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ bh = ext3_bread(handle, inode, *block, 1, err); ++ if (bh != NULL) { + inode->i_size += inode->i_sb->s_blocksize; +- EXT3_I(inode)->i_disksize = inode->i_size; +- ext3_journal_get_write_access(handle,bh); ++ ei->i_disksize = inode->i_size; + } ++ up(&ei->i_append_sem); ++ + return bh; + } + +-#ifndef assert +-#define assert(test) J_ASSERT(test) +-#endif + + #ifndef swap + #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +@@ -160,533 +86,16 @@ + #define dxtrace(command) + #endif + +-struct fake_dirent { +- __le32 inode; +- __le16 rec_len; +- u8 name_len; +- u8 file_type; +-}; +- +-struct dx_countlimit { +- __le16 limit; +- __le16 count; +-}; +- +-/* +- * dx_root_info is laid out so that if it should somehow get overlaid by a +- * dirent the two low bits of the hash version will be zero. Therefore, the +- * hash version mod 4 should never be 0. Sincerely, the paranoia department. +- */ +- +-struct dx_root { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct {} entries[0]; +-}; +- +-struct dx_node +-{ +- struct fake_dirent fake; +- struct {} entries[0]; +-}; +- +-struct dx_map_entry +-{ +- u32 hash; +- u32 offs; +-}; +- +-/* +- * Entry within index tree node. Consists of a key immediately followed +- * (without padding) by a pointer to the child node. +- * +- * Both key and pointer are of variable size, hence incomplete type. +- */ +-struct iam_entry; +- +-struct iam_entry_compat { +- __le32 hash; +- __le32 block; +-}; +- +-/* +- * Incomplete type used to refer to keys in iam container. +- * +- * As key size can be different from container to container, iam has to use +- * incomplete type. Clients cast pointer to iam_key to real key type and back. +- */ +-struct iam_key; +- +-/* Incomplete type use to refer to the records stored in iam containers. */ +-struct iam_rec; +- +-typedef __u64 iam_ptr_t; +- +-/* +- * Index node traversed during tree lookup. +- */ +-struct iam_frame { +- struct buffer_head *bh; /* buffer holding node data */ +- struct iam_entry *entries; /* array of entries */ +- struct iam_entry *at; /* target entry, found by binary search */ +-}; +- +-/* leaf node reached by tree lookup */ +-struct iam_leaf { +- struct buffer_head *bh; +- struct iam_leaf_entry *entries; +- struct iam_leaf_entry *at; +-}; +- +-struct iam_path; +-struct iam_container; +- +-/* +- * Parameters, describing a flavor of iam container. +- */ +-struct iam_descr { +- /* +- * Size of a key in this container, in bytes. +- */ +- size_t id_key_size; +- /* +- * Size of a pointer to the next level (stored in index nodes), in +- * bytes. +- */ +- size_t id_ptr_size; +- /* +- * Size of a record (stored in leaf nodes), in bytes. +- */ +- size_t id_rec_size; +- /* +- * Size of unused (by iam) space at the beginning of every non-root +- * node, in bytes. Used for compatibility with ext3. +- */ +- size_t id_node_gap; +- /* +- * Size of unused (by iam) space at the beginning of root node, in +- * bytes. Used for compatibility with ext3. +- */ +- size_t id_root_gap; +- +- /* +- * Returns pointer (in the same sense as pointer in index entry) to +- * the root node. +- */ +- __u32 (*id_root_ptr)(struct iam_container *c); +- +- /* +- * Check validity and consistency of index node. This is called when +- * iam just loaded new node into frame. +- */ +- int (*id_node_check)(struct iam_path *path, struct iam_frame *frame); +- /* +- * Initialize new node (stored in @bh) that is going to be added into +- * tree. +- */ +- int (*id_node_init)(struct iam_container *c, +- struct buffer_head *bh, int root); +- int (*id_node_read)(struct iam_container *c, iam_ptr_t ptr, +- handle_t *h, struct buffer_head **bh); +- /* +- * Key comparison function. Returns -1, 0, +1. +- */ +- int (*id_keycmp)(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); +- /* +- * Create new container. +- * +- * Newly created container has a root node and a single leaf. Leaf +- * contains single record with the smallest possible key. +- */ +- int (*id_create)(struct iam_container *c); +- struct { +- /* +- * leaf operations. +- */ +- /* +- * returns true iff leaf is positioned at the last entry. +- */ +- int (*at_end)(struct iam_container *c, struct iam_leaf *l); +- /* position leaf at the first entry */ +- void (*start)(struct iam_container *c, struct iam_leaf *l); +- /* more leaf to the next entry. */ +- void (*next)(struct iam_container *c, struct iam_leaf *l); +- /* return key of current leaf record in @k */ +- void (*key)(struct iam_container *c, struct iam_leaf *l, +- struct iam_key *k); +- /* return pointer to entry body */ +- struct iam_rec *(*rec)(struct iam_container *c, +- struct iam_leaf *l); +- } id_leaf; +-}; +- +-struct iam_container { +- /* +- * Underlying flat file. IO against this object is issued to +- * read/write nodes. +- */ +- struct inode *ic_object; +- /* +- * container flavor. +- */ +- struct iam_descr *ic_descr; +- /* +- * pointer to flavor-specific per-container data. +- */ +- void *ic_descr_data; +-}; +- +-/* +- * Structure to keep track of a path drilled through htree. +- */ +-struct iam_path { +- /* +- * Parent container. +- */ +- struct iam_container *ip_container; +- /* +- * Number of index levels minus one. +- */ +- int ip_indirect; +- /* +- * Nodes that top-to-bottom traversal passed through. +- */ +- struct iam_frame ip_frames[DX_MAX_TREE_HEIGHT]; +- /* +- * Last filled frame in ->ip_frames. Refers to the 'twig' node (one +- * immediately above leaf). +- */ +- struct iam_frame *ip_frame; +- /* +- * Leaf node: a child of ->ip_frame. +- */ +- struct iam_leaf *ip_leaf; +- /* +- * Key searched for. +- */ +- struct iam_key *ip_key_target; +- /* +- * Scratch-pad area for temporary keys. +- */ +- struct iam_key *ip_key_scratch[DX_SCRATCH_KEYS]; +- /* +- * pointer to flavor-specific per-container data. +- */ +- void *ip_descr_data; +-}; +- +-/* +- * Helper structure for legacy htrees. +- */ +-struct iam_path_compat { +- struct iam_path ipc_path; +- struct iam_container ipc_container; +- __u32 ipc_scrach[DX_SCRATCH_KEYS]; +-}; +- +-static u32 htree_root_ptr(struct iam_container *c); +-static int htree_node_check(struct iam_path *path, struct iam_frame *frame); +-static int htree_node_init(struct iam_container *c, +- struct buffer_head *bh, int root); +-static int htree_keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2); +-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, +- handle_t *h, struct buffer_head **bh); +- +-/* +- * Parameters describing iam compatibility mode in which existing ext3 htrees +- * can be manipulated. +- */ +-static struct iam_descr htree_compat_param = { +- .id_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, +- .id_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, +- .id_node_gap = offsetof(struct dx_node, entries), +- .id_root_gap = offsetof(struct dx_root, entries), +- +- .id_root_ptr = htree_root_ptr, +- .id_node_check = htree_node_check, +- .id_node_init = htree_node_init, +- .id_node_read = htree_node_read, +- .id_keycmp = htree_keycmp +-}; +- +- +-struct iam_key; +-struct iam_rec; +-struct iam_descr; +-struct iam_container; +-struct iam_path; +- +-/* +- * Initialize container @c, acquires additional reference on @inode. +- */ +-int iam_container_init(struct iam_container *c, +- struct iam_descr *descr, struct inode *inode); +-/* +- * Finalize container @c, release all resources. +- */ +-void iam_container_fini(struct iam_container *c); +- +-/* +- * Search container @c for record with key @k. If record is found, its data +- * are moved into @r. +- * +- * +- * +- * Return values: +ve: found, 0: not-found, -ve: error +- */ +-int iam_lookup(struct iam_container *c, struct iam_key *k, struct iam_rec *r); +-/* +- * Insert new record @r with key @k into container @c (within context of +- * transaction @h. +- * +- * Return values: 0: success, -ve: error, including -EEXIST when record with +- * given key is already present. +- * +- * postcondition: ergo(result == 0 || result == -EEXIST, +- * iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_insert(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Replace existing record with key @k, or insert new one. New record data are +- * in @r. +- * +- * Return values: 0: success, -ve: error. +- * +- * postcondition: ergo(result == 0, iam_lookup(c, k, r2) > 0 && +- * !memcmp(r, r2, c->ic_descr->id_rec_size)); +- */ +-int iam_update(handle_t *h, struct iam_container *c, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Delete existing record with key @k. +- * +- * Return values: 0: success, -ENOENT: not-found, -ve: other error. +- * +- * postcondition: ergo(result == 0 || result == -ENOENT, +- * !iam_lookup(c, k, *)); +- */ +-int iam_delete(handle_t *h, struct iam_container *c, struct iam_key *k); +- +-/* +- * iam cursor (iterator) api. +- */ +- +-/* +- * Flags controlling iterator functionality. +- */ +-enum iam_it_flags { +- /* +- * this iterator will move (iam_it_{prev,next}() will be called on it) +- */ +- IAM_IT_MOVE = (1 << 0), +- /* +- * tree can be updated through this iterator. +- */ +- IAM_IT_WRITE = (1 << 1) +-}; +- +-/* +- * States of iterator state machine. +- */ +-enum iam_it_state { +- /* initial state */ +- IAM_IT_DETACHED, +- /* iterator is above particular record in the container */ +- IAM_IT_ATTACHED +-}; +- +-/* +- * Iterator. +- * +- * Immediately after call to iam_it_init() iterator is in "detached" +- * (IAM_IT_DETACHED) state: it is associated with given parent container, but +- * doesn't point to any particular record in this container. +- * +- * After successful call to iam_it_get() and until corresponding call to +- * iam_it_put() iterator is in "attached" state (IAM_IT_ATTACHED). +- * +- * Attached iterator can move through records in a container (provided +- * IAM_IT_MOVE permission) in a key order, can get record and key values as it +- * passes over them, and can modify container (provided IAM_IT_WRITE +- * permission). +- * +- * Concurrency: iterators are supposed to be local to thread. Interfaces below +- * do no internal serialization. +- * +- */ +-struct iam_iterator { +- /* +- * iterator flags, taken from enum iam_it_flags. +- */ +- __u32 ii_flags; +- enum iam_it_state ii_state; +- /* +- * path to the record. Valid in IAM_IT_ATTACHED state. +- */ +- struct iam_path ii_path; +-}; +- +-static inline struct iam_key *keycpy(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return memcpy(k1, k2, c->ic_descr->id_key_size); +-} +- +-static inline int keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return c->ic_descr->id_keycmp(c, k1, k2); +-} +- +-static struct iam_container *iam_it_container(struct iam_iterator *it) +-{ +- return it->ii_path.ip_container; +-} +- +-static inline int it_keycmp(struct iam_iterator *it, +- struct iam_key *k1, struct iam_key *k2) +-{ +- return keycmp(iam_it_container(it), k1, k2); +-} +- +-/* +- * Initialize iterator to IAM_IT_DETACHED state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ +-int iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags); +-/* +- * Finalize iterator and release all resources. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- */ +-void iam_it_fini(struct iam_iterator *it); +- +-/* +- * Attach iterator. After successful completion, @it points to record with the +- * largest key not larger than @k. Semantics of ->id_create() method guarantee +- * that such record will always be found. +- * +- * Return value: 0: positioned on existing record, +- * -ve: error. +- * +- * precondition: it_state(it) == IAM_IT_DETACHED +- * postcondition: ergo(result == 0, +- * (it_state(it) == IAM_IT_ATTACHED && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0)) +- */ +-int iam_it_get(struct iam_iterator *it, struct iam_key *k); +- +-/* +- * Duplicates iterator. +- * +- * postcondition: it_state(dst) == it_state(src) && +- * iam_it_container(dst) == iam_it_container(src) && +- * dst->ii_flags = src->ii_flags && +- * ergo(it_state(it) == IAM_IT_ATTACHED, +- * iam_it_rec_get(dst) == iam_it_rec_get(src) && +- * iam_it_key_get(dst, *1) == iam_it_key_get(src, *2)) +- */ +-void iam_it_dup(struct iam_iterator *dst, struct iam_iterator *src); +- +-/* +- * Detach iterator. Does nothing it detached state. +- * +- * postcondition: it_state(it) == IAM_IT_DETACHED +- */ +-void iam_it_put(struct iam_iterator *it); +- +-/* +- * Move iterator one record right. +- * +- * Return value: 0: success, +- * +1: end of container reached +- * -ve: error +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_MOVE +- * postcondition: ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED) +- */ +-int iam_it_next(struct iam_iterator *it); +- +-/* +- * Return pointer to the record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-const struct iam_rec *iam_it_rec_get(struct iam_iterator *it); +- +-/* +- * Replace contents of record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_set(handle_t *h, struct iam_iterator *it, struct iam_rec *r); +- +-/* +- * Place key under iterator in @k, return @k +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-const struct iam_key *iam_it_key_get(struct iam_iterator *it, +- struct iam_key *k); +- +-/* +- * Insert new record with key @k and contents from @r, shifting records to the +- * right. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && +- * it->ii_flags&IAM_IT_WRITE && +- * it_keycmp(it, iam_it_key_get(it, *), k) < 0 +- * postcondition: it_state(it) == IAM_IT_ATTACHED && +- * ergo(result == 0, +- * it_keycmp(it, iam_it_key_get(it, *), k) == 0 && +- * !memcmp(iam_it_rec_get(it), r, ...)) +- */ +-int iam_it_rec_insert(handle_t *h, struct iam_iterator *it, +- struct iam_key *k, struct iam_rec *r); +-/* +- * Delete record under iterator. +- * +- * precondition: it_state(it) == IAM_IT_ATTACHED && it->ii_flags&IAM_IT_WRITE +- * postcondition: it_state(it) == IAM_IT_ATTACHED +- */ +-int iam_it_rec_delete(handle_t *h, struct iam_iterator *it); +- + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry); + static void dx_set_block(struct iam_path *p, + struct iam_entry *entry, unsigned value); +-static inline struct iam_key *dx_get_key(struct iam_path *p, +- struct iam_entry *entry, +- struct iam_key *key); +-static void dx_set_key(struct iam_path *p, struct iam_entry *entry, +- struct iam_key *key); +-static unsigned dx_get_count(struct iam_entry *entries); + static unsigned dx_get_limit(struct iam_entry *entries); + static void dx_set_count(struct iam_entry *entries, unsigned value); + static void dx_set_limit(struct iam_entry *entries, unsigned value); + static unsigned dx_root_limit(struct iam_path *p); + static unsigned dx_node_limit(struct iam_path *p); +-static int dx_probe(struct dentry *dentry, ++static int dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct iam_path *path); +@@ -696,269 +105,58 @@ + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct iam_path *path, +- struct iam_frame *frame, u32 hash, u32 block); +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct iam_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +- +-static inline void iam_path_init(struct iam_path *path, +- struct iam_container *c); +-static inline void iam_path_fini(struct iam_path *path); +- +- +-/* +- * Future: use high four bits of block for coalesce-on-delete flags +- * Mask them off for now. +- */ +- +-static inline void *entry_off(struct iam_entry *entry, ptrdiff_t off) +-{ +- return (void *)((char *)entry + off); +-} +- +-static inline struct iam_descr *path_descr(struct iam_path *p) +-{ +- return p->ip_container->ic_descr; +-} +- +-static inline struct inode *path_obj(struct iam_path *p) +-{ +- return p->ip_container->ic_object; +-} +- +-static inline size_t iam_entry_size(struct iam_path *p) +-{ +- return path_descr(p)->id_key_size + path_descr(p)->id_ptr_size; +-} +- +-static inline struct iam_entry *iam_entry_shift(struct iam_path *p, +- struct iam_entry *entry, int shift) +-{ +- void *e = entry; +- return e + shift * iam_entry_size(p); +-} +- +-static inline ptrdiff_t iam_entry_diff(struct iam_path *p, +- struct iam_entry *e1, struct iam_entry *e2) +-{ +- ptrdiff_t diff; +- +- diff = (void *)e1 - (void *)e2; +- assert(diff / iam_entry_size(p) * iam_entry_size(p) == diff); +- return diff / iam_entry_size(p); +-} +- +-static inline unsigned dx_get_block(struct iam_path *p, struct iam_entry *entry) +-{ +- return le32_to_cpu(*(u32 *)entry_off(entry, path_descr(p)->id_key_size)) +- & 0x00ffffff; +-} +- +-static inline void dx_set_block(struct iam_path *p, +- struct iam_entry *entry, unsigned value) +-{ +- *(u32*)entry_off(entry, +- path_descr(p)->id_key_size) = cpu_to_le32(value); +-} +- +-static inline struct iam_key *dx_get_key(struct iam_path *p, +- struct iam_entry *entry, +- struct iam_key *key) +-{ +- memcpy(key, entry, path_descr(p)->id_key_size); +- return key; +-} +- +-static inline struct iam_key *iam_key_at(struct iam_path *p, +- struct iam_entry *entry) +-{ +- return (struct iam_key *)entry; +-} +- +-static inline void dx_set_key(struct iam_path *p, +- struct iam_entry *entry, struct iam_key *key) +-{ +- memcpy(entry, key, path_descr(p)->id_key_size); +-} +- +-static inline unsigned dx_get_count (struct iam_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->count); +-} +- +-static inline unsigned dx_get_limit (struct iam_entry *entries) +-{ +- return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +-} +- +-static inline void dx_set_count (struct iam_entry *entries, unsigned value) +-{ +- ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +-} +- +-static inline void dx_set_limit (struct iam_entry *entries, unsigned value) ++static inline void dx_set_limit(struct iam_entry *entries, unsigned value) + { + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct iam_path *p) +-{ +- struct iam_descr *param = path_descr(p); +- unsigned entry_space = path_obj(p)->i_sb->s_blocksize - +- param->id_root_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); +-} +- +-static inline unsigned dx_node_limit(struct iam_path *p) +-{ +- struct iam_descr *param = path_descr(p); +- unsigned entry_space = path_obj(p)->i_sb->s_blocksize - +- param->id_node_gap; +- return entry_space / (param->id_key_size + param->id_ptr_size); +-} +- +-static inline int dx_index_is_compat(struct iam_path *path) ++int dx_index_is_compat(struct iam_path *path) + { +- return path_descr(path) == &htree_compat_param; ++ return iam_path_descr(path) == &iam_htree_compat_param; + } + +-static struct iam_entry *dx_get_entries(struct iam_path *path, void *data, +- int root) +-{ +- return data + +- (root ? +- path_descr(path)->id_root_gap : path_descr(path)->id_node_gap); +-} + +-static struct iam_entry *dx_node_get_entries(struct iam_path *path, +- struct iam_frame *frame) +-{ +- return dx_get_entries(path, +- frame->bh->b_data, frame == path->ip_frames); +-} +- +-static int dx_node_check(struct iam_path *p, struct iam_frame *f) ++int dx_node_check(struct iam_path *p, struct iam_frame *f) + { + struct iam_entry *e; + struct iam_container *c; +- unsigned count; +- unsigned i; +- +- c = p->ip_container; +- e = dx_node_get_entries(p, f); +- count = dx_get_count(e); +- e = iam_entry_shift(p, e, 1); +- for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { +- keycpy(c, p->ip_key_scratch[0], p->ip_key_scratch[1]); +- dx_get_key(p, e, p->ip_key_scratch[1]); +- if (i > 0 && +- keycmp(c, p->ip_key_scratch[0], p->ip_key_scratch[1]) > 0) +- return 0; +- } +- return 1; +-} +- +-static u32 htree_root_ptr(struct iam_container *c) +-{ +- return 0; +-} +- +-struct htree_cookie { +- struct dx_hash_info *hinfo; +- struct dentry *dentry; +-}; +- +-static int htree_node_check(struct iam_path *path, struct iam_frame *frame) +-{ +- void *data; +- struct iam_entry *entries; +- struct super_block *sb; +- +- data = frame->bh->b_data; +- entries = dx_node_get_entries(path, frame); +- sb = path_obj(path)->i_sb; +- if (frame == path->ip_frames) { +- /* root node */ +- struct dx_root *root; +- struct htree_cookie *hc = path->ip_descr_data; +- +- root = data; +- if (root->info.hash_version > DX_HASH_MAX) { +- ext3_warning(sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); +- return ERR_BAD_DX_DIR; +- } +- +- if (root->info.unused_flags & 1) { +- ext3_warning(sb, __FUNCTION__, +- "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); +- return ERR_BAD_DX_DIR; +- } +- +- path->ip_indirect = root->info.indirect_levels; +- if (path->ip_indirect > DX_MAX_TREE_HEIGHT - 1) { +- ext3_warning(sb, __FUNCTION__, +- "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); +- return ERR_BAD_DX_DIR; +- } +- +- assert((char *)entries == (((char *)&root->info) + +- root->info.info_length)); +- assert(dx_get_limit(entries) == dx_root_limit(path)); +- +- hc->hinfo->hash_version = root->info.hash_version; +- hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed; +- if (hc->dentry) +- ext3fs_dirhash(hc->dentry->d_name.name, +- hc->dentry->d_name.len, hc->hinfo); +- path->ip_key_target = (struct iam_key *)&hc->hinfo->hash; +- } else { +- /* non-root index */ +- assert(entries == data + path_descr(path)->id_node_gap); +- assert(dx_get_limit(entries) == dx_node_limit(path)); +- } +- frame->entries = frame->at = entries; +- return 0; +-} +- +-static int htree_node_init(struct iam_container *c, +- struct buffer_head *bh, int root) +-{ +- struct dx_node *node; +- +- assert(!root); +- +- node = (void *)bh->b_data; +- node->fake.rec_len = cpu_to_le16(c->ic_object->i_sb->s_blocksize); +- node->fake.inode = 0; +- return 0; +-} +- +-static int htree_node_read(struct iam_container *c, iam_ptr_t ptr, +- handle_t *handle, struct buffer_head **bh) +-{ +- int result = 0; +- +- *bh = ext3_bread(handle, c->ic_object, (int)ptr, 0, &result); +- if (*bh == NULL) +- result = -EIO; +- return result; +-} ++ unsigned count; ++ unsigned i; ++ iam_ptr_t blk; ++ iam_ptr_t root; ++ struct inode *inode; + +-static int htree_keycmp(struct iam_container *c, +- struct iam_key *k1, struct iam_key *k2) +-{ +- __u32 p1 = le32_to_cpu(*(__u32 *)k1); +- __u32 p2 = le32_to_cpu(*(__u32 *)k2); ++ c = p->ip_container; ++ e = dx_node_get_entries(p, f); ++ count = dx_get_count(e); ++ e = iam_entry_shift(p, e, 1); ++ root = iam_path_descr(p)->id_ops->id_root_ptr(c); + +- return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0); ++ inode = iam_path_obj(p); ++ for (i = 0; i < count - 1; ++i, e = iam_entry_shift(p, e, 1)) { ++ iam_ikeycpy(c, iam_path_ikey(p, 0), iam_path_ikey(p, 1)); ++ iam_get_ikey(p, e, iam_path_ikey(p, 1)); ++ if (i > 0 && ++ iam_ikeycmp(c, iam_path_ikey(p, 0), ++ iam_path_ikey(p, 1)) > 0) ++ return 0; ++ blk = dx_get_block(p, e); ++ /* ++ * Disable this check as it is racy. ++ */ ++ if (0 && inode->i_size < (blk + 1) * inode->i_sb->s_blocksize) ++ return 0; ++ /* ++ * By definition of a tree, no node points to the root. ++ */ ++ if (blk == root) ++ return 0; ++ } ++ return 1; + } + + /* +@@ -1044,177 +242,379 @@ + } + #endif /* DX_DEBUG */ + +-static int dx_lookup(struct iam_path *path) +-{ +- u32 ptr; +- int err = 0; +- int i; ++/* ++ * Per-node tree locking. ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ * ++ */ + +- struct iam_descr *param; +- struct iam_frame *frame; +- struct iam_container *c; ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 + +- param = path_descr(path); +- c = path->ip_container; +- +- for (frame = path->ip_frames, i = 0, +- ptr = param->id_root_ptr(path->ip_container); +- i <= path->ip_indirect; +- ptr = dx_get_block(path, frame->at), ++frame, ++i) { +- struct iam_entry *entries; +- struct iam_entry *p; +- struct iam_entry *q; +- struct iam_entry *m; +- unsigned count; ++#define DX_DEBUG (1) + +- err = param->id_node_read(c, (iam_ptr_t)ptr, NULL, &frame->bh); +- if (err != 0) +- break; +- err = param->id_node_check(path, frame); +- if (err != 0) +- break; ++#if DX_DEBUG ++static struct dx_lock_stats { ++ unsigned dls_bh_lock; ++ unsigned dls_bh_busy; ++ unsigned dls_bh_again; ++ unsigned dls_bh_full_again; ++} dx_lock_stats = { 0, }; ++#define DX_DEVAL(x) x ++#else ++#define DX_DEVAL(x) ++#endif + +- assert(dx_node_check(path, frame)); ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++ DX_DEVAL(dx_lock_stats.dls_bh_lock++); ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ DX_DEVAL(dx_lock_stats.dls_bh_busy++); ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} + +- entries = frame->entries; +- count = dx_get_count(entries); +- assert(count && count <= dx_get_limit(entries)); +- p = iam_entry_shift(path, entries, 1); +- q = iam_entry_shift(path, entries, count - 1); +- while (p <= q) { +- m = iam_entry_shift(path, +- p, iam_entry_diff(path, q, p) / 2); +- dxtrace(printk(".")); +- if (keycmp(c, iam_key_at(path, m), +- path->ip_key_target) > 0) +- q = iam_entry_shift(path, m, -1); +- else +- p = iam_entry_shift(path, m, +1); +- } ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} + +- frame->at = iam_entry_shift(path, p, -1); +- if (1) { // linear search cross check +- unsigned n = count - 1; +- struct iam_entry *at; +- +- at = entries; +- while (n--) { +- dxtrace(printk(",")); +- at = iam_entry_shift(path, at, +1); +- if (keycmp(c, iam_key_at(path, at), +- path->ip_key_target) > 0) { +- if (at != iam_entry_shift(path, frame->at, 1)) { +- BREAKPOINT; +- printk(KERN_EMERG "%i\n", +- keycmp(c, iam_key_at(path, at), +- path->ip_key_target)); +- } +- at = iam_entry_shift(path, at, -1); +- break; +- } +- } +- assert(at == frame->at); ++/* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++struct dynlock_handle *dx_lock_htree(struct inode *dir, unsigned long value, ++ enum dynlock_type lt) ++{ ++ return dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, lt, GFP_NOFS); ++} ++ ++void dx_unlock_htree(struct inode *dir, struct dynlock_handle *lh) ++{ ++ if (lh != NULL) ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lh); ++} ++ ++static void dx_unlock_array(struct inode *dir, struct dynlock_handle **lh) ++{ ++ int i; ++ ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) { ++ if (*lh != NULL) { ++ dx_unlock_htree(dir, *lh); ++ *lh = NULL; + } + } +- if (err != 0) +- iam_path_fini(path); +- path->ip_frame = --frame; +- return err; + } + + /* +- * Probe for a directory leaf block to search. ++ * dx_find_position ++ * ++ * search position of specified hash in index + * +- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +- * error in the directory index, and the caller should fall back to +- * searching the directory normally. The callers of dx_probe **MUST** +- * check for this error code, and make sure it never gets reflected +- * back to userspace. + */ +-static int dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct iam_path *path) ++ ++struct iam_entry *dx_find_position(struct iam_path *path, ++ struct iam_frame *frame) + { +- int err; +- struct htree_cookie hc = { +- .dentry = dentry, +- .hinfo = hinfo +- }; ++ int count; ++ struct iam_entry *p; ++ struct iam_entry *q; ++ struct iam_entry *m; + +- assert(dx_index_is_compat(path)); +- path->ip_descr_data = &hc; +- err = dx_lookup(path); +- assert(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); +- return err; ++ count = dx_get_count(frame->entries); ++ assert_corr(count && count <= dx_get_limit(frame->entries)); ++ p = iam_entry_shift(path, frame->entries, ++ dx_index_is_compat(path) ? 1 : 2); ++ q = iam_entry_shift(path, frame->entries, count - 1); ++ while (p <= q) { ++ m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2); ++ if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m), ++ path->ip_ikey_target) > 0) ++ q = iam_entry_shift(path, m, -1); ++ else ++ p = iam_entry_shift(path, m, +1); ++ } ++ return iam_entry_shift(path, p, -1); ++} ++ ++static iam_ptr_t dx_find_ptr(struct iam_path *path, struct iam_frame *frame) ++{ ++ return dx_get_block(path, dx_find_position(path, frame)); + } + + /* +- * Initialize container @c, acquires additional reference on @inode. ++ * Fast check for frame consistency. + */ +-int iam_container_init(struct iam_container *c, +- struct iam_descr *descr, struct inode *inode) ++static int dx_check_fast(struct iam_path *path, struct iam_frame *frame) + { +- memset(c, 0, sizeof *c); +- c->ic_descr = descr; +- c->ic_object = igrab(inode); +- if (c->ic_object != NULL) +- return 0; +- else +- return -ENOENT; ++ struct iam_container *bag; ++ struct iam_entry *next; ++ struct iam_entry *last; ++ struct iam_entry *entries; ++ struct iam_entry *at; ++ ++ bag = path->ip_container; ++ at = frame->at; ++ entries = frame->entries; ++ last = iam_entry_shift(path, entries, dx_get_count(entries) - 1); ++ ++ if (unlikely(at > last)) ++ return -EAGAIN; ++ ++ if (unlikely(dx_get_block(path, at) != frame->leaf)) ++ return -EAGAIN; ++ ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at), ++ path->ip_ikey_target) > 0)) ++ return -EAGAIN; ++ ++ next = iam_entry_shift(path, at, +1); ++ if (next <= last) { ++ if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next), ++ path->ip_ikey_target) <= 0)) ++ return -EAGAIN; ++ } ++ return 0; + } + + /* +- * Finalize container @c, release all resources. ++ * returns 0 if path was unchanged, -EAGAIN otherwise. + */ +-void iam_container_fini(struct iam_container *c) ++static int dx_check_path(struct iam_path *path, struct iam_frame *frame) + { +- if (c->ic_object != NULL) { +- iput(c->ic_object); +- c->ic_object = NULL; +- } ++ int equal; ++ ++ dx_lock_bh(frame->bh); ++ equal = dx_check_fast(path, frame) == 0 || ++ frame->leaf == dx_find_ptr(path, frame); ++ DX_DEVAL(dx_lock_stats.dls_bh_again += !equal); ++ dx_unlock_bh(frame->bh); ++ ++ return equal ? 0 : -EAGAIN; + } + +-static inline void iam_path_init(struct iam_path *path, struct iam_container *c) ++/* ++ * returns 0 if path was unchanged, -EAGAIN otherwise. ++ */ ++static int dx_check_full_path(struct iam_path *path, int search) + { +- memset(path, 0, sizeof *path); +- path->ip_container = c; +- path->ip_frame = path->ip_frames; ++ struct iam_frame *bottom; ++ struct iam_frame *scan; ++ int i; ++ int result; ++ ++ do_corr(schedule()); ++ ++ for (bottom = path->ip_frames, i = 0; ++ i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) { ++ ; /* find last filled in frame */ ++ } ++ ++ /* ++ * Lock frames, bottom to top. ++ */ ++ for (scan = bottom - 1; scan >= path->ip_frames; --scan) ++ dx_lock_bh(scan->bh); ++ /* ++ * Check them top to bottom. ++ */ ++ result = 0; ++ for (scan = path->ip_frames; scan < bottom; ++scan) { ++ struct iam_entry *pos; ++ ++ if (search) { ++ if (dx_check_fast(path, scan) == 0) ++ continue; ++ ++ pos = dx_find_position(path, scan); ++ if (scan->leaf != dx_get_block(path, pos)) { ++ result = -EAGAIN; ++ break; ++ } ++ scan->at = pos; ++ } else { ++ pos = iam_entry_shift(path, scan->entries, ++ dx_get_count(scan->entries) - 1); ++ if (scan->at > pos || ++ scan->leaf != dx_get_block(path, scan->at)) { ++ result = -EAGAIN; ++ break; ++ } ++ } ++ } ++ ++ /* ++ * Unlock top to bottom. ++ */ ++ for (scan = path->ip_frames; scan < bottom; ++scan) ++ dx_unlock_bh(scan->bh); ++ DX_DEVAL(dx_lock_stats.dls_bh_full_again += !!result); ++ do_corr(schedule()); ++ ++ return result; + } + +-static inline void iam_path_fini(struct iam_path *path) ++static int dx_lookup_try(struct iam_path *path) + { ++ u32 ptr; ++ int err = 0; + int i; + +- for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { +- if (path->ip_frames[i].bh != NULL) { +- brelse(path->ip_frames[i].bh); +- path->ip_frames[i].bh = NULL; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct iam_container *c; ++ ++ param = iam_path_descr(path); ++ c = path->ip_container; ++ ++ ptr = param->id_ops->id_root_ptr(c); ++ for (frame = path->ip_frames, i = 0; i <= path->ip_indirect; ++ ++frame, ++i) { ++ err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL, ++ &frame->bh); ++ do_corr(schedule()); ++ ++ dx_lock_bh(frame->bh); ++ /* ++ * node must be initialized under bh lock because concurrent ++ * creation procedure may change it and dx_lookup_try() will ++ * see obsolete tree height. -bzzz ++ */ ++ if (err != 0) ++ break; ++ ++ if (EXT3_INVARIANT_ON) { ++ err = param->id_ops->id_node_check(path, frame); ++ if (err != 0) ++ break; ++ } ++ ++ err = param->id_ops->id_node_load(path, frame); ++ if (err != 0) ++ break; ++ ++ assert_inv(dx_node_check(path, frame)); ++ /* ++ * splitting may change root index block and move hash we're ++ * looking for into another index block so, we have to check ++ * this situation and repeat from begining if path got changed ++ * -bzzz ++ */ ++ if (i > 0) { ++ err = dx_check_path(path, frame - 1); ++ if (err != 0) ++ break; + } ++ ++ frame->at = dx_find_position(path, frame); ++ frame->curidx = ptr; ++ frame->leaf = ptr = dx_get_block(path, frame->at); ++ ++ dx_unlock_bh(frame->bh); ++ do_corr(schedule()); + } ++ if (err != 0) ++ dx_unlock_bh(frame->bh); ++ path->ip_frame = --frame; ++ return err; + } + +-static void iam_path_compat_init(struct iam_path_compat *path, +- struct inode *inode) ++static int dx_lookup(struct iam_path *path) + { ++ int err; + int i; + +- iam_container_init(&path->ipc_container, &htree_compat_param, inode); +- /* +- * XXX hack allowing finalization of iam_path_compat with +- * iam_path_fini(). +- */ +- iput(inode); +- iam_path_init(&path->ipc_path, &path->ipc_container); +- for (i = 0; i < ARRAY_SIZE(path->ipc_path.ip_key_scratch); ++i) +- path->ipc_path.ip_key_scratch[i] = +- (struct iam_key *)&path->ipc_scrach[i]; ++ for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i) ++ assert(path->ip_frames[i].bh == NULL); ++ ++ do { ++ err = dx_lookup_try(path); ++ do_corr(schedule()); ++ if (err != 0) ++ iam_path_fini(path); ++ } while (err == -EAGAIN); ++ ++ return err; ++} ++ ++/* ++ * Performs path lookup and returns with found leaf (if any) locked by htree ++ * lock. ++ */ ++int dx_lookup_lock(struct iam_path *path, ++ struct dynlock_handle **dl, enum dynlock_type lt) ++{ ++ int result; ++ struct inode *dir; ++ ++ dir = iam_path_obj(path); ++ while ((result = dx_lookup(path)) == 0) { ++ do_corr(schedule()); ++ *dl = dx_lock_htree(dir, path->ip_frame->leaf, lt); ++ if (*dl == NULL) { ++ iam_path_fini(path); ++ result = -ENOMEM; ++ break; ++ } ++ do_corr(schedule()); ++ /* ++ * while locking leaf we just found may get split so we need ++ * to check this -bzzz ++ */ ++ if (dx_check_full_path(path, 1) == 0) ++ break; ++ dx_unlock_htree(dir, *dl); ++ *dl = NULL; ++ iam_path_fini(path); ++ } ++ return result; + } + +-static void iam_path_compat_fini(struct iam_path_compat *path) ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static int dx_probe(struct qstr *name, struct inode *dir, ++ struct dx_hash_info *hinfo, struct iam_path *path) + { +- iam_path_fini(&path->ipc_path); +- iam_container_fini(&path->ipc_container); ++ int err; ++ struct iam_path_compat *ipc; ++ ++ assert_corr(path->ip_data != NULL); ++ ipc = container_of(path->ip_data, struct iam_path_compat, ipc_descr); ++ ipc->ipc_qstr = name; ++ ipc->ipc_hinfo = hinfo; ++ ++ assert_corr(dx_index_is_compat(path)); ++ err = dx_lookup(path); ++ assert_corr(err != 0 || path->ip_frames[path->ip_indirect].bh != NULL); ++ return err; + } + ++ + /* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search +@@ -1232,16 +632,15 @@ + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +-static int ext3_htree_next_block(struct inode *dir, __u32 hash, +- struct iam_path *path, __u32 *start_hash) ++static int ext3_htree_advance(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash, ++ int compat) + { + struct iam_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + +- assert(dx_index_is_compat(path)); +- + p = path->ip_frame; + /* + * Find the next leaf page by incrementing the frame pointer. +@@ -1251,16 +650,26 @@ + * nodes need to be read. + */ + while (1) { ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); + p->at = iam_entry_shift(path, p->at, +1); + if (p->at < iam_entry_shift(path, p->entries, +- dx_get_count(p->entries))) ++ dx_get_count(p->entries))) { ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); + break; ++ } ++ dx_unlock_bh(p->bh); + if (p == path->ip_frames) + return 0; + num_frames++; + --p; + } + ++ if (compat) { ++ /* ++ * Htree hash magic. ++ */ + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir +@@ -1268,33 +677,146 @@ + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- dx_get_key(path, p->at, (struct iam_key *)&bhash); ++ iam_get_ikey(path, p->at, (struct iam_ikey *)&bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } ++ } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { +- err = path_descr(path)->id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, p->at), +- NULL, &bh); ++ iam_ptr_t idx; ++ ++ do_corr(schedule()); ++ dx_lock_bh(p->bh); ++ idx = p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); ++ err = iam_path_descr(path)->id_ops-> ++ id_node_read(path->ip_container, idx, NULL, &bh); + if (err != 0) + return err; /* Failure */ + ++p; +- brelse (p->bh); ++ brelse(p->bh); ++ assert_corr(p->bh != bh); + p->bh = bh; +- p->at = p->entries = dx_node_get_entries(path, p); +- assert(dx_node_check(path, p)); ++ p->entries = dx_node_get_entries(path, p); ++ p->at = iam_entry_shift(path, p->entries, !compat); ++ assert_corr(p->curidx != idx); ++ p->curidx = idx; ++ dx_lock_bh(p->bh); ++ assert_corr(p->leaf != dx_get_block(path, p->at)); ++ p->leaf = dx_get_block(path, p->at); ++ dx_unlock_bh(p->bh); ++ assert_inv(dx_node_check(path, p)); + } + return 1; + } + +- ++int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh) ++{ ++ struct iam_frame *f; ++ ++ for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) { ++ do_corr(schedule()); ++ *lh = dx_lock_htree(iam_path_obj(path), f->curidx, DLT_READ); ++ if (*lh == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static int iam_index_advance(struct iam_path *path) ++{ ++ return ext3_htree_advance(iam_path_obj(path), 0, path, NULL, 0); ++} ++ ++/* ++ * Advance index part of @path to point to the next leaf. Returns 1 on ++ * success, 0, when end of container was reached. Leaf node is locked. ++ */ ++int iam_index_next(struct iam_container *c, struct iam_path *path) ++{ ++ iam_ptr_t cursor; ++ struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, }; ++ int result; ++ struct inode *object; ++ ++ /* ++ * Locking for iam_index_next()... is to be described. ++ */ ++ ++ object = c->ic_object; ++ cursor = path->ip_frame->leaf; ++ ++ while (1) { ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result == 0 && cursor == path->ip_frame->leaf) { ++ result = iam_index_advance(path); ++ ++ assert_corr(result == 0 || ++ cursor != path->ip_frame->leaf); ++ break; ++ } ++ do { ++ dx_unlock_array(object, lh); ++ ++ iam_path_release(path); ++ do_corr(schedule()); ++ ++ result = dx_lookup(path); ++ if (result < 0) ++ break; ++ ++ while (path->ip_frame->leaf != cursor) { ++ do_corr(schedule()); ++ ++ result = iam_index_lock(path, lh); ++ do_corr(schedule()); ++ if (result < 0) ++ break; ++ ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ ++ result = iam_index_advance(path); ++ if (result == 0) { ++ ext3_error(object->i_sb, __FUNCTION__, ++ "cannot find cursor: %u\n", ++ cursor); ++ result = -EIO; ++ } ++ if (result < 0) ++ break; ++ result = dx_check_full_path(path, 0); ++ if (result != 0) ++ break; ++ dx_unlock_array(object, lh); ++ } ++ } while (result == -EAGAIN); ++ if (result < 0) ++ break; ++ } ++ dx_unlock_array(object, lh); ++ return result; ++} ++ ++int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct iam_path *path, __u32 *start_hash) ++{ ++ return ext3_htree_advance(dir, hash, path, start_hash, 1); ++} ++ + /* + * p is at least 6 bytes before the end of page + */ +@@ -1499,21 +1021,45 @@ + } while(more); + } + +-static void dx_insert_block(struct iam_path *path, +- struct iam_frame *frame, u32 hash, u32 block) ++void iam_insert_key(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) + { + struct iam_entry *entries = frame->entries; +- struct iam_entry *old = frame->at, *new = iam_entry_shift(path, old, +1); ++ struct iam_entry *new = iam_entry_shift(path, frame->at, +1); + int count = dx_get_count(entries); + +- assert(count < dx_get_limit(entries)); +- assert(old < iam_entry_shift(path, entries, count)); ++ /* ++ * Unfortunately we cannot assert this, as this function is sometimes ++ * called by VFS under i_sem and without pdirops lock. ++ */ ++ assert_corr(1 || iam_frame_is_locked(path, frame)); ++ assert_corr(count < dx_get_limit(entries)); ++ assert_corr(frame->at < iam_entry_shift(path, entries, count)); ++ assert_inv(dx_node_check(path, frame)); ++ + memmove(iam_entry_shift(path, new, 1), new, + (char *)iam_entry_shift(path, entries, count) - (char *)new); +- dx_set_key(path, new, (struct iam_key *)&hash); +- dx_set_block(path, new, block); ++ dx_set_ikey(path, new, key); ++ dx_set_block(path, new, ptr); + dx_set_count(entries, count + 1); ++ assert_inv(dx_node_check(path, frame)); ++} ++ ++void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame, ++ const struct iam_ikey *key, iam_ptr_t ptr) ++{ ++ dx_lock_bh(frame->bh); ++ iam_insert_key(path, frame, key, ptr); ++ dx_unlock_bh(frame->bh); ++} ++ ++void dx_insert_block(struct iam_path *path, struct iam_frame *frame, ++ u32 hash, u32 block) ++{ ++ assert_corr(dx_index_is_compat(path)); ++ iam_insert_key(path, frame, (struct iam_ikey *)&hash, block); + } ++ + #endif + + +@@ -1730,7 +1276,7 @@ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- *err = dx_probe(dentry, NULL, &hinfo, path); ++ *err = dx_probe(&dentry->d_name, NULL, &hinfo, path); + if (*err != 0) + return NULL; + } else { +@@ -1740,7 +1286,8 @@ + hash = hinfo.hash; + do { + block = dx_get_block(path, path->ip_frame->at); +- *err = path_descr(path)->id_node_read(path->ip_container, (iam_ptr_t)block, ++ *err = iam_path_descr(path)->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)block, + NULL, &bh); + if (*err != 0) + goto errout; +@@ -1908,22 +1455,69 @@ + return prev; + } + ++struct ext3_dir_entry_2 *move_entries(struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct buffer_head **bh1, ++ struct buffer_head **bh2, ++ __u32 *delim_hash) ++{ ++ char *data1; ++ char *data2; ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count; ++ unsigned continued; ++ unsigned split; ++ u32 hash2; ++ ++ struct dx_map_entry *map; ++ struct ext3_dir_entry_2 *de1; ++ struct ext3_dir_entry_2 *de2; ++ ++ data1 = (*bh1)->b_data; ++ data2 = (*bh2)->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map(map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ frame->leaf, hash2, split, count - split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de1 = dx_pack_dirents(data1, blocksize); ++ de1->rec_len = cpu_to_le16(data1 + blocksize - (char *) de1); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo, ++ (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) { ++ swap(*bh1, *bh2); ++ de1 = de2; ++ } ++ *delim_hash = hash2 + continued; ++ return de1; ++} ++ + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct iam_path *path, + struct buffer_head **bh,struct iam_frame *frame, + struct dx_hash_info *hinfo, int *error) + { +- struct inode *dir = path_obj(path); +- unsigned blocksize = dir->i_sb->s_blocksize; +- unsigned count, continued; ++ struct inode *dir = iam_path_obj(path); + struct buffer_head *bh2; + u32 newblock; + u32 hash2; +- struct dx_map_entry *map; +- char *data1 = (*bh)->b_data, *data2; +- unsigned split; +- struct ext3_dir_entry_2 *de = NULL, *de2; ++ struct ext3_dir_entry_2 *de = NULL; + int err; + + bh2 = ext3_append (handle, dir, &newblock, error); +@@ -1948,35 +1542,9 @@ + if (err) + goto journal_error; + +- data2 = bh2->b_data; +- +- /* create map in the end of data2 block */ +- map = (struct dx_map_entry *) (data2 + blocksize); +- count = dx_make_map ((struct ext3_dir_entry_2 *) data1, +- blocksize, hinfo, map); +- map -= count; +- split = count/2; // need to adjust to actual middle +- dx_sort_map (map, count); +- hash2 = map[split].hash; +- continued = hash2 == map[split - 1].hash; +- dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- +- /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split); +- de = dx_pack_dirents(data1,blocksize); +- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); +- de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ de = move_entries(dir, hinfo, bh, &bh2, &hash2); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; +- } +- dx_insert_block(path, frame, hash2 + continued, newblock); ++ dx_insert_block(path, frame, hash2, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1990,6 +1558,63 @@ + } + #endif + ++struct ext3_dir_entry_2 *find_insertion_point(struct inode *dir, ++ struct buffer_head *bh, ++ const char *name, int namelen) ++{ ++ struct ext3_dir_entry_2 *de; ++ char *top; ++ unsigned long offset; ++ int nlen; ++ int rlen; ++ int reclen; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ offset = 0; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", ++ dir, de, bh, offset)) ++ return ERR_PTR(-EIO); ++ if (ext3_match(namelen, name, de)) ++ return ERR_PTR(-EEXIST); ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ return de; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ return ERR_PTR(-ENOSPC); ++} ++ ++struct ext3_dir_entry_2 *split_entry(struct inode *dir, ++ struct ext3_dir_entry_2 *de, ++ unsigned long ino, mode_t mode, ++ const char *name, int namelen) ++{ ++ int nlen; ++ int rlen; ++ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1; ++ ++ de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ de->inode = cpu_to_le32(ino); ++ if (ino != 0) ++ ext3_set_de_type(dir->i_sb, de, mode); ++ de->name_len = namelen; ++ memcpy(de->name, name, namelen); ++ return de; ++} + + /* + * Add a new entry into a directory (leaf) block. If de is non-NULL, +@@ -2009,34 +1634,16 @@ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; +- unsigned long offset = 0; +- unsigned short reclen; +- int nlen, rlen, err; +- char *top; ++ int err; + +- reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { +- de = (struct ext3_dir_entry_2 *)bh->b_data; +- top = bh->b_data + dir->i_sb->s_blocksize - reclen; +- while ((char *) de <= top) { +- if (!ext3_check_dir_entry("ext3_add_entry", dir, de, +- bh, offset)) { +- brelse (bh); +- return -EIO; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; +- } +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if ((de->inode? rlen - nlen: rlen) >= reclen) +- break; +- de = (struct ext3_dir_entry_2 *)((char *)de + rlen); +- offset += rlen; ++ de = find_insertion_point(dir, bh, name, namelen); ++ if (IS_ERR(de)) { ++ err = PTR_ERR(de); ++ if (err != -ENOSPC) ++ brelse(bh); ++ return err; + } +- if ((char *) de > top) +- return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -2047,22 +1654,9 @@ + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT3_DIR_REC_LEN(de->name_len); +- rlen = le16_to_cpu(de->rec_len); +- if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); +- de1->rec_len = cpu_to_le16(rlen - nlen); +- de->rec_len = cpu_to_le16(nlen); +- de = de1; +- } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); ++ ++ split_entry(dir, de, inode ? inode->i_ino : 0, ++ inode ? inode->i_mode : 0, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend +@@ -2238,60 +1832,85 @@ + return add_dirent_to_buf(handle, dentry, inode, de, bh); + } + ++static int shift_entries(struct iam_path *path, ++ struct iam_frame *frame, unsigned count, ++ struct iam_entry *entries, struct iam_entry *entries2, ++ u32 newblock) ++{ ++ unsigned count1; ++ unsigned count2; ++ int delta; ++ ++ struct iam_frame *parent = frame - 1; ++ struct iam_ikey *pivot = iam_path_ikey(path, 3); ++ ++ delta = dx_index_is_compat(path) ? 0 : +1; ++ ++ count1 = count/2 + delta; ++ count2 = count - count1; ++ iam_get_ikey(path, iam_entry_shift(path, entries, count1), pivot); ++ ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy((char *) iam_entry_shift(path, entries2, delta), ++ (char *) iam_entry_shift(path, entries, count1), ++ count2 * iam_entry_size(path)); ++ ++ dx_set_count(entries2, count2 + delta); ++ dx_set_limit(entries2, dx_node_limit(path)); ++ ++ /* ++ * NOTE: very subtle piece of code competing dx_probe() may find 2nd ++ * level index in root index, then we insert new index here and set ++ * new count in that 2nd level index. so, dx_probe() may see 2nd level ++ * index w/o hash it looks for. the solution is to check root index ++ * after we locked just founded 2nd level index -bzzz ++ */ ++ iam_insert_key_lock(path, parent, pivot, newblock); ++ ++ /* ++ * now old and new 2nd level index blocks contain all pointers, so ++ * dx_probe() may find it in the both. it's OK -bzzz ++ */ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, count1); ++ dx_unlock_bh(frame->bh); ++ ++ /* ++ * now old 2nd level index block points to first half of leafs. it's ++ * importand that dx_probe() must check root index block for changes ++ * under dx_lock_bh(frame->bh) -bzzz ++ */ ++ ++ return count1; ++} ++ + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ +-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int split_index_node(handle_t *handle, struct iam_path *path, ++ struct dynlock_handle **lh) + { +- struct iam_path_compat cpath; +- struct iam_path *path = &cpath.ipc_path; +- struct iam_descr *param; +- struct iam_frame *frame, *safe; ++ + struct iam_entry *entries; /* old block contents */ + struct iam_entry *entries2; /* new block contents */ +- struct dx_hash_info hinfo; +- struct buffer_head * bh; ++ struct iam_frame *frame, *safe; + struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; +- struct inode *dir = dentry->d_parent->d_inode; +- struct super_block * sb = dir->i_sb; +- struct ext3_dir_entry_2 *de; + u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; +- int err; ++ struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,}; ++ struct inode *dir = iam_path_obj(path); ++ struct iam_descr *descr; + int nr_splet; +- int i; +- size_t isize; ++ int i, err; + +- iam_path_compat_init(&cpath, dir); +- param = path_descr(path); ++ descr = iam_path_descr(path); ++ /* ++ * Algorithm below depends on this. ++ */ ++ assert_corr(dx_root_limit(path) < dx_node_limit(path)); + +- err = dx_probe(dentry, NULL, &hinfo, path); +- if (err != 0) +- return err; + frame = path->ip_frame; + entries = frame->entries; + +- /* XXX nikita: global serialization! */ +- isize = dir->i_size; +- +- err = param->id_node_read(path->ip_container, +- (iam_ptr_t)dx_get_block(path, +- frame->at), handle, &bh); +- if (err != 0) +- goto cleanup; +- +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- +- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (err != -ENOSPC) { +- bh = NULL; +- goto cleanup; +- } +- + /* + * Tall-tree handling: we might have to split multiple index blocks + * all the way up to tree root. Tricky point here is error handling: +@@ -2300,12 +1919,14 @@ + * - first allocate all necessary blocks + * + * - insert pointers into them atomically. +- * +- * XXX nikita: this algorithm is *not* scalable, as it assumes that at +- * least nodes in the path are locked. + */ + +- /* Block full, should compress but for now just split */ ++ /* ++ * Locking: leaf is already locked. htree-locks are acquired on all ++ * index nodes that require split bottom-to-top, on the "safe" node, ++ * and on all new nodes ++ */ ++ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + +@@ -2313,8 +1934,9 @@ + for (nr_splet = 0; frame >= path->ip_frames && + dx_get_count(frame->entries) == dx_get_limit(frame->entries); + --frame, ++nr_splet) { ++ do_corr(schedule()); + if (nr_splet == DX_MAX_TREE_HEIGHT) { +- ext3_warning(sb, __FUNCTION__, ++ ext3_warning(dir->i_sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; +@@ -2322,13 +1944,53 @@ + } + + safe = frame; +- /* Go back down, allocating blocks, and adding blocks into ++ ++ /* ++ * Lock all nodes, bottom to top. ++ */ ++ for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) { ++ do_corr(schedule()); ++ lock[i] = dx_lock_htree(dir, frame->curidx, DLT_WRITE); ++ if (lock[i] == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ } ++ ++ /* ++ * Check for concurrent index modification. ++ */ ++ err = dx_check_full_path(path, 1); ++ if (err) ++ goto cleanup; ++ /* ++ * And check that the same number of nodes is to be split. ++ */ ++ for (i = 0, frame = path->ip_frame; frame >= path->ip_frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++i) { ++ ; ++ } ++ if (i != nr_splet) { ++ err = -EAGAIN; ++ goto cleanup; ++ } ++ ++ /* Go back down, allocating blocks, locking them, and adding into + * transaction... */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ do_corr(schedule()); + if (!bh_new[i] || +- param->id_node_init(path->ip_container, bh_new[i], 0) != 0) ++ descr->id_ops->id_node_init(path->ip_container, ++ bh_new[i], 0) != 0) ++ goto cleanup; ++ new_lock[i] = dx_lock_htree(dir, newblock[i], DLT_WRITE); ++ if (new_lock[i] == NULL) { ++ err = -ENOMEM; + goto cleanup; ++ } ++ do_corr(schedule()); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) +@@ -2336,6 +1998,7 @@ + } + /* Add "safe" node to transaction too */ + if (safe + 1 != path->ip_frames) { ++ do_corr(schedule()); + err = ext3_journal_get_write_access(handle, safe->bh); + if (err) + goto journal_error; +@@ -2346,6 +2009,7 @@ + unsigned count; + int idx; + struct buffer_head *bh2; ++ struct buffer_head *bh; + + entries = frame->entries; + count = dx_get_count(entries); +@@ -2354,6 +2018,7 @@ + bh2 = bh_new[i]; + entries2 = dx_get_entries(path, bh2->b_data, 0); + ++ bh = frame->bh; + if (frame == path->ip_frames) { + /* splitting root node. Tricky point: + * +@@ -2365,23 +2030,26 @@ + * capacity of the root node is smaller than that of + * non-root one. + */ +- struct dx_root *root; +- u8 indirects; + struct iam_frame *frames; ++ struct iam_entry *next; ++ ++ assert_corr(i == 0); ++ ++ do_corr(schedule()); + + frames = path->ip_frames; +- root = (struct dx_root *) frames->bh->b_data; +- indirects = root->info.indirect_levels; +- dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, + count * iam_entry_size(path)); + dx_set_limit(entries2, dx_node_limit(path)); + + /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(path, entries, newblock[i]); +- root->info.indirect_levels = indirects + 1; ++ dx_lock_bh(frame->bh); ++ next = descr->id_ops->id_root_inc(path->ip_container, ++ path, frame); ++ dx_set_block(path, next, newblock[0]); ++ dx_unlock_bh(frame->bh); + ++ do_corr(schedule()); + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, + (sizeof path->ip_frames) - 2 * sizeof frames[0]); +@@ -2389,54 +2057,146 @@ + frames[1].at = iam_entry_shift(path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; +- assert(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, frame)); ++ ++ path->ip_frame; + ++ frame; +- assert(dx_node_check(path, frame)); +- bh_new[i] = NULL; /* buffer head is "consumed" */ ++ assert_inv(dx_node_check(path, frame)); ++ bh_new[0] = NULL; /* buffer head is "consumed" */ + err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); + } else { + /* splitting non-root index node. */ +- unsigned count1 = count/2, count2 = count - count1; +- unsigned hash2; +- +- dx_get_key(path, +- iam_entry_shift(path, entries, count1), +- (struct iam_key *)&hash2); +- +- dxtrace(printk("Split index %i/%i\n", count1, count2)); +- +- memcpy ((char *) entries2, +- (char *) iam_entry_shift(path, entries, count1), +- count2 * iam_entry_size(path)); +- dx_set_count (entries, count1); +- dx_set_count (entries2, count2); +- dx_set_limit (entries2, dx_node_limit(path)); ++ struct iam_frame *parent = frame - 1; + ++ do_corr(schedule()); ++ count = shift_entries(path, frame, count, ++ entries, entries2, newblock[i]); + /* Which index block gets the new entry? */ +- if (idx >= count1) { ++ if (idx >= count) { ++ int d = dx_index_is_compat(path) ? 0 : +1; ++ + frame->at = iam_entry_shift(path, entries2, +- idx - count1); ++ idx - count + d); + frame->entries = entries = entries2; ++ frame->curidx = newblock[i]; + swap(frame->bh, bh2); ++ assert_corr(lock[i + 1] != NULL); ++ assert_corr(new_lock[i] != NULL); ++ swap(lock[i + 1], new_lock[i]); + bh_new[i] = bh2; ++ parent->at = iam_entry_shift(path, ++ parent->at, +1); + } +- dx_insert_block(path, frame - 1, hash2, newblock[i]); +- assert(dx_node_check(path, frame)); +- assert(dx_node_check(path, frame - 1)); ++ assert_inv(dx_node_check(path, frame)); ++ assert_inv(dx_node_check(path, parent)); + dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; ++ do_corr(schedule()); ++ err = ext3_journal_dirty_metadata(handle, parent->bh); ++ if (err) ++ goto journal_error; + } ++ do_corr(schedule()); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto journal_error; ++ } ++ /* ++ * This function was called to make insertion of new leaf ++ * possible. Check that it fulfilled its obligations. ++ */ ++ assert_corr(dx_get_count(path->ip_frame->entries) < ++ dx_get_limit(path->ip_frame->entries)); ++ assert_corr(lock[nr_splet] != NULL); ++ *lh = lock[nr_splet]; ++ lock[nr_splet] = NULL; ++ if (nr_splet > 0) { ++ /* ++ * Log ->i_size modification. ++ */ ++ err = ext3_mark_inode_dirty(handle, dir); ++ if (err) ++ goto journal_error; ++ } ++ goto cleanup; ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++ ++cleanup: ++ dx_unlock_array(dir, lock); ++ dx_unlock_array(dir, new_lock); ++ ++ assert_corr(err || iam_frame_is_locked(path, path->ip_frame)); ++ ++ do_corr(schedule()); ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ return err; ++} ++ ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct iam_path_compat cpath; ++ struct iam_path *path = &cpath.ipc_path; ++ struct iam_descr *param; ++ struct iam_frame *frame; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh = NULL; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct ext3_dir_entry_2 *de; ++ struct dynlock_handle *dummy = NULL; ++ int err; ++ size_t isize; ++ ++ iam_path_compat_init(&cpath, dir); ++ param = iam_path_descr(path); ++ ++ err = dx_probe(&dentry->d_name, NULL, &hinfo, path); ++ if (err != 0) ++ return err; ++ frame = path->ip_frame; ++ ++ isize = dir->i_size; ++ ++ err = param->id_ops->id_node_read(path->ip_container, ++ (iam_ptr_t)dx_get_block(path, frame->at), ++ handle, &bh); ++ if (err != 0) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); ++ if (err != -ENOSPC) { ++ bh = NULL; ++ goto cleanup; + } +- de = do_split(handle, path, &bh, --frame, &hinfo, &err); ++ ++ err = split_index_node(handle, path, &dummy); ++ if (err) ++ goto cleanup; ++ ++ /*copy split inode too*/ ++ de = do_split(handle, path, &bh, path->ip_frame, &hinfo, &err); + if (!de) + goto cleanup; +- assert(dx_node_check(path, frame)); ++ ++ assert_inv(dx_node_check(path, frame)); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup2; + +@@ -2446,10 +2206,7 @@ + if (bh) + brelse(bh); + cleanup2: +- for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { +- if (bh_new[i] != NULL) +- brelse(bh_new[i]); +- } ++ dx_unlock_htree(dir, dummy); + if (err) + inode->i_size = isize; + iam_path_fini(path); +@@ -2554,6 +2311,26 @@ + return ext3_new_inode(handle, dir, mode, inum); + } + ++struct inode *ext3_create_inode(handle_t *handle, struct inode * dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext3_new_inode(handle, dir, mode, 0); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext3_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ ext3_set_aops(inode); ++ } ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext3_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2007-10-24 10:02:51.000000000 +0300 ++++ linux-stage/fs/ext3/Makefile 2007-10-24 10:02:53.000000000 +0300 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o mballoc.o ++ extents.o mballoc.o iam.o iam_lfix.o iam_lvar.o iam_htree.o iam_uapi.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/dir.c +=================================================================== +--- linux-stage.orig/fs/ext3/dir.c 2007-10-24 10:02:49.000000000 +0300 ++++ linux-stage/fs/ext3/dir.c 2007-10-24 10:02:53.000000000 +0300 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +@@ -61,6 +62,7 @@ + } + + ++#if EXT3_INVARIANT_ON + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -90,6 +92,7 @@ + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; + } ++#endif + + static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) +@@ -304,12 +307,14 @@ + root->rb_node = NULL; + } + ++extern struct iam_private_info *ext3_iam_alloc_info(int flags); ++extern void ext3_iam_release_info(struct iam_private_info *info); + + static struct dir_private_info *create_dir_info(loff_t pos) + { + struct dir_private_info *p; + +- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ p = (void *)ext3_iam_alloc_info(GFP_KERNEL); + if (!p) + return NULL; + p->root.rb_node = NULL; +@@ -325,6 +330,7 @@ + void ext3_htree_free_dir_info(struct dir_private_info *p) + { + free_rb_tree_fname(&p->root); ++ ext3_iam_release_info((void *)p); + kfree(p); + } + +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2007-10-24 10:02:52.000000000 +0300 ++++ linux-stage/fs/ext3/ioctl.c 2007-10-24 10:02:53.000000000 +0300 +@@ -15,6 +15,7 @@ + #include + #include + ++#include + + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +@@ -268,6 +269,6 @@ + + + default: +- return -ENOTTY; ++ return iam_uapi_ioctl(inode, filp, cmd, arg); + } + } +Index: linux-stage/fs/ext3/file.c +=================================================================== +--- linux-stage.orig/fs/ext3/file.c 2007-10-24 10:02:49.000000000 +0300 ++++ linux-stage/fs/ext3/file.c 2007-10-24 10:02:53.000000000 +0300 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include "xattr.h" + #include "acl.h" + +@@ -41,8 +42,12 @@ + ext3_discard_reservation(inode); + up(&EXT3_I(inode)->truncate_sem); + } +- if (is_dx(inode) && filp->private_data) +- ext3_htree_free_dir_info(filp->private_data); ++ if (is_dx(inode) && filp->private_data) { ++ if (S_ISDIR(inode->i_mode)) ++ ext3_htree_free_dir_info(filp->private_data); ++ else ++ ext3_iam_release(filp, inode); ++ } + + return 0; + } +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2007-10-24 10:02:53.000000000 +0300 ++++ linux-stage/fs/ext3/super.c 2007-10-24 10:02:53.000000000 +0300 +@@ -461,7 +461,11 @@ + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; +- ++ ++ dynlock_init(&ei->i_htree_lock); ++ sema_init(&ei->i_rename_sem, 1); ++ sema_init(&ei->i_append_sem, 1); ++ + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2007-10-24 10:02:52.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs.h 2007-10-24 10:02:53.000000000 +0300 +@@ -902,9 +902,7 @@ + extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); + + /* dir.c */ +-extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, +- struct buffer_head *, unsigned long); ++ + extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2007-10-24 10:02:52.000000000 +0300 ++++ linux-stage/include/linux/ext3_fs_i.h 2007-10-24 10:02:53.000000000 +0300 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #define HAVE_DISK_INODE_VERSION + +@@ -135,6 +136,12 @@ + * by other means, so we have truncate_sem. + */ + struct semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; ++ + struct inode vfs_inode; + + __u32 i_cached_extent[4]; diff --git a/ldiskfs/kernel_patches/patches/ext3-tall-htree-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-tall-htree-sles10.patch new file mode 100644 index 0000000..646243d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-tall-htree-sles10.patch @@ -0,0 +1,432 @@ +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2007-08-30 14:39:15.000000000 +0300 ++++ linux-stage/fs/ext3/namei.c 2007-08-30 14:45:11.000000000 +0300 +@@ -50,6 +50,11 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++/* ++ * Maximal number of non-leaf levels in htree. In the stock ext3 this is 2. ++ */ ++#define DX_MAX_TREE_HEIGHT (5) ++ + static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +@@ -77,7 +82,7 @@ + #ifdef DX_DEBUG + #define dxtrace(command) command + #else +-#define dxtrace(command) ++#define dxtrace(command) + #endif + + struct fake_dirent +@@ -170,7 +175,7 @@ + static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, +- struct dx_frame *frames, ++ struct dx_frame *frames, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); +@@ -251,7 +256,7 @@ + } + + struct stats +-{ ++{ + unsigned names; + unsigned space; + unsigned bcount; +@@ -369,7 +374,7 @@ + goto fail; + } + +- if ((indirect = root->info.indirect_levels) > 1) { ++ if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -438,12 +443,15 @@ + + static void dx_release (struct dx_frame *frames) + { ++ int height; ++ + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ height = ((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels; ++ for (; height >= 0; height--) { ++ assert(frames[height].bh != NULL); ++ brelse(frames[height].bh); ++ } + } + + /* +@@ -465,7 +473,7 @@ + */ + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, +- struct dx_frame *frames, ++ struct dx_frame *frames, + __u32 *start_hash) + { + struct dx_frame *p; +@@ -593,7 +601,7 @@ + { + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct inode *dir; + int block, err; + int count = 0; +@@ -642,7 +650,7 @@ + } + count += ret; + hashval = ~0; +- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { +@@ -659,7 +667,7 @@ + break; + } + dx_release(frames); +- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", ++ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; + errout: +@@ -934,7 +942,7 @@ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -1063,7 +1071,7 @@ + parent = ERR_PTR(-ENOMEM); + } + return parent; +-} ++} + + #define S_SHIFT 12 + static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { +@@ -1124,6 +1132,8 @@ + return prev; + } + ++/* Allocate new node, and split leaf node @bh into it, inserting new pointer ++ * into parent node identified by @frame */ + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +@@ -1211,7 +1221,7 @@ + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. +- * ++ * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +@@ -1312,7 +1322,7 @@ + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT], *frame; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; +@@ -1453,20 +1463,29 @@ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +- struct dx_frame frames[2], *frame; +- struct dx_entry *entries, *at; ++ struct dx_frame frames[DX_MAX_TREE_HEIGHT] = {{0,},}, *frame, *safe; ++ struct dx_node *node2; ++ struct dx_entry *entries; /* old block contents */ ++ struct dx_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; + struct buffer_head * bh; ++ struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0}; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; ++ u32 newblock[DX_MAX_TREE_HEIGHT] = {0}; + int err; ++ int nr_splet; ++ int i; ++ size_t isize; + + frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; +- at = frame->at; ++ ++ /* XXX nikita: global serialization! */ ++ isize = dir->i_size; + + if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; +@@ -1482,29 +1499,43 @@ + goto cleanup; + } + ++ /* ++ * Tall-tree handling: we might have to split multiple index blocks ++ * all the way up to tree root. Tricky point here is error handling: ++ * to avoid complicated undo/rollback we ++ * ++ * - first allocate all necessary blocks ++ * ++ * - insert pointers into them atomically. ++ * ++ * XXX nikita: this algorithm is *not* scalable, as it assumes that at ++ * least nodes in the path are locked. ++ */ ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +- /* Need to split index? */ +- if (dx_get_count(entries) == dx_get_limit(entries)) { +- u32 newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; +- struct dx_entry *entries2; +- struct dx_node *node2; +- struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { ++ /* What levels need split? */ ++ for (nr_splet = 0; frame >= frames && ++ dx_get_count(frame->entries) == dx_get_limit(frame->entries); ++ --frame, ++nr_splet) { ++ if (nr_splet == DX_MAX_TREE_HEIGHT) { + ext3_warning(sb, __FUNCTION__, +- "Directory index full!"); ++ "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; + } +- bh2 = ext3_append (handle, dir, &newblock, &err); +- if (!(bh2)) ++ } ++ ++ safe = frame; ++ /* Go back down, allocating blocks, and adding blocks into ++ * transaction... */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); ++ if (!bh_new[i]) + goto cleanup; +- node2 = (struct dx_node *)(bh2->b_data); ++ node2 = (struct dx_node *)(bh_new[i]->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); + node2->fake.inode = 0; +@@ -1512,72 +1547,112 @@ + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { +- unsigned icount1 = icount/2, icount2 = icount - icount1; +- unsigned hash2 = dx_get_hash(entries + icount1); +- dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ +- err = ext3_journal_get_write_access(handle, +- frames[0].bh); ++ } ++ /* Add "safe" node to transaction too */ ++ if (safe + 1 != frames) { ++ err = ext3_journal_get_write_access(handle, safe->bh); ++ if (err) ++ goto journal_error; ++ } ++ ++ /* Go through nodes once more, inserting pointers */ ++ for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { ++ unsigned count; ++ int idx; ++ struct buffer_head *bh2; ++ ++ entries = frame->entries; ++ count = dx_get_count(entries); ++ idx = frame->at - entries; ++ ++ bh2 = bh_new[i]; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ ++ if (frame == frames) { ++ /* splitting root node. Tricky point: ++ * ++ * In the "normal" B-tree we'd split root *and* add ++ * new root to the tree with pointers to the old root ++ * and its sibling (thus introducing two new nodes). ++ * ++ * In htree it's enough to add one node, because ++ * capacity of the root node is smaller than that of ++ * non-root one. ++ */ ++ struct dx_root *root; ++ u8 indirects; ++ ++ root = (struct dx_root *) frames->bh->b_data; ++ indirects = root->info.indirect_levels; ++ dxtrace(printk("Creating new root %d\n", indirects)); ++ memcpy((char *) entries2, (char *) entries, ++ count * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock[i]); ++ root->info.indirect_levels = indirects + 1; ++ ++ /* Shift frames in the path */ ++ memmove(frames + 2, frames + 1, ++ (sizeof frames) - 2 * sizeof frames[0]); ++ /* Add new access path frame */ ++ frames[1].at = entries2 + idx; ++ frames[1].entries = entries = entries2; ++ frames[1].bh = bh2; ++ ++ frame; ++ bh_new[i] = NULL; /* buffer head is "consumed" */ ++ err = ext3_journal_get_write_access(handle, bh2); + if (err) + goto journal_error; +- +- memcpy ((char *) entries2, (char *) (entries + icount1), +- icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); +- dx_set_count (entries2, icount2); ++ } else { ++ /* splitting non-root index node. */ ++ unsigned count1 = count/2, count2 = count - count1; ++ unsigned hash2 = dx_get_hash(entries + count1); ++ dxtrace(printk("Split index %i/%i\n", count1, count2)); ++ ++ memcpy ((char *) entries2, (char *) (entries + count1), ++ count2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, count1); ++ dx_set_count (entries2, count2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ +- if (at - entries >= icount1) { +- frame->at = at = at - entries - icount1 + entries2; ++ if (idx >= count1) { ++ frame->at = entries2 + idx - count1; + frame->entries = entries = entries2; + swap(frame->bh, bh2); ++ bh_new[i] = bh2; + } +- dx_insert_block (frames + 0, hash2, newblock); +- dxtrace(dx_show_index ("node", frames[1].entries)); ++ dx_insert_block (frame - 1, hash2, newblock[i]); ++ dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; +- brelse (bh2); +- } else { +- dxtrace(printk("Creating second level index...\n")); +- memcpy((char *) entries2, (char *) entries, +- icount * sizeof(struct dx_entry)); +- dx_set_limit(entries2, dx_node_limit(dir)); +- +- /* Set up root */ +- dx_set_count(entries, 1); +- dx_set_block(entries + 0, newblock); +- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext3_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; + } +- ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); +- bh = NULL; +- goto cleanup; ++ goto cleanup2; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: + if (bh) + brelse(bh); ++cleanup2: ++ for (i = 0; i < ARRAY_SIZE(bh_new); ++i) { ++ if (bh_new[i] != NULL) ++ brelse(bh_new[i]); ++ } ++ if (err) ++ inode->i_size = isize; + dx_release(frames); + return err; + } +@@ -1587,7 +1662,7 @@ + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +-static int ext3_delete_entry (handle_t *handle, ++static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) +@@ -1856,7 +1931,7 @@ + de1 = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || +- !le32_to_cpu(de1->inode) || ++ !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", +@@ -1926,7 +2001,7 @@ + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: +- * I think I can trigger J_ASSERT in ext3_orphan_add(). We block ++ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on lock_super()), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. +@@ -2452,4 +2527,4 @@ + .removexattr = generic_removexattr, + #endif + .permission = ext3_permission, +-}; ++}; diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series index 3e81e1b..a146e53 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -25,8 +25,6 @@ ext3-sector_t-overflow-2.6.9-rhel4.patch ext3-check-jbd-errors-2.6.9.patch ext3-uninit-2.6.9.patch ext3-nanosecond-2.6-rhel4.patch -ext3-iam-ops.patch -ext3-iam-separate.patch -ext3-iam-uapi.patch ext3-orphans-delay.patch -ext3-pdirops-2.6.9.patch +ext3-iam-common.patch +ext3-iam-rhel4.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series index 04768bd..1fc13bb 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series @@ -18,3 +18,13 @@ ext3-disable-write-bar-by-default-2.6-sles10.patch ext3-uninit-2.6-sles10.patch ext3-nanosecond-2.6-sles10.patch ext3-inode-version-2.6-sles10.patch +ext3-journal-chksum-2.6.18-vanilla.patch +ext3-tall-htree-sles10.patch +ext3-htree-path.patch +ext3-htree-r5-hash.patch +ext3-htree-path-ops.patch +ext3-hash-selection-sles10.patch +ext3-htree-comments.patch +ext3-orphans-delay.patch +ext3-iam-common.patch +ext3-iam-sles10.patch -- 1.8.3.1