From f467017cc5fe522c7c7a8a77b7d5cfcb2de116ad Mon Sep 17 00:00:00 2001 From: braam Date: Fri, 12 Apr 2002 16:28:04 +0000 Subject: [PATCH] - add a new directory for extN file system: ext3 with new features - placed large directory patch in this directory. --- lustre/extN/htree-ext3-2.4.18.diff | 1216 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1216 insertions(+) create mode 100644 lustre/extN/htree-ext3-2.4.18.diff diff --git a/lustre/extN/htree-ext3-2.4.18.diff b/lustre/extN/htree-ext3-2.4.18.diff new file mode 100644 index 0000000..583f9d9 --- /dev/null +++ b/lustre/extN/htree-ext3-2.4.18.diff @@ -0,0 +1,1216 @@ +--- ./fs/ext3/dir.c 2002/03/05 06:18:59 2.1 ++++ ./fs/ext3/dir.c 2002/03/05 06:26:56 +@@ -26,7 +26,7 @@ + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + +-static int ext3_readdir(struct file *, void *, filldir_t); ++int ext3_readdir(struct file *, void *, filldir_t); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, +@@ -65,7 +65,7 @@ + return error_msg == NULL ? 1 : 0; + } + +-static int ext3_readdir(struct file * filp, ++int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) + { + int error = 0; +--- ./fs/ext3/super.c 2002/03/05 06:18:59 2.1 ++++ ./fs/ext3/super.c 2002/03/05 06:26:56 +@@ -529,6 +529,12 @@ + "EXT3 Check option not supported\n"); + #endif + } ++ else if (!strcmp (this_char, "index")) ++#ifdef CONFIG_EXT3_INDEX ++ set_opt (*mount_options, INDEX); ++#else ++ printk("EXT3 Index option not supported\n"); ++#endif + else if (!strcmp (this_char, "debug")) + set_opt (*mount_options, DEBUG); + else if (!strcmp (this_char, "errors")) { +@@ -712,6 +718,10 @@ + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); ++ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ set_opt (EXT3_SB(sb)->s_mount_opt, INDEX); ++ + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { +--- ./fs/ext3/namei.c 2002/03/05 06:18:59 2.1 ++++ ./fs/ext3/namei.c 2002/03/06 00:13:18 +@@ -16,6 +16,10 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ * Hash Tree Directory indexing (c) ++ * Daniel Phillips, 2001 ++ * Hash Tree Directory indexing porting ++ * Christopher Li, 2002 + */ + + #include +@@ -38,6 +42,436 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++void ext3_add_compat_feature (struct super_block *sb, unsigned feature) ++{ ++ if (!EXT3_HAS_COMPAT_FEATURE(sb, feature)) ++ { ++ lock_super(sb); ++ ext3_update_dynamic_rev(sb); ++ EXT3_SET_COMPAT_FEATURE(sb, feature); ++ ext3_write_super(sb); ++ unlock_super(sb); ++ } ++} ++ ++static struct buffer_head *ext3_append (handle_t *handle, ++ struct inode *inode, ++ u32 *block, int *err) ++{ ++ struct buffer_head *bh; ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ if((bh = ext3_bread (handle,inode, *block, 1, err))) { ++ inode->i_size += inode->i_sb->s_blocksize; ++ ext3_journal_get_write_access(handle,bh); ++ } ++ return bh; ++} ++ ++#ifndef assert ++#define assert(test) do if (!(test)) BUG(); while (0) ++#endif ++ ++#ifndef swap ++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) ++#endif ++ ++typedef struct { u32 v; } le_u32; ++typedef struct { u16 v; } le_u16; ++ ++#define dxtrace_on(command) command ++#define dxtrace_off(command) ++#define dxtrace dxtrace_off ++ ++struct fake_dirent ++{ ++ /*le*/u32 inode; ++ /*le*/u16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit ++{ ++ le_u16 limit; ++ le_u16 count; ++}; ++ ++struct dx_entry ++{ ++ le_u32 hash; ++ le_u32 block; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root ++{ ++ struct fake_dirent fake1; ++ char dot1[4]; ++ struct fake_dirent fake2; ++ char dot2[4]; ++ struct dx_root_info ++ { ++ le_u32 reserved_zero; ++ u8 hash_version; /* 0 now, 1 at release */ ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct dx_entry entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct dx_entry entries[0]; ++}; ++ ++ ++struct dx_frame ++{ ++ struct buffer_head *bh; ++ struct dx_entry *entries; ++ struct dx_entry *at; ++}; ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++typedef struct ext3_dir_entry_2 ext3_dirent; ++/* dx_static exists purely to suppress compiler warnings */ ++static inline unsigned dx_get_block (struct dx_entry *entry); ++static void dx_set_block (struct dx_entry *entry, unsigned value); ++static inline unsigned dx_get_hash (struct dx_entry *entry); ++static void dx_set_hash (struct dx_entry *entry, unsigned value); ++static unsigned dx_get_count (struct dx_entry *entries); ++static unsigned dx_get_limit (struct dx_entry *entries); ++static void dx_set_count (struct dx_entry *entries, unsigned value); ++static void dx_set_limit (struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit (struct inode *dir, unsigned infosize); ++static unsigned dx_node_limit (struct inode *dir); ++static unsigned dx_hack_hash (const u8 *name, int len); ++static struct dx_frame *dx_probe (struct inode *dir, u32 hash, struct dx_frame *frame); ++static void dx_release (struct dx_frame *frames); ++static int dx_make_map (ext3_dirent *de, int size, struct dx_map_entry map[]); ++static void dx_sort_map(struct dx_map_entry *map, unsigned count); ++static ext3_dirent *dx_copy_dirents (char *from, char *to, ++ struct dx_map_entry *map, int count); ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++ ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * Future: use high four bits of block for coalesce-on-delete flags ++ * Mask them off for now. ++ */ ++ ++static inline unsigned dx_get_block (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->block.v) & 0x0fffffff; ++} ++ ++static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++{ ++ entry->block.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_hash (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->hash.v); ++} ++ ++static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++{ ++ entry->hash.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_count (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); ++} ++ ++static inline unsigned dx_get_limit (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); ++} ++ ++static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); ++} ++ ++static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - 24 - infosize; ++ return 0? 20: entry_space / sizeof(struct dx_entry); ++} ++ ++static inline unsigned dx_node_limit (struct inode *dir) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - sizeof(struct fake_dirent); ++ return 0? 22: entry_space / sizeof(struct dx_entry); ++} ++ ++/* Hash function - not bad, but still looking for an ideal default */ ++ ++static unsigned dx_hack_hash (const u8 *name, int len) ++{ ++ u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ while (len--) ++ { ++ u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ if (hash & 0x80000000) hash -= 0x7fffffff; ++ hash1 = hash0; ++ hash0 = hash; ++ } ++ return 80; /* FIXME: for test only */ ++ return hash0; ++} ++ ++#define dx_hash(s,n) (dx_hack_hash(s,n) << 1) ++ ++/* ++ * Debug ++ */ ++static void dx_show_index (char * label, struct dx_entry *entries) ++{ ++ int i, n = dx_get_count (entries); ++ printk("%s index ", label); ++ for (i = 0; i < n; i++) ++ { ++ printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); ++ } ++ printk("\n"); ++} ++ ++struct stats ++{ ++ unsigned names; ++ unsigned space; ++ unsigned bcount; ++}; ++ ++static struct stats dx_show_leaf (ext3_dirent *de, int size, int show_names) ++{ ++ unsigned names = 0, space = 0; ++ char *base = (char *) de; ++ printk("names: "); ++ while ((char *) de < base + size) ++ { ++ if (de->inode) ++ { ++ if (show_names) ++ { ++ int len = de->name_len; ++ char *name = de->name; ++ while (len--) printk("%c", *name++); ++ printk(":%x.%u ", dx_hash (de->name, de->name_len), ((char *) de - base)); ++ } ++ space += EXT3_DIR_REC_LEN(de->name_len); ++ names++; ++ } ++ de = (ext3_dirent *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ printk("(%i)\n", names); ++ return (struct stats) { names, space, 1 }; ++} ++ ++struct stats dx_show_entries (struct inode *dir, struct dx_entry *entries, int levels) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count = dx_get_count (entries), names = 0, space = 0, i; ++ unsigned bcount = 0; ++ struct buffer_head *bh; ++ int err; ++ printk("%i indexed blocks...\n", count); ++ for (i = 0; i < count; i++, entries++) ++ { ++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; ++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; ++ struct stats stats; ++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); ++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; ++ stats = levels? ++ dx_show_entries (dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ++ dx_show_leaf ((ext3_dirent *) bh->b_data, blocksize, 0); ++ names += stats.names; ++ space += stats.space; ++ bcount += stats.bcount; ++ brelse (bh); ++ } ++ if (bcount) ++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", ++ names, space/bcount,(space/bcount)*100/blocksize); ++ return (struct stats) { names, space, bcount}; ++} ++ ++static void dx_show_buckets (struct inode *dir) ++{ ++ struct buffer_head *bh; ++ struct dx_root *root; ++ int err; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0,&err))) return; ++ root = (struct dx_root *) bh->b_data; ++ dx_show_entries (dir, root->entries, root->info.indirect_levels); ++ brelse (bh); ++} ++ ++ssize_t hack_show_dir (struct file * filp, void * dirent, filldir_t filldir) ++{ ++ if (is_dx (filp->f_dentry->d_inode) && !filp->f_pos) ++ dx_show_buckets (filp->f_dentry->d_inode); ++ return ext3_readdir(filp,dirent,filldir); ++} ++ ++/* ++ * Probe for a directory leaf block to search ++ */ ++ ++static struct dx_frame *dx_probe (struct inode *dir, u32 hash, struct dx_frame *frame) ++{ ++ unsigned count, indirect; ++ struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_root *root; ++ struct buffer_head *bh; ++ int err; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0,&err))) ++ goto fail; ++ root = (struct dx_root *) bh->b_data; ++ if (root->info.hash_version > 0 || root->info.unused_flags & 1) ++ goto fail; ++ if ((indirect = root->info.indirect_levels) > 1) ++ goto fail; ++ entries = (struct dx_entry *) (((char *) &root->info) + root->info.info_length); ++ assert (dx_get_limit(entries) == dx_root_limit(dir, root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ while (1) ++ { ++ count = dx_get_count(entries); ++ assert (count && count <= dx_get_limit(entries)); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ dxtrace(printk(".")); ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ ++ if (0) // linear search cross check ++ { ++ unsigned n = count - 1; ++ at = entries; ++ while (n--) ++ { ++ dxtrace(printk(",")); ++ if (dx_get_hash(++at) > hash) ++ { ++ at--; ++ break; ++ } ++ } ++ assert (at == p - 1); ++ } ++ ++ at = p - 1; ++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ frame->bh = bh; ++ frame->entries = entries; ++ frame->at = at; ++ if (!indirect--) return frame; ++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0,&err))) ++ goto fail2; ++ at = entries = ((struct dx_node *) bh->b_data)->entries; ++ assert (dx_get_limit(entries) == dx_node_limit (dir)); ++ frame++; ++ } ++fail2: ++ brelse(frame->bh); ++fail: ++ return NULL; ++} ++ ++static void dx_release (struct dx_frame *frames) ++{ ++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ brelse (frames[1].bh); ++ brelse (frames[0].bh); ++} ++ ++/* ++ * Directory block splitting, compacting ++ */ ++ ++static int dx_make_map (ext3_dirent *de, int size, struct dx_map_entry map[]) ++{ ++ int count = 0; ++ char *base = (char *) de; ++ while ((char *) de < base + size) ++ { ++ map[count].hash = dx_hash (de->name, de->name_len); ++ map[count].offs = (u32) ((char *) de - base); ++ de = (ext3_dirent *) ((char *) de + le16_to_cpu(de->rec_len)); ++ count++; ++ } ++ return count; ++} ++ ++static void dx_sort_map (struct dx_map_entry *map, unsigned count) ++{ ++ struct dx_map_entry *p, *q, *top = map + count - 1; ++ int more; ++ /* Combsort until bubble sort doesn't suck */ ++ while (count > 2) ++ { ++ count = count*10/13; ++ if (count - 9 < 2) /* 9, 10 -> 11 */ ++ count = 11; ++ for (p = top, q = p - count; q >= map; p--, q--) ++ if (p->hash < q->hash) ++ swap(*p, *q); ++ } ++ /* Garden variety bubble sort */ ++ do { ++ more = 0; ++ q = top; ++ while (q-- > map) ++ { ++ if (q[1].hash >= q[0].hash) ++ continue; ++ swap(*(q+1), *q); ++ more = 1; ++ } ++ } while(more); ++} ++ ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block) ++{ ++ struct dx_entry *entries = frame->entries, *at = frame->at; ++ assert (dx_get_count (entries) < dx_get_limit (entries)); ++ memmove (at + 2, at+1, (char *) (entries + dx_get_count(entries)) - (char *) (at)); ++ dx_set_hash(at + 1, hash); ++ dx_set_block(at + 1, block); ++ dx_set_count(entries, dx_get_count(entries) + 1); ++} ++#endif ++ + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * +@@ -95,6 +529,15 @@ + } + + /* ++ * p is at least 6 bytes before the end of page ++ */ ++static inline ext3_dirent *ext3_next_entry(ext3_dirent *p) ++{ ++ return (ext3_dirent *)((char*)p + le16_to_cpu(p->rec_len)); ++} ++ ++ ++/* + * ext3_find_entry() + * + * finds an entry in the specified directory with the wanted name. It +@@ -105,6 +548,8 @@ + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ ++ ++ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { +@@ -119,10 +564,76 @@ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; ++ int namelen; ++ const u8 *name; ++ unsigned blocksize; ++ ext3_dirent *de, *top; + + *res_dir = NULL; + sb = dir->i_sb; ++ blocksize = sb->s_blocksize; ++ namelen = dentry->d_name.len; ++ name = dentry->d_name.name; ++ if (namelen > EXT3_NAME_LEN) ++ return NULL; ++ if (ext3_dx && is_dx(dir)) { ++ u32 hash = dx_hash (name, namelen); ++ struct dx_frame frames[2], *frame; ++ if (!(frame = dx_probe (dir, hash, frames))) ++ return NULL; ++dxnext: ++ block = dx_get_block(frame->at); ++ if (!(bh = ext3_bread (NULL,dir, block, 0, &err))) ++ goto dxfail; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + blocksize ++ - EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) ++ if (ext3_match (namelen, name, de)) { ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, ++ (block<b_data))) { ++ brelse (bh); ++ goto dxfail; ++ } ++ *res_dir = de; ++ goto dxfound; ++ } ++ brelse (bh); ++ /* Same hash continues in next block? Search on. */ ++ if (++(frame->at) == frame->entries + dx_get_count(frame->entries)) ++ { ++ struct buffer_head *bh2; ++ if (frame == frames) ++ goto dxfail; ++ if (++(frames->at) == frames->entries + dx_get_count(frames->entries)) ++ goto dxfail; ++ /* should omit read if not continued */ ++ if (!(bh2 = ext3_bread (NULL, dir, ++ dx_get_block(frames->at), ++ 0, &err))) ++ goto dxfail; ++ brelse (frame->bh); ++ frame->bh = bh2; ++ frame->at = frame->entries = ((struct dx_node *) bh2->b_data)->entries; ++ /* Subtle: the 0th entry has the count, find the hash in frame above */ ++ if ((dx_get_hash(frames->at) & -2) == hash) ++ goto dxnext; ++ goto dxfail; ++ } ++ if ((dx_get_hash(frame->at) & -2) == hash) ++ goto dxnext; ++dxfail: ++ dxtrace(printk("%s not found\n", name)); ++ dx_release (frames); ++ return NULL; ++dxfound: ++ dx_release (frames); ++ return bh; + ++ } ++ + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = dir->u.ext3_i.i_dir_start_lookup; + if (start >= nblocks) +@@ -237,6 +748,88 @@ + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + ++static ext3_dirent * ++dx_copy_dirents (char *from, char *to, struct dx_map_entry *map, int count) ++{ ++ unsigned rec_len = 0; ++ ++ while (count--) { ++ ext3_dirent *de = (ext3_dirent *) (from + map->offs); ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ memcpy (to, de, rec_len); ++ ((ext3_dirent *) to)->rec_len = rec_len; ++ to += rec_len; ++ map++; ++ } ++ return (ext3_dirent *) (to - rec_len); ++} ++ ++#ifdef CONFIG_EXT3_INDEX ++static ext3_dirent *do_split(handle_t *handle, struct inode *dir, ++ struct buffer_head **bh,struct dx_frame *frame, ++ u32 hash, int *error) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count, continued; ++ struct buffer_head *bh2; ++ u32 newblock; ++ unsigned MAX_DX_MAP = PAGE_CACHE_SIZE/EXT3_DIR_REC_LEN(1) + 1; ++ u32 hash2; ++ struct dx_map_entry map[MAX_DX_MAP]; ++ char *data1 = (*bh)->b_data, *data2, *data3; ++ unsigned split; ++ ext3_dirent *de, *de2; ++ ++ bh2 = ext3_append (handle, dir, &newblock, error); ++ if (!(bh2)) ++ { ++ brelse(*bh); ++ *bh = NULL; ++ return (ext3_dirent *)bh2; ++ } ++ ++ BUFFER_TRACE(*bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, *bh); ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, frame->bh); ++ ++ data2 = bh2->b_data; ++ ++ count = dx_make_map ((ext3_dirent *) data1, blocksize, map); ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map (map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_copy_dirents (data1, data2, map + split, count - split); ++ data3 = (char *) de2 + de2->rec_len; ++ de = dx_copy_dirents (data1, data3, map, split); ++ memcpy(data1, data3, (char *) de + de->rec_len - data3); ++ de = (ext3_dirent *) ((char *) de - data3 + data1); // relocate de ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace_on(dx_show_leaf ((ext3_dirent *) data1, blocksize, 1)); ++ dxtrace_on(dx_show_leaf ((ext3_dirent *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hash >= hash2) ++ { ++ swap(*bh, bh2); ++ de = de2; ++ } ++ dx_insert_block (frame, hash2 + continued, newblock); ++ ext3_journal_dirty_metadata (handle, bh2); ++ brelse (bh2); ++ ext3_journal_dirty_metadata (handle, frame->bh); ++ dxtrace_on(dx_show_index ("frame", frame->entries)); ++ return de; ++} ++#endif ++ ++ + /* + * ext3_add_entry() + * +@@ -251,6 +844,7 @@ + /* + * AKPM: the journalling code here looks wrong on the error paths + */ ++ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +@@ -258,117 +852,284 @@ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset; +- unsigned short rec_len; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; +- struct super_block * sb; ++ struct super_block * sb = dir->i_sb; + int retval; ++ unsigned short reclen = EXT3_DIR_REC_LEN(namelen); + +- sb = dir->i_sb; ++ unsigned blocksize = sb->s_blocksize; ++ unsigned nlen, rlen; ++ u32 block; ++ char *top; + + if (!namelen) + return -EINVAL; +- bh = ext3_bread (handle, dir, 0, 0, &retval); +- if (!bh) +- return retval; +- rec_len = EXT3_DIR_REC_LEN(namelen); +- offset = 0; +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- while (1) { +- if ((char *)de >= sb->s_blocksize + bh->b_data) { +- brelse (bh); +- bh = NULL; +- bh = ext3_bread (handle, dir, +- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); +- if (!bh) +- return retval; +- if (dir->i_size <= offset) { +- if (dir->i_size == 0) { +- brelse(bh); +- return -ENOENT; ++ if (ext3_dx && is_dx(dir)) ++ { ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries, *at; ++ u32 hash; ++ char *data1; ++ ++ hash = dx_hash (name, namelen); ++ frame = dx_probe (dir, hash, frames); // do something if null ++ entries = frame->entries; ++ at = frame->at; ++ ++ if (!(bh = ext3_bread (handle,dir, dx_get_block(frame->at), 0,&retval))) ++ goto dxfail1; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, bh); ++ ++ data1 = bh->b_data; ++ de = (ext3_dirent *) data1; ++ top = data1 + (0? 200: blocksize); ++ while ((char *) de < top) ++ { ++ /* FIXME: check EEXIST and dir */ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ goto dx_add; ++ de = (ext3_dirent *) ((char *) de + rlen); ++ } ++ /* Block full, should compress but for now just split */ ++ dxtrace_on(printk("using %u of %u node entries\n", ++ dx_get_count(entries), dx_get_limit(entries))); ++ /* Need to split index? */ ++ if (dx_get_count(entries) == dx_get_limit(entries)) ++ { ++ u32 newblock; ++ unsigned icount = dx_get_count(entries); ++ char *idata2; ++ int levels = frame - frames; ++ struct dx_entry *entries2; ++ struct buffer_head *bh2; ++ if (levels && dx_get_count(frames->entries) == dx_get_limit(frames->entries)) ++ goto dxfull; ++ bh2 = ext3_append (handle, dir, &newblock, &retval); ++ if (!(bh2)) ++ goto dxfail2; ++ idata2 = bh2->b_data; ++ entries2 = ((struct dx_node *) idata2)->entries; ++ ((struct dx_node *) idata2)->fake.rec_len = cpu_to_le16(blocksize); ++ /* fake.inode already 0 */ ++ /* Seems that is not true. We still need to set inode = 0 -Chrisl*/ ++ ((struct dx_node *) idata2)->fake.inode = 0; ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, frame->bh); ++ if (levels) ++ { ++ unsigned icount1 = icount/2, icount2 = icount - icount1; ++ unsigned hash2 = dx_get_hash(entries + icount1); ++ dxtrace_on(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ ext3_journal_get_write_access(handle, frames[0].bh); ++ ++ memcpy ((char *) entries2, (char *) (entries + icount1), ++ icount2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, icount1); ++ dx_set_count (entries2, icount2); ++ dx_set_limit (entries2, dx_node_limit(dir)); ++ ++ /* Which index block gets the new entry? */ ++ if (at - entries > icount1) { ++ frame->at = at = at - entries - icount1 + entries2; ++ frame->entries = entries = entries2; ++ swap(frame->bh, bh2); + } +- +- ext3_debug ("creating next block\n"); +- +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = le16_to_cpu(sb->s_blocksize); +- dir->u.ext3_i.i_disksize = +- dir->i_size = offset + sb->s_blocksize; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); ++ dx_insert_block (frames + 0, hash2, newblock); ++ dxtrace_on(dx_show_index ("node", frames[1].entries)); ++ dxtrace_on(dx_show_index ("node", ++ ((struct dx_node *) bh2->b_data)->entries)); ++ ext3_journal_dirty_metadata(handle, bh2); ++ brelse (bh2); + } else { +- +- ext3_debug ("skipping to next block\n"); +- +- de = (struct ext3_dir_entry_2 *) bh->b_data; ++ dxtrace_on(printk("Creating second level index...\n")); ++ memcpy((char *) entries2, (char *) entries, ++ icount * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock); ++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ ++ /* Add new access path frame */ ++ frame = frames + 1; ++ frame->at = at = at - entries + entries2; ++ frame->entries = entries = entries2; ++ frame->bh = bh2; ++ ext3_journal_get_write_access(handle, frame->bh); + } ++ ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, +- offset)) { +- brelse (bh); +- return -ENOENT; +- } +- if (ext3_match (namelen, name, de)) { ++ de = do_split(handle, dir, &bh, frame, hash, &retval); ++ dx_release (frames); ++ if (!(de)) ++ goto fail; ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ goto add; ++ ++dx_add: ++ dx_release (frames); ++ goto add; ++ ++dxfull: ++ dxtrace_on(printk("Directory index full!\n")); ++ retval = -ENOSPC; ++dxfail2: ++ brelse(bh); ++dxfail1: ++ dx_release (frames); ++ goto fail1; ++ } ++ block = offset = 0; ++ while (offset < dir->i_size) { ++ bh = ext3_bread (handle, dir, block, 0, &retval); ++ if(!bh) ++ return retval; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = bh->b_data+blocksize-reclen; ++ while ((char *) de <= top) { ++ ++ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, ++ bh,offset)) { ++ brelse (bh); ++ return -ENOENT; ++ } ++ if (ext3_match (namelen, name, de)) { + brelse (bh); + return -EEXIST; +- } +- if ((le32_to_cpu(de->inode) == 0 && +- le16_to_cpu(de->rec_len) >= rec_len) || +- (le16_to_cpu(de->rec_len) >= +- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- /* By now the buffer is marked for journaling */ +- offset += le16_to_cpu(de->rec_len); +- if (le32_to_cpu(de->inode)) { +- de1 = (struct ext3_dir_entry_2 *) ((char *) de + +- EXT3_DIR_REC_LEN(de->name_len)); +- de1->rec_len = +- cpu_to_le16(le16_to_cpu(de->rec_len) - +- EXT3_DIR_REC_LEN(de->name_len)); +- de->rec_len = cpu_to_le16( +- EXT3_DIR_REC_LEN(de->name_len)); +- de = de1; + } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); +- /* +- * XXX shouldn't update any times until successful +- * completion of syscall, but too many callers depend +- * on this. +- * +- * XXX similarly, too many callers depend on +- * ext3_new_inode() setting the times, but error +- * recovery deletes the inode, so the worst that can +- * happen is that the times are slightly out of date +- * and/or different from the directory change time. +- */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- dir->i_version = ++event; +- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +- ext3_journal_dirty_metadata(handle, bh); ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ goto add; ++ de = (struct ext3_dir_entry_2 *) ((char *) de + rlen); ++ offset += rlen; ++ } ++ if (ext3_dx && dir->i_size==blocksize && test_opt(sb, INDEX)) ++ goto dx_make_index; ++ brelse(bh); ++ } ++ bh = ext3_append(handle, dir, &block, &retval); ++ if (!bh) ++ return retval; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ goto add; ++ ++add: ++ BUFFER_TRACE(bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, bh); ++ /* By now the buffer is marked for journaling */ ++ if (de->inode) { ++ de1 = (struct ext3_dir_entry_2 *) ((char *) de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ /* dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; */ ++ ext3_mark_inode_dirty(handle, dir); ++ dir->i_version = ++event; ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, bh); ++ brelse(bh); ++ return 0; ++ ++dx_make_index: ++ { ++ struct buffer_head *bh2; ++ struct dx_root *root; ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries; ++ struct ext3_dir_entry_2 *de2; ++ char *data1; ++ unsigned len; ++ u32 hash; ++ ++ dxtrace_on(printk("Creating index\n")); ++ ext3_journal_get_write_access(handle, bh); ++ root = (struct dx_root *) bh->b_data; ++ ++ ext3_add_compat_feature (sb, EXT3_FEATURE_COMPAT_DIR_INDEX); ++ dir->u.ext3_i.i_flags |= EXT3_INDEX_FL; ++ bh2 = ext3_append (handle, dir, &block, &retval); ++ if (!(bh2)) ++ { + brelse(bh); +- return 0; ++ return retval; + } +- offset += le16_to_cpu(de->rec_len); +- de = (struct ext3_dir_entry_2 *) +- ((char *) de + le16_to_cpu(de->rec_len)); ++ data1 = bh2->b_data; ++ ++ /* The 0th block becomes the root, move the dirents out */ ++ de = (ext3_dirent *) &root->info; ++ len = ((char *) root) + blocksize - (char *) de; ++ memcpy (data1, de, len); ++ de = (ext3_dirent *) data1; ++ top = data1 + len; ++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) ++ de = de2; ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ /* Initialize the root; the dot dirents already exist */ ++ de = (ext3_dirent *) (&root->fake2); ++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ memset (&root->info, 0, sizeof(root->info)); ++ root->info.info_length = sizeof(root->info); ++ entries = root->entries; ++ dx_set_block (entries, 1); ++ dx_set_count (entries, 1); ++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ ++ /* Initialize as for dx_probe */ ++ hash = dx_hash (name, namelen); ++ frame = frames; ++ frame->entries = entries; ++ frame->at = entries; ++ frame->bh = bh; ++ bh = bh2; ++ de = do_split(handle,dir, &bh, frame, hash, &retval); ++ dx_release (frames); ++ if (!(de)) ++ return retval; ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ goto add; + } +- brelse (bh); +- return -ENOSPC; ++fail1: ++ return retval; ++fail: ++ return -ENOENT; + } + ++ + /* + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry +@@ -451,7 +1212,8 @@ + struct inode * inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS+3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -478,7 +1240,8 @@ + struct inode *inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -507,7 +1270,8 @@ + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -832,7 +1596,7 @@ + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ // dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: +@@ -878,7 +1642,7 @@ + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ // dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) +@@ -904,7 +1668,8 @@ + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -959,7 +1724,8 @@ + if (inode->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -995,7 +1761,8 @@ + + old_bh = new_bh = dir_bh = NULL; + +- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS+ 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -1077,7 +1844,7 @@ + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; +- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ // old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); +@@ -1089,7 +1856,7 @@ + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; +- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ // new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, new_dir); + } + } +--- ./include/linux/ext3_fs.h 2002/03/05 06:18:59 2.1 ++++ ./include/linux/ext3_fs.h 2002/03/05 06:26:56 +@@ -339,6 +339,7 @@ + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -575,6 +576,26 @@ + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++/* ++ * Hash Tree Directory indexing ++ * (c) Daniel Phillips, 2001 ++ */ ++ ++#define CONFIG_EXT3_INDEX ++ ++#ifdef CONFIG_EXT3_INDEX ++ enum {ext3_dx = 1}; ++ #define dx_static static ++ #define is_dx(dir) ((dir)->u.ext3_i.i_flags & EXT3_INDEX_FL) ++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#else ++ enum {ext3_dx = 0}; ++ #define dx_static ++ #define is_dx(dir) 0 ++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) ++#endif + + #ifdef __KERNEL__ + /* +--- ./include/linux/ext3_jbd.h 2002/03/05 06:18:59 2.1 ++++ ./include/linux/ext3_jbd.h 2002/03/05 06:33:54 +@@ -63,6 +63,8 @@ + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + ++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 ++ + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, -- 1.8.3.1