Whamcloud - gitweb
- add a new directory for extN file system: ext3 with new features
authorbraam <braam>
Fri, 12 Apr 2002 16:28:04 +0000 (16:28 +0000)
committerbraam <braam>
Fri, 12 Apr 2002 16:28:04 +0000 (16:28 +0000)
 - placed large directory patch in this directory.

lustre/extN/htree-ext3-2.4.18.diff [new file with mode: 0644]

diff --git a/lustre/extN/htree-ext3-2.4.18.diff b/lustre/extN/htree-ext3-2.4.18.diff
new file mode 100644 (file)
index 0000000..583f9d9
--- /dev/null
@@ -0,0 +1,1216 @@
+--- ./fs/ext3/dir.c    2002/03/05 06:18:59     2.1
++++ ./fs/ext3/dir.c    2002/03/05 06:26:56
+@@ -26,7 +26,7 @@
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+-static int ext3_readdir(struct file *, void *, filldir_t);
++int ext3_readdir(struct file *, void *, filldir_t);
+ struct file_operations ext3_dir_operations = {
+       read:           generic_read_dir,
+@@ -65,7 +65,7 @@
+       return error_msg == NULL ? 1 : 0;
+ }
+-static int ext3_readdir(struct file * filp,
++int ext3_readdir(struct file * filp,
+                        void * dirent, filldir_t filldir)
+ {
+       int error = 0;
+--- ./fs/ext3/super.c  2002/03/05 06:18:59     2.1
++++ ./fs/ext3/super.c  2002/03/05 06:26:56
+@@ -529,6 +529,12 @@
+                                      "EXT3 Check option not supported\n");
+ #endif
+               }
++              else if (!strcmp (this_char, "index"))
++#ifdef CONFIG_EXT3_INDEX
++                      set_opt (*mount_options, INDEX);
++#else
++                      printk("EXT3 Index option not supported\n");
++#endif
+               else if (!strcmp (this_char, "debug"))
+                       set_opt (*mount_options, DEBUG);
+               else if (!strcmp (this_char, "errors")) {
+@@ -712,6 +718,10 @@
+                       EXT3_BLOCKS_PER_GROUP(sb),
+                       EXT3_INODES_PER_GROUP(sb),
+                       sbi->s_mount_opt);
++
++      if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++              set_opt (EXT3_SB(sb)->s_mount_opt, INDEX);
++
+       printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+                               bdevname(sb->s_dev));
+       if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+--- ./fs/ext3/namei.c  2002/03/05 06:18:59     2.1
++++ ./fs/ext3/namei.c  2002/03/06 00:13:18
+@@ -16,6 +16,10 @@
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  *  Directory entry file type support and forward compatibility hooks
+  *    for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ *  Hash Tree Directory indexing (c)
++ *    Daniel Phillips, 2001
++ *  Hash Tree Directory indexing porting
++ *    Christopher Li, 2002
+  */
+ #include <linux/fs.h>
+@@ -38,6 +42,436 @@
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
++void ext3_add_compat_feature (struct super_block *sb, unsigned feature)
++{
++      if (!EXT3_HAS_COMPAT_FEATURE(sb, feature))
++      {
++              lock_super(sb);
++              ext3_update_dynamic_rev(sb);
++              EXT3_SET_COMPAT_FEATURE(sb, feature);
++              ext3_write_super(sb);
++              unlock_super(sb);
++      }
++}
++
++static struct buffer_head *ext3_append (handle_t *handle,
++                                      struct inode *inode,
++                                      u32 *block, int *err)
++{
++      struct buffer_head *bh;
++      *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++      if((bh = ext3_bread (handle,inode, *block, 1, err))) {
++              inode->i_size += inode->i_sb->s_blocksize;
++              ext3_journal_get_write_access(handle,bh);
++      }
++      return bh;
++}
++
++#ifndef assert
++#define assert(test) do if (!(test)) BUG(); while (0)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#define dxtrace_on(command) command
++#define dxtrace_off(command)
++#define dxtrace dxtrace_off
++
++struct fake_dirent
++{
++      /*le*/u32 inode;
++      /*le*/u16 rec_len;
++      u8 name_len;
++      u8 file_type;
++};
++
++struct dx_countlimit
++{
++      le_u16 limit;
++      le_u16 count;
++};
++
++struct dx_entry
++{
++      le_u32 hash;
++      le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero.  Therefore, the
++ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++      struct fake_dirent fake1;
++      char dot1[4];
++      struct fake_dirent fake2;
++      char dot2[4];
++      struct dx_root_info
++      {
++              le_u32 reserved_zero;
++              u8 hash_version; /* 0 now, 1 at release */
++              u8 info_length; /* 8 */
++              u8 indirect_levels;
++              u8 unused_flags;
++      }
++      info;
++      struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++      struct fake_dirent fake;
++      struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++      struct buffer_head *bh;
++      struct dx_entry *entries;
++      struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++      u32 hash;
++      u32 offs;
++};
++
++typedef struct ext3_dir_entry_2 ext3_dirent;
++/* dx_static exists purely to suppress compiler warnings */
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static unsigned dx_hack_hash (const u8 *name, int len);
++static struct dx_frame *dx_probe (struct inode *dir, u32 hash, struct dx_frame *frame);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (ext3_dirent *de, int size, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static ext3_dirent *dx_copy_dirents (char *from, char *to,
++     struct dx_map_entry *map, int count);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++      return le32_to_cpu(entry->block.v) & 0x0fffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++      entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++      return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++      entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++      unsigned entry_space = dir->i_sb->s_blocksize - 24 - infosize;
++      return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++      unsigned entry_space = dir->i_sb->s_blocksize - sizeof(struct fake_dirent);
++      return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/* Hash function - not bad, but still looking for an ideal default */
++
++static unsigned dx_hack_hash (const u8 *name, int len)
++{
++      u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++      while (len--)
++      {
++              u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++              if (hash & 0x80000000) hash -= 0x7fffffff;
++              hash1 = hash0;
++              hash0 = hash;
++      }
++      return 80; /* FIXME: for test only */
++      return hash0;
++}
++
++#define dx_hash(s,n) (dx_hack_hash(s,n) << 1)
++
++/*
++ * Debug
++ */
++static void dx_show_index (char * label, struct dx_entry *entries)
++{
++      int i, n = dx_get_count (entries);
++      printk("%s index ", label);
++      for (i = 0; i < n; i++)
++      {
++              printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
++      }
++      printk("\n");
++}
++
++struct stats
++{ 
++      unsigned names;
++      unsigned space;
++      unsigned bcount;
++};
++
++static struct stats dx_show_leaf (ext3_dirent *de, int size, int show_names)
++{
++      unsigned names = 0, space = 0;
++      char *base = (char *) de;
++      printk("names: ");
++      while ((char *) de < base + size)
++      {
++              if (de->inode)
++              {
++                      if (show_names)
++                      {
++                              int len = de->name_len;
++                              char *name = de->name;
++                              while (len--) printk("%c", *name++);
++                              printk(":%x.%u ", dx_hash (de->name, de->name_len), ((char *) de - base));
++                      }
++                      space += EXT3_DIR_REC_LEN(de->name_len);
++                      names++;
++              }
++              de = (ext3_dirent *) ((char *) de + le16_to_cpu(de->rec_len));
++      }
++      printk("(%i)\n", names);
++      return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries (struct inode *dir, struct dx_entry *entries, int levels)
++{
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++      unsigned bcount = 0;
++      struct buffer_head *bh;
++      int err;
++      printk("%i indexed blocks...\n", count);
++      for (i = 0; i < count; i++, entries++)
++      {
++              u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++              u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++              struct stats stats;
++              printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
++              if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++              stats = levels?
++                 dx_show_entries (dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++                 dx_show_leaf ((ext3_dirent *) bh->b_data, blocksize, 0);
++              names += stats.names;
++              space += stats.space;
++              bcount += stats.bcount;
++              brelse (bh);
++      }
++      if (bcount)
++              printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
++                      names, space/bcount,(space/bcount)*100/blocksize);
++      return (struct stats) { names, space, bcount};
++}
++
++static void dx_show_buckets (struct inode *dir)
++{
++      struct buffer_head *bh;
++      struct dx_root *root;
++      int err;
++      if (!(bh = ext3_bread (NULL,dir, 0, 0,&err))) return;
++      root = (struct dx_root *) bh->b_data;
++      dx_show_entries (dir, root->entries, root->info.indirect_levels);
++      brelse (bh);
++}
++
++ssize_t hack_show_dir (struct file * filp, void * dirent, filldir_t filldir)
++{
++      if (is_dx (filp->f_dentry->d_inode) && !filp->f_pos)
++              dx_show_buckets (filp->f_dentry->d_inode);
++      return ext3_readdir(filp,dirent,filldir);
++}
++
++/*
++ * Probe for a directory leaf block to search
++ */
++
++static struct dx_frame *dx_probe (struct inode *dir, u32 hash, struct dx_frame *frame)
++{
++      unsigned count, indirect;
++      struct dx_entry *at, *entries, *p, *q, *m;
++      struct dx_root *root;
++      struct buffer_head *bh;
++      int err;
++      if (!(bh = ext3_bread (NULL,dir, 0, 0,&err)))
++              goto fail;
++      root = (struct dx_root *) bh->b_data;
++      if (root->info.hash_version > 0 || root->info.unused_flags & 1)
++              goto fail;
++      if ((indirect = root->info.indirect_levels) > 1)
++              goto fail;
++      entries = (struct dx_entry *) (((char *) &root->info) + root->info.info_length);
++      assert (dx_get_limit(entries) == dx_root_limit(dir, root->info.info_length));
++      dxtrace (printk("Look up %x", hash));
++      while (1)
++      {
++              count = dx_get_count(entries);
++              assert (count && count <= dx_get_limit(entries));
++              p = entries + 1;
++              q = entries + count - 1;
++              while (p <= q)
++              {
++                      m = p + (q - p)/2;
++                      dxtrace(printk("."));
++                      if (dx_get_hash(m) > hash)
++                              q = m - 1;
++                      else
++                              p = m + 1;
++              }
++
++              if (0) // linear search cross check
++              {
++                      unsigned n = count - 1;
++                      at = entries;
++                      while (n--)
++                      {
++                              dxtrace(printk(","));
++                              if (dx_get_hash(++at) > hash)
++                              {
++                                      at--;
++                                      break;
++                              }
++                      }
++                      assert (at == p - 1);
++              }
++
++              at = p - 1;
++              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++              frame->bh = bh;
++              frame->entries = entries;
++              frame->at = at;
++              if (!indirect--) return frame;
++              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0,&err)))
++                      goto fail2;
++              at = entries = ((struct dx_node *) bh->b_data)->entries;
++              assert (dx_get_limit(entries) == dx_node_limit (dir));
++              frame++;
++      }
++fail2:
++      brelse(frame->bh);
++fail:
++      return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++              brelse (frames[1].bh);
++      brelse (frames[0].bh);
++}
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (ext3_dirent *de, int size, struct dx_map_entry map[])
++{
++      int count = 0;
++      char *base = (char *) de;
++      while ((char *) de < base + size)
++      {
++              map[count].hash = dx_hash (de->name, de->name_len);
++              map[count].offs = (u32) ((char *) de - base);
++              de = (ext3_dirent *) ((char *) de + le16_to_cpu(de->rec_len));
++              count++;
++      }
++      return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++        struct dx_map_entry *p, *q, *top = map + count - 1;
++        int more;
++        /* Combsort until bubble sort doesn't suck */
++        while (count > 2)
++      {
++                count = count*10/13;
++                if (count - 9 < 2) /* 9, 10 -> 11 */
++                        count = 11;
++                for (p = top, q = p - count; q >= map; p--, q--)
++                        if (p->hash < q->hash)
++                                swap(*p, *q);
++        }
++        /* Garden variety bubble sort */
++        do {
++                more = 0;
++                q = top;
++                while (q-- > map)
++              {
++                        if (q[1].hash >= q[0].hash)
++                              continue;
++                        swap(*(q+1), *q);
++                        more = 1;
++              }
++      } while(more);
++}
++
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block)
++{
++      struct dx_entry *entries = frame->entries, *at = frame->at;
++      assert (dx_get_count (entries) < dx_get_limit (entries));
++      memmove (at + 2, at+1, (char *) (entries + dx_get_count(entries)) - (char *) (at));
++      dx_set_hash(at + 1, hash);
++      dx_set_block(at + 1, block);
++      dx_set_count(entries, dx_get_count(entries) + 1);
++}
++#endif
++
+ /*
+  * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+  *
+@@ -95,6 +529,15 @@
+ }
+ /*
++ * p is at least 6 bytes before the end of page
++ */
++static inline ext3_dirent *ext3_next_entry(ext3_dirent *p)
++{
++      return (ext3_dirent *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++
++/*
+  *    ext3_find_entry()
+  *
+  * finds an entry in the specified directory with the wanted name. It
+@@ -105,6 +548,8 @@
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
++
++      
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+                                       struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -119,10 +564,76 @@
+       int num = 0;
+       int nblocks, i, err;
+       struct inode *dir = dentry->d_parent->d_inode;
++      int namelen;
++      const u8 *name;
++      unsigned blocksize;
++      ext3_dirent *de, *top;
+       *res_dir = NULL;
+       sb = dir->i_sb;
++      blocksize = sb->s_blocksize;
++      namelen = dentry->d_name.len;
++      name = dentry->d_name.name;
++      if (namelen > EXT3_NAME_LEN)
++              return NULL;
++      if (ext3_dx && is_dx(dir)) {
++              u32 hash = dx_hash (name, namelen);
++              struct dx_frame frames[2], *frame;
++              if (!(frame = dx_probe (dir, hash, frames)))
++                      return NULL;
++dxnext:
++              block = dx_get_block(frame->at);
++              if (!(bh = ext3_bread (NULL,dir, block, 0, &err)))
++                      goto dxfail;
++              de = (struct ext3_dir_entry_2 *) bh->b_data;
++              top = (struct ext3_dir_entry_2 *) ((char *) de + blocksize
++                              - EXT3_DIR_REC_LEN(0));
++              for (; de < top; de = ext3_next_entry(de))
++                      if (ext3_match (namelen, name, de)) {
++                              if (!ext3_check_dir_entry("ext3_find_entry",
++                                        dir, de, bh,
++                                        (block<<EXT3_BLOCK_SIZE_BITS(sb))
++                                         +((char *)de - bh->b_data))) {
++                                      brelse (bh);
++                                      goto dxfail;
++                              }
++                              *res_dir = de;
++                              goto dxfound;
++                      }
++              brelse (bh);
++              /* Same hash continues in next block?  Search on. */
++              if (++(frame->at) == frame->entries + dx_get_count(frame->entries))
++              {
++                      struct buffer_head *bh2;
++                      if (frame == frames)
++                              goto dxfail;
++                      if (++(frames->at) == frames->entries + dx_get_count(frames->entries))
++                              goto dxfail;
++                      /* should omit read if not continued */
++                      if (!(bh2 = ext3_bread (NULL, dir,
++                                              dx_get_block(frames->at),
++                                              0, &err)))
++                              goto dxfail;
++                      brelse (frame->bh);
++                      frame->bh = bh2;
++                      frame->at = frame->entries = ((struct dx_node *) bh2->b_data)->entries;
++                      /* Subtle: the 0th entry has the count, find the hash in frame above */
++                      if ((dx_get_hash(frames->at) & -2) == hash)
++                              goto dxnext;
++                      goto dxfail;
++              }
++              if ((dx_get_hash(frame->at) & -2) == hash)
++                      goto dxnext;
++dxfail:
++              dxtrace(printk("%s not found\n", name));
++              dx_release (frames);
++              return NULL;
++dxfound:
++              dx_release (frames);
++              return bh;
++      }
++      
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+       start = dir->u.ext3_i.i_dir_start_lookup;
+       if (start >= nblocks)
+@@ -237,6 +748,88 @@
+               de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
++static ext3_dirent *
++dx_copy_dirents (char *from, char *to, struct dx_map_entry *map, int count)
++{
++      unsigned rec_len = 0;
++
++      while (count--) {
++              ext3_dirent *de = (ext3_dirent *) (from + map->offs);
++              rec_len = EXT3_DIR_REC_LEN(de->name_len);
++              memcpy (to, de, rec_len);
++              ((ext3_dirent *) to)->rec_len = rec_len;
++              to += rec_len;
++              map++;
++      }
++      return (ext3_dirent *) (to - rec_len);
++}
++
++#ifdef CONFIG_EXT3_INDEX
++static ext3_dirent *do_split(handle_t *handle, struct inode *dir,
++                      struct buffer_head **bh,struct dx_frame *frame,
++                      u32 hash, int *error)
++{
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count, continued;
++      struct buffer_head *bh2;
++      u32 newblock;
++      unsigned MAX_DX_MAP = PAGE_CACHE_SIZE/EXT3_DIR_REC_LEN(1) + 1;
++      u32 hash2;
++      struct dx_map_entry map[MAX_DX_MAP];
++      char *data1 = (*bh)->b_data, *data2, *data3;
++      unsigned split;
++      ext3_dirent *de, *de2;
++
++      bh2 = ext3_append (handle, dir, &newblock, error);
++      if (!(bh2))
++      {
++              brelse(*bh);
++              *bh = NULL;
++              return (ext3_dirent *)bh2;
++      }
++
++      BUFFER_TRACE(*bh, "get_write_access");
++      ext3_journal_get_write_access(handle, *bh);
++      BUFFER_TRACE(frame->bh, "get_write_access");
++      ext3_journal_get_write_access(handle, frame->bh);
++
++      data2 = bh2->b_data;
++
++      count = dx_make_map ((ext3_dirent *) data1, blocksize, map);
++      split = count/2; // need to adjust to actual middle
++      dx_sort_map (map, count);
++      hash2 = map[split].hash;
++      continued = hash2 == map[split - 1].hash;
++      dxtrace(printk("Split block %i at %x, %i/%i\n",
++              dx_get_block(frame->at), hash2, split, count-split));
++
++      /* Fancy dance to stay within two buffers */
++      de2 = dx_copy_dirents (data1, data2, map + split, count - split);
++      data3 = (char *) de2 + de2->rec_len;
++      de = dx_copy_dirents (data1, data3, map, split);
++      memcpy(data1, data3, (char *) de + de->rec_len - data3);
++      de = (ext3_dirent *) ((char *) de - data3 + data1); // relocate de
++      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++      dxtrace_on(dx_show_leaf ((ext3_dirent *) data1, blocksize, 1));
++      dxtrace_on(dx_show_leaf ((ext3_dirent *) data2, blocksize, 1));
++
++      /* Which block gets the new entry? */
++      if (hash >= hash2)
++      {
++              swap(*bh, bh2);
++              de = de2;
++      }
++      dx_insert_block (frame, hash2 + continued, newblock);
++      ext3_journal_dirty_metadata (handle, bh2);
++      brelse (bh2);
++      ext3_journal_dirty_metadata (handle, frame->bh);
++      dxtrace_on(dx_show_index ("frame", frame->entries));
++      return de;
++}
++#endif
++
++
+ /*
+  *    ext3_add_entry()
+  *
+@@ -251,6 +844,7 @@
+ /*
+  * AKPM: the journalling code here looks wrong on the error paths
+  */
++
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+       struct inode *inode)
+ {
+@@ -258,117 +852,284 @@
+       const char *name = dentry->d_name.name;
+       int namelen = dentry->d_name.len;
+       unsigned long offset;
+-      unsigned short rec_len;
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de, * de1;
+-      struct super_block * sb;
++      struct super_block * sb = dir->i_sb;
+       int     retval;
++      unsigned short reclen = EXT3_DIR_REC_LEN(namelen);
+-      sb = dir->i_sb;
++      unsigned blocksize = sb->s_blocksize;
++      unsigned nlen, rlen;
++      u32 block;
++      char *top;
+       if (!namelen)
+               return -EINVAL;
+-      bh = ext3_bread (handle, dir, 0, 0, &retval);
+-      if (!bh)
+-              return retval;
+-      rec_len = EXT3_DIR_REC_LEN(namelen);
+-      offset = 0;
+-      de = (struct ext3_dir_entry_2 *) bh->b_data;
+-      while (1) {
+-              if ((char *)de >= sb->s_blocksize + bh->b_data) {
+-                      brelse (bh);
+-                      bh = NULL;
+-                      bh = ext3_bread (handle, dir,
+-                              offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+-                      if (!bh)
+-                              return retval;
+-                      if (dir->i_size <= offset) {
+-                              if (dir->i_size == 0) {
+-                                      brelse(bh);
+-                                      return -ENOENT;
++      if (ext3_dx && is_dx(dir))
++      {
++              struct dx_frame frames[2], *frame;
++              struct dx_entry *entries, *at;
++              u32 hash;
++              char *data1;
++
++              hash = dx_hash (name, namelen);
++              frame = dx_probe (dir, hash, frames); // do something if null
++              entries = frame->entries;
++              at = frame->at;
++
++              if (!(bh = ext3_bread (handle,dir, dx_get_block(frame->at), 0,&retval)))
++                      goto dxfail1;
++
++              BUFFER_TRACE(bh, "get_write_access");
++              ext3_journal_get_write_access(handle, bh);
++
++              data1 = bh->b_data;
++              de = (ext3_dirent *) data1;
++              top = data1 + (0? 200: blocksize);
++              while ((char *) de < top)
++              {
++                      /* FIXME: check EEXIST and dir */
++                      nlen = EXT3_DIR_REC_LEN(de->name_len);
++                      rlen = le16_to_cpu(de->rec_len);
++                      if ((de->inode? rlen - nlen: rlen) >= reclen)
++                              goto dx_add;
++                      de = (ext3_dirent *) ((char *) de + rlen);
++              }
++              /* Block full, should compress but for now just split */
++              dxtrace_on(printk("using %u of %u node entries\n",
++                      dx_get_count(entries), dx_get_limit(entries)));
++              /* Need to split index? */
++              if (dx_get_count(entries) == dx_get_limit(entries))
++              {
++                      u32 newblock;
++                      unsigned icount = dx_get_count(entries);
++                      char *idata2;
++                      int levels = frame - frames;
++                      struct dx_entry *entries2;
++                      struct buffer_head *bh2;
++                      if (levels && dx_get_count(frames->entries) == dx_get_limit(frames->entries))
++                              goto dxfull;
++                      bh2 = ext3_append (handle, dir, &newblock, &retval);
++                      if (!(bh2))
++                              goto dxfail2;
++                      idata2 = bh2->b_data;
++                      entries2 = ((struct dx_node *) idata2)->entries;
++                      ((struct dx_node *) idata2)->fake.rec_len = cpu_to_le16(blocksize);
++                      /* fake.inode already 0 */
++                      /* Seems that is not true. We still need to set inode = 0 -Chrisl*/
++                      ((struct dx_node *) idata2)->fake.inode = 0;
++                      BUFFER_TRACE(frame->bh, "get_write_access");
++                      ext3_journal_get_write_access(handle, frame->bh);
++                      if (levels)
++                      {
++                              unsigned icount1 = icount/2, icount2 = icount - icount1;
++                              unsigned hash2 = dx_get_hash(entries + icount1);
++                              dxtrace_on(printk("Split index %i/%i\n", icount1, icount2));
++                              
++                              BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++                              ext3_journal_get_write_access(handle, frames[0].bh);
++                              
++                              memcpy ((char *) entries2, (char *) (entries + icount1),
++                                      icount2 * sizeof(struct dx_entry));
++                              dx_set_count (entries, icount1);
++                              dx_set_count (entries2, icount2);
++                              dx_set_limit (entries2, dx_node_limit(dir));
++
++                              /* Which index block gets the new entry? */
++                              if (at - entries > icount1) {
++                                      frame->at = at = at - entries - icount1 + entries2;
++                                      frame->entries = entries = entries2;
++                                      swap(frame->bh, bh2);
+                               }
+-
+-                              ext3_debug ("creating next block\n");
+-
+-                              BUFFER_TRACE(bh, "get_write_access");
+-                              ext3_journal_get_write_access(handle, bh);
+-                              de = (struct ext3_dir_entry_2 *) bh->b_data;
+-                              de->inode = 0;
+-                              de->rec_len = le16_to_cpu(sb->s_blocksize);
+-                              dir->u.ext3_i.i_disksize =
+-                                      dir->i_size = offset + sb->s_blocksize;
+-                              dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+-                              ext3_mark_inode_dirty(handle, dir);
++                              dx_insert_block (frames + 0, hash2, newblock);
++                              dxtrace_on(dx_show_index ("node", frames[1].entries));
++                              dxtrace_on(dx_show_index ("node",
++                                      ((struct dx_node *) bh2->b_data)->entries));
++                              ext3_journal_dirty_metadata(handle, bh2);
++                              brelse (bh2);
+                       } else {
+-
+-                              ext3_debug ("skipping to next block\n");
+-
+-                              de = (struct ext3_dir_entry_2 *) bh->b_data;
++                              dxtrace_on(printk("Creating second level index...\n"));
++                              memcpy((char *) entries2, (char *) entries,
++                                      icount * sizeof(struct dx_entry));
++                              dx_set_limit(entries2, dx_node_limit(dir));
++
++                              /* Set up root */
++                              dx_set_count(entries, 1);
++                              dx_set_block(entries + 0, newblock);
++                              ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++                              /* Add new access path frame */
++                              frame = frames + 1;
++                              frame->at = at = at - entries + entries2;
++                              frame->entries = entries = entries2;
++                              frame->bh = bh2;
++                              ext3_journal_get_write_access(handle, frame->bh);
+                       }
++                      ext3_journal_dirty_metadata(handle, frames[0].bh);
+               }
+-              if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+-                                         offset)) {
+-                      brelse (bh);
+-                      return -ENOENT;
+-              }
+-              if (ext3_match (namelen, name, de)) {
++              de = do_split(handle, dir, &bh, frame, hash, &retval);
++              dx_release (frames);
++              if (!(de))
++                      goto fail;
++              nlen = EXT3_DIR_REC_LEN(de->name_len);
++              rlen = le16_to_cpu(de->rec_len);
++              goto add;
++
++dx_add:
++              dx_release (frames);
++              goto add;
++
++dxfull:
++              dxtrace_on(printk("Directory index full!\n"));
++              retval = -ENOSPC;
++dxfail2:
++              brelse(bh);
++dxfail1:
++              dx_release (frames);
++              goto fail1;
++      }
++      block = offset = 0;
++      while (offset < dir->i_size) {
++              bh = ext3_bread (handle, dir, block, 0, &retval);
++              if(!bh)
++                      return retval;  
++              de = (struct ext3_dir_entry_2 *) bh->b_data;
++              top = bh->b_data+blocksize-reclen;
++              while ((char *) de <= top) {
++                      
++                      if (!ext3_check_dir_entry ("ext3_add_entry", dir, de,
++                                                 bh,offset)) {
++                              brelse (bh);
++                              return -ENOENT;
++                      }
++                      if (ext3_match (namelen, name, de)) {
+                               brelse (bh);
+                               return -EEXIST;
+-              }
+-              if ((le32_to_cpu(de->inode) == 0 &&
+-                              le16_to_cpu(de->rec_len) >= rec_len) ||
+-                  (le16_to_cpu(de->rec_len) >=
+-                              EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+-                      BUFFER_TRACE(bh, "get_write_access");
+-                      ext3_journal_get_write_access(handle, bh);
+-                      /* By now the buffer is marked for journaling */
+-                      offset += le16_to_cpu(de->rec_len);
+-                      if (le32_to_cpu(de->inode)) {
+-                              de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+-                                      EXT3_DIR_REC_LEN(de->name_len));
+-                              de1->rec_len =
+-                                      cpu_to_le16(le16_to_cpu(de->rec_len) -
+-                                      EXT3_DIR_REC_LEN(de->name_len));
+-                              de->rec_len = cpu_to_le16(
+-                                              EXT3_DIR_REC_LEN(de->name_len));
+-                              de = de1;
+                       }
+-                      de->file_type = EXT3_FT_UNKNOWN;
+-                      if (inode) {
+-                              de->inode = cpu_to_le32(inode->i_ino);
+-                              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+-                      } else
+-                              de->inode = 0;
+-                      de->name_len = namelen;
+-                      memcpy (de->name, name, namelen);
+-                      /*
+-                       * XXX shouldn't update any times until successful
+-                       * completion of syscall, but too many callers depend
+-                       * on this.
+-                       *
+-                       * XXX similarly, too many callers depend on
+-                       * ext3_new_inode() setting the times, but error
+-                       * recovery deletes the inode, so the worst that can
+-                       * happen is that the times are slightly out of date
+-                       * and/or different from the directory change time.
+-                       */
+-                      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+-                      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+-                      ext3_mark_inode_dirty(handle, dir);
+-                      dir->i_version = ++event;
+-                      BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+-                      ext3_journal_dirty_metadata(handle, bh);
++                      nlen = EXT3_DIR_REC_LEN(de->name_len);
++                      rlen = le16_to_cpu(de->rec_len);
++                      if ((de->inode? rlen - nlen: rlen) >= reclen)
++                              goto add;
++                      de = (struct ext3_dir_entry_2 *) ((char *) de + rlen);
++                      offset += rlen;
++              }
++              if (ext3_dx && dir->i_size==blocksize && test_opt(sb, INDEX))
++                      goto dx_make_index;
++              brelse(bh);
++      }
++      bh = ext3_append(handle, dir, &block, &retval);
++      if (!bh)
++              return retval;
++      de = (struct ext3_dir_entry_2 *) bh->b_data;
++      de->inode = 0;
++      de->rec_len = cpu_to_le16(rlen = blocksize);
++      nlen = 0;
++      goto add;
++
++add:
++      BUFFER_TRACE(bh, "get_write_access");
++      ext3_journal_get_write_access(handle, bh);
++      /* By now the buffer is marked for journaling */
++      if (de->inode) {
++              de1 = (struct ext3_dir_entry_2 *) ((char *) de + nlen);
++              de1->rec_len = cpu_to_le16(rlen - nlen);
++              de->rec_len = cpu_to_le16(nlen);
++              de = de1;
++      }
++      de->file_type = EXT3_FT_UNKNOWN;
++      if (inode) {
++              de->inode = cpu_to_le32(inode->i_ino);
++              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++      } else
++              de->inode = 0;
++      de->name_len = namelen;
++      memcpy (de->name, name, namelen);
++      /*
++       * XXX shouldn't update any times until successful
++       * completion of syscall, but too many callers depend
++       * on this.
++       *
++       * XXX similarly, too many callers depend on
++       * ext3_new_inode() setting the times, but error
++       * recovery deletes the inode, so the worst that can
++       * happen is that the times are slightly out of date
++       * and/or different from the directory change time.
++       */
++      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++      /* dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; */
++      ext3_mark_inode_dirty(handle, dir);
++      dir->i_version = ++event;
++      BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++      ext3_journal_dirty_metadata(handle, bh);
++      brelse(bh);
++      return 0;
++
++dx_make_index:
++      {
++              struct buffer_head *bh2;
++              struct dx_root *root;
++              struct dx_frame frames[2], *frame;
++              struct dx_entry *entries;
++              struct ext3_dir_entry_2 *de2;
++              char *data1;
++              unsigned len;
++              u32 hash;
++              
++              dxtrace_on(printk("Creating index\n"));
++              ext3_journal_get_write_access(handle, bh);
++              root = (struct dx_root *) bh->b_data;
++              
++              ext3_add_compat_feature (sb, EXT3_FEATURE_COMPAT_DIR_INDEX);
++              dir->u.ext3_i.i_flags |= EXT3_INDEX_FL;
++              bh2 = ext3_append (handle, dir, &block, &retval);
++              if (!(bh2))
++              {
+                       brelse(bh);
+-                      return 0;
++                      return retval;
+               }
+-              offset += le16_to_cpu(de->rec_len);
+-              de = (struct ext3_dir_entry_2 *)
+-                      ((char *) de + le16_to_cpu(de->rec_len));
++              data1 = bh2->b_data;
++
++              /* The 0th block becomes the root, move the dirents out */
++              de = (ext3_dirent *) &root->info;
++              len = ((char *) root) + blocksize - (char *) de;
++              memcpy (data1, de, len);
++              de = (ext3_dirent *) data1;
++              top = data1 + len;
++              while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++                      de = de2;
++              de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++              /* Initialize the root; the dot dirents already exist */
++              de = (ext3_dirent *) (&root->fake2);
++              de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++              memset (&root->info, 0, sizeof(root->info));
++              root->info.info_length = sizeof(root->info);
++              entries = root->entries;
++              dx_set_block (entries, 1);
++              dx_set_count (entries, 1);
++              dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++              /* Initialize as for dx_probe */
++              hash = dx_hash (name, namelen);
++              frame = frames;
++              frame->entries = entries;
++              frame->at = entries;
++              frame->bh = bh;
++              bh = bh2;
++              de = do_split(handle,dir, &bh, frame, hash, &retval);
++              dx_release (frames);
++              if (!(de))
++                      return retval;
++              nlen = EXT3_DIR_REC_LEN(de->name_len);
++              rlen = le16_to_cpu(de->rec_len);
++              goto add;
+       }
+-      brelse (bh);
+-      return -ENOSPC;
++fail1:
++      return retval;
++fail:
++      return -ENOENT;
+ }
++
+ /*
+  * ext3_delete_entry deletes a directory entry by merging it with the
+  * previous entry
+@@ -451,7 +1212,8 @@
+       struct inode * inode;
+       int err;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS+3);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -478,7 +1240,8 @@
+       struct inode *inode;
+       int err;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -507,7 +1270,8 @@
+       if (dir->i_nlink >= EXT3_LINK_MAX)
+               return -EMLINK;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -832,7 +1596,7 @@
+       ext3_mark_inode_dirty(handle, inode);
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      //      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+ end_rmdir:
+@@ -878,7 +1642,7 @@
+       if (retval)
+               goto end_unlink;
+       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      //      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+       inode->i_nlink--;
+       if (!inode->i_nlink)
+@@ -904,7 +1668,8 @@
+       if (l > dir->i_sb->s_blocksize)
+               return -ENAMETOOLONG;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -959,7 +1724,8 @@
+       if (inode->i_nlink >= EXT3_LINK_MAX)
+               return -EMLINK;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -995,7 +1761,8 @@
+       old_bh = new_bh = dir_bh = NULL;
+-      handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
++      handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS+ 2);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -1077,7 +1844,7 @@
+               new_inode->i_ctime = CURRENT_TIME;
+       }
+       old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+-      old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      // old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       if (dir_bh) {
+               BUFFER_TRACE(dir_bh, "get_write_access");
+               ext3_journal_get_write_access(handle, dir_bh);
+@@ -1089,7 +1856,7 @@
+                       new_inode->i_nlink--;
+               } else {
+                       new_dir->i_nlink++;
+-                      new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      //              new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+                       ext3_mark_inode_dirty(handle, new_dir);
+               }
+       }
+--- ./include/linux/ext3_fs.h  2002/03/05 06:18:59     2.1
++++ ./include/linux/ext3_fs.h  2002/03/05 06:26:56
+@@ -339,6 +339,7 @@
+   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -575,6 +576,26 @@
+ #define EXT3_DIR_ROUND                        (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len)    (((name_len) + 8 + EXT3_DIR_ROUND) & \
+                                        ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#define CONFIG_EXT3_INDEX
++
++#ifdef CONFIG_EXT3_INDEX
++  enum {ext3_dx = 1};
++  #define dx_static static
++  #define is_dx(dir) ((dir)->u.ext3_i.i_flags & EXT3_INDEX_FL)
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++  enum {ext3_dx = 0};
++  #define dx_static
++  #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
+ #ifdef __KERNEL__
+ /*
+--- ./include/linux/ext3_jbd.h 2002/03/05 06:18:59     2.1
++++ ./include/linux/ext3_jbd.h 2002/03/05 06:33:54
+@@ -63,6 +63,8 @@
+ #define EXT3_RESERVE_TRANS_BLOCKS     12
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle, 
+                    struct inode *inode,