From 9471dd7705002db5c9fcf78e43a25cbe0d6f0f52 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 1 Sep 2003 14:09:59 +0000 Subject: [PATCH] - parallel directory operations patches have been ported onto 2.4.18-chaos - new chaos-2.4.18-pdirops series have been created NOTE: not for production! I'm still testing this --- .../patches/dynamic-locks-2.4.18-chaos.patch | 212 +++ .../kernel_patches/patches/ext-2.4-patch-5.patch | 15 + .../patches/ext3-2.4.18-ino_sb_macro-2.patch | 1461 ++++++++++++++++ .../patches/ext3-compat-2.4.18-chaos.patch | 19 + .../patches/ext3-delete_thread-2.4.18-2.patch | 478 ++++++ .../patches/ext3-pdirops-2.4.18-chaos.patch | 1213 +++++++++++++ lustre/kernel_patches/patches/iopen-2.4.18-2.patch | 422 +++++ .../patches/linux-2.4.18ea-0.8.26-2.patch | 1775 ++++++++++++++++++++ .../patches/vfs-pdirops-2.4.18-chaos.patch | 265 +++ .../pc/dynamic-locks-2.4.18-chaos.pc | 3 + lustre/kernel_patches/pc/ext-2.4-patch-5.pc | 1 + .../pc/ext3-2.4.18-ino_sb_macro-2.pc | 20 + .../kernel_patches/pc/ext3-compat-2.4.18-chaos.pc | 1 + .../pc/ext3-delete_thread-2.4.18-2.pc | 6 + .../kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc | 6 + lustre/kernel_patches/pc/iopen-2.4.18-2.pc | 8 + .../kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc | 11 + .../kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc | 3 + lustre/kernel_patches/series/chaos-2.4.18-pdirops | 34 + 19 files changed, 5953 insertions(+) create mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-5.patch create mode 100644 lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch create mode 100644 lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch create mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.18-2.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch create mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-5.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc create mode 100644 lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc create mode 100644 lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/iopen-2.4.18-2.pc create mode 100644 lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc create mode 100644 lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/series/chaos-2.4.18-pdirops diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch b/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch new file mode 100644 index 0000000..a1cef3e --- /dev/null +++ b/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch @@ -0,0 +1,212 @@ + include/linux/dynlocks.h | 33 ++++++++++ + lib/Makefile | 4 - + lib/dynlocks.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 187 insertions(+), 2 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/include/linux/dynlocks.h 2003-09-01 16:33:25.000000000 +0400 +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_DYNLOCKS_H ++#define _LINUX_DYNLOCKS_H ++ ++#include ++#include ++ ++struct dynlock_member { ++ struct list_head dl_list; ++ unsigned long dl_value; /* lock value */ ++ int dl_refcount; /* number of users */ ++ int dl_readers; ++ int dl_writers; ++ int dl_pid; /* holder of the lock */ ++ wait_queue_head_t dl_wait; ++}; ++ ++/* ++ * lock's namespace: ++ * - list of locks ++ * - lock to protect this list ++ */ ++struct dynlock { ++ struct list_head dl_list; ++ spinlock_t dl_list_lock; ++}; ++ ++void dynlock_init(struct dynlock *dl); ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp); ++void dynlock_unlock(struct dynlock *dl, void *lock); ++ ++ ++#endif ++ +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/lib/dynlocks.c 2003-09-01 16:36:00.000000000 +0400 +@@ -0,0 +1,152 @@ ++/* ++ * Dynamic Locks ++ * ++ * struct dynlock is lockspace ++ * one may request lock (exclusive or shared) for some value ++ * in that lockspace ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * dynlock_init ++ * ++ * initialize lockspace ++ * ++ */ ++void dynlock_init(struct dynlock *dl) ++{ ++ spin_lock_init(&dl->dl_list_lock); ++ INIT_LIST_HEAD(&dl->dl_list); ++} ++ ++/* ++ * dynlock_lock ++ * ++ * acquires lock (exclusive or shared) in specified lockspace ++ * each lock in lockspace is allocated separately, so user have ++ * to specify GFP flags. ++ * routine returns pointer to lock. this pointer is intended to ++ * be passed to dynlock_unlock ++ * ++ */ ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp) ++{ ++ struct dynlock_member *nhl = NULL; ++ struct dynlock_member *hl; ++ struct list_head *cur; ++ ++repeat: ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ list_for_each(cur, &dl->dl_list) { ++ hl = list_entry(cur, struct dynlock_member, dl_list); ++ if (hl->dl_value == value) { ++ /* lock is found */ ++ if (nhl) { ++ /* someone else just allocated ++ * lock we didn't find and just created ++ * so, we drop our lock ++ */ ++ kfree(nhl); ++ nhl = NULL; ++ } ++ hl->dl_refcount++; ++ goto found; ++ } ++ } ++ /* lock not found */ ++ if (nhl) { ++ /* we already have allocated lock. use it */ ++ hl = nhl; ++ nhl = NULL; ++ list_add(&hl->dl_list, &dl->dl_list); ++ goto found; ++ } ++ spin_unlock(&dl->dl_list_lock); ++ ++ /* lock not found and we haven't allocated lock yet. allocate it */ ++ nhl = kmalloc(sizeof(struct dynlock_member), gfp); ++ if (nhl == NULL) ++ return NULL; ++ nhl->dl_refcount = 1; ++ nhl->dl_value = value; ++ nhl->dl_readers = 0; ++ nhl->dl_writers = 0; ++ init_waitqueue_head(&nhl->dl_wait); ++ ++ /* while lock is being allocated, someone else may allocate it ++ * and put onto to list. check this situation ++ */ ++ goto repeat; ++ ++found: ++ if (rw) { ++ /* exclusive lock: user don't want to share lock at all ++ * NOTE: one process may take the same lock several times ++ * this functionaly is useful for rename operations */ ++ while ((hl->dl_writers && hl->dl_pid != current->pid) || ++ hl->dl_readers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, ++ hl->dl_writers == 0 && hl->dl_readers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_writers++; ++ } else { ++ /* shared lock: user do not want to share lock with writer */ ++ while (hl->dl_writers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, hl->dl_writers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_readers++; ++ } ++ hl->dl_pid = current->pid; ++ spin_unlock(&dl->dl_list_lock); ++ ++ return hl; ++} ++ ++ ++/* ++ * dynlock_unlock ++ * ++ * user have to specify lockspace (dl) and pointer to lock structure ++ * returned by dynlock_lock() ++ * ++ */ ++void dynlock_unlock(struct dynlock *dl, void *lock) ++{ ++ struct dynlock_member *hl = lock; ++ int wakeup = 0; ++ ++ spin_lock(&dl->dl_list_lock); ++ if (hl->dl_writers) { ++ hl->dl_writers--; ++ if (hl->dl_writers == 0) ++ wakeup = 1; ++ } else { ++ hl->dl_readers--; ++ if (hl->dl_readers == 0) ++ wakeup = 1; ++ } ++ if (wakeup) { ++ hl->dl_pid = 0; ++ wake_up(&hl->dl_wait); ++ } ++ if (--(hl->dl_refcount) == 0) ++ list_del(&hl->dl_list); ++ spin_unlock(&dl->dl_list_lock); ++ if (hl->dl_refcount == 0) ++ kfree(hl); ++} ++ ++EXPORT_SYMBOL(dynlock_init); ++EXPORT_SYMBOL(dynlock_lock); ++EXPORT_SYMBOL(dynlock_unlock); ++ +--- linux-2.4.18/lib/Makefile~dynamic-locks-2.4.18-chaos 2003-08-29 11:57:40.000000000 +0400 ++++ linux-2.4.18-alexey/lib/Makefile 2003-09-01 16:35:23.000000000 +0400 +@@ -8,9 +8,9 @@ + + L_TARGET := lib.a + +-export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o ++export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o dynlocks.o + +-obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o ++obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o dynlocks.o + + obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o + obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o + +_ diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-5.patch b/lustre/kernel_patches/patches/ext-2.4-patch-5.patch new file mode 100644 index 0000000..a65f6ed --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-5.patch @@ -0,0 +1,15 @@ + include/linux/ext3_fs.h | 1 + + 1 files changed, 1 insertion(+) + +--- linux-2.4.18/include/linux/ext3_fs.h~ext-2.4-patch-5 2003-08-29 16:53:18.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-01 11:50:37.000000000 +0400 +@@ -344,6 +344,7 @@ struct ext3_inode { + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + +_ diff --git a/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch new file mode 100644 index 0000000..8343e54 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch @@ -0,0 +1,1461 @@ +--- ./fs/ext3/balloc.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/balloc.c Tue May 7 15:35:59 2002 +@@ -46,18 +46,18 @@ struct ext3_group_desc * ext3_get_group_ + unsigned long desc; + struct ext3_group_desc * gdp; + +- if (block_group >= sb->u.ext3_sb.s_groups_count) { ++ if (block_group >= EXT3_SB(sb)->s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", +- block_group, sb->u.ext3_sb.s_groups_count); ++ block_group, EXT3_SB(sb)->s_groups_count); + + return NULL; + } + + group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); + desc = block_group % EXT3_DESC_PER_BLOCK(sb); +- if (!sb->u.ext3_sb.s_group_desc[group_desc]) { ++ if (!EXT3_SB(sb)->s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", +@@ -66,9 +66,9 @@ struct ext3_group_desc * ext3_get_group_ + } + + gdp = (struct ext3_group_desc *) +- sb->u.ext3_sb.s_group_desc[group_desc]->b_data; ++ EXT3_SB(sb)->s_group_desc[group_desc]->b_data; + if (bh) +- *bh = sb->u.ext3_sb.s_group_desc[group_desc]; ++ *bh = EXT3_SB(sb)->s_group_desc[group_desc]; + return gdp + desc; + } + +@@ -104,8 +104,8 @@ static int read_block_bitmap (struct sup + * this group. The IO will be retried next time. + */ + error_out: +- sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; +- sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; ++ EXT3_SB(sb)->s_block_bitmap_number[bitmap_nr] = block_group; ++ EXT3_SB(sb)->s_block_bitmap[bitmap_nr] = bh; + return retval; + } + +@@ -128,16 +128,17 @@ static int __load_block_bitmap (struct s + int i, j, retval = 0; + unsigned long block_bitmap_number; + struct buffer_head * block_bitmap; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + +- if (block_group >= sb->u.ext3_sb.s_groups_count) ++ if (block_group >= sbi->s_groups_count) + ext3_panic (sb, "load_block_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", +- block_group, sb->u.ext3_sb.s_groups_count); ++ block_group, EXT3_SB(sb)->s_groups_count); + +- if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { +- if (sb->u.ext3_sb.s_block_bitmap[block_group]) { +- if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == ++ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { ++ if (sbi->s_block_bitmap[block_group]) { ++ if (sbi->s_block_bitmap_number[block_group] == + block_group) + return block_group; + ext3_error (sb, "__load_block_bitmap", +@@ -149,21 +150,20 @@ static int __load_block_bitmap (struct s + return block_group; + } + +- for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && +- sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) ++ for (i = 0; i < sbi->s_loaded_block_bitmaps && ++ sbi->s_block_bitmap_number[i] != block_group; i++) + ; +- if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && +- sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { +- block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; +- block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; ++ if (i < sbi->s_loaded_block_bitmaps && ++ sbi->s_block_bitmap_number[i] == block_group) { ++ block_bitmap_number = sbi->s_block_bitmap_number[i]; ++ block_bitmap = sbi->s_block_bitmap[i]; + for (j = i; j > 0; j--) { +- sb->u.ext3_sb.s_block_bitmap_number[j] = +- sb->u.ext3_sb.s_block_bitmap_number[j - 1]; +- sb->u.ext3_sb.s_block_bitmap[j] = +- sb->u.ext3_sb.s_block_bitmap[j - 1]; ++ sbi->s_block_bitmap_number[j] = ++ sbi->s_block_bitmap_number[j - 1]; ++ sbi->s_block_bitmap[j] = sbi->s_block_bitmap[j - 1]; + } +- sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; +- sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; ++ sbi->s_block_bitmap_number[0] = block_bitmap_number; ++ sbi->s_block_bitmap[0] = block_bitmap; + + /* + * There's still one special case here --- if block_bitmap == 0 +@@ -173,17 +173,14 @@ static int __load_block_bitmap (struct s + if (!block_bitmap) + retval = read_block_bitmap (sb, block_group, 0); + } else { +- if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; ++ if (sbi->s_loaded_block_bitmapss_loaded_block_bitmaps++; + else +- brelse (sb->u.ext3_sb.s_block_bitmap +- [EXT3_MAX_GROUP_LOADED - 1]); +- for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; +- j > 0; j--) { +- sb->u.ext3_sb.s_block_bitmap_number[j] = +- sb->u.ext3_sb.s_block_bitmap_number[j - 1]; +- sb->u.ext3_sb.s_block_bitmap[j] = +- sb->u.ext3_sb.s_block_bitmap[j - 1]; ++ brelse(sbi->s_block_bitmap[EXT3_MAX_GROUP_LOADED - 1]); ++ for (j = sbi->s_loaded_block_bitmaps - 1; j > 0; j--) { ++ sbi->s_block_bitmap_number[j] = ++ sbi->s_block_bitmap_number[j - 1]; ++ sbi->s_block_bitmap[j] = sbi->s_block_bitmap[j - 1]; + } + retval = read_block_bitmap (sb, block_group, 0); + } +@@ -206,24 +203,25 @@ static int __load_block_bitmap (struct s + static inline int load_block_bitmap (struct super_block * sb, + unsigned int block_group) + { ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + int slot; +- ++ + /* + * Do the lookup for the slot. First of all, check if we're asking + * for the same slot as last time, and did we succeed that last time? + */ +- if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && +- sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && +- sb->u.ext3_sb.s_block_bitmap[0]) { ++ if (sbi->s_loaded_block_bitmaps > 0 && ++ sbi->s_block_bitmap_number[0] == block_group && ++ sbi->s_block_bitmap[0]) { + return 0; + } + /* + * Or can we do a fast lookup based on a loaded group on a filesystem + * small enough to be mapped directly into the superblock? + */ +- else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && +- sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group +- && sb->u.ext3_sb.s_block_bitmap[block_group]) { ++ else if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED && ++ sbi->s_block_bitmap_number[block_group] == block_group ++ && sbi->s_block_bitmap[block_group]) { + slot = block_group; + } + /* +@@ -243,7 +241,7 @@ static inline int load_block_bitmap (str + * If it's a valid slot, we may still have cached a previous IO error, + * in which case the bh in the superblock cache will be zero. + */ +- if (!sb->u.ext3_sb.s_block_bitmap[slot]) ++ if (!sbi->s_block_bitmap[slot]) + return -EIO; + + /* +@@ -275,7 +273,7 @@ void ext3_free_blocks (handle_t *handle, + return; + } + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + (block + count) > le32_to_cpu(es->s_blocks_count)) { +@@ -304,7 +302,7 @@ do_more: + if (bitmap_nr < 0) + goto error_return; + +- bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ bitmap_bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; +@@ -330,8 +328,8 @@ do_more: + if (err) + goto error_return; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto error_return; + +@@ -341,7 +339,7 @@ + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(block, le32_to_cpu(gdp->bg_inode_table), +- sb->u.ext2_sb.s_itb_per_group)) { ++ EXT3_SB(sb)->s_itb_per_group)) { + ext3_error(sb, __FUNCTION__, + "Freeing block in system zone - block = %lu", + block); +@@ -410,8 +407,8 @@ do_more: + if (!err) err = ret; + + /* And the superblock */ +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); +- ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock"); ++ ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (!err) err = ret; + + if (overflow && !err) { +@@ -564,12 +560,12 @@ int ext3_new_block (handle_t *handle, st + } + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + if (le32_to_cpu(es->s_free_blocks_count) <= + le32_to_cpu(es->s_r_blocks_count) && +- ((sb->u.ext3_sb.s_resuid != current->fsuid) && +- (sb->u.ext3_sb.s_resgid == 0 || +- !in_group_p (sb->u.ext3_sb.s_resgid)) && ++ ((EXT3_SB(sb)->s_resuid != current->fsuid) && ++ (EXT3_SB(sb)->s_resgid == 0 || ++ !in_group_p (EXT3_SB(sb)->s_resgid)) && + !capable(CAP_SYS_RESOURCE))) + goto out; + +@@ -598,7 +595,7 @@ int ext3_new_block (handle_t *handle, st + if (bitmap_nr < 0) + goto io_error; + +- bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + + ext3_debug ("goal is at %d:%d.\n", i, j); + +@@ -621,9 +618,9 @@ int ext3_new_block (handle_t *handle, st + * Now search the rest of the groups. We assume that + * i and gdp correctly point to the last group visited. + */ +- for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { ++ for (k = 0; k < EXT3_SB(sb)->s_groups_count; k++) { + i++; +- if (i >= sb->u.ext3_sb.s_groups_count) ++ if (i >= EXT3_SB(sb)->s_groups_count) + i = 0; + gdp = ext3_get_group_desc (sb, i, &bh2); + if (!gdp) { +@@ -635,7 +632,7 @@ int ext3_new_block (handle_t *handle, st + if (bitmap_nr < 0) + goto io_error; + +- bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + j = find_next_usable_block(-1, bh, + EXT3_BLOCKS_PER_GROUP(sb)); + if (j >= 0) +@@ -674,8 +671,8 @@ got_block: + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto out; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); ++ fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (fatal) goto out; + + tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) +@@ -796,7 +804,7 @@ got_block: + if (!fatal) fatal = err; + + BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (!fatal) fatal = err; + + sb->s_dirt = 1; +@@ -829,11 +837,11 @@ unsigned long ext3_count_free_blocks (st + int i; + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -842,7 +850,7 @@ unsigned long ext3_count_free_blocks (st + if (bitmap_nr < 0) + continue; + +- x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], ++ x = ext3_count_free (EXT3_SB(sb)->s_block_bitmap[bitmap_nr], + sb->s_blocksize); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); +@@ -853,7 +861,7 @@ unsigned long ext3_count_free_blocks (st + unlock_super (sb); + return bitmap_count; + #else +- return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); ++ return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count); + #endif + } + +@@ -862,7 +870,7 @@ static inline int block_in_use (unsigned + unsigned char * map) + { + return ext3_test_bit ((block - +- le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % ++ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb), map); + } + +@@ -930,11 +938,11 @@ void ext3_check_blocks_bitmap (struct su + struct ext3_group_desc * gdp; + int i; + +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -968,7 +976,7 @@ void ext3_check_blocks_bitmap (struct su + "Inode bitmap for group %d is marked free", + i); + +- for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) ++ for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++) + if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", +--- ./fs/ext3/dir.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/dir.c Tue May 7 14:54:13 2002 +@@ -52,7 +52,7 @@ int ext3_check_dir_entry (const char * f + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > +- le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) +--- ./fs/ext3/ialloc.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/ialloc.c Tue May 7 15:39:26 2002 +@@ -73,8 +73,8 @@ static int read_inode_bitmap (struct sup + * this group. The IO will be retried next time. + */ + error_out: +- sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; +- sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; ++ EXT3_SB(sb)->s_inode_bitmap_number[bitmap_nr] = block_group; ++ EXT3_SB(sb)->s_inode_bitmap[bitmap_nr] = bh; + return retval; + } + +@@ -225,7 +225,7 @@ void ext3_free_inode (handle_t *handle, + clear_inode (inode); + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); +@@ -237,7 +237,7 @@ void ext3_free_inode (handle_t *handle, + if (bitmap_nr < 0) + goto error_return; + +- bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; ++ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; + + BUFFER_TRACE(bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh); +@@ -255,8 +255,8 @@ void ext3_free_inode (handle_t *handle, + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); +- fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access"); ++ fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (fatal) goto error_return; + + if (gdp) { +@@ -271,9 +271,9 @@ void ext3_free_inode (handle_t *handle, + if (!fatal) fatal = err; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, + "call ext3_journal_dirty_metadata"); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +@@ -305,6 +305,8 @@ struct inode * ext3_new_inode (handle_t + int i, j, avefreei; + struct inode * inode; + int bitmap_nr; ++ struct ext3_inode_info *ei; ++ struct ext3_sb_info *sbi; + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; +@@ -318,7 +320,9 @@ struct inode * ext3_new_inode (handle_t + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); +- init_rwsem(&inode->u.ext3_i.truncate_sem); ++ sbi = EXT3_SB(sb); ++ ei = EXT3_I(inode); ++ init_rwsem(&ei->truncate_sem); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; +@@ -328,9 +332,9 @@ struct inode * ext3_new_inode (handle_t + + if (S_ISDIR(mode)) { + avefreei = le32_to_cpu(es->s_free_inodes_count) / +- sb->u.ext3_sb.s_groups_count; ++ sbi->s_groups_count; + if (!gdp) { +- for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { ++ for (j = 0; j < sbi->s_groups_count; j++) { + struct buffer_head *temp_buffer; + tmp = ext3_get_group_desc (sb, j, &temp_buffer); + if (tmp && +@@ -350,7 +354,7 @@ repeat: + /* + * Try to place the inode in its parent directory + */ +- i = dir->u.ext3_i.i_block_group; ++ i = EXT3_I(dir)->i_block_group; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) + gdp = tmp; +@@ -360,10 +364,10 @@ repeat: + * Use a quadratic hash to find a group with a + * free inode + */ +- for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { ++ for (j = 1; j < sbi->s_groups_count; j <<= 1) { + i += j; +- if (i >= sb->u.ext3_sb.s_groups_count) +- i -= sb->u.ext3_sb.s_groups_count; ++ if (i >= sbi->s_groups_count) ++ i -= sbi->s_groups_count; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { +@@ -376,9 +380,9 @@ repeat: + /* + * That failed: try linear search for a free inode + */ +- i = dir->u.ext3_i.i_block_group + 1; +- for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { +- if (++i >= sb->u.ext3_sb.s_groups_count) ++ i = EXT3_I(dir)->i_block_group + 1; ++ for (j = 2; j < sbi->s_groups_count; j++) { ++ if (++i >= sbi->s_groups_count) + i = 0; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && +@@ -399,11 +403,11 @@ repeat: + if (bitmap_nr < 0) + goto fail; + +- bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; ++ bh = sbi->s_inode_bitmap[bitmap_nr]; + + if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, +- EXT3_INODES_PER_GROUP(sb))) < +- EXT3_INODES_PER_GROUP(sb)) { ++ sbi->s_inodes_per_group)) < ++ sbi->s_inodes_per_group) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) goto fail; +@@ -457,13 +461,13 @@ repeat: + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) goto fail; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(sbi->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + sb->s_dirt = 1; + if (err) goto fail; + +@@ -483,31 +487,31 @@ repeat: + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; +- inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; + if (S_ISLNK(mode)) +- inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); ++ ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + #ifdef EXT3_FRAGMENTS +- inode->u.ext3_i.i_faddr = 0; +- inode->u.ext3_i.i_frag_no = 0; +- inode->u.ext3_i.i_frag_size = 0; ++ ei->i_faddr = 0; ++ ei->i_frag_no = 0; ++ ei->i_frag_size = 0; + #endif +- inode->u.ext3_i.i_file_acl = 0; +- inode->u.ext3_i.i_dir_acl = 0; +- inode->u.ext3_i.i_dtime = 0; +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ ei->i_file_acl = 0; ++ ei->i_dir_acl = 0; ++ ei->i_dtime = 0; ++ INIT_LIST_HEAD(&ei->i_orphan); + #ifdef EXT3_PREALLOCATE +- inode->u.ext3_i.i_prealloc_count = 0; ++ ei->i_prealloc_count = 0; + #endif +- inode->u.ext3_i.i_block_group = i; ++ ei->i_block_group = i; + +- if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) ++ if (ei->i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (IS_SYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); +- inode->i_generation = sb->u.ext3_sb.s_next_generation++; ++ inode->i_generation = sbi->s_next_generation++; + +- inode->u.ext3_i.i_state = EXT3_STATE_NEW; ++ ei->i_state = EXT3_STATE_NEW; + err = ext3_mark_inode_dirty(handle, inode); + if (err) goto fail; + +@@ -585,19 +589,19 @@ struct inode *ext3_orphan_get (struct su + + unsigned long ext3_count_free_inodes (struct super_block * sb) + { ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_super_block *es = sbi->s_es; + #ifdef EXT3FS_DEBUG +- struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -606,8 +610,8 @@ unsigned long ext3_count_free_inodes (st + if (bitmap_nr < 0) + continue; + +- x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], +- EXT3_INODES_PER_GROUP(sb) / 8); ++ x = ext3_count_free(sbi->s_inode_bitmap[bitmap_nr], ++ sbi->s_inodes_per_group / 8); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; +@@ -617,7 +621,7 @@ unsigned long ext3_count_free_inodes (st + unlock_super (sb); + return desc_count; + #else +- return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); ++ return le32_to_cpu(es->s_free_inodes_count); + #endif + } + +@@ -626,16 +630,18 @@ unsigned long ext3_count_free_inodes (st + void ext3_check_inodes_bitmap (struct super_block * sb) + { + struct ext3_super_block * es; ++ struct ext3_sb_info *sbi; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + +- es = sb->u.ext3_sb.s_es; ++ sbi = EXT3_SB(sb); ++ es = sbi->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -644,7 +650,7 @@ void ext3_check_inodes_bitmap (struct su + if (bitmap_nr < 0) + continue; + +- x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], ++ x = ext3_count_free (sbi->s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(gdp->bg_free_inodes_count) != x) + ext3_error (sb, "ext3_check_inodes_bitmap", +--- ./fs/ext3/inode.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/inode.c Tue May 7 15:41:23 2002 +@@ -196,7 +196,7 @@ void ext3_delete_inode (struct inode * i + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); +- inode->u.ext3_i.i_dtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = CURRENT_TIME; + + /* + * One subtle ordering requirement: if anything has gone wrong +@@ -220,13 +220,14 @@ no_delete: + void ext3_discard_prealloc (struct inode * inode) + { + #ifdef EXT3_PREALLOCATE ++ struct ext3_inode_info *ei = EXT3_I(inode); + lock_kernel(); + /* Writer: ->i_prealloc* */ +- if (inode->u.ext3_i.i_prealloc_count) { +- unsigned short total = inode->u.ext3_i.i_prealloc_count; +- unsigned long block = inode->u.ext3_i.i_prealloc_block; +- inode->u.ext3_i.i_prealloc_count = 0; +- inode->u.ext3_i.i_prealloc_block = 0; ++ if (ei->i_prealloc_count) { ++ unsigned short total = ei->i_prealloc_count; ++ unsigned long block = ei->i_prealloc_block; ++ ei->i_prealloc_count = 0; ++ ei->i_prealloc_block = 0; + /* Writer: end */ + ext3_free_blocks (inode, block, total); + } +@@ -243,13 +244,15 @@ static int ext3_alloc_block (handle_t *h + unsigned long result; + + #ifdef EXT3_PREALLOCATE ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ + /* Writer: ->i_prealloc* */ +- if (inode->u.ext3_i.i_prealloc_count && +- (goal == inode->u.ext3_i.i_prealloc_block || +- goal + 1 == inode->u.ext3_i.i_prealloc_block)) ++ if (ei->i_prealloc_count && ++ (goal == ei->i_prealloc_block || ++ goal + 1 == ei->i_prealloc_block)) + { +- result = inode->u.ext3_i.i_prealloc_block++; +- inode->u.ext3_i.i_prealloc_count--; ++ result = ei->i_prealloc_block++; ++ ei->i_prealloc_count--; + /* Writer: end */ + ext3_debug ("preallocation hit (%lu/%lu).\n", + ++alloc_hits, ++alloc_attempts); +@@ -259,8 +262,8 @@ static int ext3_alloc_block (handle_t *h + alloc_hits, ++alloc_attempts); + if (S_ISREG(inode->i_mode)) + result = ext3_new_block (inode, goal, +- &inode->u.ext3_i.i_prealloc_count, +- &inode->u.ext3_i.i_prealloc_block, err); ++ &ei->i_prealloc_count, ++ &ei->i_prealloc_block, err); + else + result = ext3_new_block (inode, goal, 0, 0, err); + /* +@@ -394,7 +397,7 @@ static Indirect *ext3_get_branch(struct + + *err = 0; + /* i_data is not going away, no lock needed */ +- add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); ++ add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { +@@ -437,7 +440,8 @@ no_block: + + static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) + { +- u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data; + u32 *p; + + /* Try to find previous block */ +@@ -453,9 +456,8 @@ static inline unsigned long ext3_find_ne + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ +- return (inode->u.ext3_i.i_block_group * +- EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + +- le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); ++ return (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + } + + /** +@@ -474,14 +477,15 @@ + static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) + { ++ struct ext3_inode_info *ei = EXT3_I(inode); + /* Writer: ->i_next_alloc* */ +- if (block == inode->u.ext3_i.i_next_alloc_block + 1) { +- inode->u.ext3_i.i_next_alloc_block++; +- inode->u.ext3_i.i_next_alloc_goal++; ++ if (block == ei->i_next_alloc_block + 1) { ++ ei->i_next_alloc_block++; ++ ei->i_next_alloc_goal++; + } + #ifdef SEARCH_FROM_ZERO +- inode->u.ext3_i.i_next_alloc_block = 0; +- inode->u.ext3_i.i_next_alloc_goal = 0; ++ ei->i_next_alloc_block = 0; ++ ei->i_next_alloc_goal = 0; + #endif + /* Writer: end */ + /* Reader: pointers, ->i_next_alloc* */ +@@ -490,8 +493,8 @@ static int ext3_find_goal(struct inode * + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ +- if (block == inode->u.ext3_i.i_next_alloc_block) +- *goal = inode->u.ext3_i.i_next_alloc_goal; ++ if (block == ei->i_next_alloc_block) ++ *goal = ei->i_next_alloc_goal; + if (!*goal) + *goal = ext3_find_near(inode, partial); + #ifdef SEARCH_FROM_ZERO +@@ -619,6 +621,7 @@ + { + int i; + int err = 0; ++ struct ext3_inode_info *ei = EXT3_I(inode); + + /* + * If we're splicing into a [td]indirect block (as opposed to the +@@ -641,11 +644,11 @@ static int ext3_splice_branch(handle_t * + /* That's it */ + + *where->p = where->key; +- inode->u.ext3_i.i_next_alloc_block = block; +- inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); ++ ei->i_next_alloc_block = block; ++ ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key); + #ifdef SEARCH_FROM_ZERO +- inode->u.ext3_i.i_next_alloc_block = 0; +- inode->u.ext3_i.i_next_alloc_goal = 0; ++ ei->i_next_alloc_block = 0; ++ ei->i_next_alloc_goal = 0; + #endif + /* Writer: end */ + +@@ -729,6 +732,7 @@ + unsigned long goal; + int left; + int depth = ext3_block_to_path(inode, iblock, offsets); ++ struct ext3_inode_info *ei = EXT3_I(inode); + loff_t new_size; + + J_ASSERT(handle != NULL || create == 0); +@@ -782,7 +785,7 @@ out: + /* + * Block out ext3_truncate while we alter the tree + */ +- down_read(&inode->u.ext3_i.truncate_sem); ++ down_read(&ei->truncate_sem); + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + +@@ -794,7 +797,7 @@ out: + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); +- up_read(&inode->u.ext3_i.truncate_sem); ++ up_read(&ei->truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) +@@ -807,8 +810,8 @@ out: + * truncate is in progress. It is racy between multiple parallel + * instances of get_block, but we have the BKL. + */ +- if (new_size > inode->u.ext3_i.i_disksize) +- inode->u.ext3_i.i_disksize = new_size; ++ if (new_size > ei->i_disksize) ++ ei->i_disksize = new_size; + + bh_result->b_state |= (1UL << BH_New); + goto got_it; +@@ -921,7 +924,7 @@ struct buffer_head *ext3_bread(handle_t + struct buffer_head *tmp_bh; + + for (i = 1; +- inode->u.ext3_i.i_prealloc_count && ++ EXT3_I(inode)->i_prealloc_count && + i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; + i++) { + /* +@@ -1131,8 +1134,8 @@ static int ext3_commit_write(struct file + kunmap(page); + } + } +- if (inode->i_size > inode->u.ext3_i.i_disksize) { +- inode->u.ext3_i.i_disksize = inode->i_size; ++ if (inode->i_size > EXT3_I(inode)->i_disksize) { ++ EXT3_I(inode)->i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; +@@ -1832,7 +1835,8 @@ static void ext3_free_branches(handle_t + void ext3_truncate(struct inode * inode) + { + handle_t *handle; +- u32 *i_data = inode->u.ext3_i.i_data; ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ u32 *i_data = EXT3_I(inode)->i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; +@@ -1884,13 +1887,13 @@ void ext3_truncate(struct inode * inode) + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ +- inode->u.ext3_i.i_disksize = inode->i_size; ++ ei->i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ +- down_write(&inode->u.ext3_i.truncate_sem); ++ down_write(&ei->truncate_sem); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], +@@ -1954,7 +1957,7 @@ do_indirects: + case EXT3_TIND_BLOCK: + ; + } +- up_write(&inode->u.ext3_i.truncate_sem); ++ up_write(&ei->truncate_sem); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + +@@ -1983,6 +1986,8 @@ out_stop: + + int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) + { ++ struct super_block *sb = inode->i_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + struct buffer_head *bh = 0; + unsigned long block; + unsigned long block_group; +@@ -1997,23 +2010,19 @@ int ext3_get_inode_loc (struct inode *in + inode->i_ino != EXT3_JOURNAL_INO && +- inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || +- inode->i_ino > le32_to_cpu( +- inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "bad inode number: %lu", inode->i_ino); ++ inode->i_ino < EXT3_FIRST_INO(sb)) || ++ inode->i_ino > le32_to_cpu(sbi->s_es->s_inodes_count)) { ++ ext3_error (sb, __FUNCTION__, "bad inode #%lu", inode->i_ino); + goto bad_inode; + } +- block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); +- if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "group >= groups count"); ++ block_group = (inode->i_ino - 1) / sbi->s_inodes_per_group; ++ if (block_group >= sbi->s_groups_count) { ++ ext3_error(sb, __FUNCTION__, "group >= groups count"); + goto bad_inode; + } +- group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); +- desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); +- bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; ++ group_desc = block_group >> sbi->s_desc_per_block_bits; ++ desc = block_group & (sbi->s_desc_per_block - 1); ++ bh = sbi->s_group_desc[group_desc]; + if (!bh) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "Descriptor not loaded"); ++ ext3_error(sb, __FUNCTION__, "Descriptor not loaded"); + goto bad_inode; + } + +@@ -2021,17 +2022,17 @@ int ext3_get_inode_loc (struct inode *in + /* + * Figure out the offset within the block group inode table + */ +- offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * +- EXT3_INODE_SIZE(inode->i_sb); ++ offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group) * ++ sbi->s_inode_size; + block = le32_to_cpu(gdp[desc].bg_inode_table) + +- (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); +- if (!(bh = sb_bread(inode->i_sb, block))) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", ++ (offset >> EXT3_BLOCK_SIZE_BITS(sb)); ++ if (!(bh = sb_bread(sb, block))) { ++ ext3_error (sb, __FUNCTION__, + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + goto bad_inode; + } +- offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); ++ offset &= (EXT3_BLOCK_SIZE(sb) - 1); + + iloc->bh = bh; + iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); +@@ -2047,6 +2048,7 @@ void ext3_read_inode(struct inode * inod + { + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh; + int block; + +@@ -2054,7 +2056,7 @@ void ext3_read_inode(struct inode * inod + goto bad_inode; + bh = iloc.bh; + raw_inode = iloc.raw_inode; +- init_rwsem(&inode->u.ext3_i.truncate_sem); ++ init_rwsem(&ei->truncate_sem); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); +@@ -2067,7 +2069,7 @@ void ext3_read_inode(struct inode * inod + inode->i_atime = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); +- inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); ++ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses +@@ -2075,7 +2077,7 @@ void ext3_read_inode(struct inode * inod + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || +- !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { ++ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; +@@ -2090,33 +2092,33 @@ void ext3_read_inode(struct inode * inod + * size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + inode->i_version = ++event; +- inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); ++ ei->i_flags = le32_to_cpu(raw_inode->i_flags); + #ifdef EXT3_FRAGMENTS +- inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); +- inode->u.ext3_i.i_frag_no = raw_inode->i_frag; +- inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; ++ ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); ++ ei->i_frag_no = raw_inode->i_frag; ++ ei->i_frag_size = raw_inode->i_fsize; + #endif +- inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); ++ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { +- inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); ++ ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } +- inode->u.ext3_i.i_disksize = inode->i_size; ++ ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + #ifdef EXT3_PREALLOCATE +- inode->u.ext3_i.i_prealloc_count = 0; ++ ei->i_prealloc_count = 0; + #endif +- inode->u.ext3_i.i_block_group = iloc.block_group; ++ ei->i_block_group = iloc.block_group; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) +- inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ ei->i_data[block] = iloc.raw_inode->i_block[block]; ++ INIT_LIST_HEAD(&ei->i_orphan); + + brelse (iloc.bh); + +@@ -2143,17 +2145,17 @@ void ext3_read_inode(struct inode * inod + /* inode->i_attr_flags = 0; unused */ +- if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { ++ if (ei->i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ + inode->i_flags |= S_SYNC; + } +- if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { ++ if (ei->i_flags & EXT3_APPEND_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ + inode->i_flags |= S_APPEND; + } +- if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) { ++ if (ei->i_flags & EXT3_IMMUTABLE_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ + inode->i_flags |= S_IMMUTABLE; + } +- if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { ++ if (ei->i_flags & EXT3_NOATIME_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ + inode->i_flags |= S_NOATIME; + } +@@ -2175,6 +2177,7 @@ static int ext3_do_update_inode(handle_t + struct ext3_iloc *iloc) + { + struct ext3_inode *raw_inode = iloc->raw_inode; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + +@@ -2192,7 +2195,7 @@ static int ext3_do_update_inode(handle_t + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ +- if(!inode->u.ext3_i.i_dtime) { ++ if(!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = +@@ -2210,34 +2213,33 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); +- raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); ++ raw_inode->i_size = cpu_to_le32(ei->i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); +- raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); +- raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); ++ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); ++ raw_inode->i_flags = cpu_to_le32(ei->i_flags); + #ifdef EXT3_FRAGMENTS +- raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); +- raw_inode->i_frag = inode->u.ext3_i.i_frag_no; +- raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; ++ raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); ++ raw_inode->i_frag = ei->i_frag_no; ++ raw_inode->i_fsize = ei->i_frag_size; + #else + /* If we are not tracking these fields in the in-memory inode, + * then preserve them on disk, but still initialise them to zero + * for new inodes. */ +- if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { ++ if (ei->i_state & EXT3_STATE_NEW) { + raw_inode->i_faddr = 0; + raw_inode->i_frag = 0; + raw_inode->i_fsize = 0; + } + #endif +- raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); ++ raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { +- raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); ++ raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { +- raw_inode->i_size_high = +- cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); +- if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { ++ raw_inode->i_size_high = cpu_to_le32(ei->i_disksize >> 32); ++ if (ei->i_disksize > MAX_NON_LFS) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || +@@ -2247,7 +2249,7 @@ static int ext3_do_update_inode(handle_t + * created, add a flag to the superblock. + */ + err = ext3_journal_get_write_access(handle, +- sb->u.ext3_sb.s_sbh); ++ EXT3_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext3_update_dynamic_rev(sb); +@@ -2256,7 +2258,7 @@ static int ext3_do_update_inode(handle_t + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, +- sb->u.ext3_sb.s_sbh); ++ EXT3_SB(sb)->s_sbh); + } + } + } +@@ -2265,13 +2267,13 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_block[0] = + cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); + else for (block = 0; block < EXT3_N_BLOCKS; block++) +- raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; ++ raw_inode->i_block[block] = ei->i_data[block]; + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; +- EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; ++ ei->i_state &= ~EXT3_STATE_NEW; + + out_brelse: + brelse (bh); +@@ -2379,7 +2381,7 @@ int ext3_setattr(struct dentry *dentry, + } + + error = ext3_orphan_add(handle, inode); +- inode->u.ext3_i.i_disksize = attr->ia_size; ++ EXT3_I(inode)->i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; +@@ -2622,9 +2624,9 @@ int ext3_change_inode_journal_flag(struc + */ + + if (val) +- inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; ++ EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; + else +- inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; ++ EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; + + journal_unlock_updates(journal); + +--- ./fs/ext3/ioctl.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/ioctl.c Tue May 7 15:20:52 2002 +@@ -18,13 +18,14 @@ + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) + { ++ struct ext3_inode_info *ei = EXT3_I(inode); + unsigned int flags; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: +- flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; ++ flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; +@@ -42,7 +42,7 @@ int ext3_ioctl (struct inode * inode, st + if (get_user(flags, (int *) arg)) + return -EFAULT; + +- oldflags = inode->u.ext3_i.i_flags; ++ oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; +@@ -79,7 +79,7 @@ int ext3_ioctl (struct inode * inode, st + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; +- inode->u.ext3_i.i_flags = flags; ++ ei->i_flags = flags; + + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; +@@ -155,12 +155,12 @@ flags_err: + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); +- add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); +- if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { ++ add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); ++ if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } +- remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); ++ remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); + return ret; + } + #endif +--- ./fs/ext3/namei.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/namei.c Tue May 7 16:05:51 2002 +@@ -1430,8 +1430,8 @@ int ext3_orphan_add(handle_t *handle, st + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto out_unlock; + +@@ -1442,7 +1442,7 @@ int ext3_orphan_add(handle_t *handle, st + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +@@ -1520,8 +1520,7 @@ int ext3_orphan_del(handle_t *handle, st + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; +- struct inode *i_prev = +- list_entry(prev, struct inode, u.ext3_i.i_orphan); ++ struct inode *i_prev = orphan_list_entry(prev); + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); +--- ./fs/ext3/super.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/super.c Tue May 7 16:05:44 2002 +@@ -121,7 +121,7 @@ static int ext3_error_behaviour(struct s + /* If no overrides were specified on the mount, then fall back + * to the default behaviour set in the filesystem's superblock + * on disk. */ +- switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { ++ switch (le16_to_cpu(EXT3_SB(sb)->s_es->s_errors)) { + case EXT3_ERRORS_PANIC: + return EXT3_ERRORS_PANIC; + case EXT3_ERRORS_RO: +@@ -269,9 +269,9 @@ void ext3_abort (struct super_block * sb + return; + + printk (KERN_CRIT "Remounting filesystem read-only\n"); +- sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + sb->s_flags |= MS_RDONLY; +- sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; ++ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); + } + +@@ -377,8 +377,6 @@ static int ext3_blkdev_remove(struct ext3 + return ret; + } + +-#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) +- + static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) + { + struct list_head *l; +@@ -818,7 +818,7 @@ static void ext3_orphan_cleanup (struct + sb->s_flags &= ~MS_RDONLY; + } + +- if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { ++ if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); +@@ -1463,12 +1463,14 @@ static void ext3_commit_super (struct su + struct ext3_super_block * es, + int sync) + { ++ struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; ++ + es->s_wtime = cpu_to_le32(CURRENT_TIME); +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); +- mark_buffer_dirty(sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(sbh, "marking dirty"); ++ mark_buffer_dirty(sbh); + if (sync) { +- ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); +- wait_on_buffer(sb->u.ext3_sb.s_sbh); ++ ll_rw_block(WRITE, 1, &sbh); ++ wait_on_buffer(sbh); + } + } + +@@ -1519,7 +1521,7 @@ static void ext3_clear_journal_err(struc + ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + "filesystem check."); + +- sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + +--- ./fs/ext3/symlink.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/symlink.c Tue May 7 15:25:39 2002 +@@ -23,13 +23,13 @@ + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { +- char *s = (char *)dentry->d_inode->u.ext3_i.i_data; +- return vfs_readlink(dentry, buffer, buflen, s); ++ struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); ++ return vfs_readlink(dentry, buffer, buflen, (char *)ei->i_data); + } + + static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) + { +- char *s = (char *)dentry->d_inode->u.ext3_i.i_data; +- return vfs_follow_link(nd, s); ++ struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); ++ return vfs_follow_link(nd, (char*)ei->i_data); + } + +--- ./include/linux/ext3_fs.h.orig Tue Apr 16 14:27:25 2002 ++++ ./include/linux/ext3_fs.h Tue May 7 16:47:36 2002 +@@ -84,22 +84,25 @@ + #define EXT3_MIN_BLOCK_SIZE 1024 + #define EXT3_MAX_BLOCK_SIZE 4096 + #define EXT3_MIN_BLOCK_LOG_SIZE 10 ++ + #ifdef __KERNEL__ +-# define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +-#else +-# define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) +-#endif +-#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +-#ifdef __KERNEL__ +-# define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +-#else +-# define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +-#endif +-#ifdef __KERNEL__ +-#define EXT3_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_addr_per_block_bits) +-#define EXT3_INODE_SIZE(s) ((s)->u.ext3_sb.s_inode_size) +-#define EXT3_FIRST_INO(s) ((s)->u.ext3_sb.s_first_ino) ++#define EXT3_SB(sb) (&((sb)->u.ext3_sb)) ++#define EXT3_I(inode) (&((inode)->u.ext3_i)) ++ ++#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) ++#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) ++#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) ++#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) ++#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) + #else ++ ++/* Assume that user mode programs are passing in an ext3fs superblock, not ++ * a kernel struct super_block. This will allow us to call the feature-test ++ * macros from user land. */ ++#define EXT3_SB(sb) (sb) ++ ++#define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) ++#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) + #define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +@@ -108,6 +110,7 @@ + EXT3_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) + #endif ++#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + + /* + * Macro-instructions used to manage fragments +@@ -116,8 +120,8 @@ + #define EXT3_MAX_FRAG_SIZE 4096 + #define EXT3_MIN_FRAG_LOG_SIZE 10 + #ifdef __KERNEL__ +-# define EXT3_FRAG_SIZE(s) ((s)->u.ext3_sb.s_frag_size) +-# define EXT3_FRAGS_PER_BLOCK(s) ((s)->u.ext3_sb.s_frags_per_block) ++# define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) ++# define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) + #else + # define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size) + # define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s)) +@@ -163,15 +167,13 @@ + /* + * Macro-instructions used to manage group descriptors + */ ++# define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) ++# define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) + #ifdef __KERNEL__ +-# define EXT3_BLOCKS_PER_GROUP(s) ((s)->u.ext3_sb.s_blocks_per_group) +-# define EXT3_DESC_PER_BLOCK(s) ((s)->u.ext3_sb.s_desc_per_block) +-# define EXT3_INODES_PER_GROUP(s) ((s)->u.ext3_sb.s_inodes_per_group) +-# define EXT3_DESC_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_desc_per_block_bits) ++# define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) ++# define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) + #else +-# define EXT3_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) + # define EXT3_DESC_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc)) +-# define EXT3_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) + #endif + + /* +@@ -344,7 +347,7 @@ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt + #define set_opt(o, opt) o |= EXT3_MOUNT_##opt +-#define test_opt(sb, opt) ((sb)->u.ext3_sb.s_mount_opt & \ ++#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) + #else + #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +@@ -441,17 +443,11 @@ + /*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ + }; + +-#ifdef __KERNEL__ +-#define EXT3_SB(sb) (&((sb)->u.ext3_sb)) +-#define EXT3_I(inode) (&((inode)->u.ext3_i)) +-#else +-/* Assume that user mode programs are passing in an ext3fs superblock, not +- * a kernel struct super_block. This will allow us to call the feature-test +- * macros from user land. */ +-#define EXT3_SB(sb) (sb) +-#endif +- +-#define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime ++#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime ++static inline struct inode *orphan_list_entry(struct list_head *l) ++{ ++ return list_entry(l, struct inode, u.ext3_i.i_orphan); ++} + + /* + * Codes for operating systems +--- ./include/linux/ext3_jbd.h.orig Tue May 7 14:44:08 2002 ++++ ./include/linux/ext3_jbd.h Tue May 7 14:44:43 2002 +@@ -291,7 +291,7 @@ + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; +- if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL) ++ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; + } diff --git a/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch new file mode 100644 index 0000000..7cd3384 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch @@ -0,0 +1,19 @@ + fs/ext3/namei.c | 2 +- + 1 files changed, 1 insertion(+), 1 deletion(-) + +diff -puN fs/ext3/namei.c~ext3-compat-2.4.18-chaos fs/ext3/namei.c +--- linux-2.4.18/fs/ext3/namei.c~ext3-compat-2.4.18-chaos 2003-08-28 20:14:27.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-08-28 20:14:27.000000000 +0400 +@@ -830,9 +830,9 @@ static int ext3_rmdir (struct inode * di + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); +- ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + + +_ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch new file mode 100644 index 0000000..a173981 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch @@ -0,0 +1,478 @@ + +Create a service thread to handle delete and truncate of inodes, to avoid +long latency while truncating very large files. + + + fs/ext3/inode.c | 116 ++++++++++++++++++++++ + fs/ext3/super.c | 231 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 5 + include/linux/ext3_fs_sb.h | 10 + + 4 files changed, 362 insertions(+) + +--- linux-2.4.18-18.8.0-l15/fs/ext3/super.c~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/super.c Wed Jul 2 23:49:40 2003 +@@ -396,6 +396,220 @@ static void dump_orphan_list(struct supe + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ wait_event_interruptible(sbi->s_delete_thread_queue, ++ !list_empty(&sbi->s_delete_list) || ++ !test_opt(sb, ASYNCDEL)); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ } ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ ++ if (!test_opt(sb, ASYNCDEL)) ++ return; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ if (sbi->s_delete_list.next == 0) /* thread never started */ ++ return; ++ ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * create a new inode locally and put it on a list for the truncate thread. ++ * We need large parts of the inode struct in order to complete the ++ * truncate and unlink, so we may as well just have a real inode to do it. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_delete; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) ++ goto out_delete; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_delete; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ sbi->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug("delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ goto out_delete; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&oei->i_orphan)); ++ ++ nei = EXT3_I(new_inode); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * in the same order as the on-disk orphan list (badness otherwise). ++ */ ++ nei->i_orphan = oei->i_orphan; ++ nei->i_orphan.next->prev = &nei->i_orphan; ++ nei->i_orphan.prev->next = &nei->i_orphan; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_delete: ++ ext3_delete_inode(old_inode); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -403,6 +617,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,7 +666,11 @@ static struct super_operations ext3_sops + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ +@@ -511,6 +730,14 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else if (!strcmp(this_char, "noasyncdel")) ++ clear_opt(*mount_options, ASYNCDEL); ++ else ++#endif ++ + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1206,6 +1433,7 @@ struct super_block * ext3_read_super (st + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +@@ -1648,6 +1876,9 @@ int ext3_remount (struct super_block * s + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + ++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) ++ ext3_stop_delete_thread(sbi); ++ + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + +--- linux/fs/ext3/file.c.orig Fri Jan 17 10:57:31 2003 ++++ linux/fs/ext3/file.c Mon Jun 30 13:28:52 2003 +@@ -121,7 +121,11 @@ struct file_operations ext3_file_operati + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + }; + +--- linux-2.4.18-18.8.0-l15/fs/ext3/inode.c~ext3-delete_thread-2.4.18 Wed Jul 2 23:13:58 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/inode.c Wed Jul 2 23:50:29 2003 +@@ -2004,6 +2004,118 @@ out_stop: + ext3_journal_stop(handle, inode); + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 0; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. +--- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:20 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs.h Wed Jul 2 23:19:09 2003 +@@ -190,6 +190,7 @@ struct ext3_group_desc + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -317,6 +318,7 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -651,6 +653,9 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++#ifdef EXT3_DELETE_THREAD ++extern void ext3_truncate_thread(struct inode *inode); ++#endif + + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, +--- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs_sb.h Wed Jul 2 23:19:09 2003 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 32 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -74,6 +76,14 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch new file mode 100644 index 0000000..82097e7 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch @@ -0,0 +1,1213 @@ + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 2 + fs/ext3/namei.c | 580 +++++++++++++++++++++++++++++++++++++--------- + fs/ext3/super.c | 6 + include/linux/ext3_fs.h | 1 + include/linux/ext3_fs_i.h | 6 + 6 files changed, 489 insertions(+), 108 deletions(-) + +--- linux-2.4.18/fs/ext3/namei.c~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:06.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-09-01 17:52:47.000000000 +0400 +@@ -52,6 +52,9 @@ static struct buffer_head *ext3_append(h + { + struct buffer_head *bh; + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&EXT3_I(inode)->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { +@@ -59,6 +62,8 @@ static struct buffer_head *ext3_append(h + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } ++ up(&EXT3_I(inode)->i_append_sem); ++ + return bh; + } + +@@ -135,6 +140,8 @@ struct dx_frame + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; ++ unsigned long leaf; ++ unsigned int curidx; + }; + + struct dx_map_entry +@@ -143,6 +150,30 @@ struct dx_map_entry + u32 offs; + }; + ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -154,7 +185,7 @@ static void dx_set_count (struct dx_entr + static void dx_set_limit (struct dx_entry *entries, unsigned value); + static unsigned dx_root_limit (struct inode *dir, unsigned infosize); + static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, ++static struct dx_frame *dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +@@ -166,15 +197,18 @@ static void dx_sort_map(struct dx_map_en + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, int *err, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err); ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); ++static inline void *ext3_lock_htree(struct inode *, unsigned long, int); ++static inline void ext3_unlock_htree(struct inode *, void *); + + /* + * Future: use high four bits of block for coalesce-on-delete flags +@@ -307,6 +341,94 @@ struct stats dx_show_entries(struct dx_h + #endif /* DX_DEBUG */ + + /* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) ++{ ++ struct dx_entry *p, *q, *m; ++ int count; ++ ++ count = dx_get_count(entries); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ return p - 1; ++} ++ ++/* ++ * returns 1 if path is unchanged ++ */ ++int dx_check_path(struct dx_frame *frame, u32 hash) ++{ ++ struct dx_entry *p; ++ int ret = 1; ++ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hash); ++ if (frame->leaf != dx_get_block(p)) ++ ret = 0; ++ dx_unlock_bh(frame->bh); ++ ++ return ret; ++} ++ ++/* ++ * 0 - changed ++ * 1 - hasn't changed ++ */ ++static int ++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) ++{ ++ struct dx_entry *p; ++ struct dx_frame *frame = frames; ++ u32 leaf; ++ ++ /* check first level */ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ /* is there 2nd level? */ ++ frame++; ++ if (frame->bh == NULL) ++ return 1; ++ ++ /* check second level */ ++ dx_lock_bh(frame->bh); ++ ++ /* probably 1st level got changed, check it */ ++ if (!dx_check_path(frames, hinfo->hash)) { ++ /* path changed */ ++ dx_unlock_bh(frame->bh); ++ return 0; ++ } ++ ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ return 1; ++} ++ ++/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +@@ -316,19 +438,20 @@ struct stats dx_show_entries(struct dx_h + * back to userspace. + */ + static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, ++dx_probe(struct qstr *name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + { +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ unsigned indirect; ++ struct dx_entry *at, *entries; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; ++ unsigned int curidx; + + frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; ++ frame[1].bh = NULL; ++ + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; +@@ -344,8 +467,8 @@ dx_probe(struct dentry *dentry, struct i + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ if (name) ++ ext3fs_dirhash(name->name, name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { +@@ -357,7 +480,19 @@ dx_probe(struct dentry *dentry, struct i + goto fail; + } + ++repeat: ++ curidx = 0; ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ dx_lock_bh(bh); ++ /* indirect must be initialized under bh lock because ++ * 2nd level creation procedure may change it and dx_probe() ++ * will suggest htree is still single-level -bzzz */ + if ((indirect = root->info.indirect_levels) > 1) { ++ dx_unlock_bh(bh); + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -365,56 +500,46 @@ dx_probe(struct dentry *dentry, struct i + *err = ERR_BAD_DX_DIR; + goto fail; + } +- +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); ++ + while (1) + { +- count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; +- dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; +- else +- p = m + 1; +- } +- +- if (0) // linear search cross check +- { +- unsigned n = count - 1; +- at = entries; +- while (n--) +- { +- dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; +- break; +- } +- } +- assert (at == p - 1); +- } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ at = dx_find_position(entries, hinfo->hash); ++ dxtrace(printk(" %x->%u\n", ++ at == entries? 0: dx_get_hash(at), ++ dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ frame->curidx = curidx; ++ frame->leaf = dx_get_block(at); ++ if (!indirect--) { ++ dx_unlock_bh(bh); ++ return frame; ++ } ++ ++ /* step into next htree level */ ++ curidx = dx_get_block(at); ++ dx_unlock_bh(bh); ++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) + goto fail2; ++ ++ dx_lock_bh(bh); ++ /* splitting may change root index block and move ++ * hash we're looking for into another index block ++ * so, we have to check this situation and repeat ++ * from begining if path got changed -bzzz */ ++ if (!dx_check_path(frame, hash)) { ++ dx_unlock_bh(bh); ++ bh = frame->bh; ++ indirect++; ++ goto repeat; ++ } ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } ++ dx_unlock_bh(bh); + fail2: + while (frame >= frame_in) { + brelse(frame->bh); +@@ -428,8 +553,7 @@ static void dx_release (struct dx_frame + { + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ if (frames[1].bh != NULL) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -471,8 +595,10 @@ static int ext3_htree_next_block(struct + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) { ++ p->leaf = dx_get_block(p->at); + break; ++ } + if (p == frames) + return 0; + num_frames++; +@@ -498,13 +624,17 @@ static int ext3_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, err))) ++ u32 idx; ++ ++ idx = p->leaf = dx_get_block(p->at); ++ if (!(bh = ext3_bread(NULL, dir, idx, 0, err))) + return -1; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->curidx = idx; ++ p->leaf = dx_get_block(p->at); + } + return 1; + } +@@ -544,7 +674,7 @@ int ext3_htree_fill_tree(struct file *di + dir = dir_file->f_dentry->d_inode; + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + +@@ -626,7 +756,8 @@ static int dx_make_map (struct ext3_dir_ + count++; + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ +- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ de = (struct ext3_dir_entry_2 *)((char*)de + ++ le16_to_cpu(de->rec_len)); + } + return count; + } +@@ -659,7 +790,8 @@ static void dx_sort_map (struct dx_map_e + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct inode *dir, struct dx_frame *frame, ++ u32 hash, u32 block, u32 idx) + { + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; +@@ -671,6 +803,7 @@ static void dx_insert_block(struct dx_fr + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); ++ + } + #endif + +@@ -753,7 +886,8 @@ static int inline search_dirblock(struct + + + static struct buffer_head * ext3_find_entry (struct dentry *dentry, +- struct ext3_dir_entry_2 ** res_dir) ++ struct ext3_dir_entry_2 ** res_dir, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; +@@ -769,6 +903,7 @@ static struct buffer_head * ext3_find_en + int namelen; + const u8 *name; + unsigned blocksize; ++ int do_not_use_dx = 0; + + *res_dir = NULL; + sb = dir->i_sb; +@@ -777,9 +912,10 @@ static struct buffer_head * ext3_find_en + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { +- bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -788,8 +924,14 @@ static struct buffer_head * ext3_find_en + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ do_not_use_dx = 1; + } + #endif ++ *lock = ext3_lock_htree(dir, 0, rwlock); ++ if (is_dx(dir) && !do_not_use_dx) { ++ ext3_unlock_htree(dir, *lock); ++ goto repeat; ++ } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) +@@ -861,12 +1003,17 @@ cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); ++ if (!ret) { ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; ++ } + return ret; + } + + #ifdef CONFIG_EXT3_INDEX + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err) ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct dx_hash_info hinfo; +@@ -881,11 +1028,22 @@ static struct buffer_head * ext3_dx_find + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; +- if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++repeat: ++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) + return NULL; ++ ++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock); ++ /* while locking leaf we just found may get splitted ++ * so, we need another leaf. check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, *lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = frame->leaf; + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -919,6 +1077,8 @@ static struct buffer_head * ext3_dx_find + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; + dx_release (frames); + return NULL; + } +@@ -931,6 +1091,7 @@ static struct dentry *ext3_lookup(struct + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + struct dentry *alternate = NULL; ++ void *lock = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); +@@ -938,10 +1099,11 @@ static struct dentry *ext3_lookup(struct + if (ext3_check_for_iopen(dir, dentry)) + return NULL; + +- bh = ext3_find_entry(dentry, &de); ++ bh = ext3_find_entry(dentry, &de, 0, &lock); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ ext3_unlock_htree(dir, lock); + brelse (bh); + inode = iget(dir->i_sb, ino); + +@@ -984,7 +1146,8 @@ dx_move_dirents(char *from, char *to, st + unsigned rec_len = 0; + + while (count--) { +- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ struct ext3_dir_entry_2 *de = ++ (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; +@@ -997,7 +1160,8 @@ dx_move_dirents(char *from, char *to, st + + static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) + { +- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ struct ext3_dir_entry_2 *next, *to, *prev; ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; +@@ -1019,7 +1183,8 @@ static struct ext3_dir_entry_2* dx_pack_ + + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct dx_hash_info *hinfo, void **target, ++ int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1066,23 +1231,30 @@ static struct ext3_dir_entry_2 *do_split + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- ++ frame->leaf, hash2, split, count-split)); ++ + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); + + /* Which block gets the new entry? */ ++ *target = NULL; + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; +- } +- dx_insert_block (frame, hash2 + continued, newblock); ++ ++ /* entry will be stored into new block ++ * we have to lock it before add_dirent_to_buf */ ++ *target = ext3_lock_htree(dir, newblock, 1); ++ } ++ dx_lock_bh(frame->bh); ++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); ++ dx_unlock_bh(frame->bh); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1156,7 +1328,8 @@ static int add_dirent_to_buf(handle_t *h + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ struct ext3_dir_entry_2 *de1 = ++ (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; +@@ -1214,7 +1387,8 @@ static int make_indexed_dir(handle_t *ha + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; +- ++ void *lock, *new_lock; ++ + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1225,7 +1399,6 @@ static int make_indexed_dir(handle_t *ha + } + root = (struct dx_root *) bh->b_data; + +- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); +@@ -1233,6 +1406,8 @@ static int make_indexed_dir(handle_t *ha + } + data1 = bh2->b_data; + ++ lock = ext3_lock_htree(dir, block, 1); ++ + /* The 0th block becomes the root, move the dirents out */ + de = (struct ext3_dir_entry_2 *) &root->info; + len = ((char *) root) + blocksize - (char *) de; +@@ -1261,13 +1436,25 @@ static int make_indexed_dir(handle_t *ha + frame->entries = entries; + frame->at = entries; + frame->bh = bh; ++ frame->curidx = 0; ++ frame->leaf = 0; ++ frame[1].bh = NULL; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); + dx_release (frames); + if (!(de)) +- return retval; ++ goto cleanup; ++ ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++cleanup: ++ if (new_lock) ++ ext3_unlock_htree(dir, new_lock); ++ /* we mark directory indexed in order to ++ * avoid races while htree being created -bzzz */ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ ext3_unlock_htree(dir, lock); + +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ return retval; + } + #endif + +@@ -1296,11 +1483,13 @@ static int ext3_add_entry (handle_t *han + unsigned blocksize; + unsigned nlen, rlen; + u32 block, blocks; ++ void *lock; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); +@@ -1311,36 +1500,53 @@ static int ext3_add_entry (handle_t *han + ext3_mark_inode_dirty(handle, dir); + } + #endif ++ lock = ext3_lock_htree(dir, 0, 1); ++ if (is_dx(dir)) { ++ /* we got lock for block 0 ++ * probably previous holder of the lock ++ * created htree -bzzz */ ++ ext3_unlock_htree(dir, lock); ++ goto repeat; ++ } ++ + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); +- if(!bh) ++ if(!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); +- if (retval != -ENOSPC) ++ if (retval != -ENOSPC) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + + #ifdef CONFIG_EXT3_INDEX + if (blocks == 1 && !dx_fallback && +- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) +- return make_indexed_dir(handle, dentry, inode, bh); ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { ++ retval = make_indexed_dir(handle, dentry, inode, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + #endif + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); +- if (!bh) ++ if (!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; + } + + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +@@ -1352,15 +1558,28 @@ static int ext3_dx_add_entry(handle_t *h + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; ++ int curidx; ++ void *idx_lock, *leaf_lock, *newleaf_lock; + +- frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++repeat: ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +- entries = frame->entries; +- at = frame->at; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ /* we're going to chage leaf, so lock it first */ ++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); ++ ++ /* while locking leaf we just found may get splitted ++ * so we need to check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { ++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); + goto cleanup; ++ } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -1373,6 +1592,35 @@ static int ext3_dx_add_entry(handle_t *h + goto cleanup; + } + ++ /* our leaf has no enough space. hence, we have to ++ * split it. so lock index for this leaf first */ ++ curidx = frame->curidx; ++ idx_lock = ext3_lock_htree(dir, curidx, 1); ++ ++ /* now check did path get changed? */ ++ dx_release(frames); ++ ++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, ++ &hinfo, frames, &err); ++ if (!frame) { ++ /* FIXME: error handling here */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ return err; ++ } ++ ++ if (frame->curidx != curidx) { ++ /* path has been changed. we have to drop old lock ++ * and repeat */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ entries = frame->entries; ++ at = frame->at; ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +@@ -1384,7 +1632,8 @@ static int ext3_dx_add_entry(handle_t *h + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; +- ++ void *nb_lock; ++ + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __FUNCTION__, +@@ -1395,6 +1644,7 @@ static int ext3_dx_add_entry(handle_t *h + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; ++ nb_lock = ext3_lock_htree(dir, newblock, 1); + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +@@ -1406,27 +1656,73 @@ static int ext3_dx_add_entry(handle_t *h + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); ++ void *ri_lock; ++ ++ /* we have to protect root htree index against ++ * another dx_add_entry() which would want to ++ * split it too -bzzz */ ++ ri_lock = ext3_lock_htree(dir, 0, 1); ++ ++ /* as root index block blocked we must repeat ++ * searching for current position of our 2nd index -bzzz */ ++ dx_lock_bh(frame->bh); ++ frames->at = dx_find_position(frames->entries, hinfo.hash); ++ dx_unlock_bh(frame->bh); ++ + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; +- ++ ++ /* copy index into new one */ + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { ++ /* unlock index we won't use */ ++ ext3_unlock_htree(dir, idx_lock); ++ idx_lock = nb_lock; + frame->at = at = at - entries - icount1 + entries2; +- frame->entries = entries = entries2; ++ frame->entries = entries2; ++ frame->curidx = curidx = newblock; + swap(frame->bh, bh2); ++ } else { ++ /* we'll use old index,so new one may be freed */ ++ ext3_unlock_htree(dir, nb_lock); + } +- dx_insert_block (frames + 0, hash2, newblock); ++ ++ /* NOTE: very subtle piece of code ++ * competing dx_probe() may find 2nd level index in root ++ * index, then we insert new index here and set new count ++ * in that 2nd level index. so, dx_probe() may see 2nd ++ * level index w/o hash it looks for. the solution is ++ * to check root index after we locked just founded 2nd ++ * level index -bzzz */ ++ dx_lock_bh(frames[0].bh); ++ dx_insert_block (dir, frames + 0, hash2, newblock, 0); ++ dx_unlock_bh(frames[0].bh); ++ ++ /* now old and new 2nd level index blocks contain ++ * all pointers, so dx_probe() may find it in the both. ++ * it's OK -bzzz */ ++ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, icount1); ++ dx_unlock_bh(frame->bh); ++ ++ /* now old 2nd level index block points to first half ++ * of leafs. it's importand that dx_probe() must ++ * check root index block for changes under ++ * dx_lock_bh(frame->bh) -bzzz */ ++ ++ ext3_unlock_htree(dir, ri_lock); ++ + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1435,38 +1731,61 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + brelse (bh2); + } else { ++ unsigned long leaf = frame->leaf; ++ + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ ++ dx_lock_bh(frames[0].bh); + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ dx_unlock_bh(frames[0].bh); + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; ++ frame->curidx = newblock; ++ frame->leaf = leaf; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; ++ ++ /* first level index was root. it's already initialized */ ++ /* we my unlock it now */ ++ ext3_unlock_htree(dir, idx_lock); ++ ++ /* current index is just created 2nd level index */ ++ curidx = newblock; ++ idx_lock = nb_lock; + } + ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); + if (!de) + goto cleanup; ++ ++ /* index splitted */ ++ ext3_unlock_htree(dir, idx_lock); ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ++ if (newleaf_lock) ++ ext3_unlock_htree(dir, newleaf_lock); ++ + bh = 0; + goto cleanup; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: ++ ext3_unlock_htree(dir, leaf_lock); + if (bh) + brelse(bh); + dx_release(frames); +@@ -1899,6 +2218,7 @@ static int ext3_rmdir (struct inode * di + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1906,7 +2226,7 @@ static int ext3_rmdir (struct inode * di + } + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_rmdir; + +@@ -1917,14 +2237,19 @@ static int ext3_rmdir (struct inode * di + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = -ENOTEMPTY; +- if (!empty_dir (inode)) ++ if (!empty_dir (inode)) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) +@@ -1957,6 +2282,7 @@ static int ext3_unlink(struct inode * di + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1967,7 +2293,7 @@ static int ext3_unlink(struct inode * di + handle->h_sync = 1; + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_unlink; + +@@ -1975,8 +2301,10 @@ static int ext3_unlink(struct inode * di + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_unlink; ++ } + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", +@@ -1985,6 +2313,7 @@ static int ext3_unlink(struct inode * di + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2106,6 +2435,7 @@ static int ext3_rename (struct inode * o + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; ++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; + + old_bh = new_bh = dir_bh = NULL; + +@@ -2118,7 +2448,10 @@ static int ext3_rename (struct inode * o + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; + +- old_bh = ext3_find_entry (old_dentry, &old_de); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); ++ ++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2131,7 +2464,7 @@ static int ext3_rename (struct inode * o + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext3_find_entry (new_dentry, &new_de); ++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); +@@ -2194,7 +2527,7 @@ static int ext3_rename (struct inode * o + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + +- old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); +@@ -2237,6 +2570,14 @@ static int ext3_rename (struct inode * o + retval = 0; + + end_rename: ++ if (lock1) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); ++ if (lock2) ++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); ++ if (lock3) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); +@@ -2245,6 +2586,29 @@ end_rename: + } + + /* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++static inline void *ext3_lock_htree(struct inode *dir, ++ unsigned long value, int rwlock) ++{ ++ void *lock; ++ ++ if (!test_opt(dir->i_sb, PDIROPS)) ++ return NULL; ++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); ++ return lock; ++} ++ ++static inline void ext3_unlock_htree(struct inode *dir, ++ void *lock) ++{ ++ if (!test_opt(dir->i_sb, PDIROPS) || !lock) ++ return; ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); ++} ++ ++/* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { +--- linux-2.4.18/fs/ext3/super.c~ext3-pdirops-2.4.18-chaos 2003-09-01 16:33:25.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/super.c 2003-09-01 16:36:16.000000000 +0400 +@@ -786,6 +786,8 @@ static int parse_options (char * options + return 0; + } + } ++ else if (!strcmp (this_char, "pdirops")) ++ set_opt (sbi->s_mount_opt, PDIROPS); + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); +@@ -969,6 +971,10 @@ static int ext3_setup_super(struct super + ext3_check_inodes_bitmap (sb); + } + #endif ++#ifdef S_PDIROPS ++ if (test_opt (sb, PDIROPS)) ++ sb->s_flags |= S_PDIROPS; ++#endif + setup_ro_after(sb); + return res; + } +--- linux-2.4.18/include/linux/ext3_fs.h~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:06.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-01 16:36:16.000000000 +0400 +@@ -310,6 +310,7 @@ struct ext3_inode { + /* + * Mount flags + */ ++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ + #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ + #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ + #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ +--- linux-2.4.18/include/linux/ext3_fs_i.h~ext3-pdirops-2.4.18-chaos 2003-08-29 11:57:30.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs_i.h 2003-09-01 16:36:16.000000000 +0400 +@@ -17,6 +17,7 @@ + #define _LINUX_EXT3_FS_I + + #include ++#include + + /* + * second extended file system inode data in memory +@@ -73,6 +74,11 @@ struct ext3_inode_info { + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ +--- linux-2.4.18/fs/ext3/inode.c~ext3-pdirops-2.4.18-chaos 2003-09-01 16:33:25.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/inode.c 2003-09-01 17:53:33.000000000 +0400 +@@ -2454,6 +2454,8 @@ void ext3_read_inode(struct inode * inod + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; +--- linux-2.4.18/fs/ext3/ialloc.c~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:05.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/ialloc.c 2003-09-01 17:55:45.000000000 +0400 +@@ -601,6 +601,8 @@ repeat: + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); + return inode; + + fail: + +_ diff --git a/lustre/kernel_patches/patches/iopen-2.4.18-2.patch b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch new file mode 100644 index 0000000..3d9a864 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch @@ -0,0 +1,422 @@ + Documentation/filesystems/ext2.txt | 16 ++ + fs/ext3/Makefile | 2 + fs/ext3/inode.c | 4 + fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 13 + + fs/ext3/namei.c | 12 + + fs/ext3/super.c | 11 + + include/linux/ext3_fs.h | 2 + 8 files changed, 318 insertions(+), 1 deletion(-) + +--- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt 2003-07-09 17:13:02.000000000 -0600 +@@ -35,6 +35,22 @@ resgid=n The group ID which may use th + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +--- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18 2003-07-09 17:12:12.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile 2003-07-09 17:13:15.000000000 -0600 +@@ -11,7 +11,7 @@ O_TARGET := ext3.o + + export-objs := super.o inode.o xattr.o ext3-exports.o + +-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + +--- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18 2003-07-09 17:11:19.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c 2003-07-09 17:13:02.000000000 -0600 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start +@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod + struct buffer_head *bh; + int block; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c 2003-07-09 17:13:02.000000000 -0600 +@@ -0,0 +1,259 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ spin_unlock(&dcache_lock); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ /* verify this dentry is really new */ ++ assert(!de->d_inode); ++ assert(list_empty(&de->d_subdirs)); ++ assert(list_empty(&de->d_alias)); ++ ++ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) { ++ spin_unlock(&dcache_lock); ++ return NULL; ++ } ++ ++ /* Move the goal to the de hash queue - like d_move() */ ++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; ++ list_del(&goal->d_hash); ++ list_add(&goal->d_hash, &de->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&de->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, de); ++ do_switch(goal->d_parent, de->d_parent); ++ do_switch(goal->d_name.len, de->d_name.len); ++ do_switch(goal->d_name.hash, de->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ spin_unlock(&dcache_lock); ++ ++ return goal; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ inode->u.ext3_i.i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h 2003-07-09 17:13:02.000000000 -0600 +@@ -0,0 +1,13 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); +--- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c 2003-07-09 17:13:02.000000000 -0600 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * define how far ahead to read directories while searching them. +@@ -703,16 +704,21 @@ cleanup_and_exit: + return NULL; + } + #endif ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); + + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ struct dentry *alternate = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -723,6 +729,12 @@ static struct dentry *ext3_lookup(struct + if (!inode) + return ERR_PTR(-EACCES); + } ++ ++ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { ++ iput(inode); ++ return alternate; ++ } ++ + d_add(dentry, inode); + return NULL; + } +--- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/super.c 2003-07-09 17:13:02.000000000 -0600 +@@ -831,6 +831,17 @@ static int parse_options (char * options + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +--- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h 2003-07-09 17:13:02.000000000 -0600 +@@ -321,6 +321,8 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ ++#define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + +_ diff --git a/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch new file mode 100644 index 0000000..c7d06a8 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch @@ -0,0 +1,1775 @@ + fs/ext3/Makefile | 4 + fs/ext3/ext3-exports.c | 13 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 29 - + fs/ext3/namei.c | 8 + fs/ext3/super.c | 23 + fs/ext3/xattr.c | 1242 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 46 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 155 +++++ + include/linux/xattr.h | 15 + 11 files changed, 1494 insertions(+), 51 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/fs/ext3/ext3-exports.c 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); +--- linux-2.4.18/fs/ext3/ialloc.c~linux-2.4.18ea-0.8.26-2 2003-07-28 17:52:04.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/ialloc.c 2003-09-01 14:55:39.000000000 +0400 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); ++ ext3_xattr_drop_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + +--- linux-2.4.18/fs/ext3/inode.c~linux-2.4.18ea-0.8.26-2 2003-07-28 17:52:04.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/inode.c 2003-09-01 14:55:39.000000000 +0400 +@@ -39,6 +39,18 @@ + */ + #undef SEARCH_FROM_ZERO + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = EXT3_I(inode)->i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -48,7 +60,7 @@ + * still needs to be revoked. + */ + +-static int ext3_forget(handle_t *handle, int is_metadata, ++int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { +@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i + { + handle_t *handle; + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + + lock_kernel(); +@@ -1877,6 +1887,8 @@ void ext3_truncate(struct inode * inode) + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -2038,8 +2050,6 @@ int ext3_get_inode_loc (struct inode *in + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && +- inode->i_ino != EXT3_ACL_IDX_INO && +- inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( +@@ -2166,10 +2176,7 @@ void ext3_read_inode(struct inode * inod + + brelse (iloc.bh); + +- if (inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +@@ -2177,7 +2184,7 @@ void ext3_read_inode(struct inode * inod + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; +--- linux-2.4.18/fs/ext3/Makefile~linux-2.4.18ea-0.8.26-2 2003-08-29 16:53:17.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/Makefile 2003-09-01 14:55:50.000000000 +0400 +@@ -9,10 +9,10 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +--- linux-2.4.18/fs/ext3/namei.c~linux-2.4.18ea-0.8.26-2 2003-09-01 11:50:59.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-09-01 14:55:39.000000000 +0400 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1524,6 +1525,7 @@ static int ext3_add_nondir(handle_t *han + d_instantiate(dentry, inode); + return 0; + } ++ ext3_xattr_drop_inode(handle, inode); + ext3_dec_count(handle, inode); + iput(inode); + return err; +@@ -1612,7 +1614,7 @@ static int ext3_mkdir(struct inode * dir + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1620,7 +1622,6 @@ static int ext3_mkdir(struct inode * dir + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; +- inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -1647,9 +1648,6 @@ static int ext3_mkdir(struct inode * dir + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); +- inode->i_mode = S_IFDIR | mode; +- if (dir->i_mode & S_ISGID) +- inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { +--- linux-2.4.18/fs/ext3/super.c~linux-2.4.18ea-0.8.26-2 2003-08-29 16:53:17.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/super.c 2003-09-01 14:55:39.000000000 +0400 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); +@@ -1743,18 +1745,27 @@ int ext3_statfs (struct super_block * sb + + static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); + +-static int __init init_ext3_fs(void) ++static void exit_ext3_fs(void) + { +- return register_filesystem(&ext3_fs_type); ++ unregister_filesystem(&ext3_fs_type); ++ exit_ext3_xattr_user(); ++ exit_ext3_xattr(); + } + +-static void __exit exit_ext3_fs(void) ++static int __init init_ext3_fs(void) + { +- unregister_filesystem(&ext3_fs_type); ++ int error = init_ext3_xattr(); ++ if (!error) ++ error = init_ext3_xattr_user(); ++ if (!error) ++ error = register_filesystem(&ext3_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext3_fs(); ++ return error; + } + +-EXPORT_SYMBOL(ext3_force_commit); +-EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/fs/ext3/xattr.c 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,1242 @@ ++/* ++ * linux/fs/ext3/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Ext3 code with a lot of help from Eric Jarman . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * ¦ entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT3_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++/* These symbols may be needed by a module. */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT3_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, ++ struct ext3_xattr_header *); ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++static int ext3_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext3_xattr_cache_find(struct inode *, ++ struct ext3_xattr_header *); ++static void ext3_xattr_cache_remove(struct buffer_head *); ++static void ext3_xattr_rehash(struct ext3_xattr_header *, ++ struct ext3_xattr_entry *); ++ ++static struct mb_cache *ext3_xattr_cache; ++ ++#else ++# define ext3_xattr_cache_insert(bh) 0 ++# define ext3_xattr_cache_find(inode, header) NULL ++# define ext3_xattr_cache_remove(bh) do {} while(0) ++# define ext3_xattr_rehash(header, entry) do {} while(0) ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext3_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext3_xattr_sem); ++ ++static inline void ++ext3_xattr_lock(void) ++{ ++ down(&ext3_xattr_sem); ++} ++ ++static inline void ++ext3_xattr_unlock(void) ++{ ++ up(&ext3_xattr_sem); ++} ++ ++static inline int ++ext3_xattr_new_block(handle_t *handle, struct inode *inode, ++ int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + ++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext3_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext3_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext3_xattr_free_block(handle_t *handle, struct inode * inode, ++ unsigned long block) ++{ ++ ext3_free_blocks(handle, inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext3_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext3_xattr_free_block(handle, inode, block) \ ++ ext3_free_blocks(handle, inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; ++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ if (!ext3_xattr_handlers[name_index-1]) { ++ ext3_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext3_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ ext3_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext3_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static inline struct ext3_xattr_handler * ++ext3_xattr_resolve_name(const char **name) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext3_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext3_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext3_handler_lock); ++ return handler; ++} ++ ++static inline struct ext3_xattr_handler * ++ext3_xattr_handler(int name_index) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ read_lock(&ext3_handler_lock); ++ handler = ext3_xattr_handlers[name_index-1]; ++ read_unlock(&ext3_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext3_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_setxattr(struct dentry *dentry, const char *name, ++ void *value, size_t size, int flags) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT3_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT3_I(inode)->i_file_acl) ++ return 0; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) { ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len) + 1; ++ } ++ } ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) { ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ *buf++ = '\0'; ++ } ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext3_xattr_update_super_block(handle_t *handle, ++ struct super_block *sb) ++{ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT3_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext3_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_header *header = NULL; ++ struct ext3_xattr_entry *here, *last; ++ unsigned int name_len; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ ext3_xattr_lock(); ++ ++ if (EXT3_I(inode)->i_file_acl) { ++ /* The inode already has an extended attribute block. */ ++ int block = EXT3_I(inode)->i_file_acl; ++ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(sb, "ext3_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT3_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT3_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT3_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext3_xattr_cache_remove(bh); ++ error = ext3_journal_get_write_access(handle, bh); ++ if (error) ++ goto cleanup; ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT3_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT3_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext3_xattr_set2(handle, inode, bh,NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT3_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT3_XATTR_PAD, 0, ++ EXT3_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext3_xattr_rehash(header, here); ++ ++ error = ext3_xattr_set2(handle, inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ ext3_xattr_unlock(); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext3_xattr_set(): Update the file system. ++ */ ++static int ++ext3_xattr_set2(handle_t *handle, struct inode *inode, ++ struct buffer_head *old_bh, struct ext3_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext3_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext3_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ error = ext3_journal_get_write_access(handle, new_bh); ++ if (error) ++ goto cleanup; ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ (void)ext3_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT3_I(inode)->i_file_acl != 0; ++ int block = ext3_xattr_new_block(handle, inode, ++ &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++getblk_failed: ext3_xattr_free_block(handle, inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ error = ext3_journal_get_create_access(handle, new_bh); ++ if (error) { ++ unlock_buffer(new_bh); ++ goto getblk_failed; ++ } ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); ++ ext3_xattr_update_super_block(handle, sb); ++ } ++ error = ext3_journal_dirty_metadata(handle, new_bh); ++ if (error) ++ goto cleanup; ++ } ++ ++ /* Update the inode. */ ++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ error = ext3_journal_get_write_access(handle, old_bh); ++ if (error) ++ goto cleanup; ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); ++ ++ /* ext3_forget() calls bforget() for us, but we ++ let our caller release old_bh, so we need to ++ duplicate the handle before. */ ++ get_bh(old_bh); ++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext3_xattr_quota_free(inode); ++ ext3_journal_dirty_metadata(handle, old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_drop_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext3_xattr_drop_inode(handle_t *handle, struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT3_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ ext3_xattr_lock(); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_drop_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext3_error(inode->i_sb, "ext3_xattr_drop_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ext3_journal_get_write_access(handle, bh); ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext3_xattr_cache_remove(bh); ++ ext3_xattr_free_block(handle, inode, block); ++ ext3_forget(handle, 1, inode, bh, block); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ ext3_journal_dirty_metadata(handle, bh); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ext3_xattr_quota_free(inode); ++ } ++ EXT3_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ ext3_xattr_unlock(); ++} ++ ++/* ++ * ext3_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++/* ++ * ext3_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext3_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext3_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext3_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext3_xattr_cmp(struct ext3_xattr_header *header1, ++ struct ext3_xattr_header *header2) ++{ ++ struct ext3_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT3_XATTR_NEXT(entry1); ++ entry2 = EXT3_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext3_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT3_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT3_XATTR_REFCOUNT_MAX); ++ } else if (!ext3_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext3_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext3_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext3_xattr_rehash(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ struct ext3_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext3_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT3_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext3_xattr(void) ++{ ++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext3_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++ if (ext3_xattr_cache) ++ mb_cache_destroy(ext3_xattr_cache); ++ ext3_xattr_cache = NULL; ++} ++ ++#else /* CONFIG_EXT3_FS_XATTR_SHARING */ ++ ++int __init ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +--- linux-2.4.18/include/linux/ext3_fs.h~linux-2.4.18ea-0.8.26-2 2003-09-01 11:51:00.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-01 14:55:39.000000000 +0400 +@@ -63,8 +63,6 @@ + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ +-#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +@@ -94,7 +92,6 @@ + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -129,28 +126,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext3_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext3_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc +@@ -521,7 +496,7 @@ struct ext3_super_block { + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +-#define EXT3_FEATURE_COMPAT_SUPP 0 ++#define EXT3_FEATURE_COMPAT_SUPP EXT3_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ +@@ -623,6 +598,24 @@ struct dx_hash_info + #define HASH_NB_ALWAYS 1 + + ++/* Defined for extended attributes */ ++#define CONFIG_EXT3_FS_XATTR y ++#ifndef ENOATTR ++#define ENOATTR ENODATA /* No such attribute */ ++#endif ++#ifndef ENOTSUP ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++#endif ++#ifndef XATTR_NAME_MAX ++#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */ ++#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */ ++#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */ ++#endif ++#ifndef XATTR_CREATE ++#define XATTR_CREATE 1 /* set value, fail if attr already exists */ ++#define XATTR_REPLACE 2 /* set value, fail if attr does not exist */ ++#endif ++ + /* + * Describe an inode's exact location on disk and in memory + */ +@@ -704,6 +697,7 @@ extern void ext3_check_inodes_bitmap (st + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +--- linux-2.4.18/include/linux/ext3_jbd.h~linux-2.4.18ea-0.8.26-2 2003-08-29 16:53:17.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_jbd.h 2003-09-01 14:55:39.000000000 +0400 +@@ -30,13 +30,19 @@ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + ++/* Extended attributes may touch two data buffers, two bitmap buffers, ++ * and two group and summaries. */ ++ ++#define EXT3_XATTR_TRANS_BLOCKS 8 ++ + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) ++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ ++ EXT3_XATTR_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/include/linux/ext3_xattr.h 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,155 @@ ++/* ++ File: linux/ext3_xattr.h ++ ++ On-disk format of extended attributes for the ext3 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT3_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT3_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT3_XATTR_INDEX_MAX 10 ++#define EXT3_XATTR_INDEX_USER 1 ++ ++struct ext3_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext3_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT3_XATTR_PAD_BITS 2 ++#define EXT3_XATTR_PAD (1<e_name_len)) ) ++#define EXT3_XATTR_SIZE(size) \ ++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT3_FS_XATTR ++ ++struct ext3_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext3_xattr_register(int, struct ext3_xattr_handler *); ++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); ++ ++extern int ext3_setxattr(struct dentry *, const char *, void *, size_t, int); ++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); ++extern int ext3_removexattr(struct dentry *, const char *); ++ ++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext3_xattr_list(struct inode *, char *, size_t); ++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, void *, size_t, int); ++ ++extern void ext3_xattr_drop_inode(handle_t *, struct inode *); ++extern void ext3_xattr_put_super(struct super_block *); ++ ++extern int init_ext3_xattr(void) __init; ++extern void exit_ext3_xattr(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR */ ++# define ext3_setxattr NULL ++# define ext3_getxattr NULL ++# define ext3_listxattr NULL ++# define ext3_removexattr NULL ++ ++static inline int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_list(struct inode *inode, void *buffer, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext3_xattr_drop_inode(handle_t *handle, struct inode *inode) ++{ ++} ++ ++static inline void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT3_FS_XATTR */ ++ ++# ifdef CONFIG_EXT3_FS_XATTR_USER ++ ++extern int init_ext3_xattr_user(void) __init; ++extern void exit_ext3_xattr_user(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++static inline int ++init_ext3_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr_user(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/include/linux/xattr.h 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,15 @@ ++/* ++ File: linux/xattr.h ++ ++ Extended attributes handling. ++ ++ Copyright (C) 2001 by Andreas Gruenbacher ++ Copyright (C) 2001 SGI - Silicon Graphics, Inc ++*/ ++#ifndef _LINUX_XATTR_H ++#define _LINUX_XATTR_H ++ ++#define XATTR_CREATE 1 /* set value, fail if attr already exists */ ++#define XATTR_REPLACE 2 /* set value, fail if attr does not exist */ ++ ++#endif /* _LINUX_XATTR_H */ + +_ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch new file mode 100644 index 0000000..a9cc225 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch @@ -0,0 +1,265 @@ + fs/inode.c | 1 + fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++--------------- + include/linux/fs.h | 11 ++++---- + 3 files changed, 54 insertions(+), 24 deletions(-) + +--- linux-2.4.18/fs/namei.c~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 ++++ linux-2.4.18-alexey/fs/namei.c 2003-09-01 17:56:10.000000000 +0400 +@@ -101,6 +101,36 @@ void intent_release(struct lookup_intent + + } + ++static void *lock_dir(struct inode *dir, struct qstr *name) ++{ ++ unsigned long hash; ++ ++ if (!IS_PDIROPS(dir)) { ++ down(&dir->i_sem); ++ return 0; ++ } ++ ++ /* OK. fs understands parallel directory operations. ++ * so, we try to acquire lock for hash of requested ++ * filename in order to prevent any operations with ++ * same name in same time -bzzz */ ++ ++ /* calculate name hash */ ++ hash = full_name_hash(name->name, name->len); ++ ++ /* lock this hash */ ++ return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC); ++} ++ ++static void unlock_dir(struct inode *dir, void *lock) ++{ ++ if (!IS_PDIROPS(dir)) { ++ up(&dir->i_sem); ++ return; ++ } ++ dynlock_unlock(&dir->i_dcache_lock, lock); ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -302,10 +332,10 @@ static struct dentry *real_lookup(struct + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ void *lock; + + again: +- +- down(&dir->i_sem); ++ lock = lock_dir(dir, name); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -329,7 +359,7 @@ again: + else + result = dentry; + } +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + return result; + } + +@@ -337,7 +367,7 @@ again: + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) { + dput(result); +@@ -1234,13 +1264,13 @@ struct file *filp_open(const char * path + goto exit; + + dir = nd.dentry; +- down(&dir->d_inode->i_sem); ++ nd.lock = lock_dir(dir->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + + do_last: + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd.lock); + goto exit; + } + +@@ -1249,7 +1279,7 @@ do_last: + if (!dentry->d_inode) { + error = vfs_create_it(dir->d_inode, dentry, + mode & ~current->fs->umask, &it); +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd.lock); + dput(nd.dentry); + nd.dentry = dentry; + if (error) +@@ -1264,7 +1294,7 @@ do_last: + /* + * It already exists. + */ +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd.lock); + + error = -EEXIST; + if (flag & O_EXCL) +@@ -1344,7 +1374,7 @@ do_link: + goto exit; + } + dir = nd.dentry; +- down(&dir->d_inode->i_sem); ++ nd.lock = lock_dir(dir->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + putname(nd.last.name); + goto do_last; +@@ -1357,7 +1387,7 @@ static struct dentry *lookup_create(stru + { + struct dentry *dentry; + +- down(&nd->dentry->d_inode->i_sem); ++ nd->lock = lock_dir(nd->dentry->d_inode, &nd->last); + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +@@ -1446,7 +1476,7 @@ asmlinkage long sys_mknod(const char * f + } + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1509,7 +1539,7 @@ asmlinkage long sys_mkdir(const char * p + mode & ~current->fs->umask); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1619,14 +1649,14 @@ asmlinkage long sys_rmdir(const char * p + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1685,7 +1715,7 @@ asmlinkage long sys_unlink(const char * + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1696,7 +1726,7 @@ asmlinkage long sys_unlink(const char * + exit2: + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1766,7 +1796,7 @@ asmlinkage long sys_symlink(const char * + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1858,7 +1888,7 @@ asmlinkage long sys_link(const char * ol + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); + dput(new_dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out_release: + path_release(&nd); + out: +--- linux-2.4.18/include/linux/fs.h~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/fs.h 2003-09-01 16:36:16.000000000 +0400 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -136,6 +137,7 @@ extern int leases_enable, dir_notify_ena + #define S_IMMUTABLE 16 /* Immutable file */ + #define S_DEAD 32 /* removed, but still open directory */ + #define S_NOQUOTA 64 /* Inode is not counted to quota */ ++#define S_PDIROPS 256 /* Parallel directory operations */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -162,6 +164,7 @@ extern int leases_enable, dir_notify_ena + #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) + #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) + #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) ++#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + +@@ -490,6 +493,7 @@ struct inode { + atomic_t i_writecount; + unsigned int i_attr_flags; + __u32 i_generation; ++ struct dynlock i_dcache_lock; /* for parallel directory ops */ + union { + struct minix_inode_info minix_i; + struct ext2_inode_info ext2_i; +@@ -713,6 +717,7 @@ struct nameidata { + unsigned int flags; + int last_type; + struct lookup_intent *intent; ++ void *lock; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +@@ -1610,12 +1615,6 @@ static inline struct dentry *get_parent( + return dget(dentry->d_parent); + } + +-static inline void unlock_dir(struct dentry *dir) +-{ +- up(&dir->d_inode->i_sem); +- dput(dir); +-} +- + /* + * Whee.. Deadlock country. Happily there are only two VFS + * operations that does this.. +--- linux-2.4.18/fs/inode.c~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 ++++ linux-2.4.18-alexey/fs/inode.c 2003-09-01 16:36:16.000000000 +0400 +@@ -119,6 +119,7 @@ static struct inode *alloc_inode(struct + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + inode->i_mapping = mapping; ++ dynlock_init(&inode->i_dcache_lock); + } + return inode; + } + +_ diff --git a/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc b/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc new file mode 100644 index 0000000..b626dcf --- /dev/null +++ b/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc @@ -0,0 +1,3 @@ +include/linux/dynlocks.h +lib/dynlocks.c +lib/Makefile diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-5.pc b/lustre/kernel_patches/pc/ext-2.4-patch-5.pc new file mode 100644 index 0000000..7191405 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-5.pc @@ -0,0 +1 @@ +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc new file mode 100644 index 0000000..bd89204 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc @@ -0,0 +1,20 @@ +fs/ext3/balloc.c +fs/ext3/balloc.c.orig +fs/ext3/dir.c +fs/ext3/dir.c.orig +fs/ext3/ialloc.c +fs/ext3/ialloc.c.orig +fs/ext3/inode.c +fs/ext3/inode.c.orig +fs/ext3/ioctl.c +fs/ext3/ioctl.c.orig +fs/ext3/namei.c +fs/ext3/namei.c.orig +fs/ext3/super.c +fs/ext3/super.c.orig +fs/ext3/symlink.c +fs/ext3/symlink.c.orig +include/linux/ext3_fs.h +include/linux/ext3_fs.h.orig +include/linux/ext3_jbd.h +include/linux/ext3_jbd.h.orig diff --git a/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc new file mode 100644 index 0000000..9b16759 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc @@ -0,0 +1 @@ +fs/ext3/namei.c diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc new file mode 100644 index 0000000..42243c8 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc @@ -0,0 +1,6 @@ +fs/ext3/file.c +fs/ext3/file.c.orig +fs/ext3/inode.c +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc new file mode 100644 index 0000000..2ad2584 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc @@ -0,0 +1,6 @@ +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_i.h +fs/ext3/inode.c +fs/ext3/ialloc.c diff --git a/lustre/kernel_patches/pc/iopen-2.4.18-2.pc b/lustre/kernel_patches/pc/iopen-2.4.18-2.pc new file mode 100644 index 0000000..308490e --- /dev/null +++ b/lustre/kernel_patches/pc/iopen-2.4.18-2.pc @@ -0,0 +1,8 @@ +Documentation/filesystems/ext2.txt +fs/ext3/inode.c +fs/ext3/iopen.c +fs/ext3/iopen.h +fs/ext3/Makefile +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc new file mode 100644 index 0000000..1078cb4 --- /dev/null +++ b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc @@ -0,0 +1,11 @@ +fs/ext3/ext3-exports.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/Makefile +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/xattr.c +include/linux/ext3_fs.h +include/linux/ext3_jbd.h +include/linux/ext3_xattr.h +include/linux/xattr.h diff --git a/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc b/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc new file mode 100644 index 0000000..f244b84 --- /dev/null +++ b/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc @@ -0,0 +1,3 @@ +fs/namei.c +include/linux/fs.h +fs/inode.c diff --git a/lustre/kernel_patches/series/chaos-2.4.18-pdirops b/lustre/kernel_patches/series/chaos-2.4.18-pdirops new file mode 100644 index 0000000..a37519a --- /dev/null +++ b/lustre/kernel_patches/series/chaos-2.4.18-pdirops @@ -0,0 +1,34 @@ +dev_read_only.patch +exports.patch +kmem_cache_validate.patch +lustre_version.patch +vfs_intent-2.4.18-18-chaos65.patch +invalidate_show.patch +iod-rmap-exports.patch +export-truncate.patch +ext3-compat-2.4.18-chaos.patch +ext-2.4-patch-1.patch +ext-2.4-patch-2.patch +ext-2.4-patch-3.patch +ext-2.4-patch-4.patch +ext-2.4-patch-5.patch +linux-2.4.18ea-0.8.26-2.patch +ext3-2.4-ino_t.patch +ext3-2.4.18-ino_sb_macro-2.patch +ext3-orphan_lock.patch +ext3-delete_thread-2.4.18-2.patch +extN-misc-fixup.patch +extN-noread.patch +extN-wantedi.patch +ext3-san-2.4.20.patch +extN-2.4.18-ino_sb_fixup.patch +ext3-map_inode_page_2.4.18.patch +ext3-error-export.patch +iopen-2.4.18-2.patch +jbd-dont-account-blocks-twice.patch +jbd-commit-tricks.patch +ext3-o_direct-1-2.4.18-chaos.patch +ext3-no-write-super-chaos.patch +dynamic-locks-2.4.18-chaos.patch +vfs-pdirops-2.4.18-chaos.patch +ext3-pdirops-2.4.18-chaos.patch -- 1.8.3.1